feat: RAG service (#1220)

2025-02-23 01:37:26 +08:00
parent 437d36920d
commit fd84c91cdb
32 changed files with 2339 additions and 15 deletions
--- a/py/rag-service/Dockerfile
+++ b/py/rag-service/Dockerfile
@@ -0,0 +1,33 @@
+FROM debian:bookworm-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+ENV PATH="/root/.local/bin:$PATH"
+
+RUN uv python install 3.11
+
+RUN uv python list
+
+ENV PATH="/root/.uv/python/3.11/bin:$PATH"
+
+COPY requirements.txt .
+
+RUN uv venv --python 3.11
+
+RUN uv pip install -r requirements.txt
+
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PORT=8000
+
+EXPOSE ${PORT}
+
+COPY . .
+
+CMD ["uv", "run", "fastapi", "run", "src/main.py", "--workers", "3"]
--- a/py/rag-service/requirements.txt
+++ b/py/rag-service/requirements.txt
@@ -0,0 +1,162 @@
+aiohappyeyeballs==2.4.6
+aiohttp==3.11.12
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.8.0
+asgiref==3.8.1
+asttokens==3.0.0
+attrs==25.1.0
+backoff==2.2.1
+bcrypt==4.2.1
+beautifulsoup4==4.13.3
+build==1.2.2.post1
+cachetools==5.5.1
+certifi==2024.12.14
+charset-normalizer==3.4.1
+chroma-hnswlib==0.7.6
+chromadb==0.6.3
+click==8.1.8
+coloredlogs==15.0.1
+dataclasses-json==0.6.7
+decorator==5.1.1
+Deprecated==1.2.18
+dirtyjson==1.0.8
+distro==1.9.0
+dnspython==2.7.0
+durationpy==0.9
+email_validator==2.2.0
+executing==2.2.0
+fastapi==0.115.8
+fastapi-cli==0.0.7
+filelock==3.17.0
+filetype==1.2.0
+flatbuffers==25.1.24
+frozenlist==1.5.0
+fsspec==2025.2.0
+google-auth==2.38.0
+googleapis-common-protos==1.66.0
+greenlet==3.1.1
+grpcio==1.70.0
+h11==0.14.0
+httpcore==1.0.7
+httptools==0.6.4
+httpx==0.28.1
+huggingface-hub==0.28.1
+humanfriendly==10.0
+idna==3.10
+importlib_metadata==8.5.0
+importlib_resources==6.5.2
+ipython==8.32.0
+jedi==0.19.2
+Jinja2==3.1.5
+jiter==0.8.2
+joblib==1.4.2
+kubernetes==32.0.0
+llama-cloud==0.1.11
+llama-cloud-services==0.6.0
+llama-index==0.12.16
+llama-index-agent-openai==0.4.3
+llama-index-cli==0.4.0
+llama-index-core==0.12.16.post1
+llama-index-embeddings-openai==0.3.1
+llama-index-indices-managed-llama-cloud==0.6.4
+llama-index-llms-openai==0.3.18
+llama-index-multi-modal-llms-openai==0.4.3
+llama-index-program-openai==0.3.1
+llama-index-question-gen-openai==0.3.0
+llama-index-readers-file==0.4.4
+llama-index-readers-llama-parse==0.4.0
+llama-index-vector-stores-chroma==0.4.1
+llama-parse==0.6.0
+markdown-it-py==3.0.0
+markdownify==0.14.1
+MarkupSafe==3.0.2
+marshmallow==3.26.1
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mmh3==5.1.0
+monotonic==1.6
+mpmath==1.3.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+networkx==3.4.2
+nltk==3.9.1
+numpy==2.2.2
+oauthlib==3.2.2
+onnxruntime==1.20.1
+openai==1.61.1
+opentelemetry-api==1.30.0
+opentelemetry-exporter-otlp-proto-common==1.30.0
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+opentelemetry-instrumentation==0.51b0
+opentelemetry-instrumentation-asgi==0.51b0
+opentelemetry-instrumentation-fastapi==0.51b0
+opentelemetry-proto==1.30.0
+opentelemetry-sdk==1.30.0
+opentelemetry-semantic-conventions==0.51b0
+opentelemetry-util-http==0.51b0
+orjson==3.10.15
+overrides==7.7.0
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==11.1.0
+posthog==3.11.0
+prompt_toolkit==3.0.50
+propcache==0.2.1
+protobuf==5.29.3
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pydantic==2.10.6
+pydantic_core==2.27.2
+Pygments==2.19.1
+pypdf==5.2.0
+PyPika==0.48.9
+pyproject_hooks==1.2.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.20
+pytz==2025.1
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+requests-oauthlib==2.0.0
+rich==13.9.4
+rich-toolkit==0.13.2
+rsa==4.9
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.6
+SQLAlchemy==2.0.38
+stack-data==0.6.3
+starlette==0.45.3
+striprtf==0.0.26
+sympy==1.13.3
+tenacity==9.0.0
+tiktoken==0.8.0
+tokenizers==0.21.0
+tqdm==4.67.1
+traitlets==5.14.3
+tree-sitter==0.21.3
+tree-sitter-languages==1.10.2
+typer==0.15.1
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2025.1
+urllib3==2.3.0
+uvicorn==0.34.0
+uvloop==0.21.0
+watchdog==6.0.0
+watchfiles==1.0.4
+wcwidth==0.2.13
+websocket-client==1.8.0
+websockets==14.2
+wrapt==1.17.2
+yarl==1.18.3
+zipp==3.21.0
--- a/py/rag-service/src/libs/init.py
+++ b/py/rag-service/src/libs/init.py
--- a/py/rag-service/src/libs/configs.py
+++ b/py/rag-service/src/libs/configs.py
@@ -0,0 +1,14 @@
+import os
+from pathlib import Path
+
+# Configuration
+BASE_DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))
+CHROMA_PERSIST_DIR = BASE_DATA_DIR / "chroma_db"
+LOG_DIR = BASE_DATA_DIR / "logs"
+DB_FILE = BASE_DATA_DIR / "sqlite" / "indexing_history.db"
+
+# Configure directories
+BASE_DATA_DIR.mkdir(parents=True, exist_ok=True)
+LOG_DIR.mkdir(parents=True, exist_ok=True)
+DB_FILE.parent.mkdir(parents=True, exist_ok=True)  # Create sqlite directory
+CHROMA_PERSIST_DIR.mkdir(parents=True, exist_ok=True)
--- a/py/rag-service/src/libs/db.py
+++ b/py/rag-service/src/libs/db.py
@@ -0,0 +1,60 @@
+import sqlite3
+from collections.abc import Generator
+from contextlib import contextmanager
+
+from libs.configs import DB_FILE
+
+# SQLite table schemas
+CREATE_TABLES_SQL = """
+CREATE TABLE IF NOT EXISTS indexing_history (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    uri TEXT NOT NULL,
+    content_hash TEXT NOT NULL,
+    status TEXT NOT NULL,
+    timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
+    error_message TEXT,
+    document_id TEXT,
+    metadata TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_uri ON indexing_history(uri);
+CREATE INDEX IF NOT EXISTS idx_document_id ON indexing_history(document_id);
+CREATE INDEX IF NOT EXISTS idx_content_hash ON indexing_history(content_hash);
+
+CREATE TABLE IF NOT EXISTS resources (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    name TEXT NOT NULL UNIQUE,
+    uri TEXT NOT NULL UNIQUE,
+    type TEXT NOT NULL,  -- 'path' or 'https'
+    status TEXT NOT NULL DEFAULT 'active',  -- 'active' or 'inactive'
+    indexing_status TEXT NOT NULL DEFAULT 'pending',  -- 'pending', 'indexing', 'indexed', 'failed'
+    indexing_status_message TEXT,
+    indexing_started_at DATETIME,
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    last_indexed_at DATETIME,
+    last_error TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_resources_name ON resources(name);
+CREATE INDEX IF NOT EXISTS idx_resources_uri ON resources(uri);
+CREATE INDEX IF NOT EXISTS idx_resources_status ON resources(status);
+CREATE INDEX IF NOT EXISTS idx_status ON indexing_history(status);
+"""
+
+
+@contextmanager
+def get_db_connection() -> Generator[sqlite3.Connection, None, None]:
+    """Get a database connection."""
+    conn = sqlite3.connect(DB_FILE)
+    conn.row_factory = sqlite3.Row
+    try:
+        yield conn
+    finally:
+        conn.close()
+
+
+def init_db() -> None:
+    """Initialize the SQLite database."""
+    with get_db_connection() as conn:
+        conn.executescript(CREATE_TABLES_SQL)
+        conn.commit()
--- a/py/rag-service/src/libs/logger.py
+++ b/py/rag-service/src/libs/logger.py
@@ -0,0 +1,16 @@
+import logging
+from datetime import datetime
+
+from libs.configs import LOG_DIR
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[
+        logging.FileHandler(
+            LOG_DIR / f"rag_service_{datetime.now().astimezone().strftime('%Y%m%d')}.log",
+        ),
+        logging.StreamHandler(),
+    ],
+)
+logger = logging.getLogger(__name__)
--- a/py/rag-service/src/libs/utils.py
+++ b/py/rag-service/src/libs/utils.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from llama_index.core.schema import BaseNode
+
+PATTERN_URI_PART = re.compile(r"(?P<uri>.+)__part_\d+")
+METADATA_KEY_URI = "uri"
+
+
+def uri_to_path(uri: str) -> Path:
+    """Convert URI to path."""
+    return Path(uri.replace("file://", ""))
+
+
+def path_to_uri(file_path: Path) -> str:
+    """Convert path to URI."""
+    uri = file_path.as_uri()
+    if file_path.is_dir():
+        uri += "/"
+    return uri
+
+
+def is_local_uri(uri: str) -> bool:
+    """Check if the URI is a path URI."""
+    return uri.startswith("file://")
+
+
+def is_remote_uri(uri: str) -> bool:
+    """Check if the URI is an HTTPS URI or HTTP URI."""
+    return uri.startswith(("https://", "http://"))
+
+
+def is_path_node(node: BaseNode) -> bool:
+    """Check if the node is a file node."""
+    uri = get_node_uri(node)
+    if not uri:
+        return False
+    return is_local_uri(uri)
+
+
+def get_node_uri(node: BaseNode) -> str | None:
+    """Get URI from node metadata."""
+    uri = node.metadata.get(METADATA_KEY_URI)
+    if not uri:
+        doc_id = getattr(node, "doc_id", None)
+        if doc_id:
+            match = PATTERN_URI_PART.match(doc_id)
+            uri = match.group("uri") if match else doc_id
+    if uri:
+        if uri.startswith("/"):
+            uri = f"file://{uri}"
+        return uri
+    return None
+
+
+def inject_uri_to_node(node: BaseNode) -> None:
+    """Inject file path into node metadata."""
+    if METADATA_KEY_URI in node.metadata:
+        return
+    uri = get_node_uri(node)
+    if uri:
+        node.metadata[METADATA_KEY_URI] = uri
--- a/py/rag-service/src/main.py
+++ b/py/rag-service/src/main.py
--- a/py/rag-service/src/models/init.py
+++ b/py/rag-service/src/models/init.py
--- a/py/rag-service/src/models/indexing_history.py
+++ b/py/rag-service/src/models/indexing_history.py
@@ -0,0 +1,19 @@
+"""Indexing History Model."""
+
+from datetime import datetime
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class IndexingHistory(BaseModel):
+    """Model for indexing history record."""
+
+    id: int | None = Field(None, description="Record ID")
+    uri: str = Field(..., description="URI of the indexed file")
+    content_hash: str = Field(..., description="MD5 hash of the file content")
+    status: str = Field(..., description="Indexing status (indexing/completed/failed)")
+    timestamp: datetime = Field(default_factory=datetime.now, description="Record timestamp")
+    error_message: str | None = Field(None, description="Error message if failed")
+    document_id: str | None = Field(None, description="Document ID in the index")
+    metadata: dict[str, Any] | None = Field(None, description="Additional metadata")
--- a/py/rag-service/src/models/resource.py
+++ b/py/rag-service/src/models/resource.py
@@ -0,0 +1,25 @@
+"""Resource Model."""
+
+from datetime import datetime
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class Resource(BaseModel):
+    """Model for resource record."""
+
+    id: int | None = Field(None, description="Resource ID")
+    name: str = Field(..., description="Name of the resource")
+    uri: str = Field(..., description="URI of the resource")
+    type: Literal["local", "remote"] = Field(..., description="Type of resource (path/https)")
+    status: str = Field("active", description="Status of resource (active/inactive)")
+    indexing_status: Literal["pending", "indexing", "indexed", "failed"] = Field(
+        "pending",
+        description="Indexing status (pending/indexing/indexed/failed)",
+    )
+    indexing_status_message: str | None = Field(None, description="Indexing status message")
+    created_at: datetime = Field(default_factory=datetime.now, description="Creation timestamp")
+    indexing_started_at: datetime | None = Field(None, description="Indexing start timestamp")
+    last_indexed_at: datetime | None = Field(None, description="Last indexing timestamp")
+    last_error: str | None = Field(None, description="Last error message if any")
--- a/py/rag-service/src/services/init.py
+++ b/py/rag-service/src/services/init.py
--- a/py/rag-service/src/services/indexing_history.py
+++ b/py/rag-service/src/services/indexing_history.py
@@ -0,0 +1,174 @@
+import json
+import os
+from datetime import datetime
+from typing import Any
+
+from libs.db import get_db_connection
+from libs.logger import logger
+from libs.utils import get_node_uri
+from llama_index.core.schema import Document
+from models.indexing_history import IndexingHistory
+
+
+class IndexingHistoryService:
+    def delete_indexing_status(self, uri: str) -> None:
+        """Delete indexing status for a specific file."""
+        with get_db_connection() as conn:
+            conn.execute(
+                """
+              DELETE FROM indexing_history
+              WHERE uri = ?
+              """,
+                (uri,),
+            )
+            conn.commit()
+
+    def delete_indexing_status_by_document_id(self, document_id: str) -> None:
+        """Delete indexing status for a specific document."""
+        with get_db_connection() as conn:
+            conn.execute(
+                """
+              DELETE FROM indexing_history
+              WHERE document_id = ?
+              """,
+                (document_id,),
+            )
+            conn.commit()
+
+    def update_indexing_status(
+        self,
+        doc: Document,
+        status: str,
+        error_message: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        """Update the indexing status in the database."""
+        content_hash = doc.hash
+
+        # Get URI from metadata if available
+        uri = get_node_uri(doc)
+        if not uri:
+            logger.warning("URI not found for document: %s", doc.doc_id)
+            return
+
+        record = IndexingHistory(
+            id=None,
+            uri=uri,
+            content_hash=content_hash,
+            status=status,
+            error_message=error_message,
+            document_id=doc.doc_id,
+            metadata=metadata,
+        )
+        with get_db_connection() as conn:
+            # Check if record exists
+            existing = conn.execute(
+                "SELECT id FROM indexing_history WHERE document_id = ?",
+                (doc.doc_id,),
+            ).fetchone()
+
+            if existing:
+                # Update existing record
+                conn.execute(
+                    """
+                  UPDATE indexing_history
+                  SET content_hash = ?, status = ?, error_message = ?, document_id = ?, metadata = ?
+                  WHERE uri = ?
+                  """,
+                    (
+                        record.content_hash,
+                        record.status,
+                        record.error_message,
+                        record.document_id,
+                        json.dumps(record.metadata) if record.metadata else None,
+                        record.uri,
+                    ),
+                )
+            else:
+                # Insert new record
+                conn.execute(
+                    """
+                  INSERT INTO indexing_history
+                  (uri, content_hash, status, error_message, document_id, metadata)
+                  VALUES (?, ?, ?, ?, ?, ?)
+                  """,
+                    (
+                        record.uri,
+                        record.content_hash,
+                        record.status,
+                        record.error_message,
+                        record.document_id,
+                        json.dumps(record.metadata) if record.metadata else None,
+                    ),
+                )
+            conn.commit()
+
+    def get_indexing_status(self, doc: Document | None = None, base_uri: str | None = None) -> list[IndexingHistory]:
+        """Get indexing status from the database."""
+        with get_db_connection() as conn:
+            if doc:
+                uri = get_node_uri(doc)
+                if not uri:
+                    logger.warning("URI not found for document: %s", doc.doc_id)
+                    return []
+                content_hash = doc.hash
+                # For a specific file, get its latest status
+                query = """
+                  SELECT *
+                  FROM indexing_history
+                  WHERE uri = ? and content_hash = ?
+                  ORDER BY timestamp DESC LIMIT 1
+              """
+                params = (uri, content_hash)
+            elif base_uri:
+                # For files in a specific directory, get their latest status
+                query = """
+                  WITH RankedHistory AS (
+                      SELECT *,
+                             ROW_NUMBER() OVER (PARTITION BY document_id ORDER BY timestamp DESC) as rn
+                      FROM indexing_history
+                      WHERE uri LIKE ? || '%'
+                  )
+                  SELECT id, uri, content_hash, status, timestamp, error_message, document_id, metadata
+                  FROM RankedHistory
+                  WHERE rn = 1
+                  ORDER BY timestamp DESC
+              """
+                params = (base_uri,) if base_uri.endswith(os.path.sep) else (base_uri + os.path.sep,)
+            else:
+                # For all files, get their latest status
+                query = """
+                  WITH RankedHistory AS (
+                      SELECT *,
+                             ROW_NUMBER() OVER (PARTITION BY uri ORDER BY timestamp DESC) as rn
+                      FROM indexing_history
+                  )
+                  SELECT id, uri, content_hash, status, timestamp, error_message, document_id, metadata
+                  FROM RankedHistory
+                  WHERE rn = 1
+                  ORDER BY timestamp DESC
+              """
+                params = ()
+
+            rows = conn.execute(query, params).fetchall()
+
+            result = []
+            for row in rows:
+                row_dict = dict(row)
+                # Parse metadata JSON if it exists
+                if row_dict.get("metadata"):
+                    try:
+                        row_dict["metadata"] = json.loads(row_dict["metadata"])
+                    except json.JSONDecodeError:
+                        row_dict["metadata"] = None
+                # Parse timestamp string to datetime if needed
+                if isinstance(row_dict.get("timestamp"), str):
+                    row_dict["timestamp"] = datetime.fromisoformat(
+                        row_dict["timestamp"].replace("Z", "+00:00"),
+                    )
+                result.append(IndexingHistory(**row_dict))
+
+            return result
+
+
+indexing_history_service = IndexingHistoryService()
--- a/py/rag-service/src/services/resource.py
+++ b/py/rag-service/src/services/resource.py
@@ -0,0 +1,104 @@
+"""Resource Service."""
+
+from libs.db import get_db_connection
+from models.resource import Resource
+
+
+class ResourceService:
+    """Resource Service."""
+
+    def add_resource_to_db(self, resource: Resource) -> None:
+        """Add a resource to the database."""
+        with get_db_connection() as conn:
+            conn.execute(
+                """
+              INSERT INTO resources (name, uri, type, status, indexing_status, created_at)
+              VALUES (?, ?, ?, ?, ?, ?)
+              """,
+                (
+                    resource.name,
+                    resource.uri,
+                    resource.type,
+                    resource.status,
+                    resource.indexing_status,
+                    resource.created_at,
+                ),
+            )
+            conn.commit()
+
+    def update_resource_indexing_status(self, uri: str, indexing_status: str, indexing_status_message: str) -> None:
+        """Update resource indexing status in the database."""
+        with get_db_connection() as conn:
+            if indexing_status == "indexing":
+                conn.execute(
+                    """
+                  UPDATE resources
+                  SET indexing_status = ?, indexing_status_message = ?, indexing_started_at = CURRENT_TIMESTAMP
+                  WHERE uri = ?
+                  """,
+                    (indexing_status, indexing_status_message, uri),
+                )
+            else:
+                conn.execute(
+                    """
+                  UPDATE resources
+                  SET indexing_status = ?, indexing_status_message = ?, last_indexed_at = CURRENT_TIMESTAMP
+                  WHERE uri = ?
+                  """,
+                    (indexing_status, indexing_status_message, uri),
+                )
+            conn.commit()
+
+    def update_resource_status(self, uri: str, status: str, error: str | None = None) -> None:
+        """Update resource status in the database."""
+        with get_db_connection() as conn:
+            if status == "active":
+                conn.execute(
+                    """
+                  UPDATE resources
+                  SET status = ?, last_indexed_at = CURRENT_TIMESTAMP, last_error = ?
+                  WHERE uri = ?
+                  """,
+                    (status, error, uri),
+                )
+            else:
+                conn.execute(
+                    """
+                  UPDATE resources
+                  SET status = ?, last_error = ?
+                  WHERE uri = ?
+                  """,
+                    (status, error, uri),
+                )
+            conn.commit()
+
+    def get_resource(self, uri: str) -> Resource | None:
+        """Get resource from the database."""
+        with get_db_connection() as conn:
+            row = conn.execute(
+                "SELECT * FROM resources WHERE uri = ?",
+                (uri,),
+            ).fetchone()
+            if row:
+                return Resource(**dict(row))
+            return None
+
+    def get_resource_by_name(self, name: str) -> Resource | None:
+        """Get resource by name from the database."""
+        with get_db_connection() as conn:
+            row = conn.execute(
+                "SELECT * FROM resources WHERE name = ?",
+                (name,),
+            ).fetchone()
+            if row:
+                return Resource(**dict(row))
+            return None
+
+    def get_all_resources(self) -> list[Resource]:
+        """Get all resources from the database."""
+        with get_db_connection() as conn:
+            rows = conn.execute("SELECT * FROM resources ORDER BY created_at DESC").fetchall()
+            return [Resource(**dict(row)) for row in rows]
+
+
+resource_service = ResourceService()