feat: RAG service (#1220)
This commit is contained in:
0
py/rag-service/src/libs/__init__.py
Normal file
0
py/rag-service/src/libs/__init__.py
Normal file
14
py/rag-service/src/libs/configs.py
Normal file
14
py/rag-service/src/libs/configs.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Configuration
|
||||
BASE_DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))
|
||||
CHROMA_PERSIST_DIR = BASE_DATA_DIR / "chroma_db"
|
||||
LOG_DIR = BASE_DATA_DIR / "logs"
|
||||
DB_FILE = BASE_DATA_DIR / "sqlite" / "indexing_history.db"
|
||||
|
||||
# Configure directories
|
||||
BASE_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
DB_FILE.parent.mkdir(parents=True, exist_ok=True) # Create sqlite directory
|
||||
CHROMA_PERSIST_DIR.mkdir(parents=True, exist_ok=True)
|
||||
60
py/rag-service/src/libs/db.py
Normal file
60
py/rag-service/src/libs/db.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import sqlite3
|
||||
from collections.abc import Generator
|
||||
from contextlib import contextmanager
|
||||
|
||||
from libs.configs import DB_FILE
|
||||
|
||||
# SQLite table schemas
|
||||
CREATE_TABLES_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS indexing_history (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
uri TEXT NOT NULL,
|
||||
content_hash TEXT NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
error_message TEXT,
|
||||
document_id TEXT,
|
||||
metadata TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_uri ON indexing_history(uri);
|
||||
CREATE INDEX IF NOT EXISTS idx_document_id ON indexing_history(document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_content_hash ON indexing_history(content_hash);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS resources (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL UNIQUE,
|
||||
uri TEXT NOT NULL UNIQUE,
|
||||
type TEXT NOT NULL, -- 'path' or 'https'
|
||||
status TEXT NOT NULL DEFAULT 'active', -- 'active' or 'inactive'
|
||||
indexing_status TEXT NOT NULL DEFAULT 'pending', -- 'pending', 'indexing', 'indexed', 'failed'
|
||||
indexing_status_message TEXT,
|
||||
indexing_started_at DATETIME,
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
last_indexed_at DATETIME,
|
||||
last_error TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_resources_name ON resources(name);
|
||||
CREATE INDEX IF NOT EXISTS idx_resources_uri ON resources(uri);
|
||||
CREATE INDEX IF NOT EXISTS idx_resources_status ON resources(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_status ON indexing_history(status);
|
||||
"""
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_db_connection() -> Generator[sqlite3.Connection, None, None]:
|
||||
"""Get a database connection."""
|
||||
conn = sqlite3.connect(DB_FILE)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
yield conn
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def init_db() -> None:
|
||||
"""Initialize the SQLite database."""
|
||||
with get_db_connection() as conn:
|
||||
conn.executescript(CREATE_TABLES_SQL)
|
||||
conn.commit()
|
||||
16
py/rag-service/src/libs/logger.py
Normal file
16
py/rag-service/src/libs/logger.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
from libs.configs import LOG_DIR
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(
|
||||
LOG_DIR / f"rag_service_{datetime.now().astimezone().strftime('%Y%m%d')}.log",
|
||||
),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
66
py/rag-service/src/libs/utils.py
Normal file
66
py/rag-service/src/libs/utils.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from llama_index.core.schema import BaseNode
|
||||
|
||||
PATTERN_URI_PART = re.compile(r"(?P<uri>.+)__part_\d+")
|
||||
METADATA_KEY_URI = "uri"
|
||||
|
||||
|
||||
def uri_to_path(uri: str) -> Path:
|
||||
"""Convert URI to path."""
|
||||
return Path(uri.replace("file://", ""))
|
||||
|
||||
|
||||
def path_to_uri(file_path: Path) -> str:
|
||||
"""Convert path to URI."""
|
||||
uri = file_path.as_uri()
|
||||
if file_path.is_dir():
|
||||
uri += "/"
|
||||
return uri
|
||||
|
||||
|
||||
def is_local_uri(uri: str) -> bool:
|
||||
"""Check if the URI is a path URI."""
|
||||
return uri.startswith("file://")
|
||||
|
||||
|
||||
def is_remote_uri(uri: str) -> bool:
|
||||
"""Check if the URI is an HTTPS URI or HTTP URI."""
|
||||
return uri.startswith(("https://", "http://"))
|
||||
|
||||
|
||||
def is_path_node(node: BaseNode) -> bool:
|
||||
"""Check if the node is a file node."""
|
||||
uri = get_node_uri(node)
|
||||
if not uri:
|
||||
return False
|
||||
return is_local_uri(uri)
|
||||
|
||||
|
||||
def get_node_uri(node: BaseNode) -> str | None:
|
||||
"""Get URI from node metadata."""
|
||||
uri = node.metadata.get(METADATA_KEY_URI)
|
||||
if not uri:
|
||||
doc_id = getattr(node, "doc_id", None)
|
||||
if doc_id:
|
||||
match = PATTERN_URI_PART.match(doc_id)
|
||||
uri = match.group("uri") if match else doc_id
|
||||
if uri:
|
||||
if uri.startswith("/"):
|
||||
uri = f"file://{uri}"
|
||||
return uri
|
||||
return None
|
||||
|
||||
|
||||
def inject_uri_to_node(node: BaseNode) -> None:
|
||||
"""Inject file path into node metadata."""
|
||||
if METADATA_KEY_URI in node.metadata:
|
||||
return
|
||||
uri = get_node_uri(node)
|
||||
if uri:
|
||||
node.metadata[METADATA_KEY_URI] = uri
|
||||
Reference in New Issue
Block a user