feat: RAG service (#1220)

This commit is contained in:
yetone
2025-02-23 01:37:26 +08:00
committed by GitHub
parent 437d36920d
commit fd84c91cdb
32 changed files with 2339 additions and 15 deletions

View File

View File

@@ -0,0 +1,174 @@
import json
import os
from datetime import datetime
from typing import Any
from libs.db import get_db_connection
from libs.logger import logger
from libs.utils import get_node_uri
from llama_index.core.schema import Document
from models.indexing_history import IndexingHistory
class IndexingHistoryService:
def delete_indexing_status(self, uri: str) -> None:
"""Delete indexing status for a specific file."""
with get_db_connection() as conn:
conn.execute(
"""
DELETE FROM indexing_history
WHERE uri = ?
""",
(uri,),
)
conn.commit()
def delete_indexing_status_by_document_id(self, document_id: str) -> None:
"""Delete indexing status for a specific document."""
with get_db_connection() as conn:
conn.execute(
"""
DELETE FROM indexing_history
WHERE document_id = ?
""",
(document_id,),
)
conn.commit()
def update_indexing_status(
self,
doc: Document,
status: str,
error_message: str | None = None,
metadata: dict[str, Any] | None = None,
) -> None:
"""Update the indexing status in the database."""
content_hash = doc.hash
# Get URI from metadata if available
uri = get_node_uri(doc)
if not uri:
logger.warning("URI not found for document: %s", doc.doc_id)
return
record = IndexingHistory(
id=None,
uri=uri,
content_hash=content_hash,
status=status,
error_message=error_message,
document_id=doc.doc_id,
metadata=metadata,
)
with get_db_connection() as conn:
# Check if record exists
existing = conn.execute(
"SELECT id FROM indexing_history WHERE document_id = ?",
(doc.doc_id,),
).fetchone()
if existing:
# Update existing record
conn.execute(
"""
UPDATE indexing_history
SET content_hash = ?, status = ?, error_message = ?, document_id = ?, metadata = ?
WHERE uri = ?
""",
(
record.content_hash,
record.status,
record.error_message,
record.document_id,
json.dumps(record.metadata) if record.metadata else None,
record.uri,
),
)
else:
# Insert new record
conn.execute(
"""
INSERT INTO indexing_history
(uri, content_hash, status, error_message, document_id, metadata)
VALUES (?, ?, ?, ?, ?, ?)
""",
(
record.uri,
record.content_hash,
record.status,
record.error_message,
record.document_id,
json.dumps(record.metadata) if record.metadata else None,
),
)
conn.commit()
def get_indexing_status(self, doc: Document | None = None, base_uri: str | None = None) -> list[IndexingHistory]:
"""Get indexing status from the database."""
with get_db_connection() as conn:
if doc:
uri = get_node_uri(doc)
if not uri:
logger.warning("URI not found for document: %s", doc.doc_id)
return []
content_hash = doc.hash
# For a specific file, get its latest status
query = """
SELECT *
FROM indexing_history
WHERE uri = ? and content_hash = ?
ORDER BY timestamp DESC LIMIT 1
"""
params = (uri, content_hash)
elif base_uri:
# For files in a specific directory, get their latest status
query = """
WITH RankedHistory AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY document_id ORDER BY timestamp DESC) as rn
FROM indexing_history
WHERE uri LIKE ? || '%'
)
SELECT id, uri, content_hash, status, timestamp, error_message, document_id, metadata
FROM RankedHistory
WHERE rn = 1
ORDER BY timestamp DESC
"""
params = (base_uri,) if base_uri.endswith(os.path.sep) else (base_uri + os.path.sep,)
else:
# For all files, get their latest status
query = """
WITH RankedHistory AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY uri ORDER BY timestamp DESC) as rn
FROM indexing_history
)
SELECT id, uri, content_hash, status, timestamp, error_message, document_id, metadata
FROM RankedHistory
WHERE rn = 1
ORDER BY timestamp DESC
"""
params = ()
rows = conn.execute(query, params).fetchall()
result = []
for row in rows:
row_dict = dict(row)
# Parse metadata JSON if it exists
if row_dict.get("metadata"):
try:
row_dict["metadata"] = json.loads(row_dict["metadata"])
except json.JSONDecodeError:
row_dict["metadata"] = None
# Parse timestamp string to datetime if needed
if isinstance(row_dict.get("timestamp"), str):
row_dict["timestamp"] = datetime.fromisoformat(
row_dict["timestamp"].replace("Z", "+00:00"),
)
result.append(IndexingHistory(**row_dict))
return result
indexing_history_service = IndexingHistoryService()

View File

@@ -0,0 +1,104 @@
"""Resource Service."""
from libs.db import get_db_connection
from models.resource import Resource
class ResourceService:
"""Resource Service."""
def add_resource_to_db(self, resource: Resource) -> None:
"""Add a resource to the database."""
with get_db_connection() as conn:
conn.execute(
"""
INSERT INTO resources (name, uri, type, status, indexing_status, created_at)
VALUES (?, ?, ?, ?, ?, ?)
""",
(
resource.name,
resource.uri,
resource.type,
resource.status,
resource.indexing_status,
resource.created_at,
),
)
conn.commit()
def update_resource_indexing_status(self, uri: str, indexing_status: str, indexing_status_message: str) -> None:
"""Update resource indexing status in the database."""
with get_db_connection() as conn:
if indexing_status == "indexing":
conn.execute(
"""
UPDATE resources
SET indexing_status = ?, indexing_status_message = ?, indexing_started_at = CURRENT_TIMESTAMP
WHERE uri = ?
""",
(indexing_status, indexing_status_message, uri),
)
else:
conn.execute(
"""
UPDATE resources
SET indexing_status = ?, indexing_status_message = ?, last_indexed_at = CURRENT_TIMESTAMP
WHERE uri = ?
""",
(indexing_status, indexing_status_message, uri),
)
conn.commit()
def update_resource_status(self, uri: str, status: str, error: str | None = None) -> None:
"""Update resource status in the database."""
with get_db_connection() as conn:
if status == "active":
conn.execute(
"""
UPDATE resources
SET status = ?, last_indexed_at = CURRENT_TIMESTAMP, last_error = ?
WHERE uri = ?
""",
(status, error, uri),
)
else:
conn.execute(
"""
UPDATE resources
SET status = ?, last_error = ?
WHERE uri = ?
""",
(status, error, uri),
)
conn.commit()
def get_resource(self, uri: str) -> Resource | None:
"""Get resource from the database."""
with get_db_connection() as conn:
row = conn.execute(
"SELECT * FROM resources WHERE uri = ?",
(uri,),
).fetchone()
if row:
return Resource(**dict(row))
return None
def get_resource_by_name(self, name: str) -> Resource | None:
"""Get resource by name from the database."""
with get_db_connection() as conn:
row = conn.execute(
"SELECT * FROM resources WHERE name = ?",
(name,),
).fetchone()
if row:
return Resource(**dict(row))
return None
def get_all_resources(self) -> list[Resource]:
"""Get all resources from the database."""
with get_db_connection() as conn:
rows = conn.execute("SELECT * FROM resources ORDER BY created_at DESC").fetchall()
return [Resource(**dict(row)) for row in rows]
resource_service = ResourceService()