fix: rag-service error with go files (#1624)
* fix: switching from python to go * fix: type casting error * Fix import order
This commit is contained in:
@@ -148,8 +148,8 @@ tiktoken==0.8.0
|
|||||||
tokenizers==0.21.0
|
tokenizers==0.21.0
|
||||||
tqdm==4.67.1
|
tqdm==4.67.1
|
||||||
traitlets==5.14.3
|
traitlets==5.14.3
|
||||||
tree-sitter==0.21.3
|
tree-sitter==0.24.0
|
||||||
tree-sitter-languages==1.10.2
|
tree-sitter-language-pack==0.6.1
|
||||||
typer==0.15.1
|
typer==0.15.1
|
||||||
typing-inspect==0.9.0
|
typing-inspect==0.9.0
|
||||||
typing_extensions==4.12.2
|
typing_extensions==4.12.2
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ from models.resource import Resource
|
|||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from services.indexing_history import indexing_history_service
|
from services.indexing_history import indexing_history_service
|
||||||
from services.resource import resource_service
|
from services.resource import resource_service
|
||||||
|
from tree_sitter_language_pack import SupportedLanguage, get_parser
|
||||||
from watchdog.events import FileSystemEvent, FileSystemEventHandler
|
from watchdog.events import FileSystemEvent, FileSystemEventHandler
|
||||||
from watchdog.observers import Observer
|
from watchdog.observers import Observer
|
||||||
|
|
||||||
@@ -182,7 +183,7 @@ watched_resources: dict[str, BaseObserver] = {} # Directory path -> Observer in
|
|||||||
file_last_modified: dict[Path, float] = {} # File path -> Last modified time mapping
|
file_last_modified: dict[Path, float] = {} # File path -> Last modified time mapping
|
||||||
index_lock = threading.Lock()
|
index_lock = threading.Lock()
|
||||||
|
|
||||||
code_ext_map = {
|
code_ext_map: dict[str, SupportedLanguage] = {
|
||||||
".py": "python",
|
".py": "python",
|
||||||
".js": "javascript",
|
".js": "javascript",
|
||||||
".ts": "typescript",
|
".ts": "typescript",
|
||||||
@@ -803,12 +804,6 @@ def split_documents(documents: list[Document]) -> list[Document]:
|
|||||||
"""Split documents into code and non-code documents."""
|
"""Split documents into code and non-code documents."""
|
||||||
# Create file parser configuration
|
# Create file parser configuration
|
||||||
# Initialize CodeSplitter
|
# Initialize CodeSplitter
|
||||||
code_splitter = CodeSplitter(
|
|
||||||
language="python", # Default is python, will auto-detect based on file extension
|
|
||||||
chunk_lines=80, # Maximum number of lines per code block
|
|
||||||
chunk_lines_overlap=15, # Number of overlapping lines to maintain context
|
|
||||||
max_chars=1500, # Maximum number of characters per block
|
|
||||||
)
|
|
||||||
# Split code documents using CodeSplitter
|
# Split code documents using CodeSplitter
|
||||||
processed_documents = []
|
processed_documents = []
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
@@ -822,10 +817,18 @@ def split_documents(documents: list[Document]) -> list[Document]:
|
|||||||
file_ext = file_path.suffix.lower()
|
file_ext = file_path.suffix.lower()
|
||||||
if file_ext in code_ext_map:
|
if file_ext in code_ext_map:
|
||||||
# Apply CodeSplitter to code files
|
# Apply CodeSplitter to code files
|
||||||
code_splitter.language = code_ext_map.get(file_ext, "python")
|
language = code_ext_map.get(file_ext, "python")
|
||||||
|
parser = get_parser(language)
|
||||||
|
code_splitter = CodeSplitter(
|
||||||
|
language=language, # Default is python, will auto-detect based on file extension
|
||||||
|
chunk_lines=80, # Maximum number of lines per code block
|
||||||
|
chunk_lines_overlap=15, # Number of overlapping lines to maintain context
|
||||||
|
max_chars=1500, # Maximum number of characters per block
|
||||||
|
parser=parser,
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
texts = code_splitter.split_text(doc.get_content())
|
t = doc.get_content()
|
||||||
|
texts = code_splitter.split_text(t)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.error("Error splitting document: %s, so skipping split, error: %s", doc.doc_id, str(e))
|
logger.error("Error splitting document: %s, so skipping split, error: %s", doc.doc_id, str(e))
|
||||||
processed_documents.append(doc)
|
processed_documents.append(doc)
|
||||||
|
|||||||
Reference in New Issue
Block a user