fix: Avoid indexing temporary files (#2419)
Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
This commit is contained in:
@@ -29,9 +29,24 @@ from fastapi import BackgroundTasks, FastAPI, HTTPException
|
|||||||
from libs.configs import BASE_DATA_DIR, CHROMA_PERSIST_DIR
|
from libs.configs import BASE_DATA_DIR, CHROMA_PERSIST_DIR
|
||||||
from libs.db import init_db
|
from libs.db import init_db
|
||||||
from libs.logger import logger
|
from libs.logger import logger
|
||||||
from libs.utils import get_node_uri, inject_uri_to_node, is_local_uri, is_path_node, is_remote_uri, path_to_uri, uri_to_path
|
from libs.utils import (
|
||||||
from llama_index.core import Settings, SimpleDirectoryReader, StorageContext, VectorStoreIndex, load_index_from_storage
|
get_node_uri,
|
||||||
|
inject_uri_to_node,
|
||||||
|
is_local_uri,
|
||||||
|
is_path_node,
|
||||||
|
is_remote_uri,
|
||||||
|
path_to_uri,
|
||||||
|
uri_to_path,
|
||||||
|
)
|
||||||
|
from llama_index.core import (
|
||||||
|
Settings,
|
||||||
|
SimpleDirectoryReader,
|
||||||
|
StorageContext,
|
||||||
|
VectorStoreIndex,
|
||||||
|
load_index_from_storage,
|
||||||
|
)
|
||||||
from llama_index.core.node_parser import CodeSplitter
|
from llama_index.core.node_parser import CodeSplitter
|
||||||
|
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
|
||||||
from llama_index.core.schema import Document
|
from llama_index.core.schema import Document
|
||||||
from llama_index.vector_stores.chroma import ChromaVectorStore
|
from llama_index.vector_stores.chroma import ChromaVectorStore
|
||||||
from markdownify import markdownify as md
|
from markdownify import markdownify as md
|
||||||
@@ -527,9 +542,6 @@ def process_document_batch(documents: list[Document]) -> bool: # noqa: PLR0915,
|
|||||||
invalid_documents.append(doc_id)
|
invalid_documents.append(doc_id)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Create new document object with cleaned content
|
|
||||||
from llama_index.core.schema import Document
|
|
||||||
|
|
||||||
cleaned_content = clean_text(content)
|
cleaned_content = clean_text(content)
|
||||||
metadata = getattr(doc, "metadata", {}).copy()
|
metadata = getattr(doc, "metadata", {}).copy()
|
||||||
|
|
||||||
@@ -583,7 +595,11 @@ def process_document_batch(documents: list[Document]) -> bool: # noqa: PLR0915,
|
|||||||
|
|
||||||
def get_gitignore_files(directory: Path) -> list[str]:
|
def get_gitignore_files(directory: Path) -> list[str]:
|
||||||
"""Get patterns from .gitignore file."""
|
"""Get patterns from .gitignore file."""
|
||||||
patterns = [".git/"]
|
patterns = []
|
||||||
|
|
||||||
|
# Always include .git/ if it exists
|
||||||
|
if (directory / ".git").is_dir():
|
||||||
|
patterns.append(".git/")
|
||||||
|
|
||||||
# Check for .gitignore
|
# Check for .gitignore
|
||||||
gitignore_path = directory / ".gitignore"
|
gitignore_path = directory / ".gitignore"
|
||||||
@@ -796,6 +812,10 @@ def update_index_for_file(directory: Path, abs_file_path: Path) -> None:
|
|||||||
"""Update the index for a single file."""
|
"""Update the index for a single file."""
|
||||||
logger.debug("Starting to index file: %s", abs_file_path)
|
logger.debug("Starting to index file: %s", abs_file_path)
|
||||||
|
|
||||||
|
if not abs_file_path.is_file():
|
||||||
|
logger.debug("File does not exist or is not a file, skipping: %s", abs_file_path)
|
||||||
|
return
|
||||||
|
|
||||||
rel_file_path = abs_file_path.relative_to(directory)
|
rel_file_path = abs_file_path.relative_to(directory)
|
||||||
|
|
||||||
spec = get_pathspec(directory)
|
spec = get_pathspec(directory)
|
||||||
@@ -867,8 +887,6 @@ def split_documents(documents: list[Document]) -> list[Document]:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
from llama_index.core.schema import Document
|
|
||||||
|
|
||||||
new_doc = Document(
|
new_doc = Document(
|
||||||
text=text,
|
text=text,
|
||||||
doc_id=f"{doc.doc_id}__part_{i}",
|
doc_id=f"{doc.doc_id}__part_{i}",
|
||||||
@@ -958,8 +976,6 @@ async def index_local_resource_async(resource: Resource) -> None:
|
|||||||
try:
|
try:
|
||||||
logger.info("Loading directory content: %s", directory_path)
|
logger.info("Loading directory content: %s", directory_path)
|
||||||
|
|
||||||
from llama_index.core.readers.file.base import SimpleDirectoryReader
|
|
||||||
|
|
||||||
documents = SimpleDirectoryReader(
|
documents = SimpleDirectoryReader(
|
||||||
input_files=scan_directory(directory_path),
|
input_files=scan_directory(directory_path),
|
||||||
filename_as_id=True,
|
filename_as_id=True,
|
||||||
@@ -1188,8 +1204,6 @@ async def retrieve(request: RetrieveRequest): # noqa: D103, ANN201, C901, PLR09
|
|||||||
base_uri += os.path.sep
|
base_uri += os.path.sep
|
||||||
return uri.startswith(base_uri)
|
return uri.startswith(base_uri)
|
||||||
|
|
||||||
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
|
|
||||||
|
|
||||||
# Create a custom post processor
|
# Create a custom post processor
|
||||||
class ResourceFilterPostProcessor(MetadataReplacementPostProcessor):
|
class ResourceFilterPostProcessor(MetadataReplacementPostProcessor):
|
||||||
"""Post-processor for filtering nodes based on directory."""
|
"""Post-processor for filtering nodes based on directory."""
|
||||||
@@ -1262,7 +1276,7 @@ async def retrieve(request: RetrieveRequest): # noqa: D103, ANN201, C901, PLR09
|
|||||||
doc_info = {
|
doc_info = {
|
||||||
"uri": uri,
|
"uri": uri,
|
||||||
"content": cleaned_content,
|
"content": cleaned_content,
|
||||||
"score": float(node.score) if hasattr(node, "score") else None,
|
"score": float(node.score) if node.score is not None else None,
|
||||||
}
|
}
|
||||||
sources.append(doc_info)
|
sources.append(doc_info)
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user