From f9025ff4152760f6ea1bbc0c6bf72dfaee7e71bd Mon Sep 17 00:00:00 2001 From: wan Date: Mon, 10 Mar 2025 16:23:56 +0900 Subject: [PATCH] fix: skip audio and video files during directory scan in RAG Service. (#1509) --- py/rag-service/src/main.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/py/rag-service/src/main.py b/py/rag-service/src/main.py index c1e235d..d5477ee 100644 --- a/py/rag-service/src/main.py +++ b/py/rag-service/src/main.py @@ -660,14 +660,33 @@ def scan_directory(directory: Path) -> list[str]: """Scan directory and return a list of matched files.""" spec = get_pathspec(directory) + audio_video_exts = [ + ".mp3", + ".wav", + ".ogg", + ".flac", + ".aac", + ".m4a", + ".wma", + ".mp4", + ".avi", + ".mov", + ".wmv", + ".flv", + ".mkv", + ".webm", + ] + matched_files = [] for root, _, files in os.walk(directory): file_paths = [str(Path(root) / file) for file in files] - if not spec: - matched_files.extend(file_paths) - continue for file in file_paths: + file_ext = Path(file).suffix.lower() + if file_ext in audio_video_exts: + logger.info("Skipping audio/video file: %s", file) + continue + if spec and spec.match_file(os.path.relpath(file, directory)): logger.info("Ignoring file: %s", file) else: