fix: ignore git crypt files (#1465)

* Exclude file patterns from git-crypt in pathspec

git-crypt could be used to encrypt files in a repository.
These files should be excluded from the pathspec to avoid
sending them to the RAG service.
git-crypt relies on a filter attribute in the .gitattributes so we can
use ls-files to get the files that are encrypted.

* Add some logging about ignored file

The logging is quite verbose, given it logs every ignored file but I
think it useful for the end user to have an explicit feedback about
sensitive files that are being ignored.

* Fix lint errors

* Avoid Shell=true for subprocess.run() (S604)

Removing S604 "Avoid Shell=true for subprocess.run()" we get S603 "subprocess call: check for execution of untrusted input"
I dit not found a way to fix this issue, so I'm putting it in the ignore list.
I also used shutil to retrieve the absolute git path to run the subprocess commands.
This commit is contained in:
Francesco Tassi
2025-03-06 11:34:12 +01:00
committed by GitHub
parent dec794ac85
commit 2b0e7e09ae
2 changed files with 99 additions and 6 deletions

View File

@@ -8,6 +8,8 @@ import json
import multiprocessing import multiprocessing
import os import os
import re import re
import shutil
import subprocess
import threading import threading
import time import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
@@ -552,15 +554,101 @@ def process_document_batch(documents: list[Document]) -> bool: # noqa: PLR0915,
return False return False
def get_gitignore_files(directory: Path) -> list[str]:
"""Get patterns from .gitignore file."""
patterns = [".git/"]
# Check for .gitignore
gitignore_path = directory / ".gitignore"
if gitignore_path.exists():
with gitignore_path.open("r", encoding="utf-8") as f:
patterns.extend(f.readlines())
return patterns
def get_gitcrypt_files(directory: Path) -> list[str]:
"""Get patterns of git-crypt encrypted files using git command."""
git_crypt_patterns = []
git_executable = shutil.which("git")
if not git_executable:
logger.warning("git command not found, git-crypt files will not be excluded")
return git_crypt_patterns
try:
# Find git root directory
git_root_cmd = subprocess.run(
[git_executable, "-C", str(directory), "rev-parse", "--show-toplevel"],
capture_output=True,
text=True,
check=False,
)
if git_root_cmd.returncode != 0:
logger.warning("Not a git repository or git command failed: %s", git_root_cmd.stderr.strip())
return git_crypt_patterns
git_root = Path(git_root_cmd.stdout.strip())
# Get relative path from git root to our directory
rel_path = directory.relative_to(git_root) if directory != git_root else Path()
# Execute git commands separately and pipe the results
git_ls_files = subprocess.run(
[git_executable, "-C", str(git_root), "ls-files", "-z"],
capture_output=True,
text=False,
check=False,
)
if git_ls_files.returncode != 0:
return git_crypt_patterns
# Use Python to process the output instead of xargs, grep, and cut
git_check_attr = subprocess.run(
[git_executable, "-C", str(git_root), "check-attr", "filter", "--stdin", "-z"],
input=git_ls_files.stdout,
capture_output=True,
text=False,
check=False,
)
if git_check_attr.returncode != 0:
return git_crypt_patterns
# Process the output in Python to find git-crypt files
output = git_check_attr.stdout.decode("utf-8")
lines = output.split("\0")
for i in range(0, len(lines) - 2, 3):
if i + 2 < len(lines) and lines[i + 2] == "git-crypt":
file_path = lines[i]
# Only include files that are in our directory or subdirectories
file_path_obj = Path(file_path)
if str(rel_path) == "." or file_path_obj.is_relative_to(rel_path):
git_crypt_patterns.append(file_path)
# Log if git-crypt patterns were found
if git_crypt_patterns:
logger.info("Excluding git-crypt encrypted files: %s", git_crypt_patterns)
except (subprocess.SubprocessError, OSError) as e:
logger.warning("Error getting git-crypt files: %s", str(e))
return git_crypt_patterns
def get_pathspec(directory: Path) -> pathspec.PathSpec | None: def get_pathspec(directory: Path) -> pathspec.PathSpec | None:
"""Get pathspec for the directory.""" """Get pathspec for the directory."""
gitignore_path = directory / ".gitignore" # Collect patterns from both sources
if not gitignore_path.exists(): patterns = get_gitignore_files(directory)
patterns.extend(get_gitcrypt_files(directory))
# Return None if no patterns were found
if len(patterns) <= 1: # Only .git/ is in the list
return None return None
# Read gitignore patterns return pathspec.GitIgnoreSpec.from_lines(patterns)
with gitignore_path.open("r", encoding="utf-8") as f:
return pathspec.GitIgnoreSpec.from_lines([*f.readlines(), ".git/"])
def scan_directory(directory: Path) -> list[str]: def scan_directory(directory: Path) -> list[str]:
@@ -574,7 +662,11 @@ def scan_directory(directory: Path) -> list[str]:
if not spec: if not spec:
matched_files.extend(file_paths) matched_files.extend(file_paths)
continue continue
matched_files.extend([file for file in file_paths if not spec.match_file(os.path.relpath(file, directory))]) for file in file_paths:
if spec and spec.match_file(os.path.relpath(file, directory)):
logger.info("Ignoring file: %s", file)
else:
matched_files.append(file)
return matched_files return matched_files

View File

@@ -28,6 +28,7 @@ ignore = [
"D101", "D101",
"D203", # 1 blank line required before class docstring "D203", # 1 blank line required before class docstring
"D212", # Multi-line docstring summary should start at the first line "D212", # Multi-line docstring summary should start at the first line
"S603",
"TRY300", "TRY300",
"TRY400", "TRY400",
"PGH003", "PGH003",