fix: ignore git crypt files (#1465)
* Exclude file patterns from git-crypt in pathspec git-crypt could be used to encrypt files in a repository. These files should be excluded from the pathspec to avoid sending them to the RAG service. git-crypt relies on a filter attribute in the .gitattributes so we can use ls-files to get the files that are encrypted. * Add some logging about ignored file The logging is quite verbose, given it logs every ignored file but I think it useful for the end user to have an explicit feedback about sensitive files that are being ignored. * Fix lint errors * Avoid Shell=true for subprocess.run() (S604) Removing S604 "Avoid Shell=true for subprocess.run()" we get S603 "subprocess call: check for execution of untrusted input" I dit not found a way to fix this issue, so I'm putting it in the ignore list. I also used shutil to retrieve the absolute git path to run the subprocess commands.
This commit is contained in:
@@ -8,6 +8,8 @@ import json
|
|||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
@@ -552,15 +554,101 @@ def process_document_batch(documents: list[Document]) -> bool: # noqa: PLR0915,
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_gitignore_files(directory: Path) -> list[str]:
|
||||||
|
"""Get patterns from .gitignore file."""
|
||||||
|
patterns = [".git/"]
|
||||||
|
|
||||||
|
# Check for .gitignore
|
||||||
|
gitignore_path = directory / ".gitignore"
|
||||||
|
if gitignore_path.exists():
|
||||||
|
with gitignore_path.open("r", encoding="utf-8") as f:
|
||||||
|
patterns.extend(f.readlines())
|
||||||
|
|
||||||
|
return patterns
|
||||||
|
|
||||||
|
|
||||||
|
def get_gitcrypt_files(directory: Path) -> list[str]:
|
||||||
|
"""Get patterns of git-crypt encrypted files using git command."""
|
||||||
|
git_crypt_patterns = []
|
||||||
|
git_executable = shutil.which("git")
|
||||||
|
|
||||||
|
if not git_executable:
|
||||||
|
logger.warning("git command not found, git-crypt files will not be excluded")
|
||||||
|
return git_crypt_patterns
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Find git root directory
|
||||||
|
git_root_cmd = subprocess.run(
|
||||||
|
[git_executable, "-C", str(directory), "rev-parse", "--show-toplevel"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
if git_root_cmd.returncode != 0:
|
||||||
|
logger.warning("Not a git repository or git command failed: %s", git_root_cmd.stderr.strip())
|
||||||
|
return git_crypt_patterns
|
||||||
|
|
||||||
|
git_root = Path(git_root_cmd.stdout.strip())
|
||||||
|
|
||||||
|
# Get relative path from git root to our directory
|
||||||
|
rel_path = directory.relative_to(git_root) if directory != git_root else Path()
|
||||||
|
|
||||||
|
# Execute git commands separately and pipe the results
|
||||||
|
git_ls_files = subprocess.run(
|
||||||
|
[git_executable, "-C", str(git_root), "ls-files", "-z"],
|
||||||
|
capture_output=True,
|
||||||
|
text=False,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
if git_ls_files.returncode != 0:
|
||||||
|
return git_crypt_patterns
|
||||||
|
|
||||||
|
# Use Python to process the output instead of xargs, grep, and cut
|
||||||
|
git_check_attr = subprocess.run(
|
||||||
|
[git_executable, "-C", str(git_root), "check-attr", "filter", "--stdin", "-z"],
|
||||||
|
input=git_ls_files.stdout,
|
||||||
|
capture_output=True,
|
||||||
|
text=False,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
if git_check_attr.returncode != 0:
|
||||||
|
return git_crypt_patterns
|
||||||
|
|
||||||
|
# Process the output in Python to find git-crypt files
|
||||||
|
output = git_check_attr.stdout.decode("utf-8")
|
||||||
|
lines = output.split("\0")
|
||||||
|
|
||||||
|
for i in range(0, len(lines) - 2, 3):
|
||||||
|
if i + 2 < len(lines) and lines[i + 2] == "git-crypt":
|
||||||
|
file_path = lines[i]
|
||||||
|
# Only include files that are in our directory or subdirectories
|
||||||
|
file_path_obj = Path(file_path)
|
||||||
|
if str(rel_path) == "." or file_path_obj.is_relative_to(rel_path):
|
||||||
|
git_crypt_patterns.append(file_path)
|
||||||
|
|
||||||
|
# Log if git-crypt patterns were found
|
||||||
|
if git_crypt_patterns:
|
||||||
|
logger.info("Excluding git-crypt encrypted files: %s", git_crypt_patterns)
|
||||||
|
except (subprocess.SubprocessError, OSError) as e:
|
||||||
|
logger.warning("Error getting git-crypt files: %s", str(e))
|
||||||
|
|
||||||
|
return git_crypt_patterns
|
||||||
|
|
||||||
|
|
||||||
def get_pathspec(directory: Path) -> pathspec.PathSpec | None:
|
def get_pathspec(directory: Path) -> pathspec.PathSpec | None:
|
||||||
"""Get pathspec for the directory."""
|
"""Get pathspec for the directory."""
|
||||||
gitignore_path = directory / ".gitignore"
|
# Collect patterns from both sources
|
||||||
if not gitignore_path.exists():
|
patterns = get_gitignore_files(directory)
|
||||||
|
patterns.extend(get_gitcrypt_files(directory))
|
||||||
|
|
||||||
|
# Return None if no patterns were found
|
||||||
|
if len(patterns) <= 1: # Only .git/ is in the list
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Read gitignore patterns
|
return pathspec.GitIgnoreSpec.from_lines(patterns)
|
||||||
with gitignore_path.open("r", encoding="utf-8") as f:
|
|
||||||
return pathspec.GitIgnoreSpec.from_lines([*f.readlines(), ".git/"])
|
|
||||||
|
|
||||||
|
|
||||||
def scan_directory(directory: Path) -> list[str]:
|
def scan_directory(directory: Path) -> list[str]:
|
||||||
@@ -574,7 +662,11 @@ def scan_directory(directory: Path) -> list[str]:
|
|||||||
if not spec:
|
if not spec:
|
||||||
matched_files.extend(file_paths)
|
matched_files.extend(file_paths)
|
||||||
continue
|
continue
|
||||||
matched_files.extend([file for file in file_paths if not spec.match_file(os.path.relpath(file, directory))])
|
for file in file_paths:
|
||||||
|
if spec and spec.match_file(os.path.relpath(file, directory)):
|
||||||
|
logger.info("Ignoring file: %s", file)
|
||||||
|
else:
|
||||||
|
matched_files.append(file)
|
||||||
|
|
||||||
return matched_files
|
return matched_files
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ ignore = [
|
|||||||
"D101",
|
"D101",
|
||||||
"D203", # 1 blank line required before class docstring
|
"D203", # 1 blank line required before class docstring
|
||||||
"D212", # Multi-line docstring summary should start at the first line
|
"D212", # Multi-line docstring summary should start at the first line
|
||||||
|
"S603",
|
||||||
"TRY300",
|
"TRY300",
|
||||||
"TRY400",
|
"TRY400",
|
||||||
"PGH003",
|
"PGH003",
|
||||||
|
|||||||
Reference in New Issue
Block a user