From 418aba9231baa5c1c435dad668da19da4a9580e4 Mon Sep 17 00:00:00 2001 From: hauntsaninja Date: Sat, 10 Feb 2024 20:11:49 -0800 Subject: [PATCH] Fix ignoring input files for symlink reasons This relates to #4015, #4161 and the behaviour of os.getcwd() Black is a big user of pathlib and as such loves doing `.resolve()`, since for a long time it was the only good way of getting an absolute path in pathlib. However, this has two problems: The first minor problem is performance, e.g. in #3751 I (safely) got rid of a bunch of `.resolve()` which made Black 40% faster on cached runs. The second more important problem is that always resolving symlinks results in unintuitive exclusion behaviour. For instance, a gitignored symlink should never alter formatting of your actual code. This kind of thing was reported by users a few times. In #3846, I improved the exclusion rule logic for symlinks in `gen_python_files` and everything was good. But `gen_python_files` isn't enough, there's also `get_sources`, which handles user specified paths directly (instead of files Black discovers). So in #4015, I made a very similar change to #3846 for `get_sources`, and this is where some problems began. The core issue was the line: ``` root_relative_path = path.absolute().relative_to(root).as_posix() ``` The first issue is that despite root being computed from user inputs, we call `.resolve()` while computing it (likely unecessarily). Which means that `path` may not actually be relative to `root`. So I started off this PR trying to fix that, when I ran into the second issue. Which is that `os.getcwd()` (as called by `os.path.abspath` or `Path.absolute` or `Path.cwd`) also often resolves symlinks! ``` >>> import os >>> os.environ.get("PWD") '/Users/shantanu/dev/black/symlink/bug' >>> os.getcwd() '/Users/shantanu/dev/black/actual/bug' ``` This also meant that the breakage often would not show up when input relative paths. This doesn't affect `gen_python_files` / #3846 because things are always absolute and known to be relative to `root`. Anyway, it looks like #4161 fixed the crash by just swallowing the error and ignoring the file. Instead, we should just try to compute the actual relative path. I think this PR should be quite safe, but we could also consider reverting some of the previous changes; the associated issues weren't too popular. At the same time, I think there's still behaviour that can be improved and I kind of want to make larger changes, but maybe I'll save that for if we do something like #3952 Hopefully fixes #4205, fixes #4209, actual fix for #4077 --- src/black/__init__.py | 15 +++--- src/black/files.py | 44 ++++++++++------ tests/test_black.py | 120 ++++++++++++++++++++++++++++++++---------- 3 files changed, 126 insertions(+), 53 deletions(-) diff --git a/src/black/__init__.py b/src/black/__init__.py index 2d4c7f655ad..f82b9fec5b7 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -44,12 +44,12 @@ STDIN_PLACEHOLDER, ) from black.files import ( + best_effort_relative_path, find_project_root, find_pyproject_toml, find_user_pyproject_toml, gen_python_files, get_gitignore, - get_root_relative_path, parse_pyproject_toml, path_is_excluded, resolves_outside_root_or_cannot_stat, @@ -734,6 +734,7 @@ def get_sources( """Compute the set of files to be formatted.""" sources: Set[Path] = set() + assert root.is_absolute(), f"INTERNAL ERROR: `root` must be absolute but is {root}" using_default_exclude = exclude is None exclude = re_compile_maybe_verbose(DEFAULT_EXCLUDES) if exclude is None else exclude gitignore: Optional[Dict[Path, PathSpec]] = None @@ -749,11 +750,12 @@ def get_sources( # Compare the logic here to the logic in `gen_python_files`. if is_stdin or path.is_file(): - root_relative_path = get_root_relative_path(path, root, report) - - if root_relative_path is None: + if resolves_outside_root_or_cannot_stat(path, root, report): + if verbose: + out(f'Skipping invalid source: "{path}"', fg="red") continue + root_relative_path = best_effort_relative_path(path, root).as_posix() root_relative_path = "/" + root_relative_path # Hard-exclude any files that matches the `--force-exclude` regex. @@ -763,11 +765,6 @@ def get_sources( ) continue - if resolves_outside_root_or_cannot_stat(path, root, report): - if verbose: - out(f'Skipping invalid source: "{path}"', fg="red") - continue - if is_stdin: path = Path(f"{STDIN_PLACEHOLDER}{str(path)}") diff --git a/src/black/files.py b/src/black/files.py index 6c05105450c..08258dd8eb8 100644 --- a/src/black/files.py +++ b/src/black/files.py @@ -48,6 +48,11 @@ def _load_toml(path: Union[Path, str]) -> Dict[str, Any]: return tomllib.load(f) +@lru_cache +def _cached_resolve(path: Path) -> Path: + return path.resolve() + + @lru_cache def find_project_root( srcs: Sequence[str], stdin_filename: Optional[str] = None @@ -67,9 +72,9 @@ def find_project_root( if stdin_filename is not None: srcs = tuple(stdin_filename if s == "-" else s for s in srcs) if not srcs: - srcs = [str(Path.cwd().resolve())] + srcs = [str(_cached_resolve(Path.cwd()))] - path_srcs = [Path(Path.cwd(), src).resolve() for src in srcs] + path_srcs = [_cached_resolve(Path(Path.cwd(), src)) for src in srcs] # A list of lists of parents for each 'src'. 'src' is included as a # "parent" of itself if it is a directory @@ -236,7 +241,7 @@ def find_user_pyproject_toml() -> Path: else: config_root = os.environ.get("XDG_CONFIG_HOME", "~/.config") user_config_path = Path(config_root).expanduser() / "black" - return user_config_path.resolve() + return _cached_resolve(user_config_path) @lru_cache @@ -266,27 +271,31 @@ def resolves_outside_root_or_cannot_stat( try: if sys.version_info < (3, 8, 6): path = path.absolute() # https://bugs.python.org/issue33660 - resolved_path = path.resolve() - return get_root_relative_path(resolved_path, root, report) is None + resolved_path = _cached_resolve(path) except OSError as e: if report: report.path_ignored(path, f"cannot be read because {e}") return True - - -def get_root_relative_path( - path: Path, - root: Path, - report: Optional[Report] = None, -) -> Optional[str]: - """Returns the file path relative to the 'root' directory""" try: - root_relative_path = path.absolute().relative_to(root).as_posix() + resolved_path.relative_to(root).as_posix() except ValueError: if report: report.path_ignored(path, f"is a symbolic link that points outside {root}") - return None - return root_relative_path + return True + return False + + +def best_effort_relative_path(path: Path, root: Path) -> Path: + # Precondition: resolves_outside_root_or_cannot_stat(path, root) is False + try: + return path.absolute().relative_to(root) + except ValueError: + pass + root_parent = next((p for p in path.parents if _cached_resolve(p) == root), None) + if root_parent is not None: + return path.relative_to(root_parent) + # something adversarial, fallback to path guaranteed by precondition + return _cached_resolve(path).relative_to(root) def _path_is_ignored( @@ -339,7 +348,8 @@ def gen_python_files( assert root.is_absolute(), f"INTERNAL ERROR: `root` must be absolute but is {root}" for child in paths: - root_relative_path = child.absolute().relative_to(root).as_posix() + assert child.is_absolute() + root_relative_path = child.relative_to(root).as_posix() # First ignore files matching .gitignore, if passed if gitignore_dict and _path_is_ignored( diff --git a/tests/test_black.py b/tests/test_black.py index 5c6920c2b77..260106700fa 100644 --- a/tests/test_black.py +++ b/tests/test_black.py @@ -2567,32 +2567,32 @@ def test_symlinks(self) -> None: gitignore = PathSpec.from_lines("gitwildmatch", []) regular = MagicMock() - regular.absolute.return_value = root / "regular.py" + regular.relative_to.return_value = Path("regular.py") regular.resolve.return_value = root / "regular.py" regular.is_dir.return_value = False regular.is_file.return_value = True outside_root_symlink = MagicMock() - outside_root_symlink.absolute.return_value = root / "symlink.py" + outside_root_symlink.relative_to.return_value = Path("symlink.py") outside_root_symlink.resolve.return_value = Path("/nowhere") outside_root_symlink.is_dir.return_value = False outside_root_symlink.is_file.return_value = True ignored_symlink = MagicMock() - ignored_symlink.absolute.return_value = root / ".mypy_cache" / "symlink.py" + ignored_symlink.relative_to.return_value = Path(".mypy_cache") / "symlink.py" ignored_symlink.is_dir.return_value = False ignored_symlink.is_file.return_value = True # A symlink that has an excluded name, but points to an included name symlink_excluded_name = MagicMock() - symlink_excluded_name.absolute.return_value = root / "excluded_name" + symlink_excluded_name.relative_to.return_value = Path("excluded_name") symlink_excluded_name.resolve.return_value = root / "included_name.py" symlink_excluded_name.is_dir.return_value = False symlink_excluded_name.is_file.return_value = True # A symlink that has an included name, but points to an excluded name symlink_included_name = MagicMock() - symlink_included_name.absolute.return_value = root / "included_name.py" + symlink_included_name.relative_to.return_value = Path("included_name.py") symlink_included_name.resolve.return_value = root / "excluded_name" symlink_included_name.is_dir.return_value = False symlink_included_name.is_file.return_value = True @@ -2626,39 +2626,100 @@ def test_symlinks(self) -> None: outside_root_symlink.resolve.assert_called_once() ignored_symlink.resolve.assert_not_called() + def test_get_sources_symlink_and_force_exclude(self) -> None: + with TemporaryDirectory() as tempdir: + tmp = Path(tempdir).resolve() + actual = tmp / "actual" + actual.mkdir() + symlink = tmp / "symlink" + symlink.symlink_to(actual) + + actual_proj = actual / "project" + actual_proj.mkdir() + (actual_proj / "module.py").write_text("print('hello')") + + symlink_proj = symlink / "project" + + with change_directory(symlink_proj): + assert_collected_sources( + src=["module.py"], + root=symlink_proj.resolve(), + expected=["module.py"], + ) + + absolute_module = symlink_proj / "module.py" + assert_collected_sources( + src=[absolute_module], + root=symlink_proj.resolve(), + expected=[absolute_module], + ) + + # a few tricky tests for force_exclude + flat_symlink = symlink_proj / "symlink_module.py" + flat_symlink.symlink_to(actual_proj / "module.py") + assert_collected_sources( + src=[flat_symlink], + root=symlink_proj.resolve(), + force_exclude=r"/symlink_module.py", + expected=[], + ) + + target = actual_proj / "target" + target.mkdir() + (target / "another.py").write_text("print('hello')") + (symlink_proj / "nested").symlink_to(target) + + assert_collected_sources( + src=[symlink_proj / "nested" / "another.py"], + root=symlink_proj.resolve(), + force_exclude=r"nested", + expected=[], + ) + assert_collected_sources( + src=[symlink_proj / "nested" / "another.py"], + root=symlink_proj.resolve(), + force_exclude=r"target", + expected=[symlink_proj / "nested" / "another.py"], + ) + def test_get_sources_with_stdin_symlink_outside_root( self, ) -> None: path = THIS_DIR / "data" / "include_exclude_tests" stdin_filename = str(path / "b/exclude/a.py") outside_root_symlink = Path("/target_directory/a.py") + root = Path("target_directory/").resolve() with patch("pathlib.Path.resolve", return_value=outside_root_symlink): assert_collected_sources( - root=Path("target_directory/"), + root=root, src=["-"], expected=[], stdin_filename=stdin_filename, ) - @patch("black.find_project_root", lambda *args: (THIS_DIR.resolve(), None)) def test_get_sources_with_stdin(self) -> None: src = ["-"] expected = ["-"] - assert_collected_sources(src, expected, include="", exclude=r"/exclude/|a\.py") + assert_collected_sources( + src, + root=THIS_DIR.resolve(), + expected=expected, + include="", + exclude=r"/exclude/|a\.py", + ) - @patch("black.find_project_root", lambda *args: (THIS_DIR.resolve(), None)) def test_get_sources_with_stdin_filename(self) -> None: src = ["-"] stdin_filename = str(THIS_DIR / "data/collections.py") expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"] assert_collected_sources( src, - expected, + root=THIS_DIR.resolve(), + expected=expected, exclude=r"/exclude/a\.py", stdin_filename=stdin_filename, ) - @patch("black.find_project_root", lambda *args: (THIS_DIR.resolve(), None)) def test_get_sources_with_stdin_filename_and_exclude(self) -> None: # Exclude shouldn't exclude stdin_filename since it is mimicking the # file being passed directly. This is the same as @@ -2669,12 +2730,12 @@ def test_get_sources_with_stdin_filename_and_exclude(self) -> None: expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"] assert_collected_sources( src, - expected, + root=THIS_DIR.resolve(), + expected=expected, exclude=r"/exclude/|a\.py", stdin_filename=stdin_filename, ) - @patch("black.find_project_root", lambda *args: (THIS_DIR.resolve(), None)) def test_get_sources_with_stdin_filename_and_extend_exclude(self) -> None: # Extend exclude shouldn't exclude stdin_filename since it is mimicking the # file being passed directly. This is the same as @@ -2685,12 +2746,12 @@ def test_get_sources_with_stdin_filename_and_extend_exclude(self) -> None: expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"] assert_collected_sources( src, - expected, + root=THIS_DIR.resolve(), + expected=expected, extend_exclude=r"/exclude/|a\.py", stdin_filename=stdin_filename, ) - @patch("black.find_project_root", lambda *args: (THIS_DIR.resolve(), None)) def test_get_sources_with_stdin_filename_and_force_exclude(self) -> None: # Force exclude should exclude the file when passing it through # stdin_filename @@ -2698,27 +2759,32 @@ def test_get_sources_with_stdin_filename_and_force_exclude(self) -> None: stdin_filename = str(path / "b/exclude/a.py") assert_collected_sources( src=["-"], + root=THIS_DIR.resolve(), expected=[], force_exclude=r"/exclude/|a\.py", stdin_filename=stdin_filename, ) - @patch("black.find_project_root", lambda *args: (THIS_DIR.resolve(), None)) def test_get_sources_with_stdin_filename_and_force_exclude_and_symlink( self, ) -> None: # Force exclude should exclude a symlink based on the symlink, not its target - path = THIS_DIR / "data" / "include_exclude_tests" - stdin_filename = str(path / "symlink.py") - expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"] - target = path / "b/exclude/a.py" - with patch("pathlib.Path.resolve", return_value=target): - assert_collected_sources( - src=["-"], - expected=expected, - force_exclude=r"exclude/a\.py", - stdin_filename=stdin_filename, - ) + with TemporaryDirectory() as tempdir: + tmp = Path(tempdir).resolve() + (tmp / "exclude").mkdir() + (tmp / "exclude" / "a.py").write_text("print('hello')") + (tmp / "symlink.py").symlink_to(tmp / "exclude" / "a.py") + + stdin_filename = str(tmp / "symlink.py") + expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"] + with change_directory(tmp): + assert_collected_sources( + src=["-"], + root=tmp, + expected=expected, + force_exclude=r"exclude/a\.py", + stdin_filename=stdin_filename, + ) class TestDeFactoAPI: