Skip to content

Commit 6990afd

Browse files
authoredAug 22, 2024··
fix: similarity detection
Previously it would incorrectly count only the last batch of removed bytes, and now it will count all of them. This leads to realistic results with complex diffs, even though it's probably still not en-par with Git which uses more complex heuristics.
2 parents 25a3f1b + f8c5d9c commit 6990afd

File tree

2 files changed

+36
-66
lines changed

2 files changed

+36
-66
lines changed
 

Diff for: ‎gix-diff/src/rewrites/tracker.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -561,10 +561,10 @@ mod diff {
561561
type Out = usize;
562562

563563
fn process_change(&mut self, before: Range<u32>, _after: Range<u32>) {
564-
self.removed_bytes = self.input.before[before.start as usize..before.end as usize]
564+
self.removed_bytes += self.input.before[before.start as usize..before.end as usize]
565565
.iter()
566566
.map(|token| self.input.interner[*token].len())
567-
.sum();
567+
.sum::<usize>();
568568
}
569569

570570
fn finish(self) -> Self::Out {

Diff for: ‎gix/tests/object/tree/diff.rs

+34-64
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ fn tree_named(repo: &gix::Repository, rev_spec: impl AsRef<str>) -> gix::Tree {
175175
}
176176

177177
mod track_rewrites {
178+
use std::collections::HashMap;
178179
use std::convert::Infallible;
179180

180181
use gix::{
@@ -398,34 +399,27 @@ mod track_rewrites {
398399
// It's like the fixture doesn't get setup correctly.
399400
return Ok(());
400401
}
402+
403+
let mut expected = HashMap::<&BStr, (&BStr, u32)>::new();
404+
expected.insert(
405+
"cli/src/commands/file/chmod.rs".into(),
406+
("cli/src/commands/chmod.rs".into(), 90),
407+
);
408+
expected.insert(
409+
"cli/src/commands/file/print.rs".into(),
410+
("cli/src/commands/cat.rs".into(), 90),
411+
);
412+
expected.insert(
413+
"cli/tests/test_file_chmod_command.rs".into(),
414+
("cli/tests/test_chmod_command.rs".into(), 88),
415+
);
416+
expected.insert(
417+
"cli/tests/test_file_print_command.rs".into(),
418+
("cli/tests/test_cat_command.rs".into(), 77),
419+
);
420+
401421
let from = tree_named(&repo, "@~1");
402422
let to = tree_named(&repo, "@");
403-
404-
let mut count = 0;
405-
let expected_location = [
406-
"cli",
407-
"cli/src",
408-
"cli/tests",
409-
"cli/src/commands",
410-
"cli/src/commands/file",
411-
"cli/src/commands/file/print.rs",
412-
"cli/tests/test_file_print_command.rs",
413-
"cli/src/commands/file/chmod.rs",
414-
"cli/src/commands/file/mod.rs",
415-
"cli/tests/test_file_chmod_command.rs",
416-
"CHANGELOG.md",
417-
"cli/src/commands/mod.rs",
418-
"cli/tests/cli-reference@.md.snap",
419-
"cli/tests/runner.rs",
420-
"cli/tests/test_acls.rs",
421-
"cli/tests/test_diffedit_command.rs",
422-
"cli/tests/test_fix_command.rs",
423-
"cli/tests/test_global_opts.rs",
424-
"cli/tests/test_move_command.rs",
425-
"cli/tests/test_new_command.rs",
426-
"cli/tests/test_squash_command.rs",
427-
"cli/tests/test_unsquash_command.rs",
428-
];
429423
let out = from
430424
.changes()?
431425
.track_path()
@@ -441,48 +435,24 @@ mod track_rewrites {
441435
.into(),
442436
)
443437
.for_each_to_obtain_tree(&to, |change| -> Result<_, Infallible> {
444-
// TODO: all rewrites here don't match what Git produces right now.
445-
// dbg!(change);
446-
assert_eq!(change.location, expected_location[count]);
447-
match count {
448-
0 | 1 | 2 | 3 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 => {
449-
assert!(
450-
matches!(change.event, Event::Modification { .. }),
451-
"ignore modifications"
452-
);
453-
}
454-
4 => {
455-
assert!(matches!(change.event, Event::Addition { entry_mode, .. } if entry_mode.is_tree()));
456-
assert_eq!(change.location, "cli/src/commands/file");
457-
}
458-
5 => {
459-
assert_eq!(change.location, "cli/src/commands/file/print.rs");
460-
assert!(matches!(change.event, Event::Rewrite {source_location, ..} if source_location == "cli/tests/test_cat_command.rs"));
461-
}
462-
6 => {
463-
assert_eq!(change.location, "cli/tests/test_file_print_command.rs");
464-
assert!(matches!(change.event, Event::Rewrite {source_location, ..} if source_location == "cli/tests/test_chmod_command.rs"));
465-
}
466-
7 => {
467-
assert_eq!(change.location, "cli/src/commands/file/chmod.rs");
468-
assert!(matches!(change.event, Event::Rewrite {source_location, ..} if source_location == "cli/src/commands/chmod.rs"));
469-
}
470-
8 => {
471-
assert_eq!(change.location, "cli/src/commands/file/mod.rs");
472-
assert!(matches!(change.event, Event::Rewrite {source_location, ..} if source_location == "cli/src/commands/cat.rs"));
473-
}
474-
9 => {
475-
assert_eq!(change.location, "cli/tests/test_file_chmod_command.rs");
476-
assert!(matches!(change.event, Event::Rewrite {source_location, ..} if source_location == "cli/tests/test_immutable_commits.rs"));
477-
}
478-
n => unreachable!("unexpected call: {n}"),
479-
};
480-
count += 1;
438+
if let Event::Rewrite {
439+
source_location,
440+
diff: Some(diff),
441+
..
442+
} = change.event
443+
{
444+
// Round to percentage points to avoid floating point error.
445+
let similarity = (diff.similarity * 100.0) as u32;
446+
let v = expected.remove(change.location);
447+
assert_eq!(v, Some((source_location, similarity)));
448+
}
481449
Ok(Default::default())
482450
})?;
451+
452+
assert_eq!(expected, HashMap::new());
483453
let out = out.rewrites.expect("tracking enabled");
484454
assert_eq!(
485-
out.num_similarity_checks, 8,
455+
out.num_similarity_checks, 21,
486456
"this probably increases once the algorithm improves"
487457
);
488458
assert_eq!(

0 commit comments

Comments
 (0)
Please sign in to comment.