Skip to content

Commit

Permalink
plumbing: blame, Complete rewrite. Fixes go-git#603
Browse files Browse the repository at this point in the history
Signed-off-by: Arieh Schneier <15041913+AriehSchneier@users.noreply.github.com>
  • Loading branch information
AriehSchneier committed Jun 7, 2023
1 parent 4211278 commit a53784a
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 704 deletions.
246 changes: 114 additions & 132 deletions blame.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package git

import (
"bytes"
"errors"
"fmt"
"strconv"
"strings"
Expand All @@ -12,6 +11,7 @@ import (
"github.com/go-git/go-git/v5/plumbing"
"github.com/go-git/go-git/v5/plumbing/object"
"github.com/go-git/go-git/v5/utils/diff"
"github.com/sergi/go-diff/diffmatchpatch"
)

// BlameResult represents the result of a Blame operation.
Expand All @@ -30,66 +30,36 @@ func Blame(c *object.Commit, path string) (*BlameResult, error) {
// The file to blame is identified by the input arguments:
// commit and path. commit is a Commit object obtained from a Repository. Path
// represents a path to a specific file contained into the repository.
//
// Blaming a file is a two step process:
//
// 1. Create a linear history of the commits affecting a file. We use
// revlist.New for that.
//
// 2. Then build a graph with a node for every line in every file in
// the history of the file.
//
// Each node is assigned a commit: Start by the nodes in the first
// commit. Assign that commit as the creator of all its lines.
//
// Then jump to the nodes in the next commit, and calculate the diff
// between the two files. Newly created lines get
// assigned the new commit as its origin. Modified lines also get
// this new commit. Untouched lines retain the old commit.
//
// All this work is done in the assignOrigin function which holds all
// the internal relevant data in a "blame" struct, that is not
// exported.
//
// TODO: ways to improve the efficiency of this function:
// 1. Improve revlist
// 2. Improve how to traverse the history (example a backward traversal will
// be much more efficient)
//
// TODO: ways to improve the function in general:
// 1. Add memoization between revlist and assign.
// 2. It is using much more memory than needed, see the TODOs below.

b := new(blame)
b.fRev = c
b.path = path

// get all the file revisions
if err := b.fillRevs(); err != nil {
file, err := b.fRev.File(b.path)
if err != nil {
return nil, err
}

// calculate the line tracking graph and fill in
// file contents in data.
if err := b.fillGraphAndData(); err != nil {
finalLines, err := file.Lines()
if err != nil {
return nil, err
}
finalLength := len(finalLines)

file, err := b.fRev.File(b.path)
b.lineToCommit = make([]*object.Commit, finalLength)
needsMap := make([]*lineMap, finalLength)
for i := range needsMap {
needsMap[i] = &lineMap{i, i}
}
contents, err := file.Contents()
if err != nil {
return nil, err
}
finalLines, err := file.Lines()
err = b.addBlames(c, contents, needsMap)
if err != nil {
return nil, err
}

// Each node (line) holds the commit where it was introduced or
// last modified. To achieve that we use the FORWARD algorithm
// described in Zimmermann, et al. "Mining Version Archives for
// Co-changed Lines", in proceedings of the Mining Software
// Repositories workshop, Shanghai, May 22-23, 2006.
lines, err := newLines(finalLines, b.sliceGraph(len(b.graph)-1))
lines, err := newLines(finalLines, b.lineToCommit)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -123,18 +93,7 @@ func newLine(author, text string, date time.Time, hash plumbing.Hash) *Line {
}

func newLines(contents []string, commits []*object.Commit) ([]*Line, error) {
lcontents := len(contents)
lcommits := len(commits)

if lcontents != lcommits {
if lcontents == lcommits-1 && contents[lcontents-1] != "\n" {
contents = append(contents, "\n")
} else {
return nil, errors.New("contents and commits have different length")
}
}

result := make([]*Line, 0, lcontents)
result := make([]*Line, 0, len(contents))
for i := range contents {
result = append(result, newLine(
commits[i].Author.Email, contents[i],
Expand All @@ -152,96 +111,119 @@ type blame struct {
path string
// the commit of the final revision of the file to blame
fRev *object.Commit
// the chain of revisions affecting the the file to blame
revs []*object.Commit
// the contents of the file across all its revisions
data []string
// the graph of the lines in the file across all the revisions
graph [][]*object.Commit
// resolved lines
lineToCommit []*object.Commit
}

// calculate the history of a file "path", starting from commit "from", sorted by commit date.
func (b *blame) fillRevs() error {
var err error
type lineMap struct {
Orig, Cur int
}

b.revs, err = references(b.fRev, b.path)
return err
func firstNeed(needsMap []*lineMap) int {
return nextNeed(needsMap, -1)
}

func nextNeed(needsMap []*lineMap, c int) int {
for {
c++
if c >= len(needsMap) {
return -1
}
if needsMap[c] != nil {
return c
}
}
}

// build graph of a file from its revision history
func (b *blame) fillGraphAndData() error {
//TODO: not all commits are needed, only the current rev and the prev
b.graph = make([][]*object.Commit, len(b.revs))
b.data = make([]string, len(b.revs)) // file contents in all the revisions
// for every revision of the file, starting with the first
// one...
for i, rev := range b.revs {
func (b *blame) addBlames(curCommit *object.Commit, currentContents string, needsMap []*lineMap) error {
// TODO: optimise, keep record of seen commits and don't process them again
// TODO: optimise, check if any parent is identical and just pass straight to it

parents, err := parentsContainingPath(b.path, curCommit)
if err != nil {
return err
}
if len(parents) != 0 {
// Sorting here to ensure consistent results as we are just picking the branch of the older parent when both
// branches add the same line
// TODO: Find out if parents is guaranteed to be sorted
sortCommits(parents)
}

for _, prev := range parents {
// get the contents of the file
file, err := rev.File(b.path)
file, err := prev.File(b.path)
if err != nil {
return nil
}
b.data[i], err = file.Contents()
prevContents, err := file.Contents()
if err != nil {
return err
}
nLines := countLines(b.data[i])
// create a node for each line
b.graph[i] = make([]*object.Commit, nLines)
// assign a commit to each node
// if this is the first revision, then the node is assigned to
// this first commit.
if i == 0 {
for j := 0; j < nLines; j++ {
b.graph[i][j] = b.revs[i]

hunks := diff.Do(prevContents, currentContents)
// TODO: optimise, work with hunks rather than line by line
// (needsMap would also be a list of hunks and then you would match hunks, splitting where necessary)
prevl := -1
curl := -1
need := firstNeed(needsMap)
if need < 0 {
// found everything
break
}
// TODO: optimise, is it worth walking the hunks to calculate the required capacity here
getFromParent := make([]*lineMap, 0)
out:
for h := range hunks {
hLines := countLines(hunks[h].Text)
for hl := 0; hl < hLines; hl++ {
switch {
case hunks[h].Type == diffmatchpatch.DiffEqual:
prevl++
curl++
if curl == needsMap[need].Cur {
// assign current line to the parent and remove it from our needs
getFromParent = append(getFromParent, &lineMap{needsMap[need].Orig, prevl})
needsMap[need] = nil
need = nextNeed(needsMap, need)
if need < 0 {
break out
}
}
case hunks[h].Type == diffmatchpatch.DiffInsert:
curl++
if curl == needsMap[need].Cur {
// the line we want is added, it may be modified here (or another parent), skip it for now
need = nextNeed(needsMap, need)
if need < 0 {
break out
}
}
case hunks[h].Type == diffmatchpatch.DiffDelete:
prevl++
default:
panic("unreachable")
}
}
} else {
// if this is not the first commit, then assign to the old
// commit or to the new one, depending on what the diff
// says.
b.assignOrigin(i, i-1)
}
}
return nil
}

// sliceGraph returns a slice of commits (one per line) for a particular
// revision of a file (0=first revision).
func (b *blame) sliceGraph(i int) []*object.Commit {
fVs := b.graph[i]
result := make([]*object.Commit, 0, len(fVs))
for _, v := range fVs {
c := *v
result = append(result, &c)
}
return result
}

// Assigns origin to vertexes in current (c) rev from data in its previous (p)
// revision
func (b *blame) assignOrigin(c, p int) {
// assign origin based on diff info
hunks := diff.Do(b.data[p], b.data[c])
sl := -1 // source line
dl := -1 // destination line
for h := range hunks {
hLines := countLines(hunks[h].Text)
for hl := 0; hl < hLines; hl++ {
switch {
case hunks[h].Type == 0:
sl++
dl++
b.graph[c][dl] = b.graph[p][sl]
case hunks[h].Type == 1:
dl++
b.graph[c][dl] = b.revs[c]
case hunks[h].Type == -1:
sl++
default:
panic("unreachable")
if len(getFromParent) > 0 {
// Resolve any lines assigned to this parent
err = b.addBlames(prev, prevContents, getFromParent)
if err != nil {
return err
}
}
}

// any needs left in the needsMap must have come from this revision
need := firstNeed(needsMap)
for need >= 0 {
b.lineToCommit[needsMap[need].Orig] = curCommit
need = nextNeed(needsMap, need)
}

return nil
}

// GoString prints the results of a Blame using git-blame's style.
Expand All @@ -265,9 +247,9 @@ func (b *blame) GoString() string {
format := fmt.Sprintf("%%s (%%-%ds %%%dd) %%s\n",
mal, mlnl)

fVs := b.graph[len(b.graph)-1]
fVs := b.lineToCommit
for ln, v := range fVs {
fmt.Fprintf(&buf, format, v.Hash.String()[:8],
_, _ = fmt.Fprintf(&buf, format, v.Hash.String()[:8],
prettyPrintAuthor(fVs[ln]), ln+1, lines[ln])
}
return buf.String()
Expand All @@ -281,8 +263,8 @@ func prettyPrintAuthor(c *object.Commit) string {
// utility function to calculate the number of runes needed
// to print the longest author name in the blame of a file.
func (b *blame) maxAuthorLength() int {
memo := make(map[plumbing.Hash]struct{}, len(b.graph)-1)
fVs := b.graph[len(b.graph)-1]
fVs := b.lineToCommit
memo := make(map[plumbing.Hash]struct{}, len(fVs))
m := 0
for ln := range fVs {
if _, ok := memo[fVs[ln].Hash]; ok {
Expand Down

0 comments on commit a53784a

Please sign in to comment.