Merge pull request #135 from MichaelMure/github-import

Michael Muré created

Enhancement to Github iterator and importer

Change summary

bridge/github/import.go      | 95 ++++++++++++++++++-------------------
bridge/github/import_test.go |  2 
bridge/github/iterator.go    | 75 ++++++++++++++++++++---------
3 files changed, 100 insertions(+), 72 deletions(-)

Detailed changes

bridge/github/import.go 🔗

@@ -23,6 +23,15 @@ const (
 // githubImporter implement the Importer interface
 type githubImporter struct {
 	conf core.Configuration
+
+	// iterator
+	iterator *iterator
+
+	// number of imported issues
+	importedIssues int
+
+	// number of imported identities
+	importedIdentities int
 }
 
 func (gi *githubImporter) Init(conf core.Configuration) error {
@@ -30,54 +39,26 @@ func (gi *githubImporter) Init(conf core.Configuration) error {
 	return nil
 }
 
-// ImportAll .
+// ImportAll iterate over all the configured repository issues and ensure the creation of the
+// missing issues / timeline items / edits / label events ...
 func (gi *githubImporter) ImportAll(repo *cache.RepoCache, since time.Time) error {
-	iterator := NewIterator(gi.conf[keyUser], gi.conf[keyProject], gi.conf[keyToken], since)
+	gi.iterator = NewIterator(gi.conf[keyUser], gi.conf[keyProject], gi.conf[keyToken], since)
 
 	// Loop over all matching issues
-	for iterator.NextIssue() {
-		issue := iterator.IssueValue()
-
-		fmt.Printf("importing issue: %v %v\n", iterator.importedIssues, issue.Title)
-		// get issue edits
-		issueEdits := []userContentEdit{}
-		for iterator.NextIssueEdit() {
-			// issueEdit.Diff == nil happen if the event is older than early 2018, Github doesn't have the data before that.
-			// Best we can do is to ignore the event.
-			if issueEdit := iterator.IssueEditValue(); issueEdit.Diff != nil && string(*issueEdit.Diff) != "" {
-				issueEdits = append(issueEdits, issueEdit)
-			}
-		}
+	for gi.iterator.NextIssue() {
+		issue := gi.iterator.IssueValue()
+		fmt.Printf("importing issue: %v\n", issue.Title)
 
 		// create issue
-		b, err := gi.ensureIssue(repo, issue, issueEdits)
+		b, err := gi.ensureIssue(repo, issue)
 		if err != nil {
 			return fmt.Errorf("issue creation: %v", err)
 		}
 
 		// loop over timeline items
-		for iterator.NextTimeline() {
-			item := iterator.TimelineValue()
-
-			// if item is comment
-			if item.Typename == "IssueComment" {
-				// collect all edits
-				commentEdits := []userContentEdit{}
-				for iterator.NextCommentEdit() {
-					if commentEdit := iterator.CommentEditValue(); commentEdit.Diff != nil && string(*commentEdit.Diff) != "" {
-						commentEdits = append(commentEdits, commentEdit)
-					}
-				}
-
-				err := gi.ensureTimelineComment(repo, b, item.IssueComment, commentEdits)
-				if err != nil {
-					return fmt.Errorf("timeline comment creation: %v", err)
-				}
-
-			} else {
-				if err := gi.ensureTimelineItem(repo, b, item); err != nil {
-					return fmt.Errorf("timeline event creation: %v", err)
-				}
+		for gi.iterator.NextTimelineItem() {
+			if err := gi.ensureTimelineItem(repo, b, gi.iterator.TimelineItemValue()); err != nil {
+				return fmt.Errorf("timeline item creation: %v", err)
 			}
 		}
 
@@ -87,16 +68,16 @@ func (gi *githubImporter) ImportAll(repo *cache.RepoCache, since time.Time) erro
 		}
 	}
 
-	if err := iterator.Error(); err != nil {
+	if err := gi.iterator.Error(); err != nil {
 		fmt.Printf("import error: %v\n", err)
 		return err
 	}
 
-	fmt.Printf("Successfully imported %v issues from Github\n", iterator.ImportedIssues())
+	fmt.Printf("Successfully imported %d issues and %d identities from Github\n", gi.importedIssues, gi.importedIdentities)
 	return nil
 }
 
-func (gi *githubImporter) ensureIssue(repo *cache.RepoCache, issue issueTimeline, issueEdits []userContentEdit) (*cache.BugCache, error) {
+func (gi *githubImporter) ensureIssue(repo *cache.RepoCache, issue issueTimeline) (*cache.BugCache, error) {
 	// ensure issue author
 	author, err := gi.ensurePerson(repo, issue.Author)
 	if err != nil {
@@ -109,6 +90,12 @@ func (gi *githubImporter) ensureIssue(repo *cache.RepoCache, issue issueTimeline
 		return nil, err
 	}
 
+	// get issue edits
+	issueEdits := []userContentEdit{}
+	for gi.iterator.NextIssueEdit() {
+		issueEdits = append(issueEdits, gi.iterator.IssueEditValue())
+	}
+
 	// if issueEdits is empty
 	if len(issueEdits) == 0 {
 		if err == bug.ErrBugNotExist {
@@ -131,6 +118,9 @@ func (gi *githubImporter) ensureIssue(repo *cache.RepoCache, issue issueTimeline
 			if err != nil {
 				return nil, err
 			}
+
+			// importing a new bug
+			gi.importedIssues++
 		}
 
 	} else {
@@ -165,6 +155,9 @@ func (gi *githubImporter) ensureIssue(repo *cache.RepoCache, issue issueTimeline
 					return nil, err
 				}
 
+				// importing a new bug
+				gi.importedIssues++
+
 				continue
 			}
 
@@ -189,6 +182,16 @@ func (gi *githubImporter) ensureTimelineItem(repo *cache.RepoCache, b *cache.Bug
 
 	switch item.Typename {
 	case "IssueComment":
+		// collect all comment edits
+		commentEdits := []userContentEdit{}
+		for gi.iterator.NextCommentEdit() {
+			commentEdits = append(commentEdits, gi.iterator.CommentEditValue())
+		}
+
+		err := gi.ensureTimelineComment(repo, b, item.IssueComment, commentEdits)
+		if err != nil {
+			return fmt.Errorf("timeline comment creation: %v", err)
+		}
 
 	case "LabeledEvent":
 		id := parseId(item.LabeledEvent.Id)
@@ -455,6 +458,9 @@ func (gi *githubImporter) ensurePerson(repo *cache.RepoCache, actor *actor) (*ca
 		return nil, err
 	}
 
+	// importing a new identity
+	gi.importedIdentities++
+
 	var name string
 	var email string
 
@@ -528,10 +534,3 @@ func (gi *githubImporter) getGhost(repo *cache.RepoCache) (*cache.IdentityCache,
 func parseId(id githubv4.ID) string {
 	return fmt.Sprintf("%v", id)
 }
-
-func reverseEdits(edits []userContentEdit) []userContentEdit {
-	for i, j := 0, len(edits)-1; i < j; i, j = i+1, j-1 {
-		edits[i], edits[j] = edits[j], edits[i]
-	}
-	return edits
-}

bridge/github/import_test.go 🔗

@@ -152,7 +152,7 @@ func Test_Importer(t *testing.T) {
 
 	fmt.Printf("test repository imported in %f seconds\n", time.Since(start).Seconds())
 
-	require.Len(t, backend.AllBugsIds(), 9)
+	require.Len(t, backend.AllBugsIds(), len(tests))
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {

bridge/github/iterator.go 🔗

@@ -49,9 +49,6 @@ type iterator struct {
 	// sticky error
 	err error
 
-	// number of imported issues
-	importedIssues int
-
 	// timeline iterator
 	timeline timelineIterator
 
@@ -62,8 +59,9 @@ type iterator struct {
 	commentEdit commentEditIterator
 }
 
+// NewIterator create and initalize a new iterator
 func NewIterator(user, project, token string, since time.Time) *iterator {
-	return &iterator{
+	i := &iterator{
 		gc:       buildClient(token),
 		since:    since,
 		capacity: 10,
@@ -91,6 +89,9 @@ func NewIterator(user, project, token string, since time.Time) *iterator {
 			},
 		},
 	}
+
+	i.initTimelineQueryVariables()
+	return i
 }
 
 // init issue timeline variables
@@ -145,11 +146,6 @@ func (i *iterator) Error() error {
 	return i.err
 }
 
-// ImportedIssues return the number of issues we iterated over
-func (i *iterator) ImportedIssues() int {
-	return i.importedIssues
-}
-
 func (i *iterator) queryIssue() bool {
 	if err := i.gc.Query(context.TODO(), &i.timeline.query, i.timeline.variables); err != nil {
 		i.err = err
@@ -161,18 +157,18 @@ func (i *iterator) queryIssue() bool {
 	}
 
 	i.reverseTimelineEditNodes()
-	i.importedIssues++
 	return true
 }
 
-// Next issue
+// NextIssue try to query the next issue and return true. Only one issue is
+// queried at each call.
 func (i *iterator) NextIssue() bool {
-	// we make the first move
-	if i.importedIssues == 0 {
-
-		// init variables and goto queryIssue block
-		i.initTimelineQueryVariables()
-		return i.queryIssue()
+	// if $issueAfter variable is nil we can directly make the first query
+	if i.timeline.variables["issueAfter"] == (*githubv4.String)(nil) {
+		nextIssue := i.queryIssue()
+		// prevent from infinite loop by setting a non nil cursor
+		i.timeline.variables["issueAfter"] = i.timeline.query.Repository.Issues.PageInfo.EndCursor
+		return nextIssue
 	}
 
 	if i.err != nil {
@@ -195,11 +191,14 @@ func (i *iterator) NextIssue() bool {
 	return i.queryIssue()
 }
 
+// IssueValue return the actual issue value
 func (i *iterator) IssueValue() issueTimeline {
 	return i.timeline.query.Repository.Issues.Nodes[0]
 }
 
-func (i *iterator) NextTimeline() bool {
+// NextTimelineItem return true if there is a next timeline item and increments the index by one.
+// It is used iterates over all the timeline items. Extra queries are made if it is necessary.
+func (i *iterator) NextTimelineItem() bool {
 	if i.err != nil {
 		return false
 	}
@@ -231,7 +230,8 @@ func (i *iterator) NextTimeline() bool {
 	return true
 }
 
-func (i *iterator) TimelineValue() timelineItem {
+// TimelineItemValue return the actual timeline item value
+func (i *iterator) TimelineItemValue() timelineItem {
 	return i.timeline.query.Repository.Issues.Nodes[0].Timeline.Edges[i.timeline.index].Node
 }
 
@@ -253,9 +253,20 @@ func (i *iterator) queryIssueEdit() bool {
 
 	i.issueEdit.index = 0
 	i.timeline.issueEdit.index = -2
+	return i.nextValidIssueEdit()
+}
+
+func (i *iterator) nextValidIssueEdit() bool {
+	// issueEdit.Diff == nil happen if the event is older than early 2018, Github doesn't have the data before that.
+	// Best we can do is to ignore the event.
+	if issueEdit := i.IssueEditValue(); issueEdit.Diff == nil || string(*issueEdit.Diff) == "" {
+		return i.NextIssueEdit()
+	}
 	return true
 }
 
+// NextIssueEdit return true if there is a next issue edit and increments the index by one.
+// It is used iterates over all the issue edits. Extra queries are made if it is necessary.
 func (i *iterator) NextIssueEdit() bool {
 	if i.err != nil {
 		return false
@@ -266,7 +277,7 @@ func (i *iterator) NextIssueEdit() bool {
 	if i.timeline.issueEdit.index == -2 {
 		if i.issueEdit.index < min(i.capacity, len(i.issueEdit.query.Repository.Issues.Nodes[0].UserContentEdits.Nodes))-1 {
 			i.issueEdit.index++
-			return true
+			return i.nextValidIssueEdit()
 		}
 
 		if !i.issueEdit.query.Repository.Issues.Nodes[0].UserContentEdits.PageInfo.HasPreviousPage {
@@ -297,7 +308,7 @@ func (i *iterator) NextIssueEdit() bool {
 	// loop over them timeline comment edits
 	if i.timeline.issueEdit.index < min(i.capacity, len(i.timeline.query.Repository.Issues.Nodes[0].UserContentEdits.Nodes))-1 {
 		i.timeline.issueEdit.index++
-		return true
+		return i.nextValidIssueEdit()
 	}
 
 	if !i.timeline.query.Repository.Issues.Nodes[0].UserContentEdits.PageInfo.HasPreviousPage {
@@ -311,6 +322,7 @@ func (i *iterator) NextIssueEdit() bool {
 	return i.queryIssueEdit()
 }
 
+// IssueEditValue return the actual issue edit value
 func (i *iterator) IssueEditValue() userContentEdit {
 	// if we are using issue edit query
 	if i.timeline.issueEdit.index == -2 {
@@ -337,9 +349,19 @@ func (i *iterator) queryCommentEdit() bool {
 
 	i.commentEdit.index = 0
 	i.timeline.commentEdit.index = -2
+	return i.nextValidCommentEdit()
+}
+
+func (i *iterator) nextValidCommentEdit() bool {
+	// if comment edit diff is a nil pointer or points to an empty string look for next value
+	if commentEdit := i.CommentEditValue(); commentEdit.Diff == nil || string(*commentEdit.Diff) == "" {
+		return i.NextCommentEdit()
+	}
 	return true
 }
 
+// NextCommentEdit return true if there is a next comment edit and increments the index by one.
+// It is used iterates over all the comment edits. Extra queries are made if it is necessary.
 func (i *iterator) NextCommentEdit() bool {
 	if i.err != nil {
 		return false
@@ -350,7 +372,7 @@ func (i *iterator) NextCommentEdit() bool {
 
 		if i.commentEdit.index < min(i.capacity, len(i.commentEdit.query.Repository.Issues.Nodes[0].Timeline.Nodes[0].IssueComment.UserContentEdits.Nodes))-1 {
 			i.commentEdit.index++
-			return true
+			return i.nextValidCommentEdit()
 		}
 
 		if !i.commentEdit.query.Repository.Issues.Nodes[0].Timeline.Nodes[0].IssueComment.UserContentEdits.PageInfo.HasPreviousPage {
@@ -372,7 +394,7 @@ func (i *iterator) NextCommentEdit() bool {
 	// loop over them timeline comment edits
 	if i.timeline.commentEdit.index < min(i.capacity, len(i.timeline.query.Repository.Issues.Nodes[0].Timeline.Edges[i.timeline.index].Node.IssueComment.UserContentEdits.Nodes))-1 {
 		i.timeline.commentEdit.index++
-		return true
+		return i.nextValidCommentEdit()
 	}
 
 	if !i.timeline.query.Repository.Issues.Nodes[0].Timeline.Edges[i.timeline.index].Node.IssueComment.UserContentEdits.PageInfo.HasPreviousPage {
@@ -392,6 +414,7 @@ func (i *iterator) NextCommentEdit() bool {
 	return i.queryCommentEdit()
 }
 
+// CommentEditValue return the actual comment edit value
 func (i *iterator) CommentEditValue() userContentEdit {
 	if i.timeline.commentEdit.index == -2 {
 		return i.commentEdit.query.Repository.Issues.Nodes[0].Timeline.Nodes[0].IssueComment.UserContentEdits.Nodes[i.commentEdit.index]
@@ -407,3 +430,9 @@ func min(a, b int) int {
 
 	return a
 }
+
+func reverseEdits(edits []userContentEdit) {
+	for i, j := 0, len(edits)-1; i < j; i, j = i+1, j-1 {
+		edits[i], edits[j] = edits[j], edits[i]
+	}
+}