repo: proper reduced interface for full-text indexing

Michael Muré created

Additionally, remove and concentrate quite a lot of complexity from the cache layer
into a "per app" single site where to configure how indexing is done.

Change summary

repository/gogit.go        |  55 +------------
repository/gogit_test.go   |  11 --
repository/index_bleve.go  | 154 ++++++++++++++++++++++++++++++++++++++++
repository/mock_repo.go    |  82 +++++++++++++++-----
repository/repo.go         |  35 +++++++--
repository/repo_testing.go |  47 +++++++++++
6 files changed, 297 insertions(+), 87 deletions(-)

Detailed changes

repository/gogit.go 🔗

@@ -12,7 +12,6 @@ import (
 	"time"
 
 	"github.com/ProtonMail/go-crypto/openpgp"
-	"github.com/blevesearch/bleve"
 	"github.com/go-git/go-billy/v5"
 	"github.com/go-git/go-billy/v5/osfs"
 	gogit "github.com/go-git/go-git/v5"
@@ -45,7 +44,7 @@ type GoGitRepo struct {
 	clocks      map[string]lamport.Clock
 
 	indexesMutex sync.Mutex
-	indexes      map[string]bleve.Index
+	indexes      map[string]Index
 
 	keyring      Keyring
 	localStorage billy.Filesystem
@@ -75,7 +74,7 @@ func OpenGoGitRepo(path, namespace string, clockLoaders []ClockLoader) (*GoGitRe
 		r:            r,
 		path:         path,
 		clocks:       make(map[string]lamport.Clock),
-		indexes:      make(map[string]bleve.Index),
+		indexes:      make(map[string]Index),
 		keyring:      k,
 		localStorage: osfs.New(filepath.Join(path, namespace)),
 	}
@@ -129,7 +128,7 @@ func InitGoGitRepo(path, namespace string) (*GoGitRepo, error) {
 		r:            r,
 		path:         filepath.Join(path, ".git"),
 		clocks:       make(map[string]lamport.Clock),
-		indexes:      make(map[string]bleve.Index),
+		indexes:      make(map[string]Index),
 		keyring:      k,
 		localStorage: osfs.New(filepath.Join(path, ".git", namespace)),
 	}, nil
@@ -154,7 +153,7 @@ func InitBareGoGitRepo(path, namespace string) (*GoGitRepo, error) {
 		r:            r,
 		path:         path,
 		clocks:       make(map[string]lamport.Clock),
-		indexes:      make(map[string]bleve.Index),
+		indexes:      make(map[string]Index),
 		keyring:      k,
 		localStorage: osfs.New(filepath.Join(path, namespace)),
 	}, nil
@@ -323,8 +322,7 @@ func (repo *GoGitRepo) LocalStorage() billy.Filesystem {
 	return repo.localStorage
 }
 
-// GetBleveIndex return a bleve.Index that can be used to index documents
-func (repo *GoGitRepo) GetBleveIndex(name string) (bleve.Index, error) {
+func (repo *GoGitRepo) GetIndex(name string) (Index, error) {
 	repo.indexesMutex.Lock()
 	defer repo.indexesMutex.Unlock()
 
@@ -334,50 +332,11 @@ func (repo *GoGitRepo) GetBleveIndex(name string) (bleve.Index, error) {
 
 	path := filepath.Join(repo.localStorage.Root(), indexPath, name)
 
-	index, err := bleve.Open(path)
+	index, err := openBleveIndex(path)
 	if err == nil {
 		repo.indexes[name] = index
-		return index, nil
-	}
-
-	err = os.MkdirAll(path, os.ModePerm)
-	if err != nil {
-		return nil, err
-	}
-
-	mapping := bleve.NewIndexMapping()
-	mapping.DefaultAnalyzer = "en"
-
-	index, err = bleve.New(path, mapping)
-	if err != nil {
-		return nil, err
-	}
-
-	repo.indexes[name] = index
-
-	return index, nil
-}
-
-// ClearBleveIndex will wipe the given index
-func (repo *GoGitRepo) ClearBleveIndex(name string) error {
-	repo.indexesMutex.Lock()
-	defer repo.indexesMutex.Unlock()
-
-	if index, ok := repo.indexes[name]; ok {
-		err := index.Close()
-		if err != nil {
-			return err
-		}
-		delete(repo.indexes, name)
 	}
-
-	path := filepath.Join(repo.localStorage.Root(), indexPath, name)
-	err := os.RemoveAll(path)
-	if err != nil {
-		return err
-	}
-
-	return nil
+	return index, err
 }
 
 // FetchRefs fetch git refs matching a directory prefix to a remote

repository/gogit_test.go 🔗

@@ -65,24 +65,19 @@ func TestGoGitRepo_Indexes(t *testing.T) {
 	plainRoot := goGitRepoDir(t, repo)
 
 	// Can create indices
-	indexA, err := repo.GetBleveIndex("a")
+	indexA, err := repo.GetIndex("a")
 	require.NoError(t, err)
 	require.NotZero(t, indexA)
 	require.FileExists(t, filepath.Join(plainRoot, ".git", namespace, "indexes", "a", "index_meta.json"))
 	require.FileExists(t, filepath.Join(plainRoot, ".git", namespace, "indexes", "a", "store"))
 
-	indexB, err := repo.GetBleveIndex("b")
+	indexB, err := repo.GetIndex("b")
 	require.NoError(t, err)
 	require.NotZero(t, indexB)
 	require.DirExists(t, filepath.Join(plainRoot, ".git", namespace, "indexes", "b"))
 
 	// Can get an existing index
-	indexA, err = repo.GetBleveIndex("a")
+	indexA, err = repo.GetIndex("a")
 	require.NoError(t, err)
 	require.NotZero(t, indexA)
-
-	// Can delete an index
-	err = repo.ClearBleveIndex("a")
-	require.NoError(t, err)
-	require.NoDirExists(t, filepath.Join(plainRoot, ".git", namespace, "indexes", "a"))
 }

repository/index_bleve.go 🔗

@@ -0,0 +1,154 @@
+package repository
+
+import (
+	"fmt"
+	"os"
+	"strings"
+	"sync"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve"
+)
+
+var _ Index = &bleveIndex{}
+
+type bleveIndex struct {
+	path string
+
+	mu    sync.RWMutex
+	index bleve.Index
+}
+
+func openBleveIndex(path string) (*bleveIndex, error) {
+	index, err := bleve.Open(path)
+	if err == nil {
+		return &bleveIndex{path: path, index: index}, nil
+	}
+
+	b := &bleveIndex{path: path}
+	err = b.makeIndex()
+	if err != nil {
+		return nil, err
+	}
+
+	return b, nil
+}
+
+func (b *bleveIndex) makeIndex() error {
+	err := os.MkdirAll(b.path, os.ModePerm)
+	if err != nil {
+		return err
+	}
+
+	// TODO: follow https://github.com/blevesearch/bleve/issues/1576 recommendations
+
+	mapping := bleve.NewIndexMapping()
+	mapping.DefaultAnalyzer = "en"
+
+	index, err := bleve.New(b.path, mapping)
+	if err != nil {
+		return err
+	}
+	b.index = index
+	return nil
+}
+
+func (b *bleveIndex) IndexOne(id string, texts []string) error {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return b._index(b.index.Index, id, texts)
+}
+
+func (b *bleveIndex) IndexBatch() (indexer func(id string, texts []string) error, closer func() error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+
+	batch := b.index.NewBatch()
+
+	indexer = func(id string, texts []string) error {
+		return b._index(batch.Index, id, texts)
+	}
+
+	closer = func() error {
+		return b.index.Batch(batch)
+	}
+
+	return indexer, closer
+}
+
+func (b *bleveIndex) _index(indexer func(string, interface{}) error, id string, texts []string) error {
+	searchable := struct{ Text []string }{Text: texts}
+
+	// See https://github.com/blevesearch/bleve/issues/1576
+	var sb strings.Builder
+	normalize := func(text string) string {
+		sb.Reset()
+		for _, field := range strings.Fields(text) {
+			if utf8.RuneCountInString(field) < 100 {
+				sb.WriteString(field)
+				sb.WriteRune(' ')
+			}
+		}
+		return sb.String()
+	}
+
+	for i, s := range searchable.Text {
+		searchable.Text[i] = normalize(s)
+	}
+
+	return indexer(id, searchable)
+}
+
+func (b *bleveIndex) Search(terms []string) ([]string, error) {
+	b.mu.RLock()
+	defer b.mu.RUnlock()
+
+	for i, term := range terms {
+		if strings.Contains(term, " ") {
+			terms[i] = fmt.Sprintf("\"%s\"", term)
+		}
+	}
+
+	query := bleve.NewQueryStringQuery(strings.Join(terms, " "))
+	search := bleve.NewSearchRequest(query)
+
+	res, err := b.index.Search(search)
+	if err != nil {
+		return nil, err
+	}
+
+	ids := make([]string, len(res.Hits))
+	for i, hit := range res.Hits {
+		ids[i] = hit.ID
+	}
+
+	return ids, nil
+}
+
+func (b *bleveIndex) DocCount() (uint64, error) {
+	return b.index.DocCount()
+}
+
+func (b *bleveIndex) Clear() error {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+
+	err := b.index.Close()
+	if err != nil {
+		return err
+	}
+
+	err = os.RemoveAll(b.path)
+	if err != nil {
+		return err
+	}
+
+	return b.makeIndex()
+}
+
+func (b *bleveIndex) Close() error {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+
+	return b.index.Close()
+}

repository/mock_repo.go 🔗

@@ -9,7 +9,6 @@ import (
 
 	"github.com/99designs/keyring"
 	"github.com/ProtonMail/go-crypto/openpgp"
-	"github.com/blevesearch/bleve"
 	"github.com/go-git/go-billy/v5"
 	"github.com/go-git/go-billy/v5/memfs"
 
@@ -25,7 +24,7 @@ type mockRepo struct {
 	*mockRepoKeyring
 	*mockRepoCommon
 	*mockRepoStorage
-	*mockRepoBleve
+	*mockRepoIndex
 	*mockRepoData
 	*mockRepoClock
 	*mockRepoTest
@@ -39,7 +38,7 @@ func NewMockRepo() *mockRepo {
 		mockRepoKeyring: NewMockRepoKeyring(),
 		mockRepoCommon:  NewMockRepoCommon(),
 		mockRepoStorage: NewMockRepoStorage(),
-		mockRepoBleve:   newMockRepoBleve(),
+		mockRepoIndex:   newMockRepoIndex(),
 		mockRepoData:    NewMockRepoData(),
 		mockRepoClock:   NewMockRepoClock(),
 		mockRepoTest:    NewMockRepoTest(),
@@ -135,20 +134,20 @@ func (m *mockRepoStorage) LocalStorage() billy.Filesystem {
 	return m.localFs
 }
 
-var _ RepoBleve = &mockRepoBleve{}
+var _ RepoIndex = &mockRepoIndex{}
 
-type mockRepoBleve struct {
+type mockRepoIndex struct {
 	indexesMutex sync.Mutex
-	indexes      map[string]bleve.Index
+	indexes      map[string]Index
 }
 
-func newMockRepoBleve() *mockRepoBleve {
-	return &mockRepoBleve{
-		indexes: make(map[string]bleve.Index),
+func newMockRepoIndex() *mockRepoIndex {
+	return &mockRepoIndex{
+		indexes: make(map[string]Index),
 	}
 }
 
-func (m *mockRepoBleve) GetBleveIndex(name string) (bleve.Index, error) {
+func (m *mockRepoIndex) GetIndex(name string) (Index, error) {
 	m.indexesMutex.Lock()
 	defer m.indexesMutex.Unlock()
 
@@ -156,24 +155,63 @@ func (m *mockRepoBleve) GetBleveIndex(name string) (bleve.Index, error) {
 		return index, nil
 	}
 
-	mapping := bleve.NewIndexMapping()
-	mapping.DefaultAnalyzer = "en"
+	index := newIndex()
+	m.indexes[name] = index
+	return index, nil
+}
 
-	index, err := bleve.NewMemOnly(mapping)
-	if err != nil {
-		return nil, err
-	}
+var _ Index = &mockIndex{}
 
-	m.indexes[name] = index
+type mockIndex map[string][]string
 
-	return index, nil
+func newIndex() *mockIndex {
+	m := make(map[string][]string)
+	return (*mockIndex)(&m)
 }
 
-func (m *mockRepoBleve) ClearBleveIndex(name string) error {
-	m.indexesMutex.Lock()
-	defer m.indexesMutex.Unlock()
+func (m *mockIndex) IndexOne(id string, texts []string) error {
+	(*m)[id] = texts
+	return nil
+}
+
+func (m *mockIndex) IndexBatch() (indexer func(id string, texts []string) error, closer func() error) {
+	indexer = func(id string, texts []string) error {
+		(*m)[id] = texts
+		return nil
+	}
+	closer = func() error { return nil }
+	return indexer, closer
+}
+
+func (m *mockIndex) Search(terms []string) (ids []string, err error) {
+loop:
+	for id, texts := range *m {
+		for _, text := range texts {
+			for _, s := range strings.Fields(text) {
+				for _, term := range terms {
+					if s == term {
+						ids = append(ids, id)
+						continue loop
+					}
+				}
+			}
+		}
+	}
+	return ids, nil
+}
+
+func (m *mockIndex) DocCount() (uint64, error) {
+	return uint64(len(*m)), nil
+}
+
+func (m *mockIndex) Clear() error {
+	for k, _ := range *m {
+		delete(*m, k)
+	}
+	return nil
+}
 
-	delete(m.indexes, name)
+func (m *mockIndex) Close() error {
 	return nil
 }
 

repository/repo.go 🔗

@@ -6,7 +6,6 @@ import (
 	"io"
 
 	"github.com/ProtonMail/go-crypto/openpgp"
-	"github.com/blevesearch/bleve"
 	"github.com/go-git/go-billy/v5"
 
 	"github.com/MichaelMure/git-bug/util/lamport"
@@ -25,7 +24,7 @@ type Repo interface {
 	RepoKeyring
 	RepoCommon
 	RepoStorage
-	RepoBleve
+	RepoIndex
 	RepoData
 
 	Close() error
@@ -81,13 +80,33 @@ type RepoStorage interface {
 	LocalStorage() billy.Filesystem
 }
 
-// RepoBleve give access to Bleve to implement full-text search indexes.
-type RepoBleve interface {
-	// GetBleveIndex return a bleve.Index that can be used to index documents
-	GetBleveIndex(name string) (bleve.Index, error)
+// RepoIndex gives access to full-text search indexes
+type RepoIndex interface {
+	GetIndex(name string) (Index, error)
+}
+
+// Index is a full-text search index
+type Index interface {
+	// IndexOne indexes one document, for the given ID. If the document already exist,
+	// it replaces it.
+	IndexOne(id string, texts []string) error
+
+	// IndexBatch start a batch indexing. The returned indexer function is used the same
+	// way as IndexOne, and the closer function complete the batch insertion.
+	IndexBatch() (indexer func(id string, texts []string) error, closer func() error)
+
+	// Search returns the list of IDs matching the given terms.
+	Search(terms []string) (ids []string, err error)
 
-	// ClearBleveIndex will wipe the given index
-	ClearBleveIndex(name string) error
+	// DocCount returns the number of document in the index.
+	DocCount() (uint64, error)
+
+	// Clear empty the index.
+	Clear() error
+
+	// Close closes the index and make sure everything is safely written. After this call
+	// the index can't be used anymore.
+	Close() error
 }
 
 type Commit struct {

repository/repo_testing.go 🔗

@@ -10,7 +10,6 @@ import (
 	"github.com/MichaelMure/git-bug/util/lamport"
 )
 
-// TODO: add tests for RepoBleve
 // TODO: add tests for RepoStorage
 
 type RepoCreator func(t testing.TB, bare bool) TestedRepo
@@ -33,6 +32,10 @@ func RepoTest(t *testing.T, creator RepoCreator) {
 				RepoConfigTest(t, repo)
 			})
 
+			t.Run("Index", func(t *testing.T) {
+				RepoIndexTest(t, repo)
+			})
+
 			t.Run("Clocks", func(t *testing.T) {
 				RepoClockTest(t, repo)
 			})
@@ -234,6 +237,48 @@ func RepoDataSignatureTest(t *testing.T, repo RepoData) {
 	require.Error(t, err)
 }
 
+func RepoIndexTest(t *testing.T, repo RepoIndex) {
+	idx, err := repo.GetIndex("a")
+	require.NoError(t, err)
+
+	// simple indexing
+	err = idx.IndexOne("id1", []string{"foo", "bar", "foobar barfoo"})
+	require.NoError(t, err)
+
+	// batched indexing
+	indexer, closer := idx.IndexBatch()
+	err = indexer("id2", []string{"hello", "foo bar"})
+	require.NoError(t, err)
+	err = indexer("id3", []string{"Hola", "Esta bien"})
+	require.NoError(t, err)
+	err = closer()
+	require.NoError(t, err)
+
+	// search
+	res, err := idx.Search([]string{"foobar"})
+	require.NoError(t, err)
+	require.ElementsMatch(t, []string{"id1"}, res)
+
+	res, err = idx.Search([]string{"foo"})
+	require.NoError(t, err)
+	require.ElementsMatch(t, []string{"id1", "id2"}, res)
+
+	// re-indexing an item replace previous versions
+	err = idx.IndexOne("id2", []string{"hello"})
+	require.NoError(t, err)
+
+	res, err = idx.Search([]string{"foo"})
+	require.NoError(t, err)
+	require.ElementsMatch(t, []string{"id1"}, res)
+
+	err = idx.Clear()
+	require.NoError(t, err)
+
+	res, err = idx.Search([]string{"foo"})
+	require.NoError(t, err)
+	require.Empty(t, res)
+}
+
 // helper to test a RepoClock
 func RepoClockTest(t *testing.T, repo RepoClock) {
 	allClocks, err := repo.AllClocks()