scanner.go

  1package lfs
  2
  3import (
  4	"bufio"
  5	"bytes"
  6	"context"
  7	"fmt"
  8	"io"
  9	"strconv"
 10	"strings"
 11	"sync"
 12
 13	gitm "github.com/aymanbagabas/git-module"
 14	"github.com/charmbracelet/soft-serve/git"
 15)
 16
 17// SearchPointerBlobs scans the whole repository for LFS pointer files.
 18func SearchPointerBlobs(ctx context.Context, repo *git.Repository, pointerChan chan<- PointerBlob, errChan chan<- error) {
 19	basePath := repo.Path
 20
 21	catFileCheckReader, catFileCheckWriter := io.Pipe()
 22	shasToBatchReader, shasToBatchWriter := io.Pipe()
 23	catFileBatchReader, catFileBatchWriter := io.Pipe()
 24
 25	wg := sync.WaitGroup{}
 26	wg.Add(6)
 27
 28	// Create the go-routines in reverse order.
 29
 30	// 4. Take the output of cat-file --batch and check if each file in turn
 31	// to see if they're pointers to files in the LFS store
 32	go createPointerResultsFromCatFileBatch(ctx, catFileBatchReader, &wg, pointerChan)
 33
 34	// 3. Take the shas of the blobs and batch read them
 35	go catFileBatch(ctx, shasToBatchReader, catFileBatchWriter, &wg, basePath)
 36
 37	// 2. From the provided objects restrict to blobs <=1k
 38	go blobsLessThan1024FromCatFileBatchCheck(catFileCheckReader, shasToBatchWriter, &wg)
 39
 40	// 1. Run batch-check on all objects in the repository
 41	revListReader, revListWriter := io.Pipe()
 42	shasToCheckReader, shasToCheckWriter := io.Pipe()
 43	go catFileBatchCheck(ctx, shasToCheckReader, catFileCheckWriter, &wg, basePath)
 44	go blobsFromRevListObjects(revListReader, shasToCheckWriter, &wg)
 45	go revListAllObjects(ctx, revListWriter, &wg, basePath, errChan)
 46	wg.Wait()
 47
 48	close(pointerChan)
 49	close(errChan)
 50}
 51
 52func createPointerResultsFromCatFileBatch(ctx context.Context, catFileBatchReader *io.PipeReader, wg *sync.WaitGroup, pointerChan chan<- PointerBlob) {
 53	defer wg.Done()
 54	defer catFileBatchReader.Close() //nolint: errcheck
 55
 56	bufferedReader := bufio.NewReader(catFileBatchReader)
 57	buf := make([]byte, 1025)
 58
 59loop:
 60	for {
 61		select {
 62		case <-ctx.Done():
 63			break loop
 64		default:
 65		}
 66
 67		// File descriptor line: sha
 68		sha, err := bufferedReader.ReadString(' ')
 69		if err != nil {
 70			_ = catFileBatchReader.CloseWithError(err)
 71			break
 72		}
 73		sha = strings.TrimSpace(sha)
 74		// Throw away the blob
 75		if _, err := bufferedReader.ReadString(' '); err != nil {
 76			_ = catFileBatchReader.CloseWithError(err)
 77			break
 78		}
 79		sizeStr, err := bufferedReader.ReadString('\n')
 80		if err != nil {
 81			_ = catFileBatchReader.CloseWithError(err)
 82			break
 83		}
 84		size, err := strconv.Atoi(sizeStr[:len(sizeStr)-1])
 85		if err != nil {
 86			_ = catFileBatchReader.CloseWithError(err)
 87			break
 88		}
 89		pointerBuf := buf[:size+1]
 90		if _, err := io.ReadFull(bufferedReader, pointerBuf); err != nil {
 91			_ = catFileBatchReader.CloseWithError(err)
 92			break
 93		}
 94		pointerBuf = pointerBuf[:size]
 95		// Now we need to check if the pointerBuf is an LFS pointer
 96		pointer, _ := ReadPointerFromBuffer(pointerBuf)
 97		if !pointer.IsValid() {
 98			continue
 99		}
100
101		pointerChan <- PointerBlob{Hash: sha, Pointer: pointer}
102	}
103}
104
105func catFileBatch(ctx context.Context, shasToBatchReader *io.PipeReader, catFileBatchWriter *io.PipeWriter, wg *sync.WaitGroup, basePath string) {
106	defer wg.Done()
107	defer shasToBatchReader.Close()  //nolint: errcheck
108	defer catFileBatchWriter.Close() //nolint: errcheck
109
110	stderr := new(bytes.Buffer)
111	var errbuf strings.Builder
112	if err := gitm.NewCommandWithContext(ctx, "cat-file", "--batch").
113		WithTimeout(-1).
114		RunInDirWithOptions(basePath, gitm.RunInDirOptions{
115			Stdout: catFileBatchWriter,
116			Stdin:  shasToBatchReader,
117			Stderr: stderr,
118		}); err != nil {
119		_ = shasToBatchReader.CloseWithError(fmt.Errorf("git rev-list [%s]: %w - %s", basePath, err, errbuf.String()))
120	}
121}
122
123func blobsLessThan1024FromCatFileBatchCheck(catFileCheckReader *io.PipeReader, shasToBatchWriter *io.PipeWriter, wg *sync.WaitGroup) {
124	defer wg.Done()
125	defer catFileCheckReader.Close() //nolint: errcheck
126	scanner := bufio.NewScanner(catFileCheckReader)
127	defer func() {
128		_ = shasToBatchWriter.CloseWithError(scanner.Err())
129	}()
130	for scanner.Scan() {
131		line := scanner.Text()
132		if len(line) == 0 {
133			continue
134		}
135		fields := strings.Split(line, " ")
136		if len(fields) < 3 || fields[1] != "blob" {
137			continue
138		}
139		size, _ := strconv.Atoi(fields[2])
140		if size > 1024 {
141			continue
142		}
143		toWrite := []byte(fields[0] + "\n")
144		for len(toWrite) > 0 {
145			n, err := shasToBatchWriter.Write(toWrite)
146			if err != nil {
147				_ = catFileCheckReader.CloseWithError(err)
148				break
149			}
150			toWrite = toWrite[n:]
151		}
152	}
153}
154
155func catFileBatchCheck(ctx context.Context, shasToCheckReader *io.PipeReader, catFileCheckWriter *io.PipeWriter, wg *sync.WaitGroup, basePath string) {
156	defer wg.Done()
157	defer shasToCheckReader.Close()  //nolint: errcheck
158	defer catFileCheckWriter.Close() //nolint: errcheck
159
160	stderr := new(bytes.Buffer)
161	var errbuf strings.Builder
162	if err := gitm.NewCommandWithContext(ctx, "cat-file", "--batch-check").
163		WithTimeout(-1).
164		RunInDirWithOptions(basePath, gitm.RunInDirOptions{
165			Stdout: catFileCheckWriter,
166			Stdin:  shasToCheckReader,
167			Stderr: stderr,
168		}); err != nil {
169		_ = shasToCheckReader.CloseWithError(fmt.Errorf("git rev-list [%s]: %w - %s", basePath, err, errbuf.String()))
170	}
171}
172
173func blobsFromRevListObjects(revListReader *io.PipeReader, shasToCheckWriter *io.PipeWriter, wg *sync.WaitGroup) {
174	defer wg.Done()
175	defer revListReader.Close() //nolint: errcheck
176	scanner := bufio.NewScanner(revListReader)
177	defer func() {
178		_ = shasToCheckWriter.CloseWithError(scanner.Err())
179	}()
180
181	for scanner.Scan() {
182		line := scanner.Text()
183		if len(line) == 0 {
184			continue
185		}
186		fields := strings.Split(line, " ")
187		if len(fields) < 2 || len(fields[1]) == 0 {
188			continue
189		}
190		toWrite := []byte(fields[0] + "\n")
191		for len(toWrite) > 0 {
192			n, err := shasToCheckWriter.Write(toWrite)
193			if err != nil {
194				_ = revListReader.CloseWithError(err)
195				break
196			}
197			toWrite = toWrite[n:]
198		}
199	}
200}
201
202func revListAllObjects(ctx context.Context, revListWriter *io.PipeWriter, wg *sync.WaitGroup, basePath string, errChan chan<- error) {
203	defer wg.Done()
204	defer revListWriter.Close() //nolint: errcheck
205
206	stderr := new(bytes.Buffer)
207	var errbuf strings.Builder
208	if err := gitm.NewCommandWithContext(ctx, "rev-list", "--objects", "--all").
209		WithTimeout(-1).
210		RunInDirWithOptions(basePath, gitm.RunInDirOptions{
211			Stdout: revListWriter,
212			Stderr: stderr,
213		}); err != nil {
214		errChan <- fmt.Errorf("git rev-list [%s]: %w - %s", basePath, err, errbuf.String())
215	}
216}