repo: ReadData return a reader, improve the http handler (#1549)

Michael Muré created

Change summary

api/http/git_file_handler.go       | 46 +++++++++++++++++++++++++++----
api/http/git_file_handlers_test.go |  2 +
cache/repo_cache_common.go         |  2 
entities/identity/identity.go      |  3 +
entity/dag/operation_pack.go       |  8 ++++
repository/gogit.go                | 12 ++------
repository/mock_repo.go            |  4 +-
repository/repo.go                 |  5 ++-
repository/repo_testing.go         |  5 ++
9 files changed, 63 insertions(+), 24 deletions(-)

Detailed changes

api/http/git_file_handler.go 🔗

@@ -2,8 +2,9 @@ package http
 
 import (
 	"bytes"
+	"errors"
+	"io"
 	"net/http"
-	"time"
 
 	"github.com/gorilla/mux"
 
@@ -47,15 +48,46 @@ func (gfh *gitFileHandler) ServeHTTP(rw http.ResponseWriter, r *http.Request) {
 		return
 	}
 
-	// TODO: this mean that the whole file will be buffered in memory
-	// This can be a problem for big files. There might be a way around
-	// that by implementing a io.ReadSeeker that would read and discard
-	// data when a seek is called.
-	data, err := repo.ReadData(hash)
+	reader, err := repo.ReadData(hash)
+	if errors.Is(err, repository.ErrNotFound) {
+		http.Error(rw, "blob not found", http.StatusNotFound)
+		return
+	}
 	if err != nil {
 		http.Error(rw, err.Error(), http.StatusInternalServerError)
 		return
 	}
+	defer func() {
+		_ = reader.Close()
+	}()
+
+	serveContent(rw, r, reader)
+}
+
+// serveContent is a somewhat equivalent of http.serveContent, without support for range request.
+// This is necessary as the repo (and go-git)'s data reader doesn't support Seek().
+func serveContent(w http.ResponseWriter, r *http.Request, content io.Reader) {
+	if w.Header().Get("Content-Type") == "" {
+		// Sniff the type from the first up to 512 bytes.
+		var buf [512]byte
+		n, err := io.ReadFull(content, buf[:])
+		switch err {
+		case nil:
+			w.Header().Set("Content-Type", http.DetectContentType(buf[:n]))
+			content = io.MultiReader(bytes.NewReader(buf[:n]), content)
+		case io.ErrUnexpectedEOF, io.EOF:
+			w.Header().Set("Content-Type", http.DetectContentType(buf[:n]))
+			content = bytes.NewReader(buf[:n])
+		default:
+			http.Error(w, "internal server error", http.StatusInternalServerError)
+			return
+		}
+	}
+
+	w.WriteHeader(http.StatusOK)
+	if r.Method == http.MethodHead {
+		return
+	}
 
-	http.ServeContent(rw, r, "", time.Now(), bytes.NewReader(data))
+	_, _ = io.Copy(w, content)
 }

api/http/git_file_handlers_test.go 🔗

@@ -87,6 +87,8 @@ func TestGitFileHandlers(t *testing.T) {
 
 	downloadHandler.ServeHTTP(w, r)
 	assert.Equal(t, http.StatusOK, w.Code)
+	// check if content type detection is working
+	assert.Equal(t, "image/png", w.Header().Get("Content-Type"))
 
 	assert.Equal(t, data.Bytes(), w.Body.Bytes())
 }

cache/repo_cache_common.go 🔗

@@ -78,7 +78,7 @@ func (c *RepoCache) LocalStorage() repository.LocalStorage {
 }
 
 // ReadData will attempt to read arbitrary data from the given hash
-func (c *RepoCache) ReadData(hash repository.Hash) ([]byte, error) {
+func (c *RepoCache) ReadData(hash repository.Hash) (io.ReadCloser, error) {
 	return c.repo.ReadData(hash)
 }
 

entities/identity/identity.go 🔗

@@ -136,7 +136,8 @@ func read(repo repository.Repo, ref string) (*Identity, error) {
 		}
 
 		var version version
-		err = json.Unmarshal(data, &version)
+		err = json.NewDecoder(data).Decode(&version)
+		_ = data.Close()
 		if err != nil {
 			return nil, errors.Wrapf(err, "failed to decode Identity version json %s", hash)
 		}

entity/dag/operation_pack.go 🔗

@@ -3,6 +3,7 @@ package dag
 import (
 	"encoding/json"
 	"fmt"
+	"io"
 	"strconv"
 	"strings"
 
@@ -242,7 +243,12 @@ func readOperationPack(def Definition, repo repository.RepoData, resolvers entit
 	for _, entry := range entries {
 		switch {
 		case entry.Name == opsEntryName:
-			data, err := repo.ReadData(entry.Hash)
+			r, err := repo.ReadData(entry.Hash)
+			if err != nil {
+				return nil, errors.Wrap(err, "failed to read git blob data")
+			}
+			data, err := io.ReadAll(r)
+			_ = r.Close()
 			if err != nil {
 				return nil, errors.Wrap(err, "failed to read git blob data")
 			}

repository/gogit.go 🔗

@@ -550,25 +550,19 @@ func (repo *GoGitRepo) StoreData(data []byte) (Hash, error) {
 }
 
 // ReadData will attempt to read arbitrary data from the given hash
-func (repo *GoGitRepo) ReadData(hash Hash) ([]byte, error) {
+func (repo *GoGitRepo) ReadData(hash Hash) (io.ReadCloser, error) {
 	repo.rMutex.Lock()
 	defer repo.rMutex.Unlock()
 
 	obj, err := repo.r.BlobObject(plumbing.NewHash(hash.String()))
-	if err == plumbing.ErrObjectNotFound {
+	if errors.Is(err, plumbing.ErrObjectNotFound) {
 		return nil, ErrNotFound
 	}
 	if err != nil {
 		return nil, err
 	}
 
-	r, err := obj.Reader()
-	if err != nil {
-		return nil, err
-	}
-
-	// TODO: return a io.Reader instead
-	return io.ReadAll(r)
+	return obj.Reader()
 }
 
 // StoreTree will store a mapping key-->Hash as a Git tree

repository/mock_repo.go 🔗

@@ -266,13 +266,13 @@ func (r *mockRepoDataBrowse) StoreData(data []byte) (Hash, error) {
 	return hash, nil
 }
 
-func (r *mockRepoDataBrowse) ReadData(hash Hash) ([]byte, error) {
+func (r *mockRepoDataBrowse) ReadData(hash Hash) (io.ReadCloser, error) {
 	data, ok := r.blobs[hash]
 	if !ok {
 		return nil, ErrNotFound
 	}
 
-	return data, nil
+	return io.NopCloser(bytes.NewReader(data)), nil
 }
 
 func (r *mockRepoDataBrowse) StoreTree(entries []TreeEntry) (Hash, error) {

repository/repo.go 🔗

@@ -148,9 +148,10 @@ type RepoData interface {
 	// StoreData will store arbitrary data and return the corresponding hash
 	StoreData(data []byte) (Hash, error)
 
-	// ReadData will attempt to read arbitrary data from the given hash
+	// ReadData returns a reader for arbitrary data associated with the given hash.
 	// Returns ErrNotFound if not found.
-	ReadData(hash Hash) ([]byte, error)
+	// The caller must close the reader.
+	ReadData(hash Hash) (io.ReadCloser, error)
 
 	// StoreTree will store a mapping key-->Hash as a Git tree
 	StoreTree(mapping []TreeEntry) (Hash, error)

repository/repo_testing.go 🔗

@@ -107,7 +107,10 @@ func RepoDataTest(t *testing.T, repo RepoData) {
 	require.NoError(t, err)
 	require.True(t, blobHash1.IsValid())
 
-	blob1Read, err := repo.ReadData(blobHash1)
+	blob1Reader, err := repo.ReadData(blobHash1)
+	require.NoError(t, err)
+	defer func() { require.NoError(t, blob1Reader.Close()) }()
+	blob1Read, err := io.ReadAll(blob1Reader)
 	require.NoError(t, err)
 	require.Equal(t, data, blob1Read)