fastwalk.go

  1// Copyright 2016 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5// Package fastwalk provides a faster version of filepath.Walk for file system
  6// scanning tools.
  7package fastwalk
  8
  9import (
 10	"errors"
 11	"os"
 12	"path/filepath"
 13	"runtime"
 14	"sync"
 15)
 16
 17// TraverseLink is used as a return value from WalkFuncs to indicate that the
 18// symlink named in the call may be traversed.
 19var TraverseLink = errors.New("fastwalk: traverse symlink, assuming target is a directory")
 20
 21// Walk is a faster implementation of filepath.Walk.
 22//
 23// filepath.Walk's design necessarily calls os.Lstat on each file,
 24// even if the caller needs less info.
 25// Many tools need only the type of each file.
 26// On some platforms, this information is provided directly by the readdir
 27// system call, avoiding the need to stat each file individually.
 28// fastwalk_unix.go contains a fork of the syscall routines.
 29//
 30// See golang.org/issue/16399
 31//
 32// Walk walks the file tree rooted at root, calling walkFn for
 33// each file or directory in the tree, including root.
 34//
 35// If fastWalk returns filepath.SkipDir, the directory is skipped.
 36//
 37// Unlike filepath.Walk:
 38//   * file stat calls must be done by the user.
 39//     The only provided metadata is the file type, which does not include
 40//     any permission bits.
 41//   * multiple goroutines stat the filesystem concurrently. The provided
 42//     walkFn must be safe for concurrent use.
 43//   * fastWalk can follow symlinks if walkFn returns the TraverseLink
 44//     sentinel error. It is the walkFn's responsibility to prevent
 45//     fastWalk from going into symlink cycles.
 46func Walk(root string, walkFn func(path string, typ os.FileMode) error) error {
 47	// TODO(bradfitz): make numWorkers configurable? We used a
 48	// minimum of 4 to give the kernel more info about multiple
 49	// things we want, in hopes its I/O scheduling can take
 50	// advantage of that. Hopefully most are in cache. Maybe 4 is
 51	// even too low of a minimum. Profile more.
 52	numWorkers := 4
 53	if n := runtime.NumCPU(); n > numWorkers {
 54		numWorkers = n
 55	}
 56
 57	// Make sure to wait for all workers to finish, otherwise
 58	// walkFn could still be called after returning. This Wait call
 59	// runs after close(e.donec) below.
 60	var wg sync.WaitGroup
 61	defer wg.Wait()
 62
 63	w := &walker{
 64		fn:       walkFn,
 65		enqueuec: make(chan walkItem, numWorkers), // buffered for performance
 66		workc:    make(chan walkItem, numWorkers), // buffered for performance
 67		donec:    make(chan struct{}),
 68
 69		// buffered for correctness & not leaking goroutines:
 70		resc: make(chan error, numWorkers),
 71	}
 72	defer close(w.donec)
 73
 74	for i := 0; i < numWorkers; i++ {
 75		wg.Add(1)
 76		go w.doWork(&wg)
 77	}
 78	todo := []walkItem{{dir: root}}
 79	out := 0
 80	for {
 81		workc := w.workc
 82		var workItem walkItem
 83		if len(todo) == 0 {
 84			workc = nil
 85		} else {
 86			workItem = todo[len(todo)-1]
 87		}
 88		select {
 89		case workc <- workItem:
 90			todo = todo[:len(todo)-1]
 91			out++
 92		case it := <-w.enqueuec:
 93			todo = append(todo, it)
 94		case err := <-w.resc:
 95			out--
 96			if err != nil {
 97				return err
 98			}
 99			if out == 0 && len(todo) == 0 {
100				// It's safe to quit here, as long as the buffered
101				// enqueue channel isn't also readable, which might
102				// happen if the worker sends both another unit of
103				// work and its result before the other select was
104				// scheduled and both w.resc and w.enqueuec were
105				// readable.
106				select {
107				case it := <-w.enqueuec:
108					todo = append(todo, it)
109				default:
110					return nil
111				}
112			}
113		}
114	}
115}
116
117// doWork reads directories as instructed (via workc) and runs the
118// user's callback function.
119func (w *walker) doWork(wg *sync.WaitGroup) {
120	defer wg.Done()
121	for {
122		select {
123		case <-w.donec:
124			return
125		case it := <-w.workc:
126			select {
127			case <-w.donec:
128				return
129			case w.resc <- w.walk(it.dir, !it.callbackDone):
130			}
131		}
132	}
133}
134
135type walker struct {
136	fn func(path string, typ os.FileMode) error
137
138	donec    chan struct{} // closed on fastWalk's return
139	workc    chan walkItem // to workers
140	enqueuec chan walkItem // from workers
141	resc     chan error    // from workers
142}
143
144type walkItem struct {
145	dir          string
146	callbackDone bool // callback already called; don't do it again
147}
148
149func (w *walker) enqueue(it walkItem) {
150	select {
151	case w.enqueuec <- it:
152	case <-w.donec:
153	}
154}
155
156func (w *walker) onDirEnt(dirName, baseName string, typ os.FileMode) error {
157	joined := dirName + string(os.PathSeparator) + baseName
158	if typ == os.ModeDir {
159		w.enqueue(walkItem{dir: joined})
160		return nil
161	}
162
163	err := w.fn(joined, typ)
164	if typ == os.ModeSymlink {
165		if err == TraverseLink {
166			// Set callbackDone so we don't call it twice for both the
167			// symlink-as-symlink and the symlink-as-directory later:
168			w.enqueue(walkItem{dir: joined, callbackDone: true})
169			return nil
170		}
171		if err == filepath.SkipDir {
172			// Permit SkipDir on symlinks too.
173			return nil
174		}
175	}
176	return err
177}
178
179func (w *walker) walk(root string, runUserCallback bool) error {
180	if runUserCallback {
181		err := w.fn(root, os.ModeDir)
182		if err == filepath.SkipDir {
183			return nil
184		}
185		if err != nil {
186			return err
187		}
188	}
189
190	return readDir(root, w.onDirEnt)
191}