diff --git a/internal/cache/clockpro.go b/internal/cache/clockpro.go index cdd4b629150..746c072d896 100644 --- a/internal/cache/clockpro.go +++ b/internal/cache/clockpro.go @@ -19,7 +19,9 @@ package cache // import "github.com/cockroachdb/pebble/internal/cache" import ( "fmt" + "os" "runtime" + "runtime/debug" "sync" "sync/atomic" @@ -37,6 +39,17 @@ type key struct { offset uint64 } +// file returns the "file key" for the receiver. This is the key used for the +// shard.files map. +func (k key) file() key { + k.offset = 0 + return k +} + +func (k key) String() string { + return fmt.Sprintf("%d/%d/%d", k.id, k.fileNum, k.offset) +} + // Handle provides a strong reference to an entry in the cache. The reference // does not pin the entry in the cache, but it does prevent the underlying byte // slice from being reused. When entry is non-nil, value is initialized to @@ -107,8 +120,17 @@ type shard struct { reservedSize int64 maxSize int64 coldTarget int64 - blocks map[key]*entry // fileNum+offset -> block - files map[fileKey]*entry // fileNum -> list of blocks + blocks robinHoodMap // fileNum+offset -> block + files robinHoodMap // fileNum -> list of blocks + + // The blocks and files maps store values in manually managed memory that is + // invisible to the Go GC. This is fine for Value and entry objects that are + // stored in manually managed memory. Auto Values and the associated auto + // entries need to have a reference that the Go GC is aware of to prevent + // them from being reclaimed. The entries map provides this reference. When + // the "invariants" build tag is set, all Value and entry objects are Go + // allocated and the entries map will contain a reference to every entry. + entries map[*entry]struct{} handHot *entry handCold *entry @@ -121,7 +143,7 @@ type shard struct { func (c *shard) Get(id, fileNum, offset uint64) Handle { c.mu.RLock() - e := c.blocks[key{fileKey{id, fileNum}, offset}] + e := c.blocks.Get(key{fileKey{id, fileNum}, offset}) var value *Value if e != nil { value = e.getValue() @@ -160,7 +182,7 @@ func (c *shard) Set(id, fileNum, offset uint64, value *Value) Handle { defer c.mu.Unlock() k := key{fileKey{id, fileNum}, offset} - e := c.blocks[k] + e := c.blocks.Get(k) if e != nil && e.manual != value.manual() { panic(fmt.Sprintf("pebble: inconsistent caching of manual Value: entry=%t vs value=%t", e.manual, value.manual())) @@ -231,7 +253,7 @@ func (c *shard) Delete(id, fileNum, offset uint64) { c.mu.Lock() defer c.mu.Unlock() - e := c.blocks[key{fileKey{id, fileNum}, offset}] + e := c.blocks.Get(key{fileKey{id, fileNum}, offset}) if e == nil { return } @@ -243,7 +265,8 @@ func (c *shard) EvictFile(id, fileNum uint64) { c.mu.Lock() defer c.mu.Unlock() - blocks := c.files[fileKey{id, fileNum}] + fkey := key{fileKey{id, fileNum}, 0} + blocks := c.files.Get(fkey) if blocks == nil { return } @@ -291,7 +314,12 @@ func (c *shard) metaAdd(key key, e *entry) bool { return false } - c.blocks[key] = e + c.blocks.Put(key, e) + if !e.managed { + // Go allocated entries need to be referenced from Go memory. The entries + // map provides that reference. + c.entries[e] = struct{}{} + } if c.handHot == nil { // first element @@ -306,8 +334,9 @@ func (c *shard) metaAdd(key key, e *entry) bool { c.handCold = c.handCold.prev() } - if fileBlocks := c.files[key.fileKey]; fileBlocks == nil { - c.files[key.fileKey] = e + fkey := key.file() + if fileBlocks := c.files.Get(fkey); fileBlocks == nil { + c.files.Put(fkey, e) } else { fileBlocks.linkFile(e) } @@ -323,7 +352,12 @@ func (c *shard) metaDel(e *entry) { } e.setValue(nil) - delete(c.blocks, e.key) + c.blocks.Delete(e.key) + if !e.managed { + // Go allocated entries need to be referenced from Go memory. The entries + // map provides that reference. + delete(c.entries, e) + } if e == c.handHot { c.handHot = c.handHot.prev() @@ -342,10 +376,11 @@ func (c *shard) metaDel(e *entry) { c.handTest = nil } + fkey := e.key.file() if next := e.unlinkFile(); e == next { - delete(c.files, e.key.fileKey) + c.files.Delete(fkey) } else { - c.files[e.key.fileKey] = next + c.files.Put(fkey, next) } c.metaCheck(e) @@ -354,21 +389,28 @@ func (c *shard) metaDel(e *entry) { // Check that the specified entry is not referenced by the cache. func (c *shard) metaCheck(e *entry) { if invariants.Enabled { - for _, t := range c.blocks { - if e == t { - panic("not reached") - } + if _, ok := c.entries[e]; ok { + fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in entries map\n%s", + e, e.key, debug.Stack()) + os.Exit(1) } - for _, t := range c.files { - if e == t { - panic("not reached") - } + if c.blocks.findByValue(e) != nil { + fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in blocks map\n%s\n%s", + e, e.key, &c.blocks, debug.Stack()) + os.Exit(1) + } + if c.files.findByValue(e) != nil { + fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in files map\n%s\n%s", + e, e.key, &c.files, debug.Stack()) + os.Exit(1) } // NB: c.hand{Hot,Cold,Test} are pointers into a single linked list. We // only have to traverse one of them to check all of them. for t := c.handHot.next(); t != c.handHot; t = t.next() { if e == t { - panic("not reached") + fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in blocks list\n%s", + e, e.key, debug.Stack()) + os.Exit(1) } } } @@ -553,6 +595,8 @@ func clearCache(obj interface{}) { s.mu.Lock() s.maxSize = 0 s.evict() + s.blocks.free() + s.files.free() s.mu.Unlock() } } @@ -567,9 +611,10 @@ func newShards(size int64, shards int) *Cache { c.shards[i] = shard{ maxSize: size / int64(len(c.shards)), coldTarget: size / int64(len(c.shards)), - blocks: make(map[key]*entry), - files: make(map[fileKey]*entry), + entries: make(map[*entry]struct{}), } + c.shards[i].blocks.init(16) + c.shards[i].files.init(16) } // TODO(peter): This finalizer is used to clear the cache when the Cache // itself is GC'd. Investigate making this explicit, and then changing the @@ -703,7 +748,7 @@ func (c *Cache) Metrics() Metrics { for i := range c.shards { s := &c.shards[i] s.mu.RLock() - m.Count += int64(len(s.blocks)) + m.Count += int64(s.blocks.Count()) m.Size += s.sizeHot + s.sizeCold s.mu.RUnlock() m.Hits += atomic.LoadInt64(&s.hits) diff --git a/internal/cache/entry.go b/internal/cache/entry.go index 8b9721ff64c..7b429e8be61 100644 --- a/internal/cache/entry.go +++ b/internal/cache/entry.go @@ -59,9 +59,13 @@ type entry struct { } size int64 ptype entryType - // Is the memory for the entry manually managed? A manually managed entry can - // only store manually managed values (Value.manual() is true). + // Can the entry hold a manual Value? Only a manually managed entry can store + // manually managed values (Value.manual() is true). manual bool + // Was the entry allocated using the Go allocator or the manual + // allocator. This can differ from the setting of the manual field due when + // the "invariants" build tag is set. + managed bool // referenced is atomically set to indicate that this entry has been accessed // since the last time one of the clock hands swept it. referenced int32 @@ -78,11 +82,12 @@ func newEntry(s *shard, key key, size int64, manual bool) *entry { e = &entry{} } *e = entry{ - key: key, - size: size, - ptype: etCold, - manual: manual, - shard: s, + key: key, + size: size, + ptype: etCold, + manual: manual, + managed: e.managed, + shard: s, } e.blockLink.next = e e.blockLink.prev = e diff --git a/internal/cache/entry_normal.go b/internal/cache/entry_normal.go index c2c2950cab5..a2b9e60ed5d 100644 --- a/internal/cache/entry_normal.go +++ b/internal/cache/entry_normal.go @@ -24,6 +24,7 @@ func entryAllocNew() *entry { a := entryAllocPool.Get().(*entryAllocCache) e := a.alloc() entryAllocPool.Put(a) + e.managed = true return e } diff --git a/internal/cache/robin_hood.go b/internal/cache/robin_hood.go new file mode 100644 index 00000000000..1d1667e9a2e --- /dev/null +++ b/internal/cache/robin_hood.go @@ -0,0 +1,303 @@ +// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package cache + +import ( + "fmt" + "math/bits" + "os" + "runtime" + "runtime/debug" + "strings" + "time" + "unsafe" + + "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/internal/manual" +) + +var hashSeed = uintptr(time.Now().UnixNano()) + +//go:noescape +//go:linkname memhash runtime.memhash +func memhash(p unsafe.Pointer, seed, s uintptr) uintptr + +func robinHoodHash(k key, shift uint32) uint32 { + h := memhash(unsafe.Pointer(&k), hashSeed, unsafe.Sizeof(k)) + return uint32(h >> shift) +} + +type robinHoodEntry struct { + key key + value *entry + // The distance the entry is from its desired position. + dist uint32 +} + +type robinHoodEntries struct { + ptr unsafe.Pointer + len uint32 +} + +func newRobinHoodEntries(n uint32) robinHoodEntries { + size := uintptr(n) * unsafe.Sizeof(robinHoodEntry{}) + return robinHoodEntries{ + ptr: unsafe.Pointer(&(manual.New(int(size)))[0]), + len: n, + } +} + +func (e robinHoodEntries) at(i uint32) *robinHoodEntry { + return (*robinHoodEntry)(unsafe.Pointer(uintptr(e.ptr) + + uintptr(i)*unsafe.Sizeof(robinHoodEntry{}))) +} + +func (e robinHoodEntries) free() { + size := uintptr(e.len) * unsafe.Sizeof(robinHoodEntry{}) + buf := (*[manual.MaxArrayLen]byte)(e.ptr)[:size:size] + manual.Free(buf) +} + +// robinHoodMap is an implementation of Robin Hood hashing. Robin Hood hashing +// is an open-address hash table using linear probing. The twist is that the +// linear probe distance is reduced by moving existing entries when inserting +// and deleting. This is accomplished by keeping track of how far an entry is +// from its "desired" slot (hash of key modulo number of slots). During +// insertion, if the new entry being inserted is farther from its desired slot +// than the target entry, we swap the target and new entry. This effectively +// steals from the "rich" target entry and gives to the "poor" new entry (thus +// the origin of the name). +// +// An extension over the base Robin Hood hashing idea comes from +// https://probablydance.com/2017/02/26/i-wrote-the-fastest-hashtable/. A cap +// is placed on the max distance an entry can be from its desired slot. When +// this threshold is reached during insertion, the size of the table is doubled +// and insertion is restarted. Additionally, the entries slice is given "max +// dist" extra entries on the end. The very last entry in the entries slice is +// never used and acts as a sentinel which terminates loops. The previous +// maxDist-1 entries act as the extra entries. For example, if the size of the +// table is 2, maxDist is computed as 4 and the actual size of the entry slice +// is 6. +// +// +---+---+---+---+---+---+ +// | 0 | 1 | 2 | 3 | 4 | 5 | +// +---+---+---+---+---+---+ +// ^ +// size +// +// In this scenario, the target entry for a key will always be in the range +// [0,1]. Valid entries may reside in the range [0,4] due to the linear probing +// of up to maxDist entries. The entry at index 5 will never contain a value, +// and instead acts as a sentinel (its distance is always 0). The max distance +// threshold is set to log2(num-entries). This ensures that retrieval is O(log +// N), though note that N is the number of total entries, not the count of +// valid entries. +// +// Deletion is implemented via the backward shift delete mechanism instead of +// tombstones. This preserves the performance of the table in the presence of +// deletions. See +// http://codecapsule.com/2013/11/17/robin-hood-hashing-backward-shift-deletion +// for details. +type robinHoodMap struct { + entries robinHoodEntries + size uint32 + shift uint32 + count uint32 + maxDist uint32 +} + +func maxDistForSize(size uint32) uint32 { + desired := uint32(bits.Len32(size)) + if desired < 4 { + desired = 4 + } + return desired +} + +func newRobinHoodMap(initialCapacity int) *robinHoodMap { + m := &robinHoodMap{} + m.init(initialCapacity) + runtime.SetFinalizer(m, clearRobinHoodMap) + return m +} + +func clearRobinHoodMap(obj interface{}) { + m := obj.(*robinHoodMap) + m.free() +} + +func (m *robinHoodMap) init(initialCapacity int) { + if initialCapacity < 1 { + initialCapacity = 1 + } + targetSize := 1 << (uint(bits.Len(uint(2*initialCapacity-1))) - 1) + m.rehash(uint32(targetSize)) +} + +func (m *robinHoodMap) free() { + if m.entries.ptr != nil { + m.entries.free() + m.entries.ptr = nil + } +} + +func (m *robinHoodMap) rehash(size uint32) { + oldEntries := m.entries + + m.size = size + m.shift = uint32(64 - bits.Len32(m.size-1)) + m.maxDist = maxDistForSize(size) + m.entries = newRobinHoodEntries(size + m.maxDist) + m.count = 0 + + for i := uint32(0); i < oldEntries.len; i++ { + e := oldEntries.at(i) + if e.value != nil { + m.Put(e.key, e.value) + } + } + + if oldEntries.ptr != nil { + oldEntries.free() + } +} + +// Find an entry containing the specified value. This is intended to be used +// from debug and test code. +func (m *robinHoodMap) findByValue(v *entry) *robinHoodEntry { + for i := uint32(0); i < m.entries.len; i++ { + e := m.entries.at(i) + if e.value == v { + return e + } + } + return nil +} + +func (m *robinHoodMap) Count() int { + return int(m.count) +} + +func (m *robinHoodMap) Put(k key, v *entry) { + maybeExists := true + n := robinHoodEntry{key: k, value: v, dist: 0} + for i := robinHoodHash(k, m.shift); ; i++ { + e := m.entries.at(i) + if maybeExists && k == e.key { + // Entry already exists: overwrite. + e.value = n.value + m.checkEntry(i) + return + } + + if e.value == nil { + // Found an empty entry: insert here. + *e = n + m.count++ + m.checkEntry(i) + return + } + + if e.dist < n.dist { + // Swap the new entry with the current entry because the current is + // rich. We then continue to loop, looking for a new location for the + // current entry. Note that this is also the not-found condition for + // retrieval, which means that "k" is not present in the map. See Get(). + n, *e = *e, n + m.checkEntry(i) + maybeExists = false + } + + // The new entry gradually moves away from its ideal position. + n.dist++ + + // If we've reached the max distance threshold, grow the table and restart + // the insertion. + if n.dist == m.maxDist { + m.rehash(2 * m.size) + i = robinHoodHash(n.key, m.shift) - 1 + n.dist = 0 + maybeExists = false + } + } +} + +func (m *robinHoodMap) Get(k key) *entry { + var dist uint32 + for i := robinHoodHash(k, m.shift); ; i++ { + e := m.entries.at(i) + if k == e.key { + // Found. + return e.value + } + if e.dist < dist { + // Not found. + return nil + } + dist++ + } +} + +func (m *robinHoodMap) Delete(k key) { + var dist uint32 + for i := robinHoodHash(k, m.shift); ; i++ { + e := m.entries.at(i) + if k == e.key { + m.checkEntry(i) + // We found the entry to delete. Shift the following entries backwards + // until the next empty value or entry with a zero distance. Note that + // empty values are guaranteed to have "dist == 0". + m.count-- + for j := i + 1; ; j++ { + t := m.entries.at(j) + if t.dist == 0 { + *e = robinHoodEntry{} + return + } + e.key = t.key + e.value = t.value + e.dist = t.dist - 1 + e = t + m.checkEntry(j) + } + } + if dist > e.dist { + // Not found. + return + } + dist++ + } +} + +func (m *robinHoodMap) checkEntry(i uint32) { + if invariants.Enabled { + e := m.entries.at(i) + if e.value != nil { + pos := robinHoodHash(e.key, m.shift) + if (uint32(i) - pos) != e.dist { + fmt.Fprintf(os.Stderr, "%d: invalid dist=%d, expected %d: %s\n%s", + i, e.dist, uint32(i)-pos, e.key, debug.Stack()) + os.Exit(1) + } + if e.dist > m.maxDist { + fmt.Fprintf(os.Stderr, "%d: invalid dist=%d > maxDist=%d: %s\n%s", + i, e.dist, m.maxDist, e.key, debug.Stack()) + os.Exit(1) + } + } + } +} + +func (m *robinHoodMap) String() string { + var buf strings.Builder + fmt.Fprintf(&buf, "count: %d\n", m.count) + for i := uint32(0); i < m.entries.len; i++ { + e := m.entries.at(i) + if e.value != nil { + fmt.Fprintf(&buf, "%d: [%s,%p,%d]\n", i, e.key, e.value, e.dist) + } + } + return buf.String() +} diff --git a/internal/cache/robin_hood_test.go b/internal/cache/robin_hood_test.go new file mode 100644 index 00000000000..b107b11621d --- /dev/null +++ b/internal/cache/robin_hood_test.go @@ -0,0 +1,82 @@ +// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package cache + +import ( + "testing" + "time" + + "golang.org/x/exp/rand" +) + +func TestRobinHoodMap(t *testing.T) { + rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano()))) + rhMap := newRobinHoodMap(0) + goMap := make(map[key]*entry) + + randomKey := func() key { + n := rng.Intn(len(goMap)) + for k := range goMap { + if n == 0 { + return k + } + n-- + } + return key{} + } + + ops := 10000 + rng.Intn(10000) + for i := 0; i < ops; i++ { + var which float64 + if len(goMap) > 0 { + which = rng.Float64() + } + + switch { + case which < 0.4: + // 40% insert. + var k key + k.id = rng.Uint64() + k.fileNum = rng.Uint64() + k.offset = rng.Uint64() + e := &entry{} + goMap[k] = e + rhMap.Put(k, e) + if len(goMap) != int(rhMap.Count()) { + t.Fatalf("map sizes differ: %d != %d", len(goMap), rhMap.Count()) + } + + case which < 0.1: + // 10% overwrite. + k := randomKey() + e := &entry{} + goMap[k] = e + rhMap.Put(k, e) + if len(goMap) != int(rhMap.Count()) { + t.Fatalf("map sizes differ: %d != %d", len(goMap), rhMap.Count()) + } + + case which < 0.75: + // 25% delete. + k := randomKey() + delete(goMap, k) + rhMap.Delete(k) + if len(goMap) != int(rhMap.Count()) { + t.Fatalf("map sizes differ: %d != %d", len(goMap), rhMap.Count()) + } + + default: + // 25% lookup. + k := randomKey() + v := goMap[k] + u := rhMap.Get(k) + if v != u { + t.Fatalf("%s: expected %p, but found %p", k, v, u) + } + } + } + + t.Logf("map size: %d", len(goMap)) +}