Re-use the hashtable.
This is a re-base of https://github.com/golang/snappy/pull/23
Just using a sync.Pool by itself didn't seem to affect the numbers much.
Previously, the table was stack-allocated and not heap-allocated, so using a
sync.Pool isn't saving a malloc and GC.
The point of re-using the hashtable, via a sync.Pool, is that you can avoid
zero-initializing it. However, the benchmarks are mixed: some are better, some
are worse. I think that the net result is actually worse, but this commit is
provided to aid discussion of that PR and other changes such as
https://go-review.googlesource.com/#/c/21021/ in the standard library.
benchmark old MB/s new MB/s speedup
BenchmarkWordsEncode1e1-4 466.23 177.91 0.38x
BenchmarkWordsEncode1e2-4 60.08 191.31 3.18x
BenchmarkWordsEncode1e3-4 174.56 215.35 1.23x
BenchmarkWordsEncode1e4-4 172.29 166.81 0.97x
BenchmarkWordsEncode1e5-4 134.37 127.43 0.95x
BenchmarkWordsEncode1e6-4 152.59 148.35 0.97x
BenchmarkRandomEncode-4 6826.46 7249.27 1.06x
Benchmark_ZFlat0-4 310.41 299.86 0.97x
Benchmark_ZFlat1-4 198.37 165.81 0.84x
Benchmark_ZFlat2-4 8046.09 9300.88 1.16x
Benchmark_ZFlat3-4 123.41 245.15 1.99x
Benchmark_ZFlat4-4 2200.95 2346.55 1.07x
Benchmark_ZFlat5-4 307.16 294.54 0.96x
Benchmark_ZFlat6-4 136.35 130.99 0.96x
Benchmark_ZFlat7-4 130.63 125.27 0.96x
Benchmark_ZFlat8-4 143.56 137.13 0.96x
Benchmark_ZFlat9-4 125.47 120.34 0.96x
Benchmark_ZFlat10-4 365.85 348.07 0.95x
Benchmark_ZFlat11-4 200.01 194.52 0.97x
diff --git a/encode.go b/encode.go
index 1a8a821..2e255fb 100644
--- a/encode.go
+++ b/encode.go
@@ -8,6 +8,7 @@
"encoding/binary"
"errors"
"io"
+ "sync"
)
// maxOffset limits how far copy back-references can go, the same as the C++
@@ -104,12 +105,32 @@
return i + 2
}
+var encPool = sync.Pool{New: func() interface{} { return &encoder{cur: maxBlockSize} }}
+
+const (
+ tableBits = 14 // Bits used in the table
+ tableSize = 1 << tableBits // Size of the table
+)
+
+type encoder struct {
+ table [tableSize]int32
+ // cur is bigger than every element of table.
+ cur int32
+}
+
// Encode returns the encoded form of src. The returned slice may be a sub-
// slice of dst if dst was large enough to hold the entire encoded block.
// Otherwise, a newly allocated slice will be returned.
//
// It is valid to pass a nil dst.
func Encode(dst, src []byte) []byte {
+ e := encPool.Get().(*encoder)
+ dst = e.encode(dst, src)
+ encPool.Put(e)
+ return dst
+}
+
+func (e *encoder) encode(dst, src []byte) []byte {
if n := MaxEncodedLen(len(src)); n < 0 {
panic(ErrTooLarge)
} else if len(dst) < n {
@@ -128,7 +149,7 @@
if len(p) < minNonLiteralBlockSize {
d += emitLiteral(dst[d:], p)
} else {
- d += encodeBlock(dst[d:], p)
+ d += e.encodeBlock(dst[d:], p)
}
}
return dst[:d]
@@ -176,17 +197,13 @@
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
-func encodeBlock(dst, src []byte) (d int) {
- // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
- // The table element type is uint16, as s < sLimit and sLimit < len(src)
- // and len(src) <= maxBlockSize and maxBlockSize == 65536.
- const maxTableSize = 1 << 14
- shift, tableSize := uint32(32-8), 1<<8
- for tableSize < maxTableSize && tableSize < len(src) {
+func (e *encoder) encodeBlock(dst, src []byte) (d int) {
+ shift := uint32(32 - 8)
+ for n := 1 << 8; n < tableSize && n < len(src); n *= 2 {
shift--
- tableSize *= 2
}
- var table [maxTableSize]uint16
+
+ // TODO: something about e.cur wrapping.
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
@@ -198,7 +215,9 @@
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
- s := 1
+ s := 0
+ e.table[hash(load32(src, s), shift)] = e.cur
+ s = 1
nextHash := hash(load32(src, s), shift)
for {
@@ -229,10 +248,10 @@
if nextS > sLimit {
goto emitRemainder
}
- candidate = int(table[nextHash])
- table[nextHash] = uint16(s)
+ candidate = int(e.table[nextHash]) - int(e.cur)
+ e.table[nextHash] = int32(s) + e.cur
nextHash = hash(load32(src, nextS), shift)
- if load32(src, s) == load32(src, candidate) {
+ if uint(candidate) < uint(s) && load32(src, s) == load32(src, candidate) {
break
}
}
@@ -271,10 +290,13 @@
// three load32 calls.
x := load64(src, s-1)
prevHash := hash(uint32(x>>0), shift)
- table[prevHash] = uint16(s - 1)
+ e.table[prevHash] = int32(s-1) + e.cur
currHash := hash(uint32(x>>8), shift)
- candidate = int(table[currHash])
- table[currHash] = uint16(s)
+ candidate = int(e.table[currHash]) - int(e.cur)
+ e.table[currHash] = int32(s) + e.cur
+ if uint(candidate) >= uint(s) {
+ candidate = 0
+ }
if uint32(x>>8) != load32(src, candidate) {
nextHash = hash(uint32(x>>16), shift)
s++
@@ -287,6 +309,7 @@
if nextEmit < len(src) {
d += emitLiteral(dst[d:], src[nextEmit:])
}
+ e.cur += maxBlockSize
return d
}