Re-use the hashtable. This is a re-base of https://github.com/golang/snappy/pull/23 Just using a sync.Pool by itself didn't seem to affect the numbers much. Previously, the table was stack-allocated and not heap-allocated, so using a sync.Pool isn't saving a malloc and GC. The point of re-using the hashtable, via a sync.Pool, is that you can avoid zero-initializing it. However, the benchmarks are mixed: some are better, some are worse. I think that the net result is actually worse, but this commit is provided to aid discussion of that PR and other changes such as https://go-review.googlesource.com/#/c/21021/ in the standard library. benchmark old MB/s new MB/s speedup BenchmarkWordsEncode1e1-4 466.23 177.91 0.38x BenchmarkWordsEncode1e2-4 60.08 191.31 3.18x BenchmarkWordsEncode1e3-4 174.56 215.35 1.23x BenchmarkWordsEncode1e4-4 172.29 166.81 0.97x BenchmarkWordsEncode1e5-4 134.37 127.43 0.95x BenchmarkWordsEncode1e6-4 152.59 148.35 0.97x BenchmarkRandomEncode-4 6826.46 7249.27 1.06x Benchmark_ZFlat0-4 310.41 299.86 0.97x Benchmark_ZFlat1-4 198.37 165.81 0.84x Benchmark_ZFlat2-4 8046.09 9300.88 1.16x Benchmark_ZFlat3-4 123.41 245.15 1.99x Benchmark_ZFlat4-4 2200.95 2346.55 1.07x Benchmark_ZFlat5-4 307.16 294.54 0.96x Benchmark_ZFlat6-4 136.35 130.99 0.96x Benchmark_ZFlat7-4 130.63 125.27 0.96x Benchmark_ZFlat8-4 143.56 137.13 0.96x Benchmark_ZFlat9-4 125.47 120.34 0.96x Benchmark_ZFlat10-4 365.85 348.07 0.95x Benchmark_ZFlat11-4 200.01 194.52 0.97x

commit: fdf5d5de7f4e9fb3760d9421a87402c249a05326 [log] [tgz]
author: Nigel Tao <[email protected]> Mon Apr 11 23:23:38 2016
committer: Nigel Tao <[email protected]> Mon Apr 11 23:23:38 2016
tree: 34269a65375bf142ea6ead4a27fe95a049addc5f
parent: ef80b33e873821cff33038ae16c444717218cdb1 [diff]
diff --git a/encode.go b/encode.go
index 1a8a821..2e255fb 100644
--- a/encode.go
+++ b/encode.go

@@ -8,6 +8,7 @@
 	"encoding/binary"
 	"errors"
 	"io"
+	"sync"
 )
 
 // maxOffset limits how far copy back-references can go, the same as the C++
@@ -104,12 +105,32 @@
 	return i + 2
 }
 
+var encPool = sync.Pool{New: func() interface{} { return &encoder{cur: maxBlockSize} }}
+
+const (
+	tableBits = 14             // Bits used in the table
+	tableSize = 1 << tableBits // Size of the table
+)
+
+type encoder struct {
+	table [tableSize]int32
+	// cur is bigger than every element of table.
+	cur int32
+}
+
 // Encode returns the encoded form of src. The returned slice may be a sub-
 // slice of dst if dst was large enough to hold the entire encoded block.
 // Otherwise, a newly allocated slice will be returned.
 //
 // It is valid to pass a nil dst.
 func Encode(dst, src []byte) []byte {
+	e := encPool.Get().(*encoder)
+	dst = e.encode(dst, src)
+	encPool.Put(e)
+	return dst
+}
+
+func (e *encoder) encode(dst, src []byte) []byte {
 	if n := MaxEncodedLen(len(src)); n < 0 {
 		panic(ErrTooLarge)
 	} else if len(dst) < n {
@@ -128,7 +149,7 @@
 		if len(p) < minNonLiteralBlockSize {
 			d += emitLiteral(dst[d:], p)
 		} else {
-			d += encodeBlock(dst[d:], p)
+			d += e.encodeBlock(dst[d:], p)
 		}
 	}
 	return dst[:d]
@@ -176,17 +197,13 @@
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
 // 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
-func encodeBlock(dst, src []byte) (d int) {
-	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
-	// The table element type is uint16, as s < sLimit and sLimit < len(src)
-	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
-	const maxTableSize = 1 << 14
-	shift, tableSize := uint32(32-8), 1<<8
-	for tableSize < maxTableSize && tableSize < len(src) {
+func (e *encoder) encodeBlock(dst, src []byte) (d int) {
+	shift := uint32(32 - 8)
+	for n := 1 << 8; n < tableSize && n < len(src); n *= 2 {
 		shift--
-		tableSize *= 2
 	}
-	var table [maxTableSize]uint16
+
+	// TODO: something about e.cur wrapping.
 
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
@@ -198,7 +215,9 @@
 
 	// The encoded form must start with a literal, as there are no previous
 	// bytes to copy, so we start looking for hash matches at s == 1.
-	s := 1
+	s := 0
+	e.table[hash(load32(src, s), shift)] = e.cur
+	s = 1
 	nextHash := hash(load32(src, s), shift)
 
 	for {
@@ -229,10 +248,10 @@
 			if nextS > sLimit {
 				goto emitRemainder
 			}
-			candidate = int(table[nextHash])
-			table[nextHash] = uint16(s)
+			candidate = int(e.table[nextHash]) - int(e.cur)
+			e.table[nextHash] = int32(s) + e.cur
 			nextHash = hash(load32(src, nextS), shift)
-			if load32(src, s) == load32(src, candidate) {
+			if uint(candidate) < uint(s) && load32(src, s) == load32(src, candidate) {
 				break
 			}
 		}
@@ -271,10 +290,13 @@
 			// three load32 calls.
 			x := load64(src, s-1)
 			prevHash := hash(uint32(x>>0), shift)
-			table[prevHash] = uint16(s - 1)
+			e.table[prevHash] = int32(s-1) + e.cur
 			currHash := hash(uint32(x>>8), shift)
-			candidate = int(table[currHash])
-			table[currHash] = uint16(s)
+			candidate = int(e.table[currHash]) - int(e.cur)
+			e.table[currHash] = int32(s) + e.cur
+			if uint(candidate) >= uint(s) {
+				candidate = 0
+			}
 			if uint32(x>>8) != load32(src, candidate) {
 				nextHash = hash(uint32(x>>16), shift)
 				s++
@@ -287,6 +309,7 @@
 	if nextEmit < len(src) {
 		d += emitLiteral(dst[d:], src[nextEmit:])
 	}
+	e.cur += maxBlockSize
 	return d
 }
commit	fdf5d5de7f4e9fb3760d9421a87402c249a05326	[log] [tgz]
author	Nigel Tao <[email protected]>	Mon Apr 11 23:23:38 2016
committer	Nigel Tao <[email protected]>	Mon Apr 11 23:23:38 2016
tree	34269a65375bf142ea6ead4a27fe95a049addc5f
parent	ef80b33e873821cff33038ae16c444717218cdb1 [diff]