Re-use the hashtable.

This is a re-base of https://github.com/golang/snappy/pull/23

Just using a sync.Pool by itself didn't seem to affect the numbers much.
Previously, the table was stack-allocated and not heap-allocated, so using a
sync.Pool isn't saving a malloc and GC.

The point of re-using the hashtable, via a sync.Pool, is that you can avoid
zero-initializing it. However, the benchmarks are mixed: some are better, some
are worse. I think that the net result is actually worse, but this commit is
provided to aid discussion of that PR and other changes such as
https://go-review.googlesource.com/#/c/21021/ in the standard library.

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsEncode1e1-4     466.23       177.91       0.38x
BenchmarkWordsEncode1e2-4     60.08        191.31       3.18x
BenchmarkWordsEncode1e3-4     174.56       215.35       1.23x
BenchmarkWordsEncode1e4-4     172.29       166.81       0.97x
BenchmarkWordsEncode1e5-4     134.37       127.43       0.95x
BenchmarkWordsEncode1e6-4     152.59       148.35       0.97x
BenchmarkRandomEncode-4       6826.46      7249.27      1.06x
Benchmark_ZFlat0-4            310.41       299.86       0.97x
Benchmark_ZFlat1-4            198.37       165.81       0.84x
Benchmark_ZFlat2-4            8046.09      9300.88      1.16x
Benchmark_ZFlat3-4            123.41       245.15       1.99x
Benchmark_ZFlat4-4            2200.95      2346.55      1.07x
Benchmark_ZFlat5-4            307.16       294.54       0.96x
Benchmark_ZFlat6-4            136.35       130.99       0.96x
Benchmark_ZFlat7-4            130.63       125.27       0.96x
Benchmark_ZFlat8-4            143.56       137.13       0.96x
Benchmark_ZFlat9-4            125.47       120.34       0.96x
Benchmark_ZFlat10-4           365.85       348.07       0.95x
Benchmark_ZFlat11-4           200.01       194.52       0.97x
diff --git a/encode.go b/encode.go
index 1a8a821..2e255fb 100644
--- a/encode.go
+++ b/encode.go
@@ -8,6 +8,7 @@
 	"encoding/binary"
 	"errors"
 	"io"
+	"sync"
 )
 
 // maxOffset limits how far copy back-references can go, the same as the C++
@@ -104,12 +105,32 @@
 	return i + 2
 }
 
+var encPool = sync.Pool{New: func() interface{} { return &encoder{cur: maxBlockSize} }}
+
+const (
+	tableBits = 14             // Bits used in the table
+	tableSize = 1 << tableBits // Size of the table
+)
+
+type encoder struct {
+	table [tableSize]int32
+	// cur is bigger than every element of table.
+	cur int32
+}
+
 // Encode returns the encoded form of src. The returned slice may be a sub-
 // slice of dst if dst was large enough to hold the entire encoded block.
 // Otherwise, a newly allocated slice will be returned.
 //
 // It is valid to pass a nil dst.
 func Encode(dst, src []byte) []byte {
+	e := encPool.Get().(*encoder)
+	dst = e.encode(dst, src)
+	encPool.Put(e)
+	return dst
+}
+
+func (e *encoder) encode(dst, src []byte) []byte {
 	if n := MaxEncodedLen(len(src)); n < 0 {
 		panic(ErrTooLarge)
 	} else if len(dst) < n {
@@ -128,7 +149,7 @@
 		if len(p) < minNonLiteralBlockSize {
 			d += emitLiteral(dst[d:], p)
 		} else {
-			d += encodeBlock(dst[d:], p)
+			d += e.encodeBlock(dst[d:], p)
 		}
 	}
 	return dst[:d]
@@ -176,17 +197,13 @@
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
 // 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
-func encodeBlock(dst, src []byte) (d int) {
-	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
-	// The table element type is uint16, as s < sLimit and sLimit < len(src)
-	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
-	const maxTableSize = 1 << 14
-	shift, tableSize := uint32(32-8), 1<<8
-	for tableSize < maxTableSize && tableSize < len(src) {
+func (e *encoder) encodeBlock(dst, src []byte) (d int) {
+	shift := uint32(32 - 8)
+	for n := 1 << 8; n < tableSize && n < len(src); n *= 2 {
 		shift--
-		tableSize *= 2
 	}
-	var table [maxTableSize]uint16
+
+	// TODO: something about e.cur wrapping.
 
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
@@ -198,7 +215,9 @@
 
 	// The encoded form must start with a literal, as there are no previous
 	// bytes to copy, so we start looking for hash matches at s == 1.
-	s := 1
+	s := 0
+	e.table[hash(load32(src, s), shift)] = e.cur
+	s = 1
 	nextHash := hash(load32(src, s), shift)
 
 	for {
@@ -229,10 +248,10 @@
 			if nextS > sLimit {
 				goto emitRemainder
 			}
-			candidate = int(table[nextHash])
-			table[nextHash] = uint16(s)
+			candidate = int(e.table[nextHash]) - int(e.cur)
+			e.table[nextHash] = int32(s) + e.cur
 			nextHash = hash(load32(src, nextS), shift)
-			if load32(src, s) == load32(src, candidate) {
+			if uint(candidate) < uint(s) && load32(src, s) == load32(src, candidate) {
 				break
 			}
 		}
@@ -271,10 +290,13 @@
 			// three load32 calls.
 			x := load64(src, s-1)
 			prevHash := hash(uint32(x>>0), shift)
-			table[prevHash] = uint16(s - 1)
+			e.table[prevHash] = int32(s-1) + e.cur
 			currHash := hash(uint32(x>>8), shift)
-			candidate = int(table[currHash])
-			table[currHash] = uint16(s)
+			candidate = int(e.table[currHash]) - int(e.cur)
+			e.table[currHash] = int32(s) + e.cur
+			if uint(candidate) >= uint(s) {
+				candidate = 0
+			}
 			if uint32(x>>8) != load32(src, candidate) {
 				nextHash = hash(uint32(x>>16), shift)
 				s++
@@ -287,6 +309,7 @@
 	if nextEmit < len(src) {
 		d += emitLiteral(dst[d:], src[nextEmit:])
 	}
+	e.cur += maxBlockSize
 	return d
 }