Migrate to containerd v1.7.0 and update dependencies

* Updates containerd to v1.7.0 and new binary for 32-bit Arm OSes. * Updates Go dependencies - openfaas and external Signed-off-by: Alex Ellis (OpenFaaS Ltd) <alexellis2@gmail.com>
2025-06-18 03:56:35 +00:00 · 2023-03-19 10:55:53 +00:00
parent 9efd019e86
commit c41c2cd9fc
1133 changed files with 104391 additions and 75499 deletions
--- a/vendor/github.com/klauspost/compress/.goreleaser.yml
+++ b/vendor/github.com/klauspost/compress/.goreleaser.yml
@ -3,7 +3,7 @@
 before:
  hooks:
    - ./gen.sh
-    - go install mvdan.cc/garble@latest
+    - go install mvdan.cc/garble@v0.9.3

 builds:
  -
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@ -9,7 +9,6 @@ This package provides various compression algorithms.
 * [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding.
 * [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp) Provides client and server wrappers for handling gzipped requests efficiently.
 * [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation.
-* [fuzz package](https://github.com/klauspost/compress-fuzz) for fuzz testing all compressors/decompressors here.

 [![Go Reference](https://pkg.go.dev/badge/klauspost/compress.svg)](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories)
 [![Go](https://github.com/klauspost/compress/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/compress/actions/workflows/go.yml)
@ -17,6 +16,77 @@ This package provides various compression algorithms.

 # changelog

+* Mar 13, 2023 - [v1.16.1](https://github.com/klauspost/compress/releases/tag/v1.16.1)
+	* zstd: Speed up + improve best encoder by @greatroar in https://github.com/klauspost/compress/pull/776
+	* gzhttp: Add optional [BREACH mitigation](https://github.com/klauspost/compress/tree/master/gzhttp#breach-mitigation). https://github.com/klauspost/compress/pull/762 https://github.com/klauspost/compress/pull/768 https://github.com/klauspost/compress/pull/769 https://github.com/klauspost/compress/pull/770 https://github.com/klauspost/compress/pull/767
+	* s2: Add Intel LZ4s converter https://github.com/klauspost/compress/pull/766
+	* zstd: Minor bug fixes https://github.com/klauspost/compress/pull/771 https://github.com/klauspost/compress/pull/772 https://github.com/klauspost/compress/pull/773
+	* huff0: Speed up compress1xDo by @greatroar in https://github.com/klauspost/compress/pull/774
+
+* Feb 26, 2023 - [v1.16.0](https://github.com/klauspost/compress/releases/tag/v1.16.0)
+	* s2: Add [Dictionary](https://github.com/klauspost/compress/tree/master/s2#dictionaries) support.  https://github.com/klauspost/compress/pull/685
+	* s2: Add Compression Size Estimate.  https://github.com/klauspost/compress/pull/752
+	* s2: Add support for custom stream encoder. https://github.com/klauspost/compress/pull/755
+	* s2: Add LZ4 block converter. https://github.com/klauspost/compress/pull/748
+	* s2: Support io.ReaderAt in ReadSeeker. https://github.com/klauspost/compress/pull/747
+	* s2c/s2sx: Use concurrent decoding. https://github.com/klauspost/compress/pull/746
+
+* Jan 21st, 2023 (v1.15.15)
+	* deflate: Improve level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/739
+	* zstd: Add delta encoding support by @greatroar in https://github.com/klauspost/compress/pull/728
+	* zstd: Various speed improvements by @greatroar https://github.com/klauspost/compress/pull/741 https://github.com/klauspost/compress/pull/734 https://github.com/klauspost/compress/pull/736 https://github.com/klauspost/compress/pull/744 https://github.com/klauspost/compress/pull/743 https://github.com/klauspost/compress/pull/745
+	* gzhttp: Add SuffixETag() and DropETag() options to prevent ETag collisions on compressed responses by @willbicks in https://github.com/klauspost/compress/pull/740
+
+* Jan 3rd, 2023 (v1.15.14)
+
+	* flate: Improve speed in big stateless blocks https://github.com/klauspost/compress/pull/718
+	* zstd: Minor speed tweaks by @greatroar in https://github.com/klauspost/compress/pull/716 https://github.com/klauspost/compress/pull/720
+	* export NoGzipResponseWriter for custom ResponseWriter wrappers by @harshavardhana in https://github.com/klauspost/compress/pull/722
+	* s2: Add example for indexing and existing stream https://github.com/klauspost/compress/pull/723
+
+* Dec 11, 2022 (v1.15.13)
+	* zstd: Add [MaxEncodedSize](https://pkg.go.dev/github.com/klauspost/compress@v1.15.13/zstd#Encoder.MaxEncodedSize) to encoder  https://github.com/klauspost/compress/pull/691
+	* zstd: Various tweaks and improvements https://github.com/klauspost/compress/pull/693 https://github.com/klauspost/compress/pull/695 https://github.com/klauspost/compress/pull/696 https://github.com/klauspost/compress/pull/701 https://github.com/klauspost/compress/pull/702 https://github.com/klauspost/compress/pull/703 https://github.com/klauspost/compress/pull/704 https://github.com/klauspost/compress/pull/705 https://github.com/klauspost/compress/pull/706 https://github.com/klauspost/compress/pull/707 https://github.com/klauspost/compress/pull/708
+
+* Oct 26, 2022 (v1.15.12)
+
+	* zstd: Tweak decoder allocs. https://github.com/klauspost/compress/pull/680
+	* gzhttp: Always delete `HeaderNoCompression` https://github.com/klauspost/compress/pull/683
+
+* Sept 26, 2022 (v1.15.11)
+
+	* flate: Improve level 1-3 compression  https://github.com/klauspost/compress/pull/678
+	* zstd: Improve "best" compression by @nightwolfz in https://github.com/klauspost/compress/pull/677
+	* zstd: Fix+reduce decompression allocations https://github.com/klauspost/compress/pull/668
+	* zstd: Fix non-effective noescape tag https://github.com/klauspost/compress/pull/667
+
+* Sept 16, 2022 (v1.15.10)
+
+	* zstd: Add [WithDecodeAllCapLimit](https://pkg.go.dev/github.com/klauspost/compress@v1.15.10/zstd#WithDecodeAllCapLimit) https://github.com/klauspost/compress/pull/649
+	* Add Go 1.19 - deprecate Go 1.16  https://github.com/klauspost/compress/pull/651
+	* flate: Improve level 5+6 compression https://github.com/klauspost/compress/pull/656
+	* zstd: Improve "better" compresssion  https://github.com/klauspost/compress/pull/657
+	* s2: Improve "best" compression https://github.com/klauspost/compress/pull/658
+	* s2: Improve "better" compression. https://github.com/klauspost/compress/pull/635
+	* s2: Slightly faster non-assembly decompression https://github.com/klauspost/compress/pull/646
+	* Use arrays for constant size copies https://github.com/klauspost/compress/pull/659
+
+* July 21, 2022 (v1.15.9)
+
+	* zstd: Fix decoder crash on amd64 (no BMI) on invalid input https://github.com/klauspost/compress/pull/645
+	* zstd: Disable decoder extended memory copies (amd64) due to possible crashes https://github.com/klauspost/compress/pull/644
+	* zstd: Allow single segments up to "max decoded size" by @klauspost in https://github.com/klauspost/compress/pull/643
+
+* July 13, 2022 (v1.15.8)
+
+	* gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641
+	* s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638
+	* zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636
+	* zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637
+	* huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634
+	* zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640
+	* gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639
+
 * June 29, 2022 (v1.15.7)

 	* s2: Fix absolute forward seeks  https://github.com/klauspost/compress/pull/633
@ -81,15 +151,15 @@ This package provides various compression algorithms.
 	* gzhttp: Add zstd to transport by @klauspost in [#400](https://github.com/klauspost/compress/pull/400)
 	* gzhttp: Make content-type optional by @klauspost in [#510](https://github.com/klauspost/compress/pull/510)

-<details>
-	<summary>See  Details</summary>
 Both compression and decompression now supports "synchronous" stream operations. This means that whenever "concurrency" is set to 1, they will operate without spawning goroutines.

 Stream decompression is now faster on asynchronous, since the goroutine allocation much more effectively splits the workload. On typical streams this will typically use 2 cores fully for decompression. When a stream has finished decoding no goroutines will be left over, so decoders can now safely be pooled and still be garbage collected.

 While the release has been extensively tested, it is recommended to testing when upgrading.
-</details>

+<details>
+	<summary>See changes to v1.14.x</summary>
+	
 * Feb 22, 2022 (v1.14.4)
 	* flate: Fix rare huffman only (-2) corruption. [#503](https://github.com/klauspost/compress/pull/503)
 	* zip: Update deprecated CreateHeaderRaw to correctly call CreateRaw by @saracen in [#502](https://github.com/klauspost/compress/pull/502)
@ -115,6 +185,7 @@ While the release has been extensively tested, it is recommended to testing when
 	* zstd: Performance improvement in [#420]( https://github.com/klauspost/compress/pull/420) [#456](https://github.com/klauspost/compress/pull/456) [#437](https://github.com/klauspost/compress/pull/437) [#467](https://github.com/klauspost/compress/pull/467) [#468](https://github.com/klauspost/compress/pull/468)
 	* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
 	* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
+</details>

 <details>
 	<summary>See changes to v1.13.x</summary>
--- a/vendor/github.com/klauspost/compress/fse/compress.go
+++ b/vendor/github.com/klauspost/compress/fse/compress.go
@ -146,54 +146,51 @@ func (s *Scratch) compress(src []byte) error {
 		c1.encodeZero(tt[src[ip-2]])
 		ip -= 2
 	}
+	src = src[:ip]

 	// Main compression loop.
 	switch {
 	case !s.zeroBits && s.actualTableLog <= 8:
 		// We can encode 4 symbols without requiring a flush.
 		// We do not need to check if any output is 0 bits.
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encode(tt[v0])
 			c1.encode(tt[v1])
 			c2.encode(tt[v2])
 			c1.encode(tt[v3])
-			ip -= 4
 		}
 	case !s.zeroBits:
 		// We do not need to check if any output is 0 bits.
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encode(tt[v0])
 			c1.encode(tt[v1])
 			s.bw.flush32()
 			c2.encode(tt[v2])
 			c1.encode(tt[v3])
-			ip -= 4
 		}
 	case s.actualTableLog <= 8:
 		// We can encode 4 symbols without requiring a flush
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encodeZero(tt[v0])
 			c1.encodeZero(tt[v1])
 			c2.encodeZero(tt[v2])
 			c1.encodeZero(tt[v3])
-			ip -= 4
 		}
 	default:
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encodeZero(tt[v0])
 			c1.encodeZero(tt[v1])
 			s.bw.flush32()
 			c2.encodeZero(tt[v2])
 			c1.encodeZero(tt[v3])
-			ip -= 4
 		}
 	}

@ -459,15 +456,17 @@ func (s *Scratch) countSimple(in []byte) (max int) {
 	for _, v := range in {
 		s.count[v]++
 	}
-	m := uint32(0)
+	m, symlen := uint32(0), s.symbolLen
 	for i, v := range s.count[:] {
+		if v == 0 {
+			continue
+		}
 		if v > m {
 			m = v
 		}
-		if v > 0 {
-			s.symbolLen = uint16(i) + 1
-		}
+		symlen = uint16(i) + 1
 	}
+	s.symbolLen = symlen
 	return int(m)
 }

--- a/vendor/github.com/klauspost/compress/fse/decompress.go
+++ b/vendor/github.com/klauspost/compress/fse/decompress.go
@ -260,7 +260,9 @@ func (s *Scratch) buildDtable() error {
 // If the buffer is over-read an error is returned.
 func (s *Scratch) decompress() error {
 	br := &s.bits
-	br.init(s.br.unread())
+	if err := br.init(s.br.unread()); err != nil {
+		return err
+	}

 	var s1, s2 decoder
 	// Initialize and decode first state and symbol.
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@ -67,7 +67,6 @@ func (b *bitReaderBytes) fillFast() {

 	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
-	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value |= uint64(low) << (b.bitsRead - 32)
 	b.bitsRead -= 32
@ -88,8 +87,7 @@ func (b *bitReaderBytes) fill() {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
+		v := b.in[b.off-4 : b.off]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value |= uint64(low) << (b.bitsRead - 32)
 		b.bitsRead -= 32
@ -179,7 +177,6 @@ func (b *bitReaderShifted) fillFast() {

 	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
-	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 	b.bitsRead -= 32
@ -200,8 +197,7 @@ func (b *bitReaderShifted) fill() {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
+		v := b.in[b.off-4 : b.off]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 		b.bitsRead -= 32
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@ -60,6 +60,22 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
 	b.nBits += encA.nBits + encB.nBits
 }

+// encFourSymbols adds up to 32 bits from four symbols.
+// It will not check if there is space for them,
+// so the caller must ensure that b has been flushed recently.
+func (b *bitWriter) encFourSymbols(encA, encB, encC, encD cTableEntry) {
+	bitsA := encA.nBits
+	bitsB := bitsA + encB.nBits
+	bitsC := bitsB + encC.nBits
+	bitsD := bitsC + encD.nBits
+	combined := uint64(encA.val) |
+		(uint64(encB.val) << (bitsA & 63)) |
+		(uint64(encC.val) << (bitsB & 63)) |
+		(uint64(encD.val) << (bitsC & 63))
+	b.bitContainer |= combined << (b.nBits & 63)
+	b.nBits += bitsD
+}
+
 // flush32 will flush out, so there are at least 32 bits available for writing.
 func (b *bitWriter) flush32() {
 	if b.nBits < 32 {
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@ -248,8 +248,7 @@ func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
 			tmp := src[n : n+4]
 			// tmp should be len 4
 			bw.flush32()
-			bw.encTwoSymbols(cTable, tmp[3], tmp[2])
-			bw.encTwoSymbols(cTable, tmp[1], tmp[0])
+			bw.encFourSymbols(cTable[tmp[3]], cTable[tmp[2]], cTable[tmp[1]], cTable[tmp[0]])
 		}
 	} else {
 		for ; n >= 0; n -= 4 {
@ -365,29 +364,29 @@ func (s *Scratch) countSimple(in []byte) (max int, reuse bool) {
 	m := uint32(0)
 	if len(s.prevTable) > 0 {
 		for i, v := range s.count[:] {
+			if v == 0 {
+				continue
+			}
 			if v > m {
 				m = v
 			}
-			if v > 0 {
-				s.symbolLen = uint16(i) + 1
-				if i >= len(s.prevTable) {
-					reuse = false
-				} else {
-					if s.prevTable[i].nBits == 0 {
-						reuse = false
-					}
-				}
+			s.symbolLen = uint16(i) + 1
+			if i >= len(s.prevTable) {
+				reuse = false
+			} else if s.prevTable[i].nBits == 0 {
+				reuse = false
 			}
 		}
 		return int(m), reuse
 	}
 	for i, v := range s.count[:] {
+		if v == 0 {
+			continue
+		}
 		if v > m {
 			m = v
 		}
-		if v > 0 {
-			s.symbolLen = uint16(i) + 1
-		}
+		s.symbolLen = uint16(i) + 1
 	}
 	return int(m), false
 }
@ -484,34 +483,35 @@ func (s *Scratch) buildCTable() error {
 	// Different from reference implementation.
 	huffNode0 := s.nodes[0 : huffNodesLen+1]

-	for huffNode[nonNullRank].count == 0 {
+	for huffNode[nonNullRank].count() == 0 {
 		nonNullRank--
 	}

 	lowS := int16(nonNullRank)
 	nodeRoot := nodeNb + lowS - 1
 	lowN := nodeNb
-	huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count
-	huffNode[lowS].parent, huffNode[lowS-1].parent = uint16(nodeNb), uint16(nodeNb)
+	huffNode[nodeNb].setCount(huffNode[lowS].count() + huffNode[lowS-1].count())
+	huffNode[lowS].setParent(nodeNb)
+	huffNode[lowS-1].setParent(nodeNb)
 	nodeNb++
 	lowS -= 2
 	for n := nodeNb; n <= nodeRoot; n++ {
-		huffNode[n].count = 1 << 30
+		huffNode[n].setCount(1 << 30)
 	}
 	// fake entry, strong barrier
-	huffNode0[0].count = 1 << 31
+	huffNode0[0].setCount(1 << 31)

 	// create parents
 	for nodeNb <= nodeRoot {
 		var n1, n2 int16
-		if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
+		if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
 			n1 = lowS
 			lowS--
 		} else {
 			n1 = lowN
 			lowN++
 		}
-		if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
+		if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
 			n2 = lowS
 			lowS--
 		} else {
@ -519,18 +519,19 @@ func (s *Scratch) buildCTable() error {
 			lowN++
 		}

-		huffNode[nodeNb].count = huffNode0[n1+1].count + huffNode0[n2+1].count
-		huffNode0[n1+1].parent, huffNode0[n2+1].parent = uint16(nodeNb), uint16(nodeNb)
+		huffNode[nodeNb].setCount(huffNode0[n1+1].count() + huffNode0[n2+1].count())
+		huffNode0[n1+1].setParent(nodeNb)
+		huffNode0[n2+1].setParent(nodeNb)
 		nodeNb++
 	}

 	// distribute weights (unlimited tree height)
-	huffNode[nodeRoot].nbBits = 0
+	huffNode[nodeRoot].setNbBits(0)
 	for n := nodeRoot - 1; n >= startNode; n-- {
-		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
+		huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
 	}
 	for n := uint16(0); n <= nonNullRank; n++ {
-		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
+		huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
 	}
 	s.actualTableLog = s.setMaxHeight(int(nonNullRank))
 	maxNbBits := s.actualTableLog
@ -542,7 +543,7 @@ func (s *Scratch) buildCTable() error {
 	var nbPerRank [tableLogMax + 1]uint16
 	var valPerRank [16]uint16
 	for _, v := range huffNode[:nonNullRank+1] {
-		nbPerRank[v.nbBits]++
+		nbPerRank[v.nbBits()]++
 	}
 	// determine stating value per rank
 	{
@ -557,7 +558,7 @@ func (s *Scratch) buildCTable() error {

 	// push nbBits per symbol, symbol order
 	for _, v := range huffNode[:nonNullRank+1] {
-		s.cTable[v.symbol].nBits = v.nbBits
+		s.cTable[v.symbol()].nBits = v.nbBits()
 	}

 	// assign value within rank, symbol order
@ -603,12 +604,12 @@ func (s *Scratch) huffSort() {
 		pos := rank[r].current
 		rank[r].current++
 		prev := nodes[(pos-1)&huffNodesMask]
-		for pos > rank[r].base && c > prev.count {
+		for pos > rank[r].base && c > prev.count() {
 			nodes[pos&huffNodesMask] = prev
 			pos--
 			prev = nodes[(pos-1)&huffNodesMask]
 		}
-		nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)}
+		nodes[pos&huffNodesMask] = makeNodeElt(c, byte(n))
 	}
 }

@ -617,7 +618,7 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 	huffNode := s.nodes[1 : huffNodesLen+1]
 	//huffNode = huffNode[: huffNodesLen]

-	largestBits := huffNode[lastNonNull].nbBits
+	largestBits := huffNode[lastNonNull].nbBits()

 	// early exit : no elt > maxNbBits
 	if largestBits <= maxNbBits {
@ -627,14 +628,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 	baseCost := int(1) << (largestBits - maxNbBits)
 	n := uint32(lastNonNull)

-	for huffNode[n].nbBits > maxNbBits {
-		totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits))
-		huffNode[n].nbBits = maxNbBits
+	for huffNode[n].nbBits() > maxNbBits {
+		totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits()))
+		huffNode[n].setNbBits(maxNbBits)
 		n--
 	}
 	// n stops at huffNode[n].nbBits <= maxNbBits

-	for huffNode[n].nbBits == maxNbBits {
+	for huffNode[n].nbBits() == maxNbBits {
 		n--
 	}
 	// n end at index of smallest symbol using < maxNbBits
@ -655,10 +656,10 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 		{
 			currentNbBits := maxNbBits
 			for pos := int(n); pos >= 0; pos-- {
-				if huffNode[pos].nbBits >= currentNbBits {
+				if huffNode[pos].nbBits() >= currentNbBits {
 					continue
 				}
-				currentNbBits = huffNode[pos].nbBits // < maxNbBits
+				currentNbBits = huffNode[pos].nbBits() // < maxNbBits
 				rankLast[maxNbBits-currentNbBits] = uint32(pos)
 			}
 		}
@ -675,8 +676,8 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 				if lowPos == noSymbol {
 					break
 				}
-				highTotal := huffNode[highPos].count
-				lowTotal := 2 * huffNode[lowPos].count
+				highTotal := huffNode[highPos].count()
+				lowTotal := 2 * huffNode[lowPos].count()
 				if highTotal <= lowTotal {
 					break
 				}
@ -692,13 +693,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 				// this rank is no longer empty
 				rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]
 			}
-			huffNode[rankLast[nBitsToDecrease]].nbBits++
+			huffNode[rankLast[nBitsToDecrease]].setNbBits(1 +
+				huffNode[rankLast[nBitsToDecrease]].nbBits())
 			if rankLast[nBitsToDecrease] == 0 {
 				/* special case, reached largest symbol */
 				rankLast[nBitsToDecrease] = noSymbol
 			} else {
 				rankLast[nBitsToDecrease]--
-				if huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease {
+				if huffNode[rankLast[nBitsToDecrease]].nbBits() != maxNbBits-nBitsToDecrease {
 					rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */
 				}
 			}
@ -706,15 +708,15 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {

 		for totalCost < 0 { /* Sometimes, cost correction overshoot */
 			if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
-				for huffNode[n].nbBits == maxNbBits {
+				for huffNode[n].nbBits() == maxNbBits {
 					n--
 				}
-				huffNode[n+1].nbBits--
+				huffNode[n+1].setNbBits(huffNode[n+1].nbBits() - 1)
 				rankLast[1] = n + 1
 				totalCost++
 				continue
 			}
-			huffNode[rankLast[1]+1].nbBits--
+			huffNode[rankLast[1]+1].setNbBits(huffNode[rankLast[1]+1].nbBits() - 1)
 			rankLast[1]++
 			totalCost++
 		}
@ -722,9 +724,26 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 	return maxNbBits
 }

-type nodeElt struct {
-	count  uint32
-	parent uint16
-	symbol byte
-	nbBits uint8
+// A nodeElt is the fields
+//
+//	count  uint32
+//	parent uint16
+//	symbol byte
+//	nbBits uint8
+//
+// in some order, all squashed into an integer so that the compiler
+// always loads and stores entire nodeElts instead of separate fields.
+type nodeElt uint64
+
+func makeNodeElt(count uint32, symbol byte) nodeElt {
+	return nodeElt(count) | nodeElt(symbol)<<48
 }
+
+func (e *nodeElt) count() uint32  { return uint32(*e) }
+func (e *nodeElt) parent() uint16 { return uint16(*e >> 32) }
+func (e *nodeElt) symbol() byte   { return byte(*e >> 48) }
+func (e *nodeElt) nbBits() uint8  { return uint8(*e >> 56) }
+
+func (e *nodeElt) setCount(c uint32) { *e = (*e)&0xffffffff00000000 | nodeElt(c) }
+func (e *nodeElt) setParent(p int16) { *e = (*e)&0xffff0000ffffffff | nodeElt(uint16(p))<<32 }
+func (e *nodeElt) setNbBits(n uint8) { *e = (*e)&0x00ffffffffffffff | nodeElt(n)<<56 }
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@ -61,7 +61,7 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 		b, err := fse.Decompress(in[:iSize], s.fse)
 		s.fse.Out = nil
 		if err != nil {
-			return s, nil, err
+			return s, nil, fmt.Errorf("fse decompress returned: %w", err)
 		}
 		if len(b) > 255 {
 			return s, nil, errors.New("corrupt input: output table too large")
@ -763,17 +763,20 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[0][:])
-			copy(out[dstEvery:], buf[1][:])
-			copy(out[dstEvery*2:], buf[2][:])
-			copy(out[dstEvery*3:], buf[3][:])
-			out = out[bufoff:]
-			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
+			if len(out)-bufoff < dstEvery*3 {
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
+			//copy(out, buf[0][:])
+			//copy(out[dstEvery:], buf[1][:])
+			//copy(out[dstEvery*2:], buf[2][:])
+			*(*[bufoff]byte)(out) = buf[0]
+			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+			out = out[bufoff:]
+			decoded += bufoff * 4
 		}
 	}
 	if off > 0 {
@ -997,17 +1000,22 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[0][:])
-			copy(out[dstEvery:], buf[1][:])
-			copy(out[dstEvery*2:], buf[2][:])
-			copy(out[dstEvery*3:], buf[3][:])
-			out = out[bufoff:]
-			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
+			if len(out)-bufoff < dstEvery*3 {
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
+
+			//copy(out, buf[0][:])
+			//copy(out[dstEvery:], buf[1][:])
+			//copy(out[dstEvery*2:], buf[2][:])
+			// copy(out[dstEvery*3:], buf[3][:])
+			*(*[bufoff]byte)(out) = buf[0]
+			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+			out = out[bufoff:]
+			decoded += bufoff * 4
 		}
 	}
 	if off > 0 {
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@ -14,12 +14,14 @@ import (

 // decompress4x_main_loop_x86 is an x86 assembler implementation
 // of Decompress4X when tablelog > 8.
+//
 //go:noescape
 func decompress4x_main_loop_amd64(ctx *decompress4xContext)

 // decompress4x_8b_loop_x86 is an x86 assembler implementation
 // of Decompress4X when tablelog <= 8 which decodes 4 entries
 // per loop.
+//
 //go:noescape
 func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)

@ -145,11 +147,13 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {

 // decompress4x_main_loop_x86 is an x86 assembler implementation
 // of Decompress1X when tablelog > 8.
+//
 //go:noescape
 func decompress1x_main_loop_amd64(ctx *decompress1xContext)

 // decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
 // of Decompress1X when tablelog > 8.
+//
 //go:noescape
 func decompress1x_main_loop_bmi2(ctx *decompress1xContext)

--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@ -1,364 +1,352 @@
 // Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.

 //go:build amd64 && !appengine && !noasm && gc
-// +build amd64,!appengine,!noasm,gc

 // func decompress4x_main_loop_amd64(ctx *decompress4xContext)
 TEXT ·decompress4x_main_loop_amd64(SB), $0-8
-	XORQ DX, DX
-
 	// Preload values
 	MOVQ    ctx+0(FP), AX
 	MOVBQZX 8(AX), DI
-	MOVQ    16(AX), SI
-	MOVQ    48(AX), BX
-	MOVQ    24(AX), R9
-	MOVQ    32(AX), R10
-	MOVQ    (AX), R11
+	MOVQ    16(AX), BX
+	MOVQ    48(AX), SI
+	MOVQ    24(AX), R8
+	MOVQ    32(AX), R9
+	MOVQ    (AX), R10

 	// Main loop
 main_loop:
-	MOVQ  SI, R8
-	CMPQ  R8, BX
+	XORL  DX, DX
+	CMPQ  BX, SI
 	SETGE DL

 	// br0.fillFast32()
-	MOVQ    32(R11), R12
-	MOVBQZX 40(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    32(R10), R11
+	MOVBQZX 40(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill0
-	MOVQ    24(R11), AX
-	SUBQ    $0x20, R13
+	MOVQ    24(R10), AX
+	SUBQ    $0x20, R12
 	SUBQ    $0x04, AX
-	MOVQ    (R11), R14
+	MOVQ    (R10), R13

 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R14*1), R14
-	MOVQ R13, CX
-	SHLQ CL, R14
-	MOVQ AX, 24(R11)
-	ORQ  R14, R12
+	MOVL (AX)(R13*1), R13
+	MOVQ R12, CX
+	SHLQ CL, R13
+	MOVQ AX, 24(R10)
+	ORQ  R13, R11

-	// exhausted = exhausted || (br0.off < 4)
-	CMPQ  AX, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br0.off < 4)
+	CMPQ AX, $0x04
+	ADCB $+0, DL

 skip_fill0:
 	// val0 := br0.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br0.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// val1 := br0.peekTopBits(peekBits)
 	MOVQ DI, CX
-	MOVQ R12, R14
-	SHRQ CL, R14
+	MOVQ R11, R13
+	SHRQ CL, R13

 	// v1 := table[val1&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br0.advance(uint8(v1.entry))
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// these two writes get coalesced
 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (R8)
+	MOVW AX, (BX)

 	// update the bitreader structure
-	MOVQ R12, 32(R11)
-	MOVB R13, 40(R11)
-	ADDQ R9, R8
+	MOVQ R11, 32(R10)
+	MOVB R12, 40(R10)

 	// br1.fillFast32()
-	MOVQ    80(R11), R12
-	MOVBQZX 88(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    80(R10), R11
+	MOVBQZX 88(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill1
-	MOVQ    72(R11), AX
-	SUBQ    $0x20, R13
+	MOVQ    72(R10), AX
+	SUBQ    $0x20, R12
 	SUBQ    $0x04, AX
-	MOVQ    48(R11), R14
+	MOVQ    48(R10), R13

 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R14*1), R14
-	MOVQ R13, CX
-	SHLQ CL, R14
-	MOVQ AX, 72(R11)
-	ORQ  R14, R12
+	MOVL (AX)(R13*1), R13
+	MOVQ R12, CX
+	SHLQ CL, R13
+	MOVQ AX, 72(R10)
+	ORQ  R13, R11

-	// exhausted = exhausted || (br1.off < 4)
-	CMPQ  AX, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br1.off < 4)
+	CMPQ AX, $0x04
+	ADCB $+0, DL

 skip_fill1:
 	// val0 := br1.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br1.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// val1 := br1.peekTopBits(peekBits)
 	MOVQ DI, CX
-	MOVQ R12, R14
-	SHRQ CL, R14
+	MOVQ R11, R13
+	SHRQ CL, R13

 	// v1 := table[val1&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br1.advance(uint8(v1.entry))
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// these two writes get coalesced
 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (R8)
+	MOVW AX, (BX)(R8*1)

 	// update the bitreader structure
-	MOVQ R12, 80(R11)
-	MOVB R13, 88(R11)
-	ADDQ R9, R8
+	MOVQ R11, 80(R10)
+	MOVB R12, 88(R10)

 	// br2.fillFast32()
-	MOVQ    128(R11), R12
-	MOVBQZX 136(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    128(R10), R11
+	MOVBQZX 136(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill2
-	MOVQ    120(R11), AX
-	SUBQ    $0x20, R13
+	MOVQ    120(R10), AX
+	SUBQ    $0x20, R12
 	SUBQ    $0x04, AX
-	MOVQ    96(R11), R14
+	MOVQ    96(R10), R13

 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R14*1), R14
-	MOVQ R13, CX
-	SHLQ CL, R14
-	MOVQ AX, 120(R11)
-	ORQ  R14, R12
+	MOVL (AX)(R13*1), R13
+	MOVQ R12, CX
+	SHLQ CL, R13
+	MOVQ AX, 120(R10)
+	ORQ  R13, R11

-	// exhausted = exhausted || (br2.off < 4)
-	CMPQ  AX, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br2.off < 4)
+	CMPQ AX, $0x04
+	ADCB $+0, DL

 skip_fill2:
 	// val0 := br2.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br2.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// val1 := br2.peekTopBits(peekBits)
 	MOVQ DI, CX
-	MOVQ R12, R14
-	SHRQ CL, R14
+	MOVQ R11, R13
+	SHRQ CL, R13

 	// v1 := table[val1&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br2.advance(uint8(v1.entry))
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// these two writes get coalesced
 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (R8)
+	MOVW AX, (BX)(R8*2)

 	// update the bitreader structure
-	MOVQ R12, 128(R11)
-	MOVB R13, 136(R11)
-	ADDQ R9, R8
+	MOVQ R11, 128(R10)
+	MOVB R12, 136(R10)

 	// br3.fillFast32()
-	MOVQ    176(R11), R12
-	MOVBQZX 184(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    176(R10), R11
+	MOVBQZX 184(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill3
-	MOVQ    168(R11), AX
-	SUBQ    $0x20, R13
+	MOVQ    168(R10), AX
+	SUBQ    $0x20, R12
 	SUBQ    $0x04, AX
-	MOVQ    144(R11), R14
+	MOVQ    144(R10), R13

 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R14*1), R14
-	MOVQ R13, CX
-	SHLQ CL, R14
-	MOVQ AX, 168(R11)
-	ORQ  R14, R12
+	MOVL (AX)(R13*1), R13
+	MOVQ R12, CX
+	SHLQ CL, R13
+	MOVQ AX, 168(R10)
+	ORQ  R13, R11

-	// exhausted = exhausted || (br3.off < 4)
-	CMPQ  AX, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br3.off < 4)
+	CMPQ AX, $0x04
+	ADCB $+0, DL

 skip_fill3:
 	// val0 := br3.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br3.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// val1 := br3.peekTopBits(peekBits)
 	MOVQ DI, CX
-	MOVQ R12, R14
-	SHRQ CL, R14
+	MOVQ R11, R13
+	SHRQ CL, R13

 	// v1 := table[val1&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br3.advance(uint8(v1.entry))
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// these two writes get coalesced
 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (R8)
+	LEAQ (R8)(R8*2), CX
+	MOVW AX, (BX)(CX*1)

 	// update the bitreader structure
-	MOVQ  R12, 176(R11)
-	MOVB  R13, 184(R11)
-	ADDQ  $0x02, SI
+	MOVQ  R11, 176(R10)
+	MOVB  R12, 184(R10)
+	ADDQ  $0x02, BX
 	TESTB DL, DL
 	JZ    main_loop
 	MOVQ  ctx+0(FP), AX
-	SUBQ  16(AX), SI
-	SHLQ  $0x02, SI
-	MOVQ  SI, 40(AX)
+	SUBQ  16(AX), BX
+	SHLQ  $0x02, BX
+	MOVQ  BX, 40(AX)
 	RET

 // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
 TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
-	XORQ DX, DX
-
 	// Preload values
 	MOVQ    ctx+0(FP), CX
 	MOVBQZX 8(CX), DI
 	MOVQ    16(CX), BX
 	MOVQ    48(CX), SI
-	MOVQ    24(CX), R9
-	MOVQ    32(CX), R10
-	MOVQ    (CX), R11
+	MOVQ    24(CX), R8
+	MOVQ    32(CX), R9
+	MOVQ    (CX), R10

 	// Main loop
 main_loop:
-	MOVQ  BX, R8
-	CMPQ  R8, SI
+	XORL  DX, DX
+	CMPQ  BX, SI
 	SETGE DL

 	// br0.fillFast32()
-	MOVQ    32(R11), R12
-	MOVBQZX 40(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    32(R10), R11
+	MOVBQZX 40(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill0
-	MOVQ    24(R11), R14
-	SUBQ    $0x20, R13
-	SUBQ    $0x04, R14
-	MOVQ    (R11), R15
+	MOVQ    24(R10), R13
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, R13
+	MOVQ    (R10), R14

 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R14)(R15*1), R15
-	MOVQ R13, CX
-	SHLQ CL, R15
-	MOVQ R14, 24(R11)
-	ORQ  R15, R12
+	MOVL (R13)(R14*1), R14
+	MOVQ R12, CX
+	SHLQ CL, R14
+	MOVQ R13, 24(R10)
+	ORQ  R14, R11

-	// exhausted = exhausted || (br0.off < 4)
-	CMPQ  R14, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br0.off < 4)
+	CMPQ R13, $0x04
+	ADCB $+0, DL

 skip_fill0:
 	// val0 := br0.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br0.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// val1 := br0.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v1 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br0.advance(uint8(v1.entry)
 	MOVB   CH, AH
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX

 	// val2 := br0.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v2 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br0.advance(uint8(v2.entry)
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// val3 := br0.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v3 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br0.advance(uint8(v3.entry)
 	MOVB   CH, AL
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX

 	// these four writes get coalesced
@ -366,88 +354,86 @@ skip_fill0:
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (R8)
+	MOVL AX, (BX)

 	// update the bitreader structure
-	MOVQ R12, 32(R11)
-	MOVB R13, 40(R11)
-	ADDQ R9, R8
+	MOVQ R11, 32(R10)
+	MOVB R12, 40(R10)

 	// br1.fillFast32()
-	MOVQ    80(R11), R12
-	MOVBQZX 88(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    80(R10), R11
+	MOVBQZX 88(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill1
-	MOVQ    72(R11), R14
-	SUBQ    $0x20, R13
-	SUBQ    $0x04, R14
-	MOVQ    48(R11), R15
+	MOVQ    72(R10), R13
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, R13
+	MOVQ    48(R10), R14

 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R14)(R15*1), R15
-	MOVQ R13, CX
-	SHLQ CL, R15
-	MOVQ R14, 72(R11)
-	ORQ  R15, R12
+	MOVL (R13)(R14*1), R14
+	MOVQ R12, CX
+	SHLQ CL, R14
+	MOVQ R13, 72(R10)
+	ORQ  R14, R11

-	// exhausted = exhausted || (br1.off < 4)
-	CMPQ  R14, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br1.off < 4)
+	CMPQ R13, $0x04
+	ADCB $+0, DL

 skip_fill1:
 	// val0 := br1.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br1.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// val1 := br1.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v1 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br1.advance(uint8(v1.entry)
 	MOVB   CH, AH
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX

 	// val2 := br1.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v2 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br1.advance(uint8(v2.entry)
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// val3 := br1.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v3 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br1.advance(uint8(v3.entry)
 	MOVB   CH, AL
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX

 	// these four writes get coalesced
@ -455,88 +441,86 @@ skip_fill1:
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (R8)
+	MOVL AX, (BX)(R8*1)

 	// update the bitreader structure
-	MOVQ R12, 80(R11)
-	MOVB R13, 88(R11)
-	ADDQ R9, R8
+	MOVQ R11, 80(R10)
+	MOVB R12, 88(R10)

 	// br2.fillFast32()
-	MOVQ    128(R11), R12
-	MOVBQZX 136(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    128(R10), R11
+	MOVBQZX 136(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill2
-	MOVQ    120(R11), R14
-	SUBQ    $0x20, R13
-	SUBQ    $0x04, R14
-	MOVQ    96(R11), R15
+	MOVQ    120(R10), R13
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, R13
+	MOVQ    96(R10), R14

 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R14)(R15*1), R15
-	MOVQ R13, CX
-	SHLQ CL, R15
-	MOVQ R14, 120(R11)
-	ORQ  R15, R12
+	MOVL (R13)(R14*1), R14
+	MOVQ R12, CX
+	SHLQ CL, R14
+	MOVQ R13, 120(R10)
+	ORQ  R14, R11

-	// exhausted = exhausted || (br2.off < 4)
-	CMPQ  R14, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br2.off < 4)
+	CMPQ R13, $0x04
+	ADCB $+0, DL

 skip_fill2:
 	// val0 := br2.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br2.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// val1 := br2.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v1 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br2.advance(uint8(v1.entry)
 	MOVB   CH, AH
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX

 	// val2 := br2.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v2 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br2.advance(uint8(v2.entry)
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// val3 := br2.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v3 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br2.advance(uint8(v3.entry)
 	MOVB   CH, AL
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX

 	// these four writes get coalesced
@ -544,88 +528,86 @@ skip_fill2:
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (R8)
+	MOVL AX, (BX)(R8*2)

 	// update the bitreader structure
-	MOVQ R12, 128(R11)
-	MOVB R13, 136(R11)
-	ADDQ R9, R8
+	MOVQ R11, 128(R10)
+	MOVB R12, 136(R10)

 	// br3.fillFast32()
-	MOVQ    176(R11), R12
-	MOVBQZX 184(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    176(R10), R11
+	MOVBQZX 184(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill3
-	MOVQ    168(R11), R14
-	SUBQ    $0x20, R13
-	SUBQ    $0x04, R14
-	MOVQ    144(R11), R15
+	MOVQ    168(R10), R13
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, R13
+	MOVQ    144(R10), R14

 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R14)(R15*1), R15
-	MOVQ R13, CX
-	SHLQ CL, R15
-	MOVQ R14, 168(R11)
-	ORQ  R15, R12
+	MOVL (R13)(R14*1), R14
+	MOVQ R12, CX
+	SHLQ CL, R14
+	MOVQ R13, 168(R10)
+	ORQ  R14, R11

-	// exhausted = exhausted || (br3.off < 4)
-	CMPQ  R14, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br3.off < 4)
+	CMPQ R13, $0x04
+	ADCB $+0, DL

 skip_fill3:
 	// val0 := br3.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br3.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// val1 := br3.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v1 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br3.advance(uint8(v1.entry)
 	MOVB   CH, AH
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX

 	// val2 := br3.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v2 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br3.advance(uint8(v2.entry)
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12

 	// val3 := br3.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13

 	// v3 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX

 	// br3.advance(uint8(v3.entry)
 	MOVB   CH, AL
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX

 	// these four writes get coalesced
@ -633,11 +615,12 @@ skip_fill3:
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (R8)
+	LEAQ (R8)(R8*2), CX
+	MOVL AX, (BX)(CX*1)

 	// update the bitreader structure
-	MOVQ  R12, 176(R11)
-	MOVB  R13, 184(R11)
+	MOVQ  R11, 176(R10)
+	MOVB  R12, 184(R10)
 	ADDQ  $0x04, BX
 	TESTB DL, DL
 	JZ    main_loop
@ -653,7 +636,7 @@ TEXT ·decompress1x_main_loop_amd64(SB), $0-8
 	MOVQ    16(CX), DX
 	MOVQ    24(CX), BX
 	CMPQ    BX, $0x04
-	JB      error_max_decoded_size_exeeded
+	JB      error_max_decoded_size_exceeded
 	LEAQ    (DX)(BX*1), BX
 	MOVQ    (CX), SI
 	MOVQ    (SI), R8
@ -668,7 +651,7 @@ main_loop:
 	// Check if we have room for 4 bytes in the output buffer
 	LEAQ 4(DX), CX
 	CMPQ CX, BX
-	JGE  error_max_decoded_size_exeeded
+	JGE  error_max_decoded_size_exceeded

 	// Decode 4 values
 	CMPQ R11, $0x20
@ -745,7 +728,7 @@ loop_condition:
 	RET

 	// Report error
-error_max_decoded_size_exeeded:
+error_max_decoded_size_exceeded:
 	MOVQ ctx+0(FP), AX
 	MOVQ $-1, CX
 	MOVQ CX, 40(AX)
@ -758,7 +741,7 @@ TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
 	MOVQ    16(CX), DX
 	MOVQ    24(CX), BX
 	CMPQ    BX, $0x04
-	JB      error_max_decoded_size_exeeded
+	JB      error_max_decoded_size_exceeded
 	LEAQ    (DX)(BX*1), BX
 	MOVQ    (CX), SI
 	MOVQ    (SI), R8
@ -773,7 +756,7 @@ main_loop:
 	// Check if we have room for 4 bytes in the output buffer
 	LEAQ 4(DX), CX
 	CMPQ CX, BX
-	JGE  error_max_decoded_size_exeeded
+	JGE  error_max_decoded_size_exceeded

 	// Decode 4 values
 	CMPQ  R11, $0x20
@ -840,7 +823,7 @@ loop_condition:
 	RET

 	// Report error
-error_max_decoded_size_exeeded:
+error_max_decoded_size_exceeded:
 	MOVQ ctx+0(FP), AX
 	MOVQ $-1, CX
 	MOVQ CX, 40(AX)
--- a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@ -122,17 +122,21 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[0][:])
-			copy(out[dstEvery:], buf[1][:])
-			copy(out[dstEvery*2:], buf[2][:])
-			copy(out[dstEvery*3:], buf[3][:])
-			out = out[bufoff:]
-			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
+			if len(out)-bufoff < dstEvery*3 {
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
+			//copy(out, buf[0][:])
+			//copy(out[dstEvery:], buf[1][:])
+			//copy(out[dstEvery*2:], buf[2][:])
+			//copy(out[dstEvery*3:], buf[3][:])
+			*(*[bufoff]byte)(out) = buf[0]
+			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+			out = out[bufoff:]
+			decoded += bufoff * 4
 		}
 	}
 	if off > 0 {
--- a/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
@ -18,6 +18,7 @@ func load64(b []byte, i int) uint64 {
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 //
 // It assumes that:
+//
 //	dst is long enough to hold the encoded bytes
 //	1 <= len(lit) && len(lit) <= 65536
 func emitLiteral(dst, lit []byte) int {
@ -42,6 +43,7 @@ func emitLiteral(dst, lit []byte) int {
 // emitCopy writes a copy chunk and returns the number of bytes written.
 //
 // It assumes that:
+//
 //	dst is long enough to hold the encoded bytes
 //	1 <= offset && offset <= 65535
 //	4 <= length && length <= 65535
@ -89,6 +91,7 @@ func emitCopy(dst []byte, offset, length int) int {
 // src[i:i+k-j] and src[j:k] have the same contents.
 //
 // It assumes that:
+//
 //	0 <= i && i < j && j <= len(src)
 func extendMatch(src []byte, i, j int) int {
 	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
@ -100,13 +103,36 @@ func hash(u, shift uint32) uint32 {
 	return (u * 0x1e35a7bd) >> shift
 }

+// EncodeBlockInto exposes encodeBlock but checks dst size.
+func EncodeBlockInto(dst, src []byte) (d int) {
+	if MaxEncodedLen(len(src)) > len(dst) {
+		return 0
+	}
+
+	// encodeBlock breaks on too big blocks, so split.
+	for len(src) > 0 {
+		p := src
+		src = nil
+		if len(p) > maxBlockSize {
+			p, src = p[:maxBlockSize], p[maxBlockSize:]
+		}
+		if len(p) < minNonLiteralBlockSize {
+			d += emitLiteral(dst[d:], p)
+		} else {
+			d += encodeBlock(dst[d:], p)
+		}
+	}
+	return d
+}
+
 // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlock(dst, src []byte) (d int) {
 	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
 	// The table element type is uint16, as s < sLimit and sLimit < len(src)
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@ -12,6 +12,8 @@ The `zstd` package is provided as open source software using a Go standard licen

 Currently the package is heavily optimized for 64 bit processors and will be significantly slower on 32 bit processors.

+For seekable zstd streams, see [this excellent package](https://github.com/SaveTheRbtz/zstd-seekable-format-go).
+
 ## Installation

 Install using `go get -u github.com/klauspost/compress`. The package is located in `github.com/klauspost/compress/zstd`.
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@ -9,8 +9,8 @@ import (
 	"encoding/binary"
 	"errors"
 	"fmt"
+	"hash/crc32"
 	"io"
-	"io/ioutil"
 	"os"
 	"path/filepath"
 	"sync"
@ -83,8 +83,9 @@ type blockDec struct {

 	err error

-	// Check against this crc
-	checkCRC []byte
+	// Check against this crc, if hasCRC is true.
+	checkCRC uint32
+	hasCRC   bool

 	// Frame to use for singlethreaded decoding.
 	// Should not be used by the decoder itself since parent may be another frame.
@ -192,16 +193,14 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	}

 	// Read block data.
-	if cap(b.dataStorage) < cSize {
+	if _, ok := br.(*byteBuf); !ok && cap(b.dataStorage) < cSize {
+		// byteBuf doesn't need a destination buffer.
 		if b.lowMem || cSize > maxCompressedBlockSize {
 			b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
 		} else {
 			b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
 		}
 	}
-	if cap(b.dst) <= maxSize {
-		b.dst = make([]byte, 0, maxSize+1)
-	}
 	b.data, err = br.readBig(cSize, b.dataStorage)
 	if err != nil {
 		if debugDecoder {
@ -210,6 +209,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 		}
 		return err
 	}
+	if cap(b.dst) <= maxSize {
+		b.dst = make([]byte, 0, maxSize+1)
+	}
 	return nil
 }

@ -233,7 +235,7 @@ func (b *blockDec) decodeBuf(hist *history) error {
 			if b.lowMem {
 				b.dst = make([]byte, b.RLESize)
 			} else {
-				b.dst = make([]byte, maxBlockSize)
+				b.dst = make([]byte, maxCompressedBlockSize)
 			}
 		}
 		b.dst = b.dst[:b.RLESize]
@ -441,6 +443,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
 			}
 		}
 		var err error
+		if debugDecoder {
+			println("huff table input:", len(literals), "CRC:", crc32.ChecksumIEEE(literals))
+		}
 		huff, literals, err = huff0.ReadTable(literals, huff)
 		if err != nil {
 			println("reading huffman table:", err)
@ -651,7 +656,7 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
 		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
 		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
 		buf.Write(in)
-		ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
+		os.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
 	}

 	return nil
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@ -7,7 +7,6 @@ package zstd
 import (
 	"fmt"
 	"io"
-	"io/ioutil"
 )

 type byteBuffer interface {
@ -23,7 +22,7 @@ type byteBuffer interface {
 	readByte() (byte, error)

 	// Skip n bytes.
-	skipN(n int) error
+	skipN(n int64) error
 }

 // in-memory buffer
@ -55,16 +54,19 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
 func (b *byteBuf) readByte() (byte, error) {
 	bb := *b
 	if len(bb) < 1 {
-		return 0, nil
+		return 0, io.ErrUnexpectedEOF
 	}
 	r := bb[0]
 	*b = bb[1:]
 	return r, nil
 }

-func (b *byteBuf) skipN(n int) error {
+func (b *byteBuf) skipN(n int64) error {
 	bb := *b
-	if len(bb) < n {
+	if n < 0 {
+		return fmt.Errorf("negative skip (%d) requested", n)
+	}
+	if int64(len(bb)) < n {
 		return io.ErrUnexpectedEOF
 	}
 	*b = bb[n:]
@ -120,9 +122,9 @@ func (r *readerWrapper) readByte() (byte, error) {
 	return r.tmp[0], nil
 }

-func (r *readerWrapper) skipN(n int) error {
-	n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
-	if n2 != int64(n) {
+func (r *readerWrapper) skipN(n int64) error {
+	n2, err := io.CopyN(io.Discard, r.r, n)
+	if n2 != n {
 		err = io.ErrUnexpectedEOF
 	}
 	return err
--- a/vendor/github.com/klauspost/compress/zstd/decodeheader.go
+++ b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
@ -4,7 +4,6 @@
 package zstd

 import (
-	"bytes"
 	"encoding/binary"
 	"errors"
 	"io"
@ -102,8 +101,8 @@ func (h *Header) Decode(in []byte) error {
 	}
 	h.HeaderSize += 4
 	b, in := in[:4], in[4:]
-	if !bytes.Equal(b, frameMagic) {
-		if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
+	if string(b) != frameMagic {
+		if string(b[1:4]) != skippableFrameMagic || b[0]&0xf0 != 0x50 {
 			return ErrMagicMismatch
 		}
 		if len(in) < 4 {
@ -153,7 +152,7 @@ func (h *Header) Decode(in []byte) error {
 		}
 		b, in = in[:size], in[size:]
 		h.HeaderSize += int(size)
-		switch size {
+		switch len(b) {
 		case 1:
 			h.DictionaryID = uint32(b[0])
 		case 2:
@ -183,7 +182,7 @@ func (h *Header) Decode(in []byte) error {
 		}
 		b, in = in[:fcsSize], in[fcsSize:]
 		h.HeaderSize += int(fcsSize)
-		switch fcsSize {
+		switch len(b) {
 		case 1:
 			h.FrameContentSize = uint64(b[0])
 		case 2:
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@ -5,7 +5,6 @@
 package zstd

 import (
-	"bytes"
 	"context"
 	"encoding/binary"
 	"io"
@ -35,13 +34,13 @@ type Decoder struct {
 		br           readerWrapper
 		enabled      bool
 		inFrame      bool
+		dstBuf       []byte
 	}

 	frame *frameDec

 	// Custom dictionaries.
-	// Always uses copies.
-	dicts map[uint32]dict
+	dicts map[uint32]*dict

 	// streamWg is the waitgroup for all streams
 	streamWg sync.WaitGroup
@ -103,7 +102,7 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 	}

 	// Transfer option dicts.
-	d.dicts = make(map[uint32]dict, len(d.o.dicts))
+	d.dicts = make(map[uint32]*dict, len(d.o.dicts))
 	for _, dc := range d.o.dicts {
 		d.dicts[dc.id] = dc
 	}
@ -187,21 +186,23 @@ func (d *Decoder) Reset(r io.Reader) error {
 	}

 	// If bytes buffer and < 5MB, do sync decoding anyway.
-	if bb, ok := r.(byter); ok && bb.Len() < 5<<20 {
+	if bb, ok := r.(byter); ok && bb.Len() < d.o.decodeBufsBelow && !d.o.limitToCap {
 		bb2 := bb
 		if debugDecoder {
 			println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
 		}
 		b := bb2.Bytes()
 		var dst []byte
-		if cap(d.current.b) > 0 {
-			dst = d.current.b
+		if cap(d.syncStream.dstBuf) > 0 {
+			dst = d.syncStream.dstBuf[:0]
 		}

-		dst, err := d.DecodeAll(b, dst[:0])
+		dst, err := d.DecodeAll(b, dst)
 		if err == nil {
 			err = io.EOF
 		}
+		// Save output buffer
+		d.syncStream.dstBuf = dst
 		d.current.b = dst
 		d.current.err = err
 		d.current.flushed = true
@ -216,6 +217,7 @@ func (d *Decoder) Reset(r io.Reader) error {
 	d.current.err = nil
 	d.current.flushed = false
 	d.current.d = nil
+	d.syncStream.dstBuf = nil

 	// Ensure no-one else is still running...
 	d.streamWg.Wait()
@ -312,6 +314,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	// Grab a block decoder and frame decoder.
 	block := <-d.decoders
 	frame := block.localFrame
+	initialSize := len(dst)
 	defer func() {
 		if debugDecoder {
 			printf("re-adding decoder: %p", block)
@ -337,21 +340,26 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 			}
 			return dst, err
 		}
-		if frame.DictionaryID != nil {
-			dict, ok := d.dicts[*frame.DictionaryID]
-			if !ok {
-				return nil, ErrUnknownDictionary
-			}
-			if debugDecoder {
-				println("setting dict", frame.DictionaryID)
-			}
-			frame.history.setDict(&dict)
+		if err = d.setDict(frame); err != nil {
+			return nil, err
 		}
 		if frame.WindowSize > d.o.maxWindowSize {
+			if debugDecoder {
+				println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
+			}
 			return dst, ErrWindowSizeExceeded
 		}
 		if frame.FrameContentSize != fcsUnknown {
-			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
+			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)-initialSize) {
+				if debugDecoder {
+					println("decoder size exceeded; fcs:", frame.FrameContentSize, "> mcs:", d.o.maxDecodedSize-uint64(len(dst)-initialSize), "len:", len(dst))
+				}
+				return dst, ErrDecoderSizeExceeded
+			}
+			if d.o.limitToCap && frame.FrameContentSize > uint64(cap(dst)-len(dst)) {
+				if debugDecoder {
+					println("decoder size exceeded; fcs:", frame.FrameContentSize, "> (cap-len)", cap(dst)-len(dst))
+				}
 				return dst, ErrDecoderSizeExceeded
 			}
 			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
@ -361,7 +369,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 			}
 		}

-		if cap(dst) == 0 {
+		if cap(dst) == 0 && !d.o.limitToCap {
 			// Allocate len(input) * 2 by default if nothing is provided
 			// and we didn't get frame content size.
 			size := len(input) * 2
@ -379,6 +387,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 		if err != nil {
 			return dst, err
 		}
+		if uint64(len(dst)-initialSize) > d.o.maxDecodedSize {
+			return dst, ErrDecoderSizeExceeded
+		}
 		if len(frame.bBuf) == 0 {
 			if debugDecoder {
 				println("frame dbuf empty")
@ -439,7 +450,11 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
 		println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
 	}

-	if !d.o.ignoreChecksum && len(next.b) > 0 {
+	if d.o.ignoreChecksum {
+		return true
+	}
+
+	if len(next.b) > 0 {
 		n, err := d.current.crc.Write(next.b)
 		if err == nil {
 			if n != len(next.b) {
@ -447,18 +462,16 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
 			}
 		}
 	}
-	if next.err == nil && next.d != nil && len(next.d.checkCRC) != 0 {
-		got := d.current.crc.Sum64()
-		var tmp [4]byte
-		binary.LittleEndian.PutUint32(tmp[:], uint32(got))
-		if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
+	if next.err == nil && next.d != nil && next.d.hasCRC {
+		got := uint32(d.current.crc.Sum64())
+		if got != next.d.checkCRC {
 			if debugDecoder {
-				println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
+				printf("CRC Check Failed: %08x (got) != %08x (on stream)\n", got, next.d.checkCRC)
 			}
 			d.current.err = ErrCRCMismatch
 		} else {
 			if debugDecoder {
-				println("CRC ok", tmp[:])
+				printf("CRC ok %08x\n", got)
 			}
 		}
 	}
@ -474,18 +487,12 @@ func (d *Decoder) nextBlockSync() (ok bool) {
 		if !d.syncStream.inFrame {
 			d.frame.history.reset()
 			d.current.err = d.frame.reset(&d.syncStream.br)
+			if d.current.err == nil {
+				d.current.err = d.setDict(d.frame)
+			}
 			if d.current.err != nil {
 				return false
 			}
-			if d.frame.DictionaryID != nil {
-				dict, ok := d.dicts[*d.frame.DictionaryID]
-				if !ok {
-					d.current.err = ErrUnknownDictionary
-					return false
-				} else {
-					d.frame.history.setDict(&dict)
-				}
-			}
 			if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
 				d.current.err = ErrDecoderSizeExceeded
 				return false
@ -664,6 +671,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 				if debugDecoder {
 					println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
 				}
+				hist.reset()
 				hist.decoders = block.async.newHist.decoders
 				hist.recentOffsets = block.async.newHist.recentOffsets
 				hist.windowSize = block.async.newHist.windowSize
@ -695,6 +703,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 			seqExecute <- block
 		}
 		close(seqExecute)
+		hist.reset()
 	}()

 	var wg sync.WaitGroup
@ -718,6 +727,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 				if debugDecoder {
 					println("Async 2: new history")
 				}
+				hist.reset()
 				hist.windowSize = block.async.newHist.windowSize
 				hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
 				if block.async.newHist.dict != nil {
@ -747,7 +757,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 					if block.lowMem {
 						block.dst = make([]byte, block.RLESize)
 					} else {
-						block.dst = make([]byte, maxBlockSize)
+						block.dst = make([]byte, maxCompressedBlockSize)
 					}
 				}
 				block.dst = block.dst[:block.RLESize]
@ -799,13 +809,14 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 		if debugDecoder {
 			println("decoder goroutines finished")
 		}
+		hist.reset()
 	}()

+	var hist history
 decodeStream:
 	for {
-		var hist history
 		var hasErr bool
-
+		hist.reset()
 		decodeBlock := func(block *blockDec) {
 			if hasErr {
 				if block != nil {
@ -840,15 +851,14 @@ decodeStream:
 		if debugDecoder && err != nil {
 			println("Frame decoder returned", err)
 		}
-		if err == nil && frame.DictionaryID != nil {
-			dict, ok := d.dicts[*frame.DictionaryID]
-			if !ok {
-				err = ErrUnknownDictionary
-			} else {
-				frame.history.setDict(&dict)
-			}
+		if err == nil {
+			err = d.setDict(frame)
 		}
 		if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
+			if debugDecoder {
+				println("decoder size exceeded, fws:", d.frame.WindowSize, "> mws:", d.o.maxWindowSize)
+			}
+
 			err = ErrDecoderSizeExceeded
 		}
 		if err != nil {
@ -890,18 +900,22 @@ decodeStream:
 				println("next block returned error:", err)
 			}
 			dec.err = err
-			dec.checkCRC = nil
+			dec.hasCRC = false
 			if dec.Last && frame.HasCheckSum && err == nil {
 				crc, err := frame.rawInput.readSmall(4)
-				if err != nil {
+				if len(crc) < 4 {
+					if err == nil {
+						err = io.ErrUnexpectedEOF
+
+					}
 					println("CRC missing?", err)
 					dec.err = err
-				}
-				var tmp [4]byte
-				copy(tmp[:], crc)
-				dec.checkCRC = tmp[:]
-				if debugDecoder {
-					println("found crc to check:", dec.checkCRC)
+				} else {
+					dec.checkCRC = binary.LittleEndian.Uint32(crc)
+					dec.hasCRC = true
+					if debugDecoder {
+						printf("found crc to check: %08x\n", dec.checkCRC)
+					}
 				}
 			}
 			err = dec.err
@ -917,5 +931,23 @@ decodeStream:
 	}
 	close(seqDecode)
 	wg.Wait()
+	hist.reset()
 	d.frame.history.b = frameHistCache
 }
+
+func (d *Decoder) setDict(frame *frameDec) (err error) {
+	dict, ok := d.dicts[frame.DictionaryID]
+	if ok {
+		if debugDecoder {
+			println("setting dict", frame.DictionaryID)
+		}
+		frame.history.setDict(dict)
+	} else if frame.DictionaryID != 0 {
+		// A zero or missing dictionary id is ambiguous:
+		// either dictionary zero, or no dictionary. In particular,
+		// zstd --patch-from uses this id for the source file,
+		// so only return an error if the dictionary id is not zero.
+		err = ErrUnknownDictionary
+	}
+	return err
+}
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@ -6,6 +6,8 @@ package zstd

 import (
 	"errors"
+	"fmt"
+	"math/bits"
 	"runtime"
 )

@ -14,20 +16,23 @@ type DOption func(*decoderOptions) error

 // options retains accumulated state of multiple options.
 type decoderOptions struct {
-	lowMem         bool
-	concurrent     int
-	maxDecodedSize uint64
-	maxWindowSize  uint64
-	dicts          []dict
-	ignoreChecksum bool
+	lowMem          bool
+	concurrent      int
+	maxDecodedSize  uint64
+	maxWindowSize   uint64
+	dicts           []*dict
+	ignoreChecksum  bool
+	limitToCap      bool
+	decodeBufsBelow int
 }

 func (o *decoderOptions) setDefault() {
 	*o = decoderOptions{
 		// use less ram: true for now, but may change.
-		lowMem:        true,
-		concurrent:    runtime.GOMAXPROCS(0),
-		maxWindowSize: MaxWindowSize,
+		lowMem:          true,
+		concurrent:      runtime.GOMAXPROCS(0),
+		maxWindowSize:   MaxWindowSize,
+		decodeBufsBelow: 128 << 10,
 	}
 	if o.concurrent > 4 {
 		o.concurrent = 4
@ -82,7 +87,13 @@ func WithDecoderMaxMemory(n uint64) DOption {
 }

 // WithDecoderDicts allows to register one or more dictionaries for the decoder.
-// If several dictionaries with the same ID is provided the last one will be used.
+//
+// Each slice in dict must be in the [dictionary format] produced by
+// "zstd --train" from the Zstandard reference implementation.
+//
+// If several dictionaries with the same ID are provided, the last one will be used.
+//
+// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
 func WithDecoderDicts(dicts ...[]byte) DOption {
 	return func(o *decoderOptions) error {
 		for _, b := range dicts {
@ -90,12 +101,24 @@ func WithDecoderDicts(dicts ...[]byte) DOption {
 			if err != nil {
 				return err
 			}
-			o.dicts = append(o.dicts, *d)
+			o.dicts = append(o.dicts, d)
 		}
 		return nil
 	}
 }

+// WithEncoderDictRaw registers a dictionary that may be used by the decoder.
+// The slice content can be arbitrary data.
+func WithDecoderDictRaw(id uint32, content []byte) DOption {
+	return func(o *decoderOptions) error {
+		if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
+			return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
+		}
+		o.dicts = append(o.dicts, &dict{id: id, content: content, offsets: [3]int{1, 4, 8}})
+		return nil
+	}
+}
+
 // WithDecoderMaxWindow allows to set a maximum window size for decodes.
 // This allows rejecting packets that will cause big memory usage.
 // The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.
@ -114,6 +137,29 @@ func WithDecoderMaxWindow(size uint64) DOption {
 	}
 }

+// WithDecodeAllCapLimit will limit DecodeAll to decoding cap(dst)-len(dst) bytes,
+// or any size set in WithDecoderMaxMemory.
+// This can be used to limit decoding to a specific maximum output size.
+// Disabled by default.
+func WithDecodeAllCapLimit(b bool) DOption {
+	return func(o *decoderOptions) error {
+		o.limitToCap = b
+		return nil
+	}
+}
+
+// WithDecodeBuffersBelow will fully decode readers that have a
+// `Bytes() []byte` and `Len() int` interface similar to bytes.Buffer.
+// This typically uses less allocations but will have the full decompressed object in memory.
+// Note that DecodeAllCapLimit will disable this, as well as giving a size of 0 or less.
+// Default is 128KiB.
+func WithDecodeBuffersBelow(size int) DOption {
+	return func(o *decoderOptions) error {
+		o.decodeBufsBelow = size
+		return nil
+	}
+}
+
 // IgnoreChecksum allows to forcibly ignore checksum checking.
 func IgnoreChecksum(b bool) DOption {
 	return func(o *decoderOptions) error {
--- a/vendor/github.com/klauspost/compress/zstd/dict.go
+++ b/vendor/github.com/klauspost/compress/zstd/dict.go
@ -1,7 +1,6 @@
 package zstd

 import (
-	"bytes"
 	"encoding/binary"
 	"errors"
 	"fmt"
@ -20,7 +19,10 @@ type dict struct {
 	content []byte
 }

-var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec}
+const dictMagic = "\x37\xa4\x30\xec"
+
+// Maximum dictionary size for the reference implementation (1.5.3) is 2 GiB.
+const dictMaxLength = 1 << 31

 // ID returns the dictionary id or 0 if d is nil.
 func (d *dict) ID() uint32 {
@ -30,14 +32,38 @@ func (d *dict) ID() uint32 {
 	return d.id
 }

-// DictContentSize returns the dictionary content size or 0 if d is nil.
-func (d *dict) DictContentSize() int {
+// ContentSize returns the dictionary content size or 0 if d is nil.
+func (d *dict) ContentSize() int {
 	if d == nil {
 		return 0
 	}
 	return len(d.content)
 }

+// Content returns the dictionary content.
+func (d *dict) Content() []byte {
+	if d == nil {
+		return nil
+	}
+	return d.content
+}
+
+// Offsets returns the initial offsets.
+func (d *dict) Offsets() [3]int {
+	if d == nil {
+		return [3]int{}
+	}
+	return d.offsets
+}
+
+// LitEncoder returns the literal encoder.
+func (d *dict) LitEncoder() *huff0.Scratch {
+	if d == nil {
+		return nil
+	}
+	return d.litEnc
+}
+
 // Load a dictionary as described in
 // https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
 func loadDict(b []byte) (*dict, error) {
@ -50,7 +76,7 @@ func loadDict(b []byte) (*dict, error) {
 		ofDec: sequenceDec{fse: &fseDecoder{}},
 		mlDec: sequenceDec{fse: &fseDecoder{}},
 	}
-	if !bytes.Equal(b[:4], dictMagic[:]) {
+	if string(b[:4]) != dictMagic {
 		return nil, ErrMagicMismatch
 	}
 	d.id = binary.LittleEndian.Uint32(b[4:8])
@ -62,7 +88,7 @@ func loadDict(b []byte) (*dict, error) {
 	var err error
 	d.litEnc, b, err = huff0.ReadTable(b[8:], nil)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("loading literal table: %w", err)
 	}
 	d.litEnc.Reuse = huff0.ReusePolicyMust

@ -120,3 +146,16 @@ func loadDict(b []byte) (*dict, error) {

 	return &d, nil
 }
+
+// InspectDictionary loads a zstd dictionary and provides functions to inspect the content.
+func InspectDictionary(b []byte) (interface {
+	ID() uint32
+	ContentSize() int
+	Content() []byte
+	Offsets() [3]int
+	LitEncoder() *huff0.Scratch
+}, error) {
+	initPredefined()
+	d, err := loadDict(b)
+	return d, err
+}
--- a/vendor/github.com/klauspost/compress/zstd/enc_base.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go
@ -16,6 +16,7 @@ type fastBase struct {
 	cur int32
 	// maximum offset. Should be at least 2x block size.
 	maxMatchOff int32
+	bufferReset int32
 	hist        []byte
 	crc         *xxhash.Digest
 	tmp         [8]byte
@ -56,8 +57,8 @@ func (e *fastBase) Block() *blockEnc {
 }

 func (e *fastBase) addBlock(src []byte) int32 {
-	if debugAsserts && e.cur > bufferReset {
-		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
+	if debugAsserts && e.cur > e.bufferReset {
+		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, e.bufferReset))
 	}
 	// check if we have space already
 	if len(e.hist)+len(src) > cap(e.hist) {
@ -126,24 +127,7 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
 			panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
 		}
 	}
-	a := src[s:]
-	b := src[t:]
-	b = b[:len(a)]
-	end := int32((len(a) >> 3) << 3)
-	for i := int32(0); i < end; i += 8 {
-		if diff := load6432(a, i) ^ load6432(b, i); diff != 0 {
-			return i + int32(bits.TrailingZeros64(diff)>>3)
-		}
-	}
-
-	a = a[end:]
-	b = b[end:]
-	for i := range a {
-		if a[i] != b[i] {
-			return int32(i) + end
-		}
-	}
-	return int32(len(a)) + end
+	return int32(matchLen(src[s:], src[t:]))
 }

 // Reset the encoding table.
@ -165,13 +149,13 @@ func (e *fastBase) resetBase(d *dict, singleBlock bool) {
 		if singleBlock {
 			e.lowMem = true
 		}
-		e.ensureHist(d.DictContentSize() + maxCompressedBlockSize)
+		e.ensureHist(d.ContentSize() + maxCompressedBlockSize)
 		e.lowMem = low
 	}

 	// We offset current position so everything will be out of reach.
 	// If above reset line, history will be purged.
-	if e.cur < bufferReset {
+	if e.cur < e.bufferReset {
 		e.cur += e.maxMatchOff + int32(len(e.hist))
 	}
 	e.hist = e.hist[:0]
--- a/vendor/github.com/klauspost/compress/zstd/enc_best.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_best.go
@ -84,14 +84,10 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
 	)

 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = prevEntry{}
-			}
-			for i := range e.longTable[:] {
-				e.longTable[i] = prevEntry{}
-			}
+			e.table = [bestShortTableSize]prevEntry{}
+			e.longTable = [bestLongTableSize]prevEntry{}
 			e.cur = e.maxMatchOff
 			break
 		}
@ -192,12 +188,6 @@ encodeLoop:
 			panic("offset0 was 0")
 		}

-		bestOf := func(a, b match) match {
-			if a.est+(a.s-b.s)*bitsPerByte>>10 < b.est+(b.s-a.s)*bitsPerByte>>10 {
-				return a
-			}
-			return b
-		}
 		const goodEnough = 100

 		nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
@ -205,36 +195,41 @@ encodeLoop:
 		candidateL := e.longTable[nextHashL]
 		candidateS := e.table[nextHashS]

-		matchAt := func(offset int32, s int32, first uint32, rep int32) match {
+		// Set m to a match at offset if it looks like that will improve compression.
+		improve := func(m *match, offset int32, s int32, first uint32, rep int32) {
 			if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
-				return match{s: s, est: highScore}
+				return
 			}
 			if debugAsserts {
 				if !bytes.Equal(src[s:s+4], src[offset:offset+4]) {
 					panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
 				}
 			}
-			m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
-			m.estBits(bitsPerByte)
-			return m
+			cand := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
+			cand.estBits(bitsPerByte)
+			if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 {
+				*m = cand
+			}
 		}

-		best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
-		best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
-		best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1))
+		best := match{s: s, est: highScore}
+		improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1)

 		if canRepeat && best.length < goodEnough {
 			cv32 := uint32(cv >> 8)
 			spp := s + 1
-			best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
-			best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
-			best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
+			improve(&best, spp-offset1, spp, cv32, 1)
+			improve(&best, spp-offset2, spp, cv32, 2)
+			improve(&best, spp-offset3, spp, cv32, 3)
 			if best.length > 0 {
 				cv32 = uint32(cv >> 24)
 				spp += 2
-				best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
-				best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
-				best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
+				improve(&best, spp-offset1, spp, cv32, 1)
+				improve(&best, spp-offset2, spp, cv32, 2)
+				improve(&best, spp-offset3, spp, cv32, 3)
 			}
 		}
 		// Load next and check...
@ -261,28 +256,30 @@ encodeLoop:
 			candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]

 			// Short at s+1
-			best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
+			improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
 			// Long at s+1, s+2
-			best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1))
-			best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
-			best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1))
-			best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1))
+			improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
+			improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
+			improve(&best, candidateL2.offset-e.cur, s+1, uint32(cv2), -1)
+			improve(&best, candidateL2.prev-e.cur, s+1, uint32(cv2), -1)
 			if false {
 				// Short at s+3.
 				// Too often worse...
-				best = bestOf(best, matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1))
+				improve(&best, e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)
 			}
 			// See if we can find a better match by checking where the current best ends.
 			// Use that offset to see if we can find a better full match.
 			if sAt := best.s + best.length; sAt < sLimit {
 				nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen)
 				candidateEnd := e.longTable[nextHashL]
-				if pos := candidateEnd.offset - e.cur - best.length; pos >= 0 {
-					bestEnd := bestOf(best, matchAt(pos, best.s, load3232(src, best.s), -1))
-					if pos := candidateEnd.prev - e.cur - best.length; pos >= 0 {
-						bestEnd = bestOf(bestEnd, matchAt(pos, best.s, load3232(src, best.s), -1))
+				// Start check at a fixed offset to allow for a few mismatches.
+				// For this compression level 2 yields the best results.
+				const skipBeginning = 2
+				if pos := candidateEnd.offset - e.cur - best.length + skipBeginning; pos >= 0 {
+					improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
+					if pos := candidateEnd.prev - e.cur - best.length + skipBeginning; pos >= 0 {
+						improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
 					}
-					best = bestEnd
 				}
 			}
 		}
--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@ -62,14 +62,10 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
 	)

 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = tableEntry{}
-			}
-			for i := range e.longTable[:] {
-				e.longTable[i] = prevEntry{}
-			}
+			e.table = [betterShortTableSize]tableEntry{}
+			e.longTable = [betterLongTableSize]prevEntry{}
 			e.cur = e.maxMatchOff
 			break
 		}
@ -416,15 +412,23 @@ encodeLoop:

 		// Try to find a better match by searching for a long match at the end of the current best match
 		if s+matched < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is around 3 bytes, but depends on input.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 3
+
 			nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
-			cv := load3232(src, s)
+			s2 := s + skipBeginning
+			cv := load3232(src, s2)
 			candidateL := e.longTable[nextHashL]
-			coffsetL := candidateL.offset - e.cur - matched
-			if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+			coffsetL := candidateL.offset - e.cur - matched + skipBeginning
+			if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
 				// Found a long match, at least 4 bytes.
-				matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+				matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
 				if matchedNext > matched {
 					t = coffsetL
+					s = s2
 					matched = matchedNext
 					if debugMatches {
 						println("long match at end-of-match")
@ -434,12 +438,13 @@ encodeLoop:

 			// Check prev long...
 			if true {
-				coffsetL = candidateL.prev - e.cur - matched
-				if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+				coffsetL = candidateL.prev - e.cur - matched + skipBeginning
+				if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
 					// Found a long match, at least 4 bytes.
-					matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+					matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
 					if matchedNext > matched {
 						t = coffsetL
+						s = s2
 						matched = matchedNext
 						if debugMatches {
 							println("prev long match at end-of-match")
@ -578,7 +583,7 @@ func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) {
 	)

 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@ -44,14 +44,10 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
 	)

 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = tableEntry{}
-			}
-			for i := range e.longTable[:] {
-				e.longTable[i] = tableEntry{}
-			}
+			e.table = [dFastShortTableSize]tableEntry{}
+			e.longTable = [dFastLongTableSize]tableEntry{}
 			e.cur = e.maxMatchOff
 			break
 		}
@ -388,7 +384,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 	)

 	// Protect against e.cur wraparound.
-	if e.cur >= bufferReset {
+	if e.cur >= e.bufferReset {
 		for i := range e.table[:] {
 			e.table[i] = tableEntry{}
 		}
@ -685,7 +681,7 @@ encodeLoop:
 	}

 	// We do not store history, so we must offset e.cur to avoid false matches for next user.
-	if e.cur < bufferReset {
+	if e.cur < e.bufferReset {
 		e.cur += int32(len(src))
 	}
 }
@ -700,7 +696,7 @@ func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) {
 	)

 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
@ -1103,7 +1099,8 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
 	}

 	if allDirty || dirtyShardCnt > dLongTableShardCnt/2 {
-		copy(e.longTable[:], e.dictLongTable)
+		//copy(e.longTable[:], e.dictLongTable)
+		e.longTable = *(*[dFastLongTableSize]tableEntry)(e.dictLongTable)
 		for i := range e.longTableShardDirty {
 			e.longTableShardDirty[i] = false
 		}
@ -1114,7 +1111,9 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
 			continue
 		}

-		copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
+		// copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
+		*(*[dLongTableShardSize]tableEntry)(e.longTable[i*dLongTableShardSize:]) = *(*[dLongTableShardSize]tableEntry)(e.dictLongTable[i*dLongTableShardSize:])
+
 		e.longTableShardDirty[i] = false
 	}
 }
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@ -43,7 +43,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 	)

 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
@ -304,13 +304,13 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 	if debugEncoder {
-		if len(src) > maxBlockSize {
+		if len(src) > maxCompressedBlockSize {
 			panic("src too big")
 		}
 	}

 	// Protect against e.cur wraparound.
-	if e.cur >= bufferReset {
+	if e.cur >= e.bufferReset {
 		for i := range e.table[:] {
 			e.table[i] = tableEntry{}
 		}
@ -538,7 +538,7 @@ encodeLoop:
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 	// We do not store history, so we must offset e.cur to avoid false matches for next user.
-	if e.cur < bufferReset {
+	if e.cur < e.bufferReset {
 		e.cur += int32(len(src))
 	}
 }
@ -555,11 +555,9 @@ func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) {
 		return
 	}
 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = tableEntry{}
-			}
+			e.table = [tableSize]tableEntry{}
 			e.cur = e.maxMatchOff
 			break
 		}
@ -871,7 +869,8 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
 	const shardCnt = tableShardCnt
 	const shardSize = tableShardSize
 	if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
-		copy(e.table[:], e.dictTable)
+		//copy(e.table[:], e.dictTable)
+		e.table = *(*[tableSize]tableEntry)(e.dictTable)
 		for i := range e.tableShardDirty {
 			e.tableShardDirty[i] = false
 		}
@ -883,7 +882,8 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
 			continue
 		}

-		copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+		//copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+		*(*[shardSize]tableEntry)(e.table[i*shardSize:]) = *(*[shardSize]tableEntry)(e.dictTable[i*shardSize:])
 		e.tableShardDirty[i] = false
 	}
 	e.allDirty = false
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@ -8,6 +8,7 @@ import (
 	"crypto/rand"
 	"fmt"
 	"io"
+	"math"
 	rdebug "runtime/debug"
 	"sync"

@ -528,8 +529,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		// If a non-single block is needed the encoder will reset again.
 		e.encoders <- enc
 	}()
-	// Use single segments when above minimum window and below 1MB.
-	single := len(src) < 1<<20 && len(src) > MinWindowSize
+	// Use single segments when above minimum window and below window size.
+	single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
 	if e.o.single != nil {
 		single = *e.o.single
 	}
@ -639,3 +640,37 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 	}
 	return dst
 }
+
+// MaxEncodedSize returns the expected maximum
+// size of an encoded block or stream.
+func (e *Encoder) MaxEncodedSize(size int) int {
+	frameHeader := 4 + 2 // magic + frame header & window descriptor
+	if e.o.dict != nil {
+		frameHeader += 4
+	}
+	// Frame content size:
+	if size < 256 {
+		frameHeader++
+	} else if size < 65536+256 {
+		frameHeader += 2
+	} else if size < math.MaxInt32 {
+		frameHeader += 4
+	} else {
+		frameHeader += 8
+	}
+	// Final crc
+	if e.o.crc {
+		frameHeader += 4
+	}
+
+	// Max overhead is 3 bytes/block.
+	// There cannot be 0 blocks.
+	blocks := (size + e.o.blockSize) / e.o.blockSize
+
+	// Combine, add padding.
+	maxSz := frameHeader + 3*blocks + size
+	if e.o.pad > 1 {
+		maxSz += calcSkippableFrame(int64(maxSz), int64(e.o.pad))
+	}
+	return maxSz
+}
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@ -3,6 +3,8 @@ package zstd
 import (
 	"errors"
 	"fmt"
+	"math"
+	"math/bits"
 	"runtime"
 	"strings"
 )
@ -47,22 +49,22 @@ func (o encoderOptions) encoder() encoder {
 	switch o.level {
 	case SpeedFastest:
 		if o.dict != nil {
-			return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+			return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
 		}
-		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}

 	case SpeedDefault:
 		if o.dict != nil {
-			return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}}
+			return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}}
 		}
-		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
 	case SpeedBetterCompression:
 		if o.dict != nil {
-			return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+			return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
 		}
-		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
 	case SpeedBestCompression:
-		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
 	}
 	panic("unknown compression level")
 }
@ -283,7 +285,7 @@ func WithNoEntropyCompression(b bool) EOption {
 // a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
 // For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
 // This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
-// If this is not specified, block encodes will automatically choose this based on the input size.
+// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
 // This setting has no effect on streamed encodes.
 func WithSingleSegment(b bool) EOption {
 	return func(o *encoderOptions) error {
@ -304,7 +306,13 @@ func WithLowerEncoderMem(b bool) EOption {
 }

 // WithEncoderDict allows to register a dictionary that will be used for the encode.
+//
+// The slice dict must be in the [dictionary format] produced by
+// "zstd --train" from the Zstandard reference implementation.
+//
 // The encoder *may* choose to use no dictionary instead for certain payloads.
+//
+// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
 func WithEncoderDict(dict []byte) EOption {
 	return func(o *encoderOptions) error {
 		d, err := loadDict(dict)
@ -315,3 +323,17 @@ func WithEncoderDict(dict []byte) EOption {
 		return nil
 	}
 }
+
+// WithEncoderDictRaw registers a dictionary that may be used by the encoder.
+//
+// The slice content may contain arbitrary data. It will be used as an initial
+// history.
+func WithEncoderDictRaw(id uint32, content []byte) EOption {
+	return func(o *encoderOptions) error {
+		if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
+			return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
+		}
+		o.dict = &dict{id: id, content: content, offsets: [3]int{1, 4, 8}}
+		return nil
+	}
+}
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@ -5,7 +5,7 @@
 package zstd

 import (
-	"bytes"
+	"encoding/binary"
 	"encoding/hex"
 	"errors"
 	"io"
@ -29,7 +29,7 @@ type frameDec struct {

 	FrameContentSize uint64

-	DictionaryID  *uint32
+	DictionaryID  uint32
 	HasCheckSum   bool
 	SingleSegment bool
 }
@ -43,9 +43,9 @@ const (
 	MaxWindowSize = 1 << 29
 )

-var (
-	frameMagic          = []byte{0x28, 0xb5, 0x2f, 0xfd}
-	skippableFrameMagic = []byte{0x2a, 0x4d, 0x18}
+const (
+	frameMagic          = "\x28\xb5\x2f\xfd"
+	skippableFrameMagic = "\x2a\x4d\x18"
 )

 func newFrameDec(o decoderOptions) *frameDec {
@ -89,9 +89,9 @@ func (d *frameDec) reset(br byteBuffer) error {
 			copy(signature[1:], b)
 		}

-		if !bytes.Equal(signature[1:4], skippableFrameMagic) || signature[0]&0xf0 != 0x50 {
+		if string(signature[1:4]) != skippableFrameMagic || signature[0]&0xf0 != 0x50 {
 			if debugDecoder {
-				println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString(skippableFrameMagic))
+				println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString([]byte(skippableFrameMagic)))
 			}
 			// Break if not skippable frame.
 			break
@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 		}
 		n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
 		println("Skipping frame with", n, "bytes.")
-		err = br.skipN(int(n))
+		err = br.skipN(int64(n))
 		if err != nil {
 			if debugDecoder {
 				println("Reading discarded frame", err)
@ -114,9 +114,9 @@ func (d *frameDec) reset(br byteBuffer) error {
 			return err
 		}
 	}
-	if !bytes.Equal(signature[:], frameMagic) {
+	if string(signature[:]) != frameMagic {
 		if debugDecoder {
-			println("Got magic numbers: ", signature, "want:", frameMagic)
+			println("Got magic numbers: ", signature, "want:", []byte(frameMagic))
 		}
 		return ErrMagicMismatch
 	}
@ -155,7 +155,7 @@ func (d *frameDec) reset(br byteBuffer) error {

 	// Read Dictionary_ID
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
-	d.DictionaryID = nil
+	d.DictionaryID = 0
 	if size := fhd & 3; size != 0 {
 		if size == 3 {
 			size = 4
@ -167,7 +167,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 			return err
 		}
 		var id uint32
-		switch size {
+		switch len(b) {
 		case 1:
 			id = uint32(b[0])
 		case 2:
@ -178,11 +178,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 		if debugDecoder {
 			println("Dict size", size, "ID:", id)
 		}
-		if id > 0 {
-			// ID 0 means "sorry, no dictionary anyway".
-			// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
-			d.DictionaryID = &id
-		}
+		d.DictionaryID = id
 	}

 	// Read Frame_Content_Size
@ -204,7 +200,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 			println("Reading Frame content", err)
 			return err
 		}
-		switch fcsSize {
+		switch len(b) {
 		case 1:
 			d.FrameContentSize = uint64(b[0])
 		case 2:
@ -231,20 +227,27 @@ func (d *frameDec) reset(br byteBuffer) error {
 		d.crc.Reset()
 	}

+	if d.WindowSize > d.o.maxWindowSize {
+		if debugDecoder {
+			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+		}
+		return ErrWindowSizeExceeded
+	}
+
 	if d.WindowSize == 0 && d.SingleSegment {
 		// We may not need window in this case.
 		d.WindowSize = d.FrameContentSize
 		if d.WindowSize < MinWindowSize {
 			d.WindowSize = MinWindowSize
 		}
+		if d.WindowSize > d.o.maxDecodedSize {
+			if debugDecoder {
+				printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+			}
+			return ErrDecoderSizeExceeded
+		}
 	}

-	if d.WindowSize > uint64(d.o.maxWindowSize) {
-		if debugDecoder {
-			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
-		}
-		return ErrWindowSizeExceeded
-	}
 	// The minimum Window_Size is 1 KB.
 	if d.WindowSize < MinWindowSize {
 		if debugDecoder {
@ -254,11 +257,16 @@ func (d *frameDec) reset(br byteBuffer) error {
 	}
 	d.history.windowSize = int(d.WindowSize)
 	if !d.o.lowMem || d.history.windowSize < maxBlockSize {
-		// Alloc 2x window size if not low-mem, or very small window size.
+		// Alloc 2x window size if not low-mem, or window size below 2MB.
 		d.history.allocFrameBuffer = d.history.windowSize * 2
 	} else {
-		// Alloc with one additional block
-		d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
+		if d.o.lowMem {
+			// Alloc with 1MB extra.
+			d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize/2
+		} else {
+			// Alloc with 2MB extra.
+			d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
+		}
 	}

 	if debugDecoder {
@ -293,7 +301,7 @@ func (d *frameDec) checkCRC() error {
 	}

 	// We can overwrite upper tmp now
-	want, err := d.rawInput.readSmall(4)
+	buf, err := d.rawInput.readSmall(4)
 	if err != nil {
 		println("CRC missing?", err)
 		return err
@ -303,22 +311,17 @@ func (d *frameDec) checkCRC() error {
 		return nil
 	}

-	var tmp [4]byte
-	got := d.crc.Sum64()
-	// Flip to match file order.
-	tmp[0] = byte(got >> 0)
-	tmp[1] = byte(got >> 8)
-	tmp[2] = byte(got >> 16)
-	tmp[3] = byte(got >> 24)
+	want := binary.LittleEndian.Uint32(buf[:4])
+	got := uint32(d.crc.Sum64())

-	if !bytes.Equal(tmp[:], want) {
+	if got != want {
 		if debugDecoder {
-			println("CRC Check Failed:", tmp[:], "!=", want)
+			printf("CRC check failed: got %08x, want %08x\n", got, want)
 		}
 		return ErrCRCMismatch
 	}
 	if debugDecoder {
-		println("CRC ok", tmp[:])
+		printf("CRC ok %08x\n", got)
 	}
 	return nil
 }
@ -336,7 +339,7 @@ func (d *frameDec) consumeCRC() error {
 	return nil
 }

-// runDecoder will create a sync decoder that will decode a block of data.
+// runDecoder will run the decoder for the remainder of the frame.
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 	saved := d.history.b

@ -346,12 +349,23 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 	// Store input length, so we only check new data.
 	crcStart := len(dst)
 	d.history.decoders.maxSyncLen = 0
+	if d.o.limitToCap {
+		d.history.decoders.maxSyncLen = uint64(cap(dst) - len(dst))
+	}
 	if d.FrameContentSize != fcsUnknown {
-		d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
+		if !d.o.limitToCap || d.FrameContentSize+uint64(len(dst)) < d.history.decoders.maxSyncLen {
+			d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
+		}
 		if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
+			if debugDecoder {
+				println("maxSyncLen:", d.history.decoders.maxSyncLen, "> maxDecodedSize:", d.o.maxDecodedSize)
+			}
 			return dst, ErrDecoderSizeExceeded
 		}
-		if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
+		if debugDecoder {
+			println("maxSyncLen:", d.history.decoders.maxSyncLen)
+		}
+		if !d.o.limitToCap && uint64(cap(dst)) < d.history.decoders.maxSyncLen {
 			// Alloc for output
 			dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
 			copy(dst2, dst)
@ -371,7 +385,13 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 		if err != nil {
 			break
 		}
-		if uint64(len(d.history.b)) > d.o.maxDecodedSize {
+		if uint64(len(d.history.b)-crcStart) > d.o.maxDecodedSize {
+			println("runDecoder: maxDecodedSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.o.maxDecodedSize)
+			err = ErrDecoderSizeExceeded
+			break
+		}
+		if d.o.limitToCap && len(d.history.b) > cap(dst) {
+			println("runDecoder: cap exceeded", uint64(len(d.history.b)), ">", cap(dst))
 			err = ErrDecoderSizeExceeded
 			break
 		}
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
@ -21,7 +21,8 @@ type buildDtableAsmContext struct {

 // buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
 // Function returns non-zero exit code on error.
-// go:noescape
+//
+//go:noescape
 func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int

 // please keep in sync with _generate/gen_fse.go
@ -34,8 +35,8 @@ const (
 // buildDtable will build the decoding table.
 func (s *fseDecoder) buildDtable() error {
 	ctx := buildDtableAsmContext{
-		stateTable: (*uint16)(&s.stateTable[0]),
-		norm:       (*int16)(&s.norm[0]),
+		stateTable: &s.stateTable[0],
+		norm:       &s.norm[0],
 		dt:         (*uint64)(&s.dt[0]),
 	}
 	code := buildDtable_asm(s, &ctx)
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
@ -1,7 +1,6 @@
 // Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.

 //go:build !appengine && !noasm && gc && !noasm
-// +build !appengine,!noasm,gc,!noasm

 // func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
 TEXT ·buildDtable_asm(SB), $0-24
--- a/vendor/github.com/klauspost/compress/zstd/history.go
+++ b/vendor/github.com/klauspost/compress/zstd/history.go
@ -37,26 +37,23 @@ func (h *history) reset() {
 	h.ignoreBuffer = 0
 	h.error = false
 	h.recentOffsets = [3]int{1, 4, 8}
-	if f := h.decoders.litLengths.fse; f != nil && !f.preDefined {
-		fseDecoderPool.Put(f)
-	}
-	if f := h.decoders.offsets.fse; f != nil && !f.preDefined {
-		fseDecoderPool.Put(f)
-	}
-	if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined {
-		fseDecoderPool.Put(f)
-	}
+	h.decoders.freeDecoders()
 	h.decoders = sequenceDecs{br: h.decoders.br}
-	if h.huffTree != nil {
-		if h.dict == nil || h.dict.litEnc != h.huffTree {
-			huffDecoderPool.Put(h.huffTree)
-		}
-	}
+	h.freeHuffDecoder()
 	h.huffTree = nil
 	h.dict = nil
 	//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
 }

+func (h *history) freeHuffDecoder() {
+	if h.huffTree != nil {
+		if h.dict == nil || h.dict.litEnc != h.huffTree {
+			huffDecoderPool.Put(h.huffTree)
+			h.huffTree = nil
+		}
+	}
+}
+
 func (h *history) setDict(dict *dict) {
 	if dict == nil {
 		return
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
@ -2,12 +2,7 @@

 VENDORED: Go to [github.com/cespare/xxhash](https://github.com/cespare/xxhash) for original package.

-
-[![GoDoc](https://godoc.org/github.com/cespare/xxhash?status.svg)](https://godoc.org/github.com/cespare/xxhash)
-[![Build Status](https://travis-ci.org/cespare/xxhash.svg?branch=master)](https://travis-ci.org/cespare/xxhash)
-
-xxhash is a Go implementation of the 64-bit
-[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a
+xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a
 high-quality hashing algorithm that is much faster than anything in the Go
 standard library.

@ -28,31 +23,49 @@ func (*Digest) WriteString(string) (int, error)
 func (*Digest) Sum64() uint64
 ```

-This implementation provides a fast pure-Go implementation and an even faster
-assembly implementation for amd64.
+The package is written with optimized pure Go and also contains even faster
+assembly implementations for amd64 and arm64. If desired, the `purego` build tag
+opts into using the Go code even on those architectures.
+
+[xxHash]: http://cyan4973.github.io/xxHash/
+
+## Compatibility
+
+This package is in a module and the latest code is in version 2 of the module.
+You need a version of Go with at least "minimal module compatibility" to use
+github.com/cespare/xxhash/v2:
+
+* 1.9.7+ for Go 1.9
+* 1.10.3+ for Go 1.10
+* Go 1.11 or later
+
+I recommend using the latest release of Go.

 ## Benchmarks

 Here are some quick benchmarks comparing the pure-Go and assembly
 implementations of Sum64.

-| input size | purego | asm |
-| --- | --- | --- |
-| 5 B   |  979.66 MB/s |  1291.17 MB/s  |
-| 100 B | 7475.26 MB/s | 7973.40 MB/s  |
-| 4 KB  | 17573.46 MB/s | 17602.65 MB/s |
-| 10 MB | 17131.46 MB/s | 17142.16 MB/s |
+| input size | purego    | asm       |
+| ---------- | --------- | --------- |
+| 4 B        |  1.3 GB/s |  1.2 GB/s |
+| 16 B       |  2.9 GB/s |  3.5 GB/s |
+| 100 B      |  6.9 GB/s |  8.1 GB/s |
+| 4 KB       | 11.7 GB/s | 16.7 GB/s |
+| 10 MB      | 12.0 GB/s | 17.3 GB/s |

-These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using
-the following commands under Go 1.11.2:
+These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C
+CPU using the following commands under Go 1.19.2:

 ```
-$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes'
-$ go test -benchtime 10s -bench '/xxhash,direct,bytes'
+benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$')
+benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$')
 ```

 ## Projects using this package

 - [InfluxDB](https://github.com/influxdata/influxdb)
 - [Prometheus](https://github.com/prometheus/prometheus)
+- [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
 - [FreeCache](https://github.com/coocood/freecache)
+- [FastCache](https://github.com/VictoriaMetrics/fastcache)
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
@ -18,19 +18,11 @@ const (
 	prime5 uint64 = 2870177450012600261
 )

-// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where
-// possible in the Go code is worth a small (but measurable) performance boost
-// by avoiding some MOVQs. Vars are needed for the asm and also are useful for
-// convenience in the Go code in a few places where we need to intentionally
-// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the
-// result overflows a uint64).
-var (
-	prime1v = prime1
-	prime2v = prime2
-	prime3v = prime3
-	prime4v = prime4
-	prime5v = prime5
-)
+// Store the primes in an array as well.
+//
+// The consts are used when possible in Go code to avoid MOVs but we need a
+// contiguous array of the assembly code.
+var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}

 // Digest implements hash.Hash64.
 type Digest struct {
@ -52,10 +44,10 @@ func New() *Digest {

 // Reset clears the Digest's state so that it can be reused.
 func (d *Digest) Reset() {
-	d.v1 = prime1v + prime2
+	d.v1 = primes[0] + prime2
 	d.v2 = prime2
 	d.v3 = 0
-	d.v4 = -prime1v
+	d.v4 = -primes[0]
 	d.total = 0
 	d.n = 0
 }
@ -71,21 +63,23 @@ func (d *Digest) Write(b []byte) (n int, err error) {
 	n = len(b)
 	d.total += uint64(n)

+	memleft := d.mem[d.n&(len(d.mem)-1):]
+
 	if d.n+n < 32 {
 		// This new data doesn't even fill the current block.
-		copy(d.mem[d.n:], b)
+		copy(memleft, b)
 		d.n += n
 		return
 	}

 	if d.n > 0 {
 		// Finish off the partial block.
-		copy(d.mem[d.n:], b)
+		c := copy(memleft, b)
 		d.v1 = round(d.v1, u64(d.mem[0:8]))
 		d.v2 = round(d.v2, u64(d.mem[8:16]))
 		d.v3 = round(d.v3, u64(d.mem[16:24]))
 		d.v4 = round(d.v4, u64(d.mem[24:32]))
-		b = b[32-d.n:]
+		b = b[c:]
 		d.n = 0
 	}

@ -135,21 +129,20 @@ func (d *Digest) Sum64() uint64 {

 	h += d.total

-	i, end := 0, d.n
-	for ; i+8 <= end; i += 8 {
-		k1 := round(0, u64(d.mem[i:i+8]))
+	b := d.mem[:d.n&(len(d.mem)-1)]
+	for ; len(b) >= 8; b = b[8:] {
+		k1 := round(0, u64(b[:8]))
 		h ^= k1
 		h = rol27(h)*prime1 + prime4
 	}
-	if i+4 <= end {
-		h ^= uint64(u32(d.mem[i:i+4])) * prime1
+	if len(b) >= 4 {
+		h ^= uint64(u32(b[:4])) * prime1
 		h = rol23(h)*prime2 + prime3
-		i += 4
+		b = b[4:]
 	}
-	for i < end {
-		h ^= uint64(d.mem[i]) * prime5
+	for ; len(b) > 0; b = b[1:] {
+		h ^= uint64(b[0]) * prime5
 		h = rol11(h) * prime1
-		i++
 	}

 	h ^= h >> 33
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
@ -1,3 +1,4 @@
+//go:build !appengine && gc && !purego && !noasm
 // +build !appengine
 // +build gc
 // +build !purego
@ -5,212 +6,205 @@

 #include "textflag.h"

-// Register allocation:
-// AX	h
-// SI	pointer to advance through b
-// DX	n
-// BX	loop end
-// R8	v1, k1
-// R9	v2
-// R10	v3
-// R11	v4
-// R12	tmp
-// R13	prime1v
-// R14	prime2v
-// DI	prime4v
+// Registers:
+#define h      AX
+#define d      AX
+#define p      SI // pointer to advance through b
+#define n      DX
+#define end    BX // loop end
+#define v1     R8
+#define v2     R9
+#define v3     R10
+#define v4     R11
+#define x      R12
+#define prime1 R13
+#define prime2 R14
+#define prime4 DI

-// round reads from and advances the buffer pointer in SI.
-// It assumes that R13 has prime1v and R14 has prime2v.
-#define round(r) \
-	MOVQ  (SI), R12 \
-	ADDQ  $8, SI    \
-	IMULQ R14, R12  \
-	ADDQ  R12, r    \
-	ROLQ  $31, r    \
-	IMULQ R13, r
+#define round(acc, x) \
+	IMULQ prime2, x   \
+	ADDQ  x, acc      \
+	ROLQ  $31, acc    \
+	IMULQ prime1, acc

-// mergeRound applies a merge round on the two registers acc and val.
-// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
-#define mergeRound(acc, val) \
-	IMULQ R14, val \
-	ROLQ  $31, val \
-	IMULQ R13, val \
-	XORQ  val, acc \
-	IMULQ R13, acc \
-	ADDQ  DI, acc
+// round0 performs the operation x = round(0, x).
+#define round0(x) \
+	IMULQ prime2, x \
+	ROLQ  $31, x    \
+	IMULQ prime1, x
+
+// mergeRound applies a merge round on the two registers acc and x.
+// It assumes that prime1, prime2, and prime4 have been loaded.
+#define mergeRound(acc, x) \
+	round0(x)         \
+	XORQ  x, acc      \
+	IMULQ prime1, acc \
+	ADDQ  prime4, acc
+
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that there is at least one block
+// to process.
+#define blockLoop() \
+loop:  \
+	MOVQ +0(p), x  \
+	round(v1, x)   \
+	MOVQ +8(p), x  \
+	round(v2, x)   \
+	MOVQ +16(p), x \
+	round(v3, x)   \
+	MOVQ +24(p), x \
+	round(v4, x)   \
+	ADDQ $32, p    \
+	CMPQ p, end    \
+	JLE  loop

 // func Sum64(b []byte) uint64
-TEXT ·Sum64(SB), NOSPLIT, $0-32
+TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
 	// Load fixed primes.
-	MOVQ ·prime1v(SB), R13
-	MOVQ ·prime2v(SB), R14
-	MOVQ ·prime4v(SB), DI
+	MOVQ ·primes+0(SB), prime1
+	MOVQ ·primes+8(SB), prime2
+	MOVQ ·primes+24(SB), prime4

 	// Load slice.
-	MOVQ b_base+0(FP), SI
-	MOVQ b_len+8(FP), DX
-	LEAQ (SI)(DX*1), BX
+	MOVQ b_base+0(FP), p
+	MOVQ b_len+8(FP), n
+	LEAQ (p)(n*1), end

 	// The first loop limit will be len(b)-32.
-	SUBQ $32, BX
+	SUBQ $32, end

 	// Check whether we have at least one block.
-	CMPQ DX, $32
+	CMPQ n, $32
 	JLT  noBlocks

 	// Set up initial state (v1, v2, v3, v4).
-	MOVQ R13, R8
-	ADDQ R14, R8
-	MOVQ R14, R9
-	XORQ R10, R10
-	XORQ R11, R11
-	SUBQ R13, R11
+	MOVQ prime1, v1
+	ADDQ prime2, v1
+	MOVQ prime2, v2
+	XORQ v3, v3
+	XORQ v4, v4
+	SUBQ prime1, v4

-	// Loop until SI > BX.
-blockLoop:
-	round(R8)
-	round(R9)
-	round(R10)
-	round(R11)
+	blockLoop()

-	CMPQ SI, BX
-	JLE  blockLoop
+	MOVQ v1, h
+	ROLQ $1, h
+	MOVQ v2, x
+	ROLQ $7, x
+	ADDQ x, h
+	MOVQ v3, x
+	ROLQ $12, x
+	ADDQ x, h
+	MOVQ v4, x
+	ROLQ $18, x
+	ADDQ x, h

-	MOVQ R8, AX
-	ROLQ $1, AX
-	MOVQ R9, R12
-	ROLQ $7, R12
-	ADDQ R12, AX
-	MOVQ R10, R12
-	ROLQ $12, R12
-	ADDQ R12, AX
-	MOVQ R11, R12
-	ROLQ $18, R12
-	ADDQ R12, AX
-
-	mergeRound(AX, R8)
-	mergeRound(AX, R9)
-	mergeRound(AX, R10)
-	mergeRound(AX, R11)
+	mergeRound(h, v1)
+	mergeRound(h, v2)
+	mergeRound(h, v3)
+	mergeRound(h, v4)

 	JMP afterBlocks

 noBlocks:
-	MOVQ ·prime5v(SB), AX
+	MOVQ ·primes+32(SB), h

 afterBlocks:
-	ADDQ DX, AX
+	ADDQ n, h

-	// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
-	ADDQ $24, BX
+	ADDQ $24, end
+	CMPQ p, end
+	JG   try4

-	CMPQ SI, BX
-	JG   fourByte
+loop8:
+	MOVQ  (p), x
+	ADDQ  $8, p
+	round0(x)
+	XORQ  x, h
+	ROLQ  $27, h
+	IMULQ prime1, h
+	ADDQ  prime4, h

-wordLoop:
-	// Calculate k1.
-	MOVQ  (SI), R8
-	ADDQ  $8, SI
-	IMULQ R14, R8
-	ROLQ  $31, R8
-	IMULQ R13, R8
+	CMPQ p, end
+	JLE  loop8

-	XORQ  R8, AX
-	ROLQ  $27, AX
-	IMULQ R13, AX
-	ADDQ  DI, AX
+try4:
+	ADDQ $4, end
+	CMPQ p, end
+	JG   try1

-	CMPQ SI, BX
-	JLE  wordLoop
+	MOVL  (p), x
+	ADDQ  $4, p
+	IMULQ prime1, x
+	XORQ  x, h

-fourByte:
-	ADDQ $4, BX
-	CMPQ SI, BX
-	JG   singles
+	ROLQ  $23, h
+	IMULQ prime2, h
+	ADDQ  ·primes+16(SB), h

-	MOVL  (SI), R8
-	ADDQ  $4, SI
-	IMULQ R13, R8
-	XORQ  R8, AX
-
-	ROLQ  $23, AX
-	IMULQ R14, AX
-	ADDQ  ·prime3v(SB), AX
-
-singles:
-	ADDQ $4, BX
-	CMPQ SI, BX
+try1:
+	ADDQ $4, end
+	CMPQ p, end
 	JGE  finalize

-singlesLoop:
-	MOVBQZX (SI), R12
-	ADDQ    $1, SI
-	IMULQ   ·prime5v(SB), R12
-	XORQ    R12, AX
+loop1:
+	MOVBQZX (p), x
+	ADDQ    $1, p
+	IMULQ   ·primes+32(SB), x
+	XORQ    x, h
+	ROLQ    $11, h
+	IMULQ   prime1, h

-	ROLQ  $11, AX
-	IMULQ R13, AX
-
-	CMPQ SI, BX
-	JL   singlesLoop
+	CMPQ p, end
+	JL   loop1

 finalize:
-	MOVQ  AX, R12
-	SHRQ  $33, R12
-	XORQ  R12, AX
-	IMULQ R14, AX
-	MOVQ  AX, R12
-	SHRQ  $29, R12
-	XORQ  R12, AX
-	IMULQ ·prime3v(SB), AX
-	MOVQ  AX, R12
-	SHRQ  $32, R12
-	XORQ  R12, AX
+	MOVQ  h, x
+	SHRQ  $33, x
+	XORQ  x, h
+	IMULQ prime2, h
+	MOVQ  h, x
+	SHRQ  $29, x
+	XORQ  x, h
+	IMULQ ·primes+16(SB), h
+	MOVQ  h, x
+	SHRQ  $32, x
+	XORQ  x, h

-	MOVQ AX, ret+24(FP)
+	MOVQ h, ret+24(FP)
 	RET

-// writeBlocks uses the same registers as above except that it uses AX to store
-// the d pointer.
-
 // func writeBlocks(d *Digest, b []byte) int
-TEXT ·writeBlocks(SB), NOSPLIT, $0-40
+TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
 	// Load fixed primes needed for round.
-	MOVQ ·prime1v(SB), R13
-	MOVQ ·prime2v(SB), R14
+	MOVQ ·primes+0(SB), prime1
+	MOVQ ·primes+8(SB), prime2

 	// Load slice.
-	MOVQ b_base+8(FP), SI
-	MOVQ b_len+16(FP), DX
-	LEAQ (SI)(DX*1), BX
-	SUBQ $32, BX
+	MOVQ b_base+8(FP), p
+	MOVQ b_len+16(FP), n
+	LEAQ (p)(n*1), end
+	SUBQ $32, end

 	// Load vN from d.
-	MOVQ d+0(FP), AX
-	MOVQ 0(AX), R8   // v1
-	MOVQ 8(AX), R9   // v2
-	MOVQ 16(AX), R10 // v3
-	MOVQ 24(AX), R11 // v4
+	MOVQ s+0(FP), d
+	MOVQ 0(d), v1
+	MOVQ 8(d), v2
+	MOVQ 16(d), v3
+	MOVQ 24(d), v4

 	// We don't need to check the loop condition here; this function is
 	// always called with at least one block of data to process.
-blockLoop:
-	round(R8)
-	round(R9)
-	round(R10)
-	round(R11)
-
-	CMPQ SI, BX
-	JLE  blockLoop
+	blockLoop()

 	// Copy vN back to d.
-	MOVQ R8, 0(AX)
-	MOVQ R9, 8(AX)
-	MOVQ R10, 16(AX)
-	MOVQ R11, 24(AX)
+	MOVQ v1, 0(d)
+	MOVQ v2, 8(d)
+	MOVQ v3, 16(d)
+	MOVQ v4, 24(d)

-	// The number of bytes written is SI minus the old base pointer.
-	SUBQ b_base+8(FP), SI
-	MOVQ SI, ret+32(FP)
+	// The number of bytes written is p minus the old base pointer.
+	SUBQ b_base+8(FP), p
+	MOVQ p, ret+32(FP)

 	RET
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
@ -1,13 +1,17 @@
-// +build gc,!purego,!noasm
+//go:build !appengine && gc && !purego && !noasm
+// +build !appengine
+// +build gc
+// +build !purego
+// +build !noasm

 #include "textflag.h"

-// Register allocation.
+// Registers:
 #define digest	R1
-#define h	R2 // Return value.
-#define p	R3 // Input pointer.
-#define len	R4
-#define nblocks	R5 // len / 32.
+#define h	R2 // return value
+#define p	R3 // input pointer
+#define n	R4 // input length
+#define nblocks	R5 // n / 32
 #define prime1	R7
 #define prime2	R8
 #define prime3	R9
@ -25,60 +29,52 @@
 #define round(acc, x) \
 	MADD prime2, acc, x, acc \
 	ROR  $64-31, acc         \
-	MUL  prime1, acc         \
+	MUL  prime1, acc

-// x = round(0, x).
+// round0 performs the operation x = round(0, x).
 #define round0(x) \
 	MUL prime2, x \
 	ROR $64-31, x \
-	MUL prime1, x \
+	MUL prime1, x

-#define mergeRound(x) \
-	round0(x)                 \
-	EOR  x, h                 \
-	MADD h, prime4, prime1, h \
+#define mergeRound(acc, x) \
+	round0(x)                     \
+	EOR  x, acc                   \
+	MADD acc, prime4, prime1, acc

-// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
-#define blocksLoop() \
-	LSR     $5, len, nblocks \
-	PCALIGN $16              \
-	loop:                    \
-	LDP.P   32(p), (x1, x2)  \
-	round(v1, x1)            \
-	LDP     -16(p), (x3, x4) \
-	round(v2, x2)            \
-	SUB     $1, nblocks      \
-	round(v3, x3)            \
-	round(v4, x4)            \
-	CBNZ    nblocks, loop    \
-
-// The primes are repeated here to ensure that they're stored
-// in a contiguous array, so we can load them with LDP.
-DATA primes<> +0(SB)/8, $11400714785074694791
-DATA primes<> +8(SB)/8, $14029467366897019727
-DATA primes<>+16(SB)/8, $1609587929392839161
-DATA primes<>+24(SB)/8, $9650029242287828579
-DATA primes<>+32(SB)/8, $2870177450012600261
-GLOBL primes<>(SB), NOPTR+RODATA, $40
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that n >= 32.
+#define blockLoop() \
+	LSR     $5, n, nblocks  \
+	PCALIGN $16             \
+	loop:                   \
+	LDP.P   16(p), (x1, x2) \
+	LDP.P   16(p), (x3, x4) \
+	round(v1, x1)           \
+	round(v2, x2)           \
+	round(v3, x3)           \
+	round(v4, x4)           \
+	SUB     $1, nblocks     \
+	CBNZ    nblocks, loop

 // func Sum64(b []byte) uint64
-TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
-	LDP b_base+0(FP), (p, len)
+TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
+	LDP b_base+0(FP), (p, n)

-	LDP  primes<> +0(SB), (prime1, prime2)
-	LDP  primes<>+16(SB), (prime3, prime4)
-	MOVD primes<>+32(SB), prime5
+	LDP  ·primes+0(SB), (prime1, prime2)
+	LDP  ·primes+16(SB), (prime3, prime4)
+	MOVD ·primes+32(SB), prime5

-	CMP  $32, len
-	CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
-	BLO  afterLoop
+	CMP  $32, n
+	CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
+	BLT  afterLoop

 	ADD  prime1, prime2, v1
 	MOVD prime2, v2
 	MOVD $0, v3
 	NEG  prime1, v4

-	blocksLoop()
+	blockLoop()

 	ROR $64-1, v1, x1
 	ROR $64-7, v2, x2
@ -88,71 +84,75 @@ TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
 	ADD x3, x4
 	ADD x2, x4, h

-	mergeRound(v1)
-	mergeRound(v2)
-	mergeRound(v3)
-	mergeRound(v4)
+	mergeRound(h, v1)
+	mergeRound(h, v2)
+	mergeRound(h, v3)
+	mergeRound(h, v4)

 afterLoop:
-	ADD len, h
+	ADD n, h

-	TBZ   $4, len, try8
+	TBZ   $4, n, try8
 	LDP.P 16(p), (x1, x2)

 	round0(x1)
+
+	// NOTE: here and below, sequencing the EOR after the ROR (using a
+	// rotated register) is worth a small but measurable speedup for small
+	// inputs.
 	ROR  $64-27, h
 	EOR  x1 @> 64-27, h, h
 	MADD h, prime4, prime1, h

 	round0(x2)
 	ROR  $64-27, h
-	EOR  x2 @> 64-27, h
+	EOR  x2 @> 64-27, h, h
 	MADD h, prime4, prime1, h

 try8:
-	TBZ    $3, len, try4
+	TBZ    $3, n, try4
 	MOVD.P 8(p), x1

 	round0(x1)
 	ROR  $64-27, h
-	EOR  x1 @> 64-27, h
+	EOR  x1 @> 64-27, h, h
 	MADD h, prime4, prime1, h

 try4:
-	TBZ     $2, len, try2
+	TBZ     $2, n, try2
 	MOVWU.P 4(p), x2

 	MUL  prime1, x2
 	ROR  $64-23, h
-	EOR  x2 @> 64-23, h
+	EOR  x2 @> 64-23, h, h
 	MADD h, prime3, prime2, h

 try2:
-	TBZ     $1, len, try1
+	TBZ     $1, n, try1
 	MOVHU.P 2(p), x3
 	AND     $255, x3, x1
 	LSR     $8, x3, x2

 	MUL prime5, x1
 	ROR $64-11, h
-	EOR x1 @> 64-11, h
+	EOR x1 @> 64-11, h, h
 	MUL prime1, h

 	MUL prime5, x2
 	ROR $64-11, h
-	EOR x2 @> 64-11, h
+	EOR x2 @> 64-11, h, h
 	MUL prime1, h

 try1:
-	TBZ   $0, len, end
+	TBZ   $0, n, finalize
 	MOVBU (p), x4

 	MUL prime5, x4
 	ROR $64-11, h
-	EOR x4 @> 64-11, h
+	EOR x4 @> 64-11, h, h
 	MUL prime1, h

-end:
+finalize:
 	EOR h >> 33, h
 	MUL prime2, h
 	EOR h >> 29, h
@ -163,24 +163,22 @@ end:
 	RET

 // func writeBlocks(d *Digest, b []byte) int
-//
-// Assumes len(b) >= 32.
-TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
-	LDP primes<>(SB), (prime1, prime2)
+TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
+	LDP ·primes+0(SB), (prime1, prime2)

 	// Load state. Assume v[1-4] are stored contiguously.
 	MOVD d+0(FP), digest
 	LDP  0(digest), (v1, v2)
 	LDP  16(digest), (v3, v4)

-	LDP b_base+8(FP), (p, len)
+	LDP b_base+8(FP), (p, n)

-	blocksLoop()
+	blockLoop()

 	// Store updated state.
 	STP (v1, v2), 0(digest)
 	STP (v3, v4), 16(digest)

-	BIC  $31, len
-	MOVD len, ret+32(FP)
+	BIC  $31, n
+	MOVD n, ret+32(FP)
 	RET
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
@ -13,4 +13,4 @@ package xxhash
 func Sum64(b []byte) uint64

 //go:noescape
-func writeBlocks(d *Digest, b []byte) int
+func writeBlocks(s *Digest, b []byte) int
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
@ -15,10 +15,10 @@ func Sum64(b []byte) uint64 {
 	var h uint64

 	if n >= 32 {
-		v1 := prime1v + prime2
+		v1 := primes[0] + prime2
 		v2 := prime2
 		v3 := uint64(0)
-		v4 := -prime1v
+		v4 := -primes[0]
 		for len(b) >= 32 {
 			v1 = round(v1, u64(b[0:8:len(b)]))
 			v2 = round(v2, u64(b[8:16:len(b)]))
@ -37,19 +37,18 @@ func Sum64(b []byte) uint64 {

 	h += uint64(n)

-	i, end := 0, len(b)
-	for ; i+8 <= end; i += 8 {
-		k1 := round(0, u64(b[i:i+8:len(b)]))
+	for ; len(b) >= 8; b = b[8:] {
+		k1 := round(0, u64(b[:8]))
 		h ^= k1
 		h = rol27(h)*prime1 + prime4
 	}
-	if i+4 <= end {
-		h ^= uint64(u32(b[i:i+4:len(b)])) * prime1
+	if len(b) >= 4 {
+		h ^= uint64(u32(b[:4])) * prime1
 		h = rol23(h)*prime2 + prime3
-		i += 4
+		b = b[4:]
 	}
-	for ; i < end; i++ {
-		h ^= uint64(b[i]) * prime5
+	for ; len(b) > 0; b = b[1:] {
+		h ^= uint64(b[0]) * prime5
 		h = rol11(h) * prime1
 	}

--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@ -99,6 +99,21 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) erro
 	return nil
 }

+func (s *sequenceDecs) freeDecoders() {
+	if f := s.litLengths.fse; f != nil && !f.preDefined {
+		fseDecoderPool.Put(f)
+		s.litLengths.fse = nil
+	}
+	if f := s.offsets.fse; f != nil && !f.preDefined {
+		fseDecoderPool.Put(f)
+		s.offsets.fse = nil
+	}
+	if f := s.matchLengths.fse; f != nil && !f.preDefined {
+		fseDecoderPool.Put(f)
+		s.matchLengths.fse = nil
+	}
+}
+
 // execute will execute the decoded sequence with the provided history.
 // The sequence must be evaluated before being sent.
 func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
@ -299,7 +314,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
 		}
 		size := ll + ml + len(out)
 		if size-startSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
+			return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 		}
 		if size > cap(out) {
 			// Not enough size, which can happen under high volume block streaming conditions
@ -409,9 +424,8 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
 		}
 	}

-	// Check if space for literals
-	if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
-		return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
+	if size := len(s.literals) + len(out) - startSize; size > maxBlockSize {
+		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 	}

 	// Add final literals
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@ -32,18 +32,22 @@ type decodeSyncAsmContext struct {
 // sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
 //
 // Please refer to seqdec_generic.go for the reference implementation.
+//
 //go:noescape
 func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int

 // sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
+//
 //go:noescape
 func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int

 // sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
+//
 //go:noescape
 func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int

 // sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
+//
 //go:noescape
 func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int

@ -55,16 +59,22 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
 	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
 		return false, nil
 	}
-	useSafe := false
-	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
-		useSafe = true
-	}
-	if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
-		useSafe = true
-	}
-	if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
-		useSafe = true
-	}
+
+	// FIXME: Using unsafe memory copies leads to rare, random crashes
+	// with fuzz testing. It is therefore disabled for now.
+	const useSafe = true
+	/*
+		useSafe := false
+		if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
+			useSafe = true
+		}
+		if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
+			useSafe = true
+		}
+		if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
+			useSafe = true
+		}
+	*/

 	br := s.br

@ -129,7 +139,7 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
 		if debugDecoder {
 			println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
 		}
-		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
+		return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)

 	default:
 		return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
@ -137,7 +147,7 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {

 	s.seqSize += ctx.litRemain
 	if s.seqSize > maxBlockSize {
-		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+		return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 	}
 	err := br.close()
 	if err != nil {
@ -195,20 +205,24 @@ const errorNotEnoughSpace = 5
 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
 //
 // Please refer to seqdec_generic.go for the reference implementation.
+//
 //go:noescape
 func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int

 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
 //
 // Please refer to seqdec_generic.go for the reference implementation.
+//
 //go:noescape
 func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int

 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//
 //go:noescape
 func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int

 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//
 //go:noescape
 func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int

@ -275,7 +289,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {

 	s.seqSize += ctx.litRemain
 	if s.seqSize > maxBlockSize {
-		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 	}
 	err := br.close()
 	if err != nil {
@ -302,10 +316,12 @@ type executeAsmContext struct {
 // Returns false if a match offset is too big.
 //
 // Please refer to seqdec_generic.go for the reference implementation.
+//
 //go:noescape
 func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool

 // Same as above, but with safe memcopies
+//
 //go:noescape
 func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool

--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@ -1,7 +1,6 @@
 // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.

 //go:build !appengine && !noasm && gc && !noasm
-// +build !appengine,!noasm,gc,!noasm

 // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 // Requires: CMOV
@ -52,34 +51,46 @@ sequenceDecs_decode_amd64_fill_byte_by_byte:

 sequenceDecs_decode_amd64_fill_end:
 	// Update offset
-	MOVQ    R9, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R15
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R15
-	ADDQ    R15, AX
-	MOVQ    AX, 16(R10)
+	MOVQ  R9, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_amd64_of_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_amd64_of_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_amd64_of_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_amd64_of_update_zero:
+	MOVQ AX, 16(R10)

 	// Update match length
-	MOVQ    R8, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R15
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R15
-	ADDQ    R15, AX
-	MOVQ    AX, 8(R10)
+	MOVQ  R8, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_amd64_ml_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_amd64_ml_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_amd64_ml_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_amd64_ml_update_zero:
+	MOVQ AX, 8(R10)

 	// Fill bitreader to have enough for the remaining
 	CMPQ SI, $0x08
@ -107,19 +118,25 @@ sequenceDecs_decode_amd64_fill_2_byte_by_byte:

 sequenceDecs_decode_amd64_fill_2_end:
 	// Update literal length
-	MOVQ    DI, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R15
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R15
-	ADDQ    R15, AX
-	MOVQ    AX, (R10)
+	MOVQ  DI, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_amd64_ll_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_amd64_ll_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_amd64_ll_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_amd64_ll_update_zero:
+	MOVQ AX, (R10)

 	// Fill bitreader for state updates
 	MOVQ    R14, (SP)
@ -198,7 +215,7 @@ sequenceDecs_decode_amd64_skip_update:
 	MOVQ R12, R13
 	MOVQ R11, R12
 	MOVQ CX, R11
-	JMP  sequenceDecs_decode_amd64_adjust_end
+	JMP  sequenceDecs_decode_amd64_after_adjust

 sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
 	CMPQ (R10), $0x00000000
@ -210,7 +227,7 @@ sequenceDecs_decode_amd64_adjust_offset_maybezero:
 	TESTQ CX, CX
 	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
 	MOVQ  R11, CX
-	JMP   sequenceDecs_decode_amd64_adjust_end
+	JMP   sequenceDecs_decode_amd64_after_adjust

 sequenceDecs_decode_amd64_adjust_offset_nonzero:
 	CMPQ CX, $0x01
@ -247,7 +264,7 @@ sequenceDecs_decode_amd64_adjust_temp_valid:
 	MOVQ    AX, R11
 	MOVQ    AX, CX

-sequenceDecs_decode_amd64_adjust_end:
+sequenceDecs_decode_amd64_after_adjust:
 	MOVQ CX, 16(R10)

 	// Check values
@ -303,10 +320,6 @@ error_not_enough_literals:
 	MOVQ $0x00000004, ret+24(FP)
 	RET

-	// Return with not enough output space error
-	MOVQ $0x00000005, ret+24(FP)
-	RET
-
 // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 // Requires: CMOV
 TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
@ -356,49 +369,67 @@ sequenceDecs_decode_56_amd64_fill_byte_by_byte:

 sequenceDecs_decode_56_amd64_fill_end:
 	// Update offset
-	MOVQ    R9, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R15
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R15
-	ADDQ    R15, AX
-	MOVQ    AX, 16(R10)
+	MOVQ  R9, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_56_amd64_of_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_56_amd64_of_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_56_amd64_of_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_56_amd64_of_update_zero:
+	MOVQ AX, 16(R10)

 	// Update match length
-	MOVQ    R8, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R15
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R15
-	ADDQ    R15, AX
-	MOVQ    AX, 8(R10)
+	MOVQ  R8, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_56_amd64_ml_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_56_amd64_ml_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_56_amd64_ml_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_56_amd64_ml_update_zero:
+	MOVQ AX, 8(R10)

 	// Update literal length
-	MOVQ    DI, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R15
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R15
-	ADDQ    R15, AX
-	MOVQ    AX, (R10)
+	MOVQ  DI, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_56_amd64_ll_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_56_amd64_ll_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_56_amd64_ll_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_56_amd64_ll_update_zero:
+	MOVQ AX, (R10)

 	// Fill bitreader for state updates
 	MOVQ    R14, (SP)
@ -477,7 +508,7 @@ sequenceDecs_decode_56_amd64_skip_update:
 	MOVQ R12, R13
 	MOVQ R11, R12
 	MOVQ CX, R11
-	JMP  sequenceDecs_decode_56_amd64_adjust_end
+	JMP  sequenceDecs_decode_56_amd64_after_adjust

 sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
 	CMPQ (R10), $0x00000000
@ -489,7 +520,7 @@ sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
 	TESTQ CX, CX
 	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
 	MOVQ  R11, CX
-	JMP   sequenceDecs_decode_56_amd64_adjust_end
+	JMP   sequenceDecs_decode_56_amd64_after_adjust

 sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
 	CMPQ CX, $0x01
@ -526,7 +557,7 @@ sequenceDecs_decode_56_amd64_adjust_temp_valid:
 	MOVQ    AX, R11
 	MOVQ    AX, CX

-sequenceDecs_decode_56_amd64_adjust_end:
+sequenceDecs_decode_56_amd64_after_adjust:
 	MOVQ CX, 16(R10)

 	// Check values
@ -582,10 +613,6 @@ error_not_enough_literals:
 	MOVQ $0x00000004, ret+24(FP)
 	RET

-	// Return with not enough output space error
-	MOVQ $0x00000005, ret+24(FP)
-	RET
-
 // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 // Requires: BMI, BMI2, CMOV
 TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
@ -757,7 +784,7 @@ sequenceDecs_decode_bmi2_skip_update:
 	MOVQ R11, R12
 	MOVQ R10, R11
 	MOVQ CX, R10
-	JMP  sequenceDecs_decode_bmi2_adjust_end
+	JMP  sequenceDecs_decode_bmi2_after_adjust

 sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
 	CMPQ (R9), $0x00000000
@ -769,7 +796,7 @@ sequenceDecs_decode_bmi2_adjust_offset_maybezero:
 	TESTQ CX, CX
 	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
 	MOVQ  R10, CX
-	JMP   sequenceDecs_decode_bmi2_adjust_end
+	JMP   sequenceDecs_decode_bmi2_after_adjust

 sequenceDecs_decode_bmi2_adjust_offset_nonzero:
 	CMPQ CX, $0x01
@ -806,7 +833,7 @@ sequenceDecs_decode_bmi2_adjust_temp_valid:
 	MOVQ    R13, R10
 	MOVQ    R13, CX

-sequenceDecs_decode_bmi2_adjust_end:
+sequenceDecs_decode_bmi2_after_adjust:
 	MOVQ CX, 16(R9)

 	// Check values
@ -862,10 +889,6 @@ error_not_enough_literals:
 	MOVQ $0x00000004, ret+24(FP)
 	RET

-	// Return with not enough output space error
-	MOVQ $0x00000005, ret+24(FP)
-	RET
-
 // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 // Requires: BMI, BMI2, CMOV
 TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
@ -1012,7 +1035,7 @@ sequenceDecs_decode_56_bmi2_skip_update:
 	MOVQ R11, R12
 	MOVQ R10, R11
 	MOVQ CX, R10
-	JMP  sequenceDecs_decode_56_bmi2_adjust_end
+	JMP  sequenceDecs_decode_56_bmi2_after_adjust

 sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
 	CMPQ (R9), $0x00000000
@ -1024,7 +1047,7 @@ sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
 	TESTQ CX, CX
 	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
 	MOVQ  R10, CX
-	JMP   sequenceDecs_decode_56_bmi2_adjust_end
+	JMP   sequenceDecs_decode_56_bmi2_after_adjust

 sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
 	CMPQ CX, $0x01
@ -1061,7 +1084,7 @@ sequenceDecs_decode_56_bmi2_adjust_temp_valid:
 	MOVQ    R13, R10
 	MOVQ    R13, CX

-sequenceDecs_decode_56_bmi2_adjust_end:
+sequenceDecs_decode_56_bmi2_after_adjust:
 	MOVQ CX, 16(R9)

 	// Check values
@ -1117,10 +1140,6 @@ error_not_enough_literals:
 	MOVQ $0x00000004, ret+24(FP)
 	RET

-	// Return with not enough output space error
-	MOVQ $0x00000005, ret+24(FP)
-	RET
-
 // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
 // Requires: SSE
 TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
@ -1354,8 +1373,7 @@ loop_finished:
 	MOVQ ctx+0(FP), AX
 	MOVQ DX, 24(AX)
 	MOVQ DI, 104(AX)
-	MOVQ 80(AX), CX
-	SUBQ CX, SI
+	SUBQ 80(AX), SI
 	MOVQ SI, 112(AX)
 	RET

@ -1367,8 +1385,7 @@ error_match_off_too_big:
 	MOVQ ctx+0(FP), AX
 	MOVQ DX, 24(AX)
 	MOVQ DI, 104(AX)
-	MOVQ 80(AX), CX
-	SUBQ CX, SI
+	SUBQ 80(AX), SI
 	MOVQ SI, 112(AX)
 	RET

@ -1712,8 +1729,7 @@ loop_finished:
 	MOVQ ctx+0(FP), AX
 	MOVQ DX, 24(AX)
 	MOVQ DI, 104(AX)
-	MOVQ 80(AX), CX
-	SUBQ CX, SI
+	SUBQ 80(AX), SI
 	MOVQ SI, 112(AX)
 	RET

@ -1725,8 +1741,7 @@ error_match_off_too_big:
 	MOVQ ctx+0(FP), AX
 	MOVQ DX, 24(AX)
 	MOVQ DI, 104(AX)
-	MOVQ 80(AX), CX
-	SUBQ CX, SI
+	SUBQ 80(AX), SI
 	MOVQ SI, 112(AX)
 	RET

@ -1749,6 +1764,10 @@ TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
 	MOVQ    72(AX), DI
 	MOVQ    80(AX), R8
 	MOVQ    88(AX), R9
+	XORQ    CX, CX
+	MOVQ    CX, 8(SP)
+	MOVQ    CX, 16(SP)
+	MOVQ    CX, 24(SP)
 	MOVQ    112(AX), R10
 	MOVQ    128(AX), CX
 	MOVQ    CX, 32(SP)
@ -1798,34 +1817,46 @@ sequenceDecs_decodeSync_amd64_fill_byte_by_byte:

 sequenceDecs_decodeSync_amd64_fill_end:
 	// Update offset
-	MOVQ    R9, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R14
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R14
-	ADDQ    R14, AX
-	MOVQ    AX, 8(SP)
+	MOVQ  R9, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_amd64_of_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_amd64_of_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_amd64_of_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_amd64_of_update_zero:
+	MOVQ AX, 8(SP)

 	// Update match length
-	MOVQ    R8, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R14
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R14
-	ADDQ    R14, AX
-	MOVQ    AX, 16(SP)
+	MOVQ  R8, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_amd64_ml_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_amd64_ml_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_amd64_ml_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_amd64_ml_update_zero:
+	MOVQ AX, 16(SP)

 	// Fill bitreader to have enough for the remaining
 	CMPQ SI, $0x08
@ -1853,19 +1884,25 @@ sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:

 sequenceDecs_decodeSync_amd64_fill_2_end:
 	// Update literal length
-	MOVQ    DI, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R14
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R14
-	ADDQ    R14, AX
-	MOVQ    AX, 24(SP)
+	MOVQ  DI, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_amd64_ll_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_amd64_ll_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_amd64_ll_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_amd64_ll_update_zero:
+	MOVQ AX, 24(SP)

 	// Fill bitreader for state updates
 	MOVQ    R13, (SP)
@ -1945,7 +1982,7 @@ sequenceDecs_decodeSync_amd64_skip_update:
 	MOVUPS 144(CX), X0
 	MOVQ   R13, 144(CX)
 	MOVUPS X0, 152(CX)
-	JMP    sequenceDecs_decodeSync_amd64_adjust_end
+	JMP    sequenceDecs_decodeSync_amd64_after_adjust

 sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
 	CMPQ 24(SP), $0x00000000
@ -1957,7 +1994,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
 	TESTQ R13, R13
 	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
 	MOVQ  144(CX), R13
-	JMP   sequenceDecs_decodeSync_amd64_adjust_end
+	JMP   sequenceDecs_decodeSync_amd64_after_adjust

 sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
 	MOVQ    R13, AX
@ -1966,8 +2003,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
 	CMPQ    R13, $0x03
 	CMOVQEQ R14, AX
 	CMOVQEQ R15, R14
-	LEAQ    144(CX), R15
-	ADDQ    (R15)(AX*8), R14
+	ADDQ    144(CX)(AX*8), R14
 	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
 	MOVQ    $0x00000001, R14

@ -1983,7 +2019,7 @@ sequenceDecs_decodeSync_amd64_adjust_skip:
 	MOVQ R14, 144(CX)
 	MOVQ R14, R13

-sequenceDecs_decodeSync_amd64_adjust_end:
+sequenceDecs_decodeSync_amd64_after_adjust:
 	MOVQ R13, 8(SP)

 	// Check values
@ -2280,6 +2316,10 @@ TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
 	MOVQ    72(CX), SI
 	MOVQ    80(CX), DI
 	MOVQ    88(CX), R8
+	XORQ    R9, R9
+	MOVQ    R9, 8(SP)
+	MOVQ    R9, 16(SP)
+	MOVQ    R9, 24(SP)
 	MOVQ    112(CX), R9
 	MOVQ    128(CX), R10
 	MOVQ    R10, 32(SP)
@ -2452,7 +2492,7 @@ sequenceDecs_decodeSync_bmi2_skip_update:
 	MOVUPS 144(CX), X0
 	MOVQ   R13, 144(CX)
 	MOVUPS X0, 152(CX)
-	JMP    sequenceDecs_decodeSync_bmi2_adjust_end
+	JMP    sequenceDecs_decodeSync_bmi2_after_adjust

 sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
 	CMPQ 24(SP), $0x00000000
@ -2464,7 +2504,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
 	TESTQ R13, R13
 	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
 	MOVQ  144(CX), R13
-	JMP   sequenceDecs_decodeSync_bmi2_adjust_end
+	JMP   sequenceDecs_decodeSync_bmi2_after_adjust

 sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
 	MOVQ    R13, R12
@ -2473,8 +2513,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
 	CMPQ    R13, $0x03
 	CMOVQEQ R14, R12
 	CMOVQEQ R15, R14
-	LEAQ    144(CX), R15
-	ADDQ    (R15)(R12*8), R14
+	ADDQ    144(CX)(R12*8), R14
 	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
 	MOVQ    $0x00000001, R14

@ -2490,7 +2529,7 @@ sequenceDecs_decodeSync_bmi2_adjust_skip:
 	MOVQ R14, 144(CX)
 	MOVQ R14, R13

-sequenceDecs_decodeSync_bmi2_adjust_end:
+sequenceDecs_decodeSync_bmi2_after_adjust:
 	MOVQ R13, 8(SP)

 	// Check values
@ -2787,6 +2826,10 @@ TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
 	MOVQ    72(AX), DI
 	MOVQ    80(AX), R8
 	MOVQ    88(AX), R9
+	XORQ    CX, CX
+	MOVQ    CX, 8(SP)
+	MOVQ    CX, 16(SP)
+	MOVQ    CX, 24(SP)
 	MOVQ    112(AX), R10
 	MOVQ    128(AX), CX
 	MOVQ    CX, 32(SP)
@ -2836,34 +2879,46 @@ sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:

 sequenceDecs_decodeSync_safe_amd64_fill_end:
 	// Update offset
-	MOVQ    R9, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R14
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R14
-	ADDQ    R14, AX
-	MOVQ    AX, 8(SP)
+	MOVQ  R9, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_safe_amd64_of_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_safe_amd64_of_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_safe_amd64_of_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_of_update_zero:
+	MOVQ AX, 8(SP)

 	// Update match length
-	MOVQ    R8, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R14
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R14
-	ADDQ    R14, AX
-	MOVQ    AX, 16(SP)
+	MOVQ  R8, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
+	MOVQ AX, 16(SP)

 	// Fill bitreader to have enough for the remaining
 	CMPQ SI, $0x08
@ -2891,19 +2946,25 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:

 sequenceDecs_decodeSync_safe_amd64_fill_2_end:
 	// Update literal length
-	MOVQ    DI, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R14
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R14
-	ADDQ    R14, AX
-	MOVQ    AX, 24(SP)
+	MOVQ  DI, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
+	MOVQ AX, 24(SP)

 	// Fill bitreader for state updates
 	MOVQ    R13, (SP)
@ -2983,7 +3044,7 @@ sequenceDecs_decodeSync_safe_amd64_skip_update:
 	MOVUPS 144(CX), X0
 	MOVQ   R13, 144(CX)
 	MOVUPS X0, 152(CX)
-	JMP    sequenceDecs_decodeSync_safe_amd64_adjust_end
+	JMP    sequenceDecs_decodeSync_safe_amd64_after_adjust

 sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
 	CMPQ 24(SP), $0x00000000
@ -2995,7 +3056,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
 	TESTQ R13, R13
 	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
 	MOVQ  144(CX), R13
-	JMP   sequenceDecs_decodeSync_safe_amd64_adjust_end
+	JMP   sequenceDecs_decodeSync_safe_amd64_after_adjust

 sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
 	MOVQ    R13, AX
@ -3004,8 +3065,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
 	CMPQ    R13, $0x03
 	CMOVQEQ R14, AX
 	CMOVQEQ R15, R14
-	LEAQ    144(CX), R15
-	ADDQ    (R15)(AX*8), R14
+	ADDQ    144(CX)(AX*8), R14
 	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
 	MOVQ    $0x00000001, R14

@ -3021,7 +3081,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_skip:
 	MOVQ R14, 144(CX)
 	MOVQ R14, R13

-sequenceDecs_decodeSync_safe_amd64_adjust_end:
+sequenceDecs_decodeSync_safe_amd64_after_adjust:
 	MOVQ R13, 8(SP)

 	// Check values
@ -3420,6 +3480,10 @@ TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
 	MOVQ    72(CX), SI
 	MOVQ    80(CX), DI
 	MOVQ    88(CX), R8
+	XORQ    R9, R9
+	MOVQ    R9, 8(SP)
+	MOVQ    R9, 16(SP)
+	MOVQ    R9, 24(SP)
 	MOVQ    112(CX), R9
 	MOVQ    128(CX), R10
 	MOVQ    R10, 32(SP)
@ -3592,7 +3656,7 @@ sequenceDecs_decodeSync_safe_bmi2_skip_update:
 	MOVUPS 144(CX), X0
 	MOVQ   R13, 144(CX)
 	MOVUPS X0, 152(CX)
-	JMP    sequenceDecs_decodeSync_safe_bmi2_adjust_end
+	JMP    sequenceDecs_decodeSync_safe_bmi2_after_adjust

 sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
 	CMPQ 24(SP), $0x00000000
@ -3604,7 +3668,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
 	TESTQ R13, R13
 	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
 	MOVQ  144(CX), R13
-	JMP   sequenceDecs_decodeSync_safe_bmi2_adjust_end
+	JMP   sequenceDecs_decodeSync_safe_bmi2_after_adjust

 sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
 	MOVQ    R13, R12
@ -3613,8 +3677,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
 	CMPQ    R13, $0x03
 	CMOVQEQ R14, R12
 	CMOVQEQ R15, R14
-	LEAQ    144(CX), R15
-	ADDQ    (R15)(R12*8), R14
+	ADDQ    144(CX)(R12*8), R14
 	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
 	MOVQ    $0x00000001, R14

@ -3630,7 +3693,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
 	MOVQ R14, 144(CX)
 	MOVQ R14, R13

-sequenceDecs_decodeSync_safe_bmi2_adjust_end:
+sequenceDecs_decodeSync_safe_bmi2_after_adjust:
 	MOVQ R13, 8(SP)

 	// Check values
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
@ -111,7 +111,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
 		}
 		s.seqSize += ll + ml
 		if s.seqSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+			return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 		}
 		litRemain -= ll
 		if litRemain < 0 {
@ -149,7 +149,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
 	}
 	s.seqSize += litRemain
 	if s.seqSize > maxBlockSize {
-		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 	}
 	err := br.close()
 	if err != nil {
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@ -36,9 +36,6 @@ const forcePreDef = false
 // zstdMinMatch is the minimum zstd match length.
 const zstdMinMatch = 3

-// Reset the buffer offset when reaching this.
-const bufferReset = math.MaxInt32 - MaxWindowSize
-
 // fcsUnknown is used for unknown frame content size.
 const fcsUnknown = math.MaxUint64

@ -75,7 +72,6 @@ var (
 	ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit")

 	// ErrUnknownDictionary is returned if the dictionary ID is unknown.
-	// For the time being dictionaries are not supported.
 	ErrUnknownDictionary = errors.New("unknown dictionary")

 	// ErrFrameSizeExceeded is returned if the stated frame size is exceeded.
@ -110,26 +106,25 @@ func printf(format string, a ...interface{}) {
 	}
 }

-// matchLen returns the maximum length.
+// matchLen returns the maximum common prefix length of a and b.
 // a must be the shortest of the two.
-// The function also returns whether all bytes matched.
-func matchLen(a, b []byte) int {
-	b = b[:len(a)]
-	for i := 0; i < len(a)-7; i += 8 {
-		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-			return i + (bits.TrailingZeros64(diff) >> 3)
+func matchLen(a, b []byte) (n int) {
+	for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
+		diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
+		if diff != 0 {
+			return n + bits.TrailingZeros64(diff)>>3
 		}
+		n += 8
 	}

-	checked := (len(a) >> 3) << 3
-	a = a[checked:]
-	b = b[checked:]
 	for i := range a {
 		if a[i] != b[i] {
-			return i + checked
+			break
 		}
+		n++
 	}
-	return len(a) + checked
+	return n
+
 }

 func load3232(b []byte, i int32) uint32 {
@ -140,10 +135,6 @@ func load6432(b []byte, i int32) uint64 {
 	return binary.LittleEndian.Uint64(b[i:])
 }

-func load64(b []byte, i int) uint64 {
-	return binary.LittleEndian.Uint64(b[i:])
-}
-
 type byter interface {
 	Bytes() []byte
 	Len() int