Migrate to containerd v1.7.0 and update dependencies

* Updates containerd to v1.7.0 and new binary for 32-bit
Arm OSes.
* Updates Go dependencies - openfaas and external

Signed-off-by: Alex Ellis (OpenFaaS Ltd) <alexellis2@gmail.com>
This commit is contained in:
Alex Ellis (OpenFaaS Ltd)
2023-03-19 10:55:53 +00:00
committed by Alex Ellis
parent 9efd019e86
commit c41c2cd9fc
1133 changed files with 104391 additions and 75499 deletions

View File

@ -12,6 +12,8 @@ The `zstd` package is provided as open source software using a Go standard licen
Currently the package is heavily optimized for 64 bit processors and will be significantly slower on 32 bit processors.
For seekable zstd streams, see [this excellent package](https://github.com/SaveTheRbtz/zstd-seekable-format-go).
## Installation
Install using `go get -u github.com/klauspost/compress`. The package is located in `github.com/klauspost/compress/zstd`.

View File

@ -9,8 +9,8 @@ import (
"encoding/binary"
"errors"
"fmt"
"hash/crc32"
"io"
"io/ioutil"
"os"
"path/filepath"
"sync"
@ -83,8 +83,9 @@ type blockDec struct {
err error
// Check against this crc
checkCRC []byte
// Check against this crc, if hasCRC is true.
checkCRC uint32
hasCRC bool
// Frame to use for singlethreaded decoding.
// Should not be used by the decoder itself since parent may be another frame.
@ -192,16 +193,14 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
}
// Read block data.
if cap(b.dataStorage) < cSize {
if _, ok := br.(*byteBuf); !ok && cap(b.dataStorage) < cSize {
// byteBuf doesn't need a destination buffer.
if b.lowMem || cSize > maxCompressedBlockSize {
b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
} else {
b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
}
}
if cap(b.dst) <= maxSize {
b.dst = make([]byte, 0, maxSize+1)
}
b.data, err = br.readBig(cSize, b.dataStorage)
if err != nil {
if debugDecoder {
@ -210,6 +209,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
}
return err
}
if cap(b.dst) <= maxSize {
b.dst = make([]byte, 0, maxSize+1)
}
return nil
}
@ -233,7 +235,7 @@ func (b *blockDec) decodeBuf(hist *history) error {
if b.lowMem {
b.dst = make([]byte, b.RLESize)
} else {
b.dst = make([]byte, maxBlockSize)
b.dst = make([]byte, maxCompressedBlockSize)
}
}
b.dst = b.dst[:b.RLESize]
@ -441,6 +443,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
}
}
var err error
if debugDecoder {
println("huff table input:", len(literals), "CRC:", crc32.ChecksumIEEE(literals))
}
huff, literals, err = huff0.ReadTable(literals, huff)
if err != nil {
println("reading huffman table:", err)
@ -651,7 +656,7 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
buf.Write(in)
ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
os.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
}
return nil

View File

@ -7,7 +7,6 @@ package zstd
import (
"fmt"
"io"
"io/ioutil"
)
type byteBuffer interface {
@ -23,7 +22,7 @@ type byteBuffer interface {
readByte() (byte, error)
// Skip n bytes.
skipN(n int) error
skipN(n int64) error
}
// in-memory buffer
@ -55,16 +54,19 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
func (b *byteBuf) readByte() (byte, error) {
bb := *b
if len(bb) < 1 {
return 0, nil
return 0, io.ErrUnexpectedEOF
}
r := bb[0]
*b = bb[1:]
return r, nil
}
func (b *byteBuf) skipN(n int) error {
func (b *byteBuf) skipN(n int64) error {
bb := *b
if len(bb) < n {
if n < 0 {
return fmt.Errorf("negative skip (%d) requested", n)
}
if int64(len(bb)) < n {
return io.ErrUnexpectedEOF
}
*b = bb[n:]
@ -120,9 +122,9 @@ func (r *readerWrapper) readByte() (byte, error) {
return r.tmp[0], nil
}
func (r *readerWrapper) skipN(n int) error {
n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
if n2 != int64(n) {
func (r *readerWrapper) skipN(n int64) error {
n2, err := io.CopyN(io.Discard, r.r, n)
if n2 != n {
err = io.ErrUnexpectedEOF
}
return err

View File

@ -4,7 +4,6 @@
package zstd
import (
"bytes"
"encoding/binary"
"errors"
"io"
@ -102,8 +101,8 @@ func (h *Header) Decode(in []byte) error {
}
h.HeaderSize += 4
b, in := in[:4], in[4:]
if !bytes.Equal(b, frameMagic) {
if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
if string(b) != frameMagic {
if string(b[1:4]) != skippableFrameMagic || b[0]&0xf0 != 0x50 {
return ErrMagicMismatch
}
if len(in) < 4 {
@ -153,7 +152,7 @@ func (h *Header) Decode(in []byte) error {
}
b, in = in[:size], in[size:]
h.HeaderSize += int(size)
switch size {
switch len(b) {
case 1:
h.DictionaryID = uint32(b[0])
case 2:
@ -183,7 +182,7 @@ func (h *Header) Decode(in []byte) error {
}
b, in = in[:fcsSize], in[fcsSize:]
h.HeaderSize += int(fcsSize)
switch fcsSize {
switch len(b) {
case 1:
h.FrameContentSize = uint64(b[0])
case 2:

View File

@ -5,7 +5,6 @@
package zstd
import (
"bytes"
"context"
"encoding/binary"
"io"
@ -35,13 +34,13 @@ type Decoder struct {
br readerWrapper
enabled bool
inFrame bool
dstBuf []byte
}
frame *frameDec
// Custom dictionaries.
// Always uses copies.
dicts map[uint32]dict
dicts map[uint32]*dict
// streamWg is the waitgroup for all streams
streamWg sync.WaitGroup
@ -103,7 +102,7 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
}
// Transfer option dicts.
d.dicts = make(map[uint32]dict, len(d.o.dicts))
d.dicts = make(map[uint32]*dict, len(d.o.dicts))
for _, dc := range d.o.dicts {
d.dicts[dc.id] = dc
}
@ -187,21 +186,23 @@ func (d *Decoder) Reset(r io.Reader) error {
}
// If bytes buffer and < 5MB, do sync decoding anyway.
if bb, ok := r.(byter); ok && bb.Len() < 5<<20 {
if bb, ok := r.(byter); ok && bb.Len() < d.o.decodeBufsBelow && !d.o.limitToCap {
bb2 := bb
if debugDecoder {
println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
}
b := bb2.Bytes()
var dst []byte
if cap(d.current.b) > 0 {
dst = d.current.b
if cap(d.syncStream.dstBuf) > 0 {
dst = d.syncStream.dstBuf[:0]
}
dst, err := d.DecodeAll(b, dst[:0])
dst, err := d.DecodeAll(b, dst)
if err == nil {
err = io.EOF
}
// Save output buffer
d.syncStream.dstBuf = dst
d.current.b = dst
d.current.err = err
d.current.flushed = true
@ -216,6 +217,7 @@ func (d *Decoder) Reset(r io.Reader) error {
d.current.err = nil
d.current.flushed = false
d.current.d = nil
d.syncStream.dstBuf = nil
// Ensure no-one else is still running...
d.streamWg.Wait()
@ -312,6 +314,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
// Grab a block decoder and frame decoder.
block := <-d.decoders
frame := block.localFrame
initialSize := len(dst)
defer func() {
if debugDecoder {
printf("re-adding decoder: %p", block)
@ -337,21 +340,26 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
}
return dst, err
}
if frame.DictionaryID != nil {
dict, ok := d.dicts[*frame.DictionaryID]
if !ok {
return nil, ErrUnknownDictionary
}
if debugDecoder {
println("setting dict", frame.DictionaryID)
}
frame.history.setDict(&dict)
if err = d.setDict(frame); err != nil {
return nil, err
}
if frame.WindowSize > d.o.maxWindowSize {
if debugDecoder {
println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
}
return dst, ErrWindowSizeExceeded
}
if frame.FrameContentSize != fcsUnknown {
if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)-initialSize) {
if debugDecoder {
println("decoder size exceeded; fcs:", frame.FrameContentSize, "> mcs:", d.o.maxDecodedSize-uint64(len(dst)-initialSize), "len:", len(dst))
}
return dst, ErrDecoderSizeExceeded
}
if d.o.limitToCap && frame.FrameContentSize > uint64(cap(dst)-len(dst)) {
if debugDecoder {
println("decoder size exceeded; fcs:", frame.FrameContentSize, "> (cap-len)", cap(dst)-len(dst))
}
return dst, ErrDecoderSizeExceeded
}
if cap(dst)-len(dst) < int(frame.FrameContentSize) {
@ -361,7 +369,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
}
}
if cap(dst) == 0 {
if cap(dst) == 0 && !d.o.limitToCap {
// Allocate len(input) * 2 by default if nothing is provided
// and we didn't get frame content size.
size := len(input) * 2
@ -379,6 +387,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
if err != nil {
return dst, err
}
if uint64(len(dst)-initialSize) > d.o.maxDecodedSize {
return dst, ErrDecoderSizeExceeded
}
if len(frame.bBuf) == 0 {
if debugDecoder {
println("frame dbuf empty")
@ -439,7 +450,11 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
}
if !d.o.ignoreChecksum && len(next.b) > 0 {
if d.o.ignoreChecksum {
return true
}
if len(next.b) > 0 {
n, err := d.current.crc.Write(next.b)
if err == nil {
if n != len(next.b) {
@ -447,18 +462,16 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
}
}
}
if next.err == nil && next.d != nil && len(next.d.checkCRC) != 0 {
got := d.current.crc.Sum64()
var tmp [4]byte
binary.LittleEndian.PutUint32(tmp[:], uint32(got))
if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
if next.err == nil && next.d != nil && next.d.hasCRC {
got := uint32(d.current.crc.Sum64())
if got != next.d.checkCRC {
if debugDecoder {
println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
printf("CRC Check Failed: %08x (got) != %08x (on stream)\n", got, next.d.checkCRC)
}
d.current.err = ErrCRCMismatch
} else {
if debugDecoder {
println("CRC ok", tmp[:])
printf("CRC ok %08x\n", got)
}
}
}
@ -474,18 +487,12 @@ func (d *Decoder) nextBlockSync() (ok bool) {
if !d.syncStream.inFrame {
d.frame.history.reset()
d.current.err = d.frame.reset(&d.syncStream.br)
if d.current.err == nil {
d.current.err = d.setDict(d.frame)
}
if d.current.err != nil {
return false
}
if d.frame.DictionaryID != nil {
dict, ok := d.dicts[*d.frame.DictionaryID]
if !ok {
d.current.err = ErrUnknownDictionary
return false
} else {
d.frame.history.setDict(&dict)
}
}
if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
d.current.err = ErrDecoderSizeExceeded
return false
@ -664,6 +671,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
if debugDecoder {
println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
}
hist.reset()
hist.decoders = block.async.newHist.decoders
hist.recentOffsets = block.async.newHist.recentOffsets
hist.windowSize = block.async.newHist.windowSize
@ -695,6 +703,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
seqExecute <- block
}
close(seqExecute)
hist.reset()
}()
var wg sync.WaitGroup
@ -718,6 +727,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
if debugDecoder {
println("Async 2: new history")
}
hist.reset()
hist.windowSize = block.async.newHist.windowSize
hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
if block.async.newHist.dict != nil {
@ -747,7 +757,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
if block.lowMem {
block.dst = make([]byte, block.RLESize)
} else {
block.dst = make([]byte, maxBlockSize)
block.dst = make([]byte, maxCompressedBlockSize)
}
}
block.dst = block.dst[:block.RLESize]
@ -799,13 +809,14 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
if debugDecoder {
println("decoder goroutines finished")
}
hist.reset()
}()
var hist history
decodeStream:
for {
var hist history
var hasErr bool
hist.reset()
decodeBlock := func(block *blockDec) {
if hasErr {
if block != nil {
@ -840,15 +851,14 @@ decodeStream:
if debugDecoder && err != nil {
println("Frame decoder returned", err)
}
if err == nil && frame.DictionaryID != nil {
dict, ok := d.dicts[*frame.DictionaryID]
if !ok {
err = ErrUnknownDictionary
} else {
frame.history.setDict(&dict)
}
if err == nil {
err = d.setDict(frame)
}
if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
if debugDecoder {
println("decoder size exceeded, fws:", d.frame.WindowSize, "> mws:", d.o.maxWindowSize)
}
err = ErrDecoderSizeExceeded
}
if err != nil {
@ -890,18 +900,22 @@ decodeStream:
println("next block returned error:", err)
}
dec.err = err
dec.checkCRC = nil
dec.hasCRC = false
if dec.Last && frame.HasCheckSum && err == nil {
crc, err := frame.rawInput.readSmall(4)
if err != nil {
if len(crc) < 4 {
if err == nil {
err = io.ErrUnexpectedEOF
}
println("CRC missing?", err)
dec.err = err
}
var tmp [4]byte
copy(tmp[:], crc)
dec.checkCRC = tmp[:]
if debugDecoder {
println("found crc to check:", dec.checkCRC)
} else {
dec.checkCRC = binary.LittleEndian.Uint32(crc)
dec.hasCRC = true
if debugDecoder {
printf("found crc to check: %08x\n", dec.checkCRC)
}
}
}
err = dec.err
@ -917,5 +931,23 @@ decodeStream:
}
close(seqDecode)
wg.Wait()
hist.reset()
d.frame.history.b = frameHistCache
}
func (d *Decoder) setDict(frame *frameDec) (err error) {
dict, ok := d.dicts[frame.DictionaryID]
if ok {
if debugDecoder {
println("setting dict", frame.DictionaryID)
}
frame.history.setDict(dict)
} else if frame.DictionaryID != 0 {
// A zero or missing dictionary id is ambiguous:
// either dictionary zero, or no dictionary. In particular,
// zstd --patch-from uses this id for the source file,
// so only return an error if the dictionary id is not zero.
err = ErrUnknownDictionary
}
return err
}

View File

@ -6,6 +6,8 @@ package zstd
import (
"errors"
"fmt"
"math/bits"
"runtime"
)
@ -14,20 +16,23 @@ type DOption func(*decoderOptions) error
// options retains accumulated state of multiple options.
type decoderOptions struct {
lowMem bool
concurrent int
maxDecodedSize uint64
maxWindowSize uint64
dicts []dict
ignoreChecksum bool
lowMem bool
concurrent int
maxDecodedSize uint64
maxWindowSize uint64
dicts []*dict
ignoreChecksum bool
limitToCap bool
decodeBufsBelow int
}
func (o *decoderOptions) setDefault() {
*o = decoderOptions{
// use less ram: true for now, but may change.
lowMem: true,
concurrent: runtime.GOMAXPROCS(0),
maxWindowSize: MaxWindowSize,
lowMem: true,
concurrent: runtime.GOMAXPROCS(0),
maxWindowSize: MaxWindowSize,
decodeBufsBelow: 128 << 10,
}
if o.concurrent > 4 {
o.concurrent = 4
@ -82,7 +87,13 @@ func WithDecoderMaxMemory(n uint64) DOption {
}
// WithDecoderDicts allows to register one or more dictionaries for the decoder.
// If several dictionaries with the same ID is provided the last one will be used.
//
// Each slice in dict must be in the [dictionary format] produced by
// "zstd --train" from the Zstandard reference implementation.
//
// If several dictionaries with the same ID are provided, the last one will be used.
//
// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
func WithDecoderDicts(dicts ...[]byte) DOption {
return func(o *decoderOptions) error {
for _, b := range dicts {
@ -90,12 +101,24 @@ func WithDecoderDicts(dicts ...[]byte) DOption {
if err != nil {
return err
}
o.dicts = append(o.dicts, *d)
o.dicts = append(o.dicts, d)
}
return nil
}
}
// WithEncoderDictRaw registers a dictionary that may be used by the decoder.
// The slice content can be arbitrary data.
func WithDecoderDictRaw(id uint32, content []byte) DOption {
return func(o *decoderOptions) error {
if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
}
o.dicts = append(o.dicts, &dict{id: id, content: content, offsets: [3]int{1, 4, 8}})
return nil
}
}
// WithDecoderMaxWindow allows to set a maximum window size for decodes.
// This allows rejecting packets that will cause big memory usage.
// The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.
@ -114,6 +137,29 @@ func WithDecoderMaxWindow(size uint64) DOption {
}
}
// WithDecodeAllCapLimit will limit DecodeAll to decoding cap(dst)-len(dst) bytes,
// or any size set in WithDecoderMaxMemory.
// This can be used to limit decoding to a specific maximum output size.
// Disabled by default.
func WithDecodeAllCapLimit(b bool) DOption {
return func(o *decoderOptions) error {
o.limitToCap = b
return nil
}
}
// WithDecodeBuffersBelow will fully decode readers that have a
// `Bytes() []byte` and `Len() int` interface similar to bytes.Buffer.
// This typically uses less allocations but will have the full decompressed object in memory.
// Note that DecodeAllCapLimit will disable this, as well as giving a size of 0 or less.
// Default is 128KiB.
func WithDecodeBuffersBelow(size int) DOption {
return func(o *decoderOptions) error {
o.decodeBufsBelow = size
return nil
}
}
// IgnoreChecksum allows to forcibly ignore checksum checking.
func IgnoreChecksum(b bool) DOption {
return func(o *decoderOptions) error {

View File

@ -1,7 +1,6 @@
package zstd
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
@ -20,7 +19,10 @@ type dict struct {
content []byte
}
var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec}
const dictMagic = "\x37\xa4\x30\xec"
// Maximum dictionary size for the reference implementation (1.5.3) is 2 GiB.
const dictMaxLength = 1 << 31
// ID returns the dictionary id or 0 if d is nil.
func (d *dict) ID() uint32 {
@ -30,14 +32,38 @@ func (d *dict) ID() uint32 {
return d.id
}
// DictContentSize returns the dictionary content size or 0 if d is nil.
func (d *dict) DictContentSize() int {
// ContentSize returns the dictionary content size or 0 if d is nil.
func (d *dict) ContentSize() int {
if d == nil {
return 0
}
return len(d.content)
}
// Content returns the dictionary content.
func (d *dict) Content() []byte {
if d == nil {
return nil
}
return d.content
}
// Offsets returns the initial offsets.
func (d *dict) Offsets() [3]int {
if d == nil {
return [3]int{}
}
return d.offsets
}
// LitEncoder returns the literal encoder.
func (d *dict) LitEncoder() *huff0.Scratch {
if d == nil {
return nil
}
return d.litEnc
}
// Load a dictionary as described in
// https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
func loadDict(b []byte) (*dict, error) {
@ -50,7 +76,7 @@ func loadDict(b []byte) (*dict, error) {
ofDec: sequenceDec{fse: &fseDecoder{}},
mlDec: sequenceDec{fse: &fseDecoder{}},
}
if !bytes.Equal(b[:4], dictMagic[:]) {
if string(b[:4]) != dictMagic {
return nil, ErrMagicMismatch
}
d.id = binary.LittleEndian.Uint32(b[4:8])
@ -62,7 +88,7 @@ func loadDict(b []byte) (*dict, error) {
var err error
d.litEnc, b, err = huff0.ReadTable(b[8:], nil)
if err != nil {
return nil, err
return nil, fmt.Errorf("loading literal table: %w", err)
}
d.litEnc.Reuse = huff0.ReusePolicyMust
@ -120,3 +146,16 @@ func loadDict(b []byte) (*dict, error) {
return &d, nil
}
// InspectDictionary loads a zstd dictionary and provides functions to inspect the content.
func InspectDictionary(b []byte) (interface {
ID() uint32
ContentSize() int
Content() []byte
Offsets() [3]int
LitEncoder() *huff0.Scratch
}, error) {
initPredefined()
d, err := loadDict(b)
return d, err
}

View File

@ -16,6 +16,7 @@ type fastBase struct {
cur int32
// maximum offset. Should be at least 2x block size.
maxMatchOff int32
bufferReset int32
hist []byte
crc *xxhash.Digest
tmp [8]byte
@ -56,8 +57,8 @@ func (e *fastBase) Block() *blockEnc {
}
func (e *fastBase) addBlock(src []byte) int32 {
if debugAsserts && e.cur > bufferReset {
panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
if debugAsserts && e.cur > e.bufferReset {
panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, e.bufferReset))
}
// check if we have space already
if len(e.hist)+len(src) > cap(e.hist) {
@ -126,24 +127,7 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
}
}
a := src[s:]
b := src[t:]
b = b[:len(a)]
end := int32((len(a) >> 3) << 3)
for i := int32(0); i < end; i += 8 {
if diff := load6432(a, i) ^ load6432(b, i); diff != 0 {
return i + int32(bits.TrailingZeros64(diff)>>3)
}
}
a = a[end:]
b = b[end:]
for i := range a {
if a[i] != b[i] {
return int32(i) + end
}
}
return int32(len(a)) + end
return int32(matchLen(src[s:], src[t:]))
}
// Reset the encoding table.
@ -165,13 +149,13 @@ func (e *fastBase) resetBase(d *dict, singleBlock bool) {
if singleBlock {
e.lowMem = true
}
e.ensureHist(d.DictContentSize() + maxCompressedBlockSize)
e.ensureHist(d.ContentSize() + maxCompressedBlockSize)
e.lowMem = low
}
// We offset current position so everything will be out of reach.
// If above reset line, history will be purged.
if e.cur < bufferReset {
if e.cur < e.bufferReset {
e.cur += e.maxMatchOff + int32(len(e.hist))
}
e.hist = e.hist[:0]

View File

@ -84,14 +84,10 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = prevEntry{}
}
for i := range e.longTable[:] {
e.longTable[i] = prevEntry{}
}
e.table = [bestShortTableSize]prevEntry{}
e.longTable = [bestLongTableSize]prevEntry{}
e.cur = e.maxMatchOff
break
}
@ -192,12 +188,6 @@ encodeLoop:
panic("offset0 was 0")
}
bestOf := func(a, b match) match {
if a.est+(a.s-b.s)*bitsPerByte>>10 < b.est+(b.s-a.s)*bitsPerByte>>10 {
return a
}
return b
}
const goodEnough = 100
nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
@ -205,36 +195,41 @@ encodeLoop:
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
matchAt := func(offset int32, s int32, first uint32, rep int32) match {
// Set m to a match at offset if it looks like that will improve compression.
improve := func(m *match, offset int32, s int32, first uint32, rep int32) {
if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
return match{s: s, est: highScore}
return
}
if debugAsserts {
if !bytes.Equal(src[s:s+4], src[offset:offset+4]) {
panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
}
}
m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
m.estBits(bitsPerByte)
return m
cand := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
cand.estBits(bitsPerByte)
if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 {
*m = cand
}
}
best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1))
best := match{s: s, est: highScore}
improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1)
if canRepeat && best.length < goodEnough {
cv32 := uint32(cv >> 8)
spp := s + 1
best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
improve(&best, spp-offset1, spp, cv32, 1)
improve(&best, spp-offset2, spp, cv32, 2)
improve(&best, spp-offset3, spp, cv32, 3)
if best.length > 0 {
cv32 = uint32(cv >> 24)
spp += 2
best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
improve(&best, spp-offset1, spp, cv32, 1)
improve(&best, spp-offset2, spp, cv32, 2)
improve(&best, spp-offset3, spp, cv32, 3)
}
}
// Load next and check...
@ -261,28 +256,30 @@ encodeLoop:
candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]
// Short at s+1
best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
// Long at s+1, s+2
best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1))
best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1))
best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1))
improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
improve(&best, candidateL2.offset-e.cur, s+1, uint32(cv2), -1)
improve(&best, candidateL2.prev-e.cur, s+1, uint32(cv2), -1)
if false {
// Short at s+3.
// Too often worse...
best = bestOf(best, matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1))
improve(&best, e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)
}
// See if we can find a better match by checking where the current best ends.
// Use that offset to see if we can find a better full match.
if sAt := best.s + best.length; sAt < sLimit {
nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen)
candidateEnd := e.longTable[nextHashL]
if pos := candidateEnd.offset - e.cur - best.length; pos >= 0 {
bestEnd := bestOf(best, matchAt(pos, best.s, load3232(src, best.s), -1))
if pos := candidateEnd.prev - e.cur - best.length; pos >= 0 {
bestEnd = bestOf(bestEnd, matchAt(pos, best.s, load3232(src, best.s), -1))
// Start check at a fixed offset to allow for a few mismatches.
// For this compression level 2 yields the best results.
const skipBeginning = 2
if pos := candidateEnd.offset - e.cur - best.length + skipBeginning; pos >= 0 {
improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
if pos := candidateEnd.prev - e.cur - best.length + skipBeginning; pos >= 0 {
improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
}
best = bestEnd
}
}
}

View File

@ -62,14 +62,10 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
}
for i := range e.longTable[:] {
e.longTable[i] = prevEntry{}
}
e.table = [betterShortTableSize]tableEntry{}
e.longTable = [betterLongTableSize]prevEntry{}
e.cur = e.maxMatchOff
break
}
@ -416,15 +412,23 @@ encodeLoop:
// Try to find a better match by searching for a long match at the end of the current best match
if s+matched < sLimit {
// Allow some bytes at the beginning to mismatch.
// Sweet spot is around 3 bytes, but depends on input.
// The skipped bytes are tested in Extend backwards,
// and still picked up as part of the match if they do.
const skipBeginning = 3
nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
cv := load3232(src, s)
s2 := s + skipBeginning
cv := load3232(src, s2)
candidateL := e.longTable[nextHashL]
coffsetL := candidateL.offset - e.cur - matched
if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
coffsetL := candidateL.offset - e.cur - matched + skipBeginning
if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
// Found a long match, at least 4 bytes.
matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
if matchedNext > matched {
t = coffsetL
s = s2
matched = matchedNext
if debugMatches {
println("long match at end-of-match")
@ -434,12 +438,13 @@ encodeLoop:
// Check prev long...
if true {
coffsetL = candidateL.prev - e.cur - matched
if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
coffsetL = candidateL.prev - e.cur - matched + skipBeginning
if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
// Found a long match, at least 4 bytes.
matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
if matchedNext > matched {
t = coffsetL
s = s2
matched = matchedNext
if debugMatches {
println("prev long match at end-of-match")
@ -578,7 +583,7 @@ func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}

View File

@ -44,14 +44,10 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
}
for i := range e.longTable[:] {
e.longTable[i] = tableEntry{}
}
e.table = [dFastShortTableSize]tableEntry{}
e.longTable = [dFastLongTableSize]tableEntry{}
e.cur = e.maxMatchOff
break
}
@ -388,7 +384,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
if e.cur >= bufferReset {
if e.cur >= e.bufferReset {
for i := range e.table[:] {
e.table[i] = tableEntry{}
}
@ -685,7 +681,7 @@ encodeLoop:
}
// We do not store history, so we must offset e.cur to avoid false matches for next user.
if e.cur < bufferReset {
if e.cur < e.bufferReset {
e.cur += int32(len(src))
}
}
@ -700,7 +696,7 @@ func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
@ -1103,7 +1099,8 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
}
if allDirty || dirtyShardCnt > dLongTableShardCnt/2 {
copy(e.longTable[:], e.dictLongTable)
//copy(e.longTable[:], e.dictLongTable)
e.longTable = *(*[dFastLongTableSize]tableEntry)(e.dictLongTable)
for i := range e.longTableShardDirty {
e.longTableShardDirty[i] = false
}
@ -1114,7 +1111,9 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
continue
}
copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
// copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
*(*[dLongTableShardSize]tableEntry)(e.longTable[i*dLongTableShardSize:]) = *(*[dLongTableShardSize]tableEntry)(e.dictLongTable[i*dLongTableShardSize:])
e.longTableShardDirty[i] = false
}
}

View File

@ -43,7 +43,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
@ -304,13 +304,13 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
minNonLiteralBlockSize = 1 + 1 + inputMargin
)
if debugEncoder {
if len(src) > maxBlockSize {
if len(src) > maxCompressedBlockSize {
panic("src too big")
}
}
// Protect against e.cur wraparound.
if e.cur >= bufferReset {
if e.cur >= e.bufferReset {
for i := range e.table[:] {
e.table[i] = tableEntry{}
}
@ -538,7 +538,7 @@ encodeLoop:
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
}
// We do not store history, so we must offset e.cur to avoid false matches for next user.
if e.cur < bufferReset {
if e.cur < e.bufferReset {
e.cur += int32(len(src))
}
}
@ -555,11 +555,9 @@ func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) {
return
}
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
}
e.table = [tableSize]tableEntry{}
e.cur = e.maxMatchOff
break
}
@ -871,7 +869,8 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
const shardCnt = tableShardCnt
const shardSize = tableShardSize
if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
copy(e.table[:], e.dictTable)
//copy(e.table[:], e.dictTable)
e.table = *(*[tableSize]tableEntry)(e.dictTable)
for i := range e.tableShardDirty {
e.tableShardDirty[i] = false
}
@ -883,7 +882,8 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
continue
}
copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
//copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
*(*[shardSize]tableEntry)(e.table[i*shardSize:]) = *(*[shardSize]tableEntry)(e.dictTable[i*shardSize:])
e.tableShardDirty[i] = false
}
e.allDirty = false

View File

@ -8,6 +8,7 @@ import (
"crypto/rand"
"fmt"
"io"
"math"
rdebug "runtime/debug"
"sync"
@ -528,8 +529,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
// If a non-single block is needed the encoder will reset again.
e.encoders <- enc
}()
// Use single segments when above minimum window and below 1MB.
single := len(src) < 1<<20 && len(src) > MinWindowSize
// Use single segments when above minimum window and below window size.
single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
if e.o.single != nil {
single = *e.o.single
}
@ -639,3 +640,37 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
}
return dst
}
// MaxEncodedSize returns the expected maximum
// size of an encoded block or stream.
func (e *Encoder) MaxEncodedSize(size int) int {
frameHeader := 4 + 2 // magic + frame header & window descriptor
if e.o.dict != nil {
frameHeader += 4
}
// Frame content size:
if size < 256 {
frameHeader++
} else if size < 65536+256 {
frameHeader += 2
} else if size < math.MaxInt32 {
frameHeader += 4
} else {
frameHeader += 8
}
// Final crc
if e.o.crc {
frameHeader += 4
}
// Max overhead is 3 bytes/block.
// There cannot be 0 blocks.
blocks := (size + e.o.blockSize) / e.o.blockSize
// Combine, add padding.
maxSz := frameHeader + 3*blocks + size
if e.o.pad > 1 {
maxSz += calcSkippableFrame(int64(maxSz), int64(e.o.pad))
}
return maxSz
}

View File

@ -3,6 +3,8 @@ package zstd
import (
"errors"
"fmt"
"math"
"math/bits"
"runtime"
"strings"
)
@ -47,22 +49,22 @@ func (o encoderOptions) encoder() encoder {
switch o.level {
case SpeedFastest:
if o.dict != nil {
return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
}
return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
case SpeedDefault:
if o.dict != nil {
return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}}
return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}}
}
return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
case SpeedBetterCompression:
if o.dict != nil {
return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
}
return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
case SpeedBestCompression:
return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
}
panic("unknown compression level")
}
@ -283,7 +285,7 @@ func WithNoEntropyCompression(b bool) EOption {
// a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
// For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
// This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
// If this is not specified, block encodes will automatically choose this based on the input size.
// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
// This setting has no effect on streamed encodes.
func WithSingleSegment(b bool) EOption {
return func(o *encoderOptions) error {
@ -304,7 +306,13 @@ func WithLowerEncoderMem(b bool) EOption {
}
// WithEncoderDict allows to register a dictionary that will be used for the encode.
//
// The slice dict must be in the [dictionary format] produced by
// "zstd --train" from the Zstandard reference implementation.
//
// The encoder *may* choose to use no dictionary instead for certain payloads.
//
// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
func WithEncoderDict(dict []byte) EOption {
return func(o *encoderOptions) error {
d, err := loadDict(dict)
@ -315,3 +323,17 @@ func WithEncoderDict(dict []byte) EOption {
return nil
}
}
// WithEncoderDictRaw registers a dictionary that may be used by the encoder.
//
// The slice content may contain arbitrary data. It will be used as an initial
// history.
func WithEncoderDictRaw(id uint32, content []byte) EOption {
return func(o *encoderOptions) error {
if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
}
o.dict = &dict{id: id, content: content, offsets: [3]int{1, 4, 8}}
return nil
}
}

View File

@ -5,7 +5,7 @@
package zstd
import (
"bytes"
"encoding/binary"
"encoding/hex"
"errors"
"io"
@ -29,7 +29,7 @@ type frameDec struct {
FrameContentSize uint64
DictionaryID *uint32
DictionaryID uint32
HasCheckSum bool
SingleSegment bool
}
@ -43,9 +43,9 @@ const (
MaxWindowSize = 1 << 29
)
var (
frameMagic = []byte{0x28, 0xb5, 0x2f, 0xfd}
skippableFrameMagic = []byte{0x2a, 0x4d, 0x18}
const (
frameMagic = "\x28\xb5\x2f\xfd"
skippableFrameMagic = "\x2a\x4d\x18"
)
func newFrameDec(o decoderOptions) *frameDec {
@ -89,9 +89,9 @@ func (d *frameDec) reset(br byteBuffer) error {
copy(signature[1:], b)
}
if !bytes.Equal(signature[1:4], skippableFrameMagic) || signature[0]&0xf0 != 0x50 {
if string(signature[1:4]) != skippableFrameMagic || signature[0]&0xf0 != 0x50 {
if debugDecoder {
println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString(skippableFrameMagic))
println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString([]byte(skippableFrameMagic)))
}
// Break if not skippable frame.
break
@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error {
}
n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
println("Skipping frame with", n, "bytes.")
err = br.skipN(int(n))
err = br.skipN(int64(n))
if err != nil {
if debugDecoder {
println("Reading discarded frame", err)
@ -114,9 +114,9 @@ func (d *frameDec) reset(br byteBuffer) error {
return err
}
}
if !bytes.Equal(signature[:], frameMagic) {
if string(signature[:]) != frameMagic {
if debugDecoder {
println("Got magic numbers: ", signature, "want:", frameMagic)
println("Got magic numbers: ", signature, "want:", []byte(frameMagic))
}
return ErrMagicMismatch
}
@ -155,7 +155,7 @@ func (d *frameDec) reset(br byteBuffer) error {
// Read Dictionary_ID
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
d.DictionaryID = nil
d.DictionaryID = 0
if size := fhd & 3; size != 0 {
if size == 3 {
size = 4
@ -167,7 +167,7 @@ func (d *frameDec) reset(br byteBuffer) error {
return err
}
var id uint32
switch size {
switch len(b) {
case 1:
id = uint32(b[0])
case 2:
@ -178,11 +178,7 @@ func (d *frameDec) reset(br byteBuffer) error {
if debugDecoder {
println("Dict size", size, "ID:", id)
}
if id > 0 {
// ID 0 means "sorry, no dictionary anyway".
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
d.DictionaryID = &id
}
d.DictionaryID = id
}
// Read Frame_Content_Size
@ -204,7 +200,7 @@ func (d *frameDec) reset(br byteBuffer) error {
println("Reading Frame content", err)
return err
}
switch fcsSize {
switch len(b) {
case 1:
d.FrameContentSize = uint64(b[0])
case 2:
@ -231,20 +227,27 @@ func (d *frameDec) reset(br byteBuffer) error {
d.crc.Reset()
}
if d.WindowSize > d.o.maxWindowSize {
if debugDecoder {
printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
}
return ErrWindowSizeExceeded
}
if d.WindowSize == 0 && d.SingleSegment {
// We may not need window in this case.
d.WindowSize = d.FrameContentSize
if d.WindowSize < MinWindowSize {
d.WindowSize = MinWindowSize
}
if d.WindowSize > d.o.maxDecodedSize {
if debugDecoder {
printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
}
return ErrDecoderSizeExceeded
}
}
if d.WindowSize > uint64(d.o.maxWindowSize) {
if debugDecoder {
printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
}
return ErrWindowSizeExceeded
}
// The minimum Window_Size is 1 KB.
if d.WindowSize < MinWindowSize {
if debugDecoder {
@ -254,11 +257,16 @@ func (d *frameDec) reset(br byteBuffer) error {
}
d.history.windowSize = int(d.WindowSize)
if !d.o.lowMem || d.history.windowSize < maxBlockSize {
// Alloc 2x window size if not low-mem, or very small window size.
// Alloc 2x window size if not low-mem, or window size below 2MB.
d.history.allocFrameBuffer = d.history.windowSize * 2
} else {
// Alloc with one additional block
d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
if d.o.lowMem {
// Alloc with 1MB extra.
d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize/2
} else {
// Alloc with 2MB extra.
d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
}
}
if debugDecoder {
@ -293,7 +301,7 @@ func (d *frameDec) checkCRC() error {
}
// We can overwrite upper tmp now
want, err := d.rawInput.readSmall(4)
buf, err := d.rawInput.readSmall(4)
if err != nil {
println("CRC missing?", err)
return err
@ -303,22 +311,17 @@ func (d *frameDec) checkCRC() error {
return nil
}
var tmp [4]byte
got := d.crc.Sum64()
// Flip to match file order.
tmp[0] = byte(got >> 0)
tmp[1] = byte(got >> 8)
tmp[2] = byte(got >> 16)
tmp[3] = byte(got >> 24)
want := binary.LittleEndian.Uint32(buf[:4])
got := uint32(d.crc.Sum64())
if !bytes.Equal(tmp[:], want) {
if got != want {
if debugDecoder {
println("CRC Check Failed:", tmp[:], "!=", want)
printf("CRC check failed: got %08x, want %08x\n", got, want)
}
return ErrCRCMismatch
}
if debugDecoder {
println("CRC ok", tmp[:])
printf("CRC ok %08x\n", got)
}
return nil
}
@ -336,7 +339,7 @@ func (d *frameDec) consumeCRC() error {
return nil
}
// runDecoder will create a sync decoder that will decode a block of data.
// runDecoder will run the decoder for the remainder of the frame.
func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
saved := d.history.b
@ -346,12 +349,23 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
// Store input length, so we only check new data.
crcStart := len(dst)
d.history.decoders.maxSyncLen = 0
if d.o.limitToCap {
d.history.decoders.maxSyncLen = uint64(cap(dst) - len(dst))
}
if d.FrameContentSize != fcsUnknown {
d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
if !d.o.limitToCap || d.FrameContentSize+uint64(len(dst)) < d.history.decoders.maxSyncLen {
d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
}
if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
if debugDecoder {
println("maxSyncLen:", d.history.decoders.maxSyncLen, "> maxDecodedSize:", d.o.maxDecodedSize)
}
return dst, ErrDecoderSizeExceeded
}
if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
if debugDecoder {
println("maxSyncLen:", d.history.decoders.maxSyncLen)
}
if !d.o.limitToCap && uint64(cap(dst)) < d.history.decoders.maxSyncLen {
// Alloc for output
dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
copy(dst2, dst)
@ -371,7 +385,13 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
if err != nil {
break
}
if uint64(len(d.history.b)) > d.o.maxDecodedSize {
if uint64(len(d.history.b)-crcStart) > d.o.maxDecodedSize {
println("runDecoder: maxDecodedSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.o.maxDecodedSize)
err = ErrDecoderSizeExceeded
break
}
if d.o.limitToCap && len(d.history.b) > cap(dst) {
println("runDecoder: cap exceeded", uint64(len(d.history.b)), ">", cap(dst))
err = ErrDecoderSizeExceeded
break
}

View File

@ -21,7 +21,8 @@ type buildDtableAsmContext struct {
// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
// Function returns non-zero exit code on error.
// go:noescape
//
//go:noescape
func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
// please keep in sync with _generate/gen_fse.go
@ -34,8 +35,8 @@ const (
// buildDtable will build the decoding table.
func (s *fseDecoder) buildDtable() error {
ctx := buildDtableAsmContext{
stateTable: (*uint16)(&s.stateTable[0]),
norm: (*int16)(&s.norm[0]),
stateTable: &s.stateTable[0],
norm: &s.norm[0],
dt: (*uint64)(&s.dt[0]),
}
code := buildDtable_asm(s, &ctx)

View File

@ -1,7 +1,6 @@
// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
//go:build !appengine && !noasm && gc && !noasm
// +build !appengine,!noasm,gc,!noasm
// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
TEXT ·buildDtable_asm(SB), $0-24

View File

@ -37,26 +37,23 @@ func (h *history) reset() {
h.ignoreBuffer = 0
h.error = false
h.recentOffsets = [3]int{1, 4, 8}
if f := h.decoders.litLengths.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
}
if f := h.decoders.offsets.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
}
if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
}
h.decoders.freeDecoders()
h.decoders = sequenceDecs{br: h.decoders.br}
if h.huffTree != nil {
if h.dict == nil || h.dict.litEnc != h.huffTree {
huffDecoderPool.Put(h.huffTree)
}
}
h.freeHuffDecoder()
h.huffTree = nil
h.dict = nil
//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
}
func (h *history) freeHuffDecoder() {
if h.huffTree != nil {
if h.dict == nil || h.dict.litEnc != h.huffTree {
huffDecoderPool.Put(h.huffTree)
h.huffTree = nil
}
}
}
func (h *history) setDict(dict *dict) {
if dict == nil {
return

View File

@ -2,12 +2,7 @@
VENDORED: Go to [github.com/cespare/xxhash](https://github.com/cespare/xxhash) for original package.
[![GoDoc](https://godoc.org/github.com/cespare/xxhash?status.svg)](https://godoc.org/github.com/cespare/xxhash)
[![Build Status](https://travis-ci.org/cespare/xxhash.svg?branch=master)](https://travis-ci.org/cespare/xxhash)
xxhash is a Go implementation of the 64-bit
[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a
xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a
high-quality hashing algorithm that is much faster than anything in the Go
standard library.
@ -28,31 +23,49 @@ func (*Digest) WriteString(string) (int, error)
func (*Digest) Sum64() uint64
```
This implementation provides a fast pure-Go implementation and an even faster
assembly implementation for amd64.
The package is written with optimized pure Go and also contains even faster
assembly implementations for amd64 and arm64. If desired, the `purego` build tag
opts into using the Go code even on those architectures.
[xxHash]: http://cyan4973.github.io/xxHash/
## Compatibility
This package is in a module and the latest code is in version 2 of the module.
You need a version of Go with at least "minimal module compatibility" to use
github.com/cespare/xxhash/v2:
* 1.9.7+ for Go 1.9
* 1.10.3+ for Go 1.10
* Go 1.11 or later
I recommend using the latest release of Go.
## Benchmarks
Here are some quick benchmarks comparing the pure-Go and assembly
implementations of Sum64.
| input size | purego | asm |
| --- | --- | --- |
| 5 B | 979.66 MB/s | 1291.17 MB/s |
| 100 B | 7475.26 MB/s | 7973.40 MB/s |
| 4 KB | 17573.46 MB/s | 17602.65 MB/s |
| 10 MB | 17131.46 MB/s | 17142.16 MB/s |
| input size | purego | asm |
| ---------- | --------- | --------- |
| 4 B | 1.3 GB/s | 1.2 GB/s |
| 16 B | 2.9 GB/s | 3.5 GB/s |
| 100 B | 6.9 GB/s | 8.1 GB/s |
| 4 KB | 11.7 GB/s | 16.7 GB/s |
| 10 MB | 12.0 GB/s | 17.3 GB/s |
These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using
the following commands under Go 1.11.2:
These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C
CPU using the following commands under Go 1.19.2:
```
$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes'
$ go test -benchtime 10s -bench '/xxhash,direct,bytes'
benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$')
benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$')
```
## Projects using this package
- [InfluxDB](https://github.com/influxdata/influxdb)
- [Prometheus](https://github.com/prometheus/prometheus)
- [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
- [FreeCache](https://github.com/coocood/freecache)
- [FastCache](https://github.com/VictoriaMetrics/fastcache)

View File

@ -18,19 +18,11 @@ const (
prime5 uint64 = 2870177450012600261
)
// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where
// possible in the Go code is worth a small (but measurable) performance boost
// by avoiding some MOVQs. Vars are needed for the asm and also are useful for
// convenience in the Go code in a few places where we need to intentionally
// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the
// result overflows a uint64).
var (
prime1v = prime1
prime2v = prime2
prime3v = prime3
prime4v = prime4
prime5v = prime5
)
// Store the primes in an array as well.
//
// The consts are used when possible in Go code to avoid MOVs but we need a
// contiguous array of the assembly code.
var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}
// Digest implements hash.Hash64.
type Digest struct {
@ -52,10 +44,10 @@ func New() *Digest {
// Reset clears the Digest's state so that it can be reused.
func (d *Digest) Reset() {
d.v1 = prime1v + prime2
d.v1 = primes[0] + prime2
d.v2 = prime2
d.v3 = 0
d.v4 = -prime1v
d.v4 = -primes[0]
d.total = 0
d.n = 0
}
@ -71,21 +63,23 @@ func (d *Digest) Write(b []byte) (n int, err error) {
n = len(b)
d.total += uint64(n)
memleft := d.mem[d.n&(len(d.mem)-1):]
if d.n+n < 32 {
// This new data doesn't even fill the current block.
copy(d.mem[d.n:], b)
copy(memleft, b)
d.n += n
return
}
if d.n > 0 {
// Finish off the partial block.
copy(d.mem[d.n:], b)
c := copy(memleft, b)
d.v1 = round(d.v1, u64(d.mem[0:8]))
d.v2 = round(d.v2, u64(d.mem[8:16]))
d.v3 = round(d.v3, u64(d.mem[16:24]))
d.v4 = round(d.v4, u64(d.mem[24:32]))
b = b[32-d.n:]
b = b[c:]
d.n = 0
}
@ -135,21 +129,20 @@ func (d *Digest) Sum64() uint64 {
h += d.total
i, end := 0, d.n
for ; i+8 <= end; i += 8 {
k1 := round(0, u64(d.mem[i:i+8]))
b := d.mem[:d.n&(len(d.mem)-1)]
for ; len(b) >= 8; b = b[8:] {
k1 := round(0, u64(b[:8]))
h ^= k1
h = rol27(h)*prime1 + prime4
}
if i+4 <= end {
h ^= uint64(u32(d.mem[i:i+4])) * prime1
if len(b) >= 4 {
h ^= uint64(u32(b[:4])) * prime1
h = rol23(h)*prime2 + prime3
i += 4
b = b[4:]
}
for i < end {
h ^= uint64(d.mem[i]) * prime5
for ; len(b) > 0; b = b[1:] {
h ^= uint64(b[0]) * prime5
h = rol11(h) * prime1
i++
}
h ^= h >> 33

View File

@ -1,3 +1,4 @@
//go:build !appengine && gc && !purego && !noasm
// +build !appengine
// +build gc
// +build !purego
@ -5,212 +6,205 @@
#include "textflag.h"
// Register allocation:
// AX h
// SI pointer to advance through b
// DX n
// BX loop end
// R8 v1, k1
// R9 v2
// R10 v3
// R11 v4
// R12 tmp
// R13 prime1v
// R14 prime2v
// DI prime4v
// Registers:
#define h AX
#define d AX
#define p SI // pointer to advance through b
#define n DX
#define end BX // loop end
#define v1 R8
#define v2 R9
#define v3 R10
#define v4 R11
#define x R12
#define prime1 R13
#define prime2 R14
#define prime4 DI
// round reads from and advances the buffer pointer in SI.
// It assumes that R13 has prime1v and R14 has prime2v.
#define round(r) \
MOVQ (SI), R12 \
ADDQ $8, SI \
IMULQ R14, R12 \
ADDQ R12, r \
ROLQ $31, r \
IMULQ R13, r
#define round(acc, x) \
IMULQ prime2, x \
ADDQ x, acc \
ROLQ $31, acc \
IMULQ prime1, acc
// mergeRound applies a merge round on the two registers acc and val.
// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
#define mergeRound(acc, val) \
IMULQ R14, val \
ROLQ $31, val \
IMULQ R13, val \
XORQ val, acc \
IMULQ R13, acc \
ADDQ DI, acc
// round0 performs the operation x = round(0, x).
#define round0(x) \
IMULQ prime2, x \
ROLQ $31, x \
IMULQ prime1, x
// mergeRound applies a merge round on the two registers acc and x.
// It assumes that prime1, prime2, and prime4 have been loaded.
#define mergeRound(acc, x) \
round0(x) \
XORQ x, acc \
IMULQ prime1, acc \
ADDQ prime4, acc
// blockLoop processes as many 32-byte blocks as possible,
// updating v1, v2, v3, and v4. It assumes that there is at least one block
// to process.
#define blockLoop() \
loop: \
MOVQ +0(p), x \
round(v1, x) \
MOVQ +8(p), x \
round(v2, x) \
MOVQ +16(p), x \
round(v3, x) \
MOVQ +24(p), x \
round(v4, x) \
ADDQ $32, p \
CMPQ p, end \
JLE loop
// func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOSPLIT, $0-32
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
// Load fixed primes.
MOVQ ·prime1v(SB), R13
MOVQ ·prime2v(SB), R14
MOVQ ·prime4v(SB), DI
MOVQ ·primes+0(SB), prime1
MOVQ ·primes+8(SB), prime2
MOVQ ·primes+24(SB), prime4
// Load slice.
MOVQ b_base+0(FP), SI
MOVQ b_len+8(FP), DX
LEAQ (SI)(DX*1), BX
MOVQ b_base+0(FP), p
MOVQ b_len+8(FP), n
LEAQ (p)(n*1), end
// The first loop limit will be len(b)-32.
SUBQ $32, BX
SUBQ $32, end
// Check whether we have at least one block.
CMPQ DX, $32
CMPQ n, $32
JLT noBlocks
// Set up initial state (v1, v2, v3, v4).
MOVQ R13, R8
ADDQ R14, R8
MOVQ R14, R9
XORQ R10, R10
XORQ R11, R11
SUBQ R13, R11
MOVQ prime1, v1
ADDQ prime2, v1
MOVQ prime2, v2
XORQ v3, v3
XORQ v4, v4
SUBQ prime1, v4
// Loop until SI > BX.
blockLoop:
round(R8)
round(R9)
round(R10)
round(R11)
blockLoop()
CMPQ SI, BX
JLE blockLoop
MOVQ v1, h
ROLQ $1, h
MOVQ v2, x
ROLQ $7, x
ADDQ x, h
MOVQ v3, x
ROLQ $12, x
ADDQ x, h
MOVQ v4, x
ROLQ $18, x
ADDQ x, h
MOVQ R8, AX
ROLQ $1, AX
MOVQ R9, R12
ROLQ $7, R12
ADDQ R12, AX
MOVQ R10, R12
ROLQ $12, R12
ADDQ R12, AX
MOVQ R11, R12
ROLQ $18, R12
ADDQ R12, AX
mergeRound(AX, R8)
mergeRound(AX, R9)
mergeRound(AX, R10)
mergeRound(AX, R11)
mergeRound(h, v1)
mergeRound(h, v2)
mergeRound(h, v3)
mergeRound(h, v4)
JMP afterBlocks
noBlocks:
MOVQ ·prime5v(SB), AX
MOVQ ·primes+32(SB), h
afterBlocks:
ADDQ DX, AX
ADDQ n, h
// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
ADDQ $24, BX
ADDQ $24, end
CMPQ p, end
JG try4
CMPQ SI, BX
JG fourByte
loop8:
MOVQ (p), x
ADDQ $8, p
round0(x)
XORQ x, h
ROLQ $27, h
IMULQ prime1, h
ADDQ prime4, h
wordLoop:
// Calculate k1.
MOVQ (SI), R8
ADDQ $8, SI
IMULQ R14, R8
ROLQ $31, R8
IMULQ R13, R8
CMPQ p, end
JLE loop8
XORQ R8, AX
ROLQ $27, AX
IMULQ R13, AX
ADDQ DI, AX
try4:
ADDQ $4, end
CMPQ p, end
JG try1
CMPQ SI, BX
JLE wordLoop
MOVL (p), x
ADDQ $4, p
IMULQ prime1, x
XORQ x, h
fourByte:
ADDQ $4, BX
CMPQ SI, BX
JG singles
ROLQ $23, h
IMULQ prime2, h
ADDQ ·primes+16(SB), h
MOVL (SI), R8
ADDQ $4, SI
IMULQ R13, R8
XORQ R8, AX
ROLQ $23, AX
IMULQ R14, AX
ADDQ ·prime3v(SB), AX
singles:
ADDQ $4, BX
CMPQ SI, BX
try1:
ADDQ $4, end
CMPQ p, end
JGE finalize
singlesLoop:
MOVBQZX (SI), R12
ADDQ $1, SI
IMULQ ·prime5v(SB), R12
XORQ R12, AX
loop1:
MOVBQZX (p), x
ADDQ $1, p
IMULQ ·primes+32(SB), x
XORQ x, h
ROLQ $11, h
IMULQ prime1, h
ROLQ $11, AX
IMULQ R13, AX
CMPQ SI, BX
JL singlesLoop
CMPQ p, end
JL loop1
finalize:
MOVQ AX, R12
SHRQ $33, R12
XORQ R12, AX
IMULQ R14, AX
MOVQ AX, R12
SHRQ $29, R12
XORQ R12, AX
IMULQ ·prime3v(SB), AX
MOVQ AX, R12
SHRQ $32, R12
XORQ R12, AX
MOVQ h, x
SHRQ $33, x
XORQ x, h
IMULQ prime2, h
MOVQ h, x
SHRQ $29, x
XORQ x, h
IMULQ ·primes+16(SB), h
MOVQ h, x
SHRQ $32, x
XORQ x, h
MOVQ AX, ret+24(FP)
MOVQ h, ret+24(FP)
RET
// writeBlocks uses the same registers as above except that it uses AX to store
// the d pointer.
// func writeBlocks(d *Digest, b []byte) int
TEXT ·writeBlocks(SB), NOSPLIT, $0-40
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
// Load fixed primes needed for round.
MOVQ ·prime1v(SB), R13
MOVQ ·prime2v(SB), R14
MOVQ ·primes+0(SB), prime1
MOVQ ·primes+8(SB), prime2
// Load slice.
MOVQ b_base+8(FP), SI
MOVQ b_len+16(FP), DX
LEAQ (SI)(DX*1), BX
SUBQ $32, BX
MOVQ b_base+8(FP), p
MOVQ b_len+16(FP), n
LEAQ (p)(n*1), end
SUBQ $32, end
// Load vN from d.
MOVQ d+0(FP), AX
MOVQ 0(AX), R8 // v1
MOVQ 8(AX), R9 // v2
MOVQ 16(AX), R10 // v3
MOVQ 24(AX), R11 // v4
MOVQ s+0(FP), d
MOVQ 0(d), v1
MOVQ 8(d), v2
MOVQ 16(d), v3
MOVQ 24(d), v4
// We don't need to check the loop condition here; this function is
// always called with at least one block of data to process.
blockLoop:
round(R8)
round(R9)
round(R10)
round(R11)
CMPQ SI, BX
JLE blockLoop
blockLoop()
// Copy vN back to d.
MOVQ R8, 0(AX)
MOVQ R9, 8(AX)
MOVQ R10, 16(AX)
MOVQ R11, 24(AX)
MOVQ v1, 0(d)
MOVQ v2, 8(d)
MOVQ v3, 16(d)
MOVQ v4, 24(d)
// The number of bytes written is SI minus the old base pointer.
SUBQ b_base+8(FP), SI
MOVQ SI, ret+32(FP)
// The number of bytes written is p minus the old base pointer.
SUBQ b_base+8(FP), p
MOVQ p, ret+32(FP)
RET

View File

@ -1,13 +1,17 @@
// +build gc,!purego,!noasm
//go:build !appengine && gc && !purego && !noasm
// +build !appengine
// +build gc
// +build !purego
// +build !noasm
#include "textflag.h"
// Register allocation.
// Registers:
#define digest R1
#define h R2 // Return value.
#define p R3 // Input pointer.
#define len R4
#define nblocks R5 // len / 32.
#define h R2 // return value
#define p R3 // input pointer
#define n R4 // input length
#define nblocks R5 // n / 32
#define prime1 R7
#define prime2 R8
#define prime3 R9
@ -25,60 +29,52 @@
#define round(acc, x) \
MADD prime2, acc, x, acc \
ROR $64-31, acc \
MUL prime1, acc \
MUL prime1, acc
// x = round(0, x).
// round0 performs the operation x = round(0, x).
#define round0(x) \
MUL prime2, x \
ROR $64-31, x \
MUL prime1, x \
MUL prime1, x
#define mergeRound(x) \
round0(x) \
EOR x, h \
MADD h, prime4, prime1, h \
#define mergeRound(acc, x) \
round0(x) \
EOR x, acc \
MADD acc, prime4, prime1, acc
// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
#define blocksLoop() \
LSR $5, len, nblocks \
PCALIGN $16 \
loop: \
LDP.P 32(p), (x1, x2) \
round(v1, x1) \
LDP -16(p), (x3, x4) \
round(v2, x2) \
SUB $1, nblocks \
round(v3, x3) \
round(v4, x4) \
CBNZ nblocks, loop \
// The primes are repeated here to ensure that they're stored
// in a contiguous array, so we can load them with LDP.
DATA primes<> +0(SB)/8, $11400714785074694791
DATA primes<> +8(SB)/8, $14029467366897019727
DATA primes<>+16(SB)/8, $1609587929392839161
DATA primes<>+24(SB)/8, $9650029242287828579
DATA primes<>+32(SB)/8, $2870177450012600261
GLOBL primes<>(SB), NOPTR+RODATA, $40
// blockLoop processes as many 32-byte blocks as possible,
// updating v1, v2, v3, and v4. It assumes that n >= 32.
#define blockLoop() \
LSR $5, n, nblocks \
PCALIGN $16 \
loop: \
LDP.P 16(p), (x1, x2) \
LDP.P 16(p), (x3, x4) \
round(v1, x1) \
round(v2, x2) \
round(v3, x3) \
round(v4, x4) \
SUB $1, nblocks \
CBNZ nblocks, loop
// func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
LDP b_base+0(FP), (p, len)
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
LDP b_base+0(FP), (p, n)
LDP primes<> +0(SB), (prime1, prime2)
LDP primes<>+16(SB), (prime3, prime4)
MOVD primes<>+32(SB), prime5
LDP ·primes+0(SB), (prime1, prime2)
LDP ·primes+16(SB), (prime3, prime4)
MOVD ·primes+32(SB), prime5
CMP $32, len
CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
BLO afterLoop
CMP $32, n
CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
BLT afterLoop
ADD prime1, prime2, v1
MOVD prime2, v2
MOVD $0, v3
NEG prime1, v4
blocksLoop()
blockLoop()
ROR $64-1, v1, x1
ROR $64-7, v2, x2
@ -88,71 +84,75 @@ TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
ADD x3, x4
ADD x2, x4, h
mergeRound(v1)
mergeRound(v2)
mergeRound(v3)
mergeRound(v4)
mergeRound(h, v1)
mergeRound(h, v2)
mergeRound(h, v3)
mergeRound(h, v4)
afterLoop:
ADD len, h
ADD n, h
TBZ $4, len, try8
TBZ $4, n, try8
LDP.P 16(p), (x1, x2)
round0(x1)
// NOTE: here and below, sequencing the EOR after the ROR (using a
// rotated register) is worth a small but measurable speedup for small
// inputs.
ROR $64-27, h
EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h
round0(x2)
ROR $64-27, h
EOR x2 @> 64-27, h
EOR x2 @> 64-27, h, h
MADD h, prime4, prime1, h
try8:
TBZ $3, len, try4
TBZ $3, n, try4
MOVD.P 8(p), x1
round0(x1)
ROR $64-27, h
EOR x1 @> 64-27, h
EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h
try4:
TBZ $2, len, try2
TBZ $2, n, try2
MOVWU.P 4(p), x2
MUL prime1, x2
ROR $64-23, h
EOR x2 @> 64-23, h
EOR x2 @> 64-23, h, h
MADD h, prime3, prime2, h
try2:
TBZ $1, len, try1
TBZ $1, n, try1
MOVHU.P 2(p), x3
AND $255, x3, x1
LSR $8, x3, x2
MUL prime5, x1
ROR $64-11, h
EOR x1 @> 64-11, h
EOR x1 @> 64-11, h, h
MUL prime1, h
MUL prime5, x2
ROR $64-11, h
EOR x2 @> 64-11, h
EOR x2 @> 64-11, h, h
MUL prime1, h
try1:
TBZ $0, len, end
TBZ $0, n, finalize
MOVBU (p), x4
MUL prime5, x4
ROR $64-11, h
EOR x4 @> 64-11, h
EOR x4 @> 64-11, h, h
MUL prime1, h
end:
finalize:
EOR h >> 33, h
MUL prime2, h
EOR h >> 29, h
@ -163,24 +163,22 @@ end:
RET
// func writeBlocks(d *Digest, b []byte) int
//
// Assumes len(b) >= 32.
TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
LDP primes<>(SB), (prime1, prime2)
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
LDP ·primes+0(SB), (prime1, prime2)
// Load state. Assume v[1-4] are stored contiguously.
MOVD d+0(FP), digest
LDP 0(digest), (v1, v2)
LDP 16(digest), (v3, v4)
LDP b_base+8(FP), (p, len)
LDP b_base+8(FP), (p, n)
blocksLoop()
blockLoop()
// Store updated state.
STP (v1, v2), 0(digest)
STP (v3, v4), 16(digest)
BIC $31, len
MOVD len, ret+32(FP)
BIC $31, n
MOVD n, ret+32(FP)
RET

View File

@ -13,4 +13,4 @@ package xxhash
func Sum64(b []byte) uint64
//go:noescape
func writeBlocks(d *Digest, b []byte) int
func writeBlocks(s *Digest, b []byte) int

View File

@ -15,10 +15,10 @@ func Sum64(b []byte) uint64 {
var h uint64
if n >= 32 {
v1 := prime1v + prime2
v1 := primes[0] + prime2
v2 := prime2
v3 := uint64(0)
v4 := -prime1v
v4 := -primes[0]
for len(b) >= 32 {
v1 = round(v1, u64(b[0:8:len(b)]))
v2 = round(v2, u64(b[8:16:len(b)]))
@ -37,19 +37,18 @@ func Sum64(b []byte) uint64 {
h += uint64(n)
i, end := 0, len(b)
for ; i+8 <= end; i += 8 {
k1 := round(0, u64(b[i:i+8:len(b)]))
for ; len(b) >= 8; b = b[8:] {
k1 := round(0, u64(b[:8]))
h ^= k1
h = rol27(h)*prime1 + prime4
}
if i+4 <= end {
h ^= uint64(u32(b[i:i+4:len(b)])) * prime1
if len(b) >= 4 {
h ^= uint64(u32(b[:4])) * prime1
h = rol23(h)*prime2 + prime3
i += 4
b = b[4:]
}
for ; i < end; i++ {
h ^= uint64(b[i]) * prime5
for ; len(b) > 0; b = b[1:] {
h ^= uint64(b[0]) * prime5
h = rol11(h) * prime1
}

View File

@ -99,6 +99,21 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) erro
return nil
}
func (s *sequenceDecs) freeDecoders() {
if f := s.litLengths.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
s.litLengths.fse = nil
}
if f := s.offsets.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
s.offsets.fse = nil
}
if f := s.matchLengths.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
s.matchLengths.fse = nil
}
}
// execute will execute the decoded sequence with the provided history.
// The sequence must be evaluated before being sent.
func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
@ -299,7 +314,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
}
size := ll + ml + len(out)
if size-startSize > maxBlockSize {
return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
if size > cap(out) {
// Not enough size, which can happen under high volume block streaming conditions
@ -409,9 +424,8 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
}
}
// Check if space for literals
if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
if size := len(s.literals) + len(out) - startSize; size > maxBlockSize {
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
// Add final literals

View File

@ -32,18 +32,22 @@ type decodeSyncAsmContext struct {
// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
//
// Please refer to seqdec_generic.go for the reference implementation.
//
//go:noescape
func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
//
//go:noescape
func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
//
//go:noescape
func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
//
//go:noescape
func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
@ -55,16 +59,22 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
return false, nil
}
useSafe := false
if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
useSafe = true
}
if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
useSafe = true
}
if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
useSafe = true
}
// FIXME: Using unsafe memory copies leads to rare, random crashes
// with fuzz testing. It is therefore disabled for now.
const useSafe = true
/*
useSafe := false
if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
useSafe = true
}
if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
useSafe = true
}
if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
useSafe = true
}
*/
br := s.br
@ -129,7 +139,7 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
if debugDecoder {
println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
}
return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
default:
return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
@ -137,7 +147,7 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
s.seqSize += ctx.litRemain
if s.seqSize > maxBlockSize {
return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
err := br.close()
if err != nil {
@ -195,20 +205,24 @@ const errorNotEnoughSpace = 5
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
//
// Please refer to seqdec_generic.go for the reference implementation.
//
//go:noescape
func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
//
// Please refer to seqdec_generic.go for the reference implementation.
//
//go:noescape
func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
//
//go:noescape
func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
//
//go:noescape
func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
@ -275,7 +289,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
s.seqSize += ctx.litRemain
if s.seqSize > maxBlockSize {
return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
err := br.close()
if err != nil {
@ -302,10 +316,12 @@ type executeAsmContext struct {
// Returns false if a match offset is too big.
//
// Please refer to seqdec_generic.go for the reference implementation.
//
//go:noescape
func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
// Same as above, but with safe memcopies
//
//go:noescape
func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool

View File

@ -1,7 +1,6 @@
// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
//go:build !appengine && !noasm && gc && !noasm
// +build !appengine,!noasm,gc,!noasm
// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
@ -52,34 +51,46 @@ sequenceDecs_decode_amd64_fill_byte_by_byte:
sequenceDecs_decode_amd64_fill_end:
// Update offset
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R15
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R15
ADDQ R15, AX
MOVQ AX, 16(R10)
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decode_amd64_of_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decode_amd64_of_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decode_amd64_of_update_zero
NEGQ CX
SHRQ CL, R15
ADDQ R15, AX
sequenceDecs_decode_amd64_of_update_zero:
MOVQ AX, 16(R10)
// Update match length
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R15
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R15
ADDQ R15, AX
MOVQ AX, 8(R10)
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decode_amd64_ml_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decode_amd64_ml_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decode_amd64_ml_update_zero
NEGQ CX
SHRQ CL, R15
ADDQ R15, AX
sequenceDecs_decode_amd64_ml_update_zero:
MOVQ AX, 8(R10)
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
@ -107,19 +118,25 @@ sequenceDecs_decode_amd64_fill_2_byte_by_byte:
sequenceDecs_decode_amd64_fill_2_end:
// Update literal length
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R15
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R15
ADDQ R15, AX
MOVQ AX, (R10)
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decode_amd64_ll_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decode_amd64_ll_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decode_amd64_ll_update_zero
NEGQ CX
SHRQ CL, R15
ADDQ R15, AX
sequenceDecs_decode_amd64_ll_update_zero:
MOVQ AX, (R10)
// Fill bitreader for state updates
MOVQ R14, (SP)
@ -198,7 +215,7 @@ sequenceDecs_decode_amd64_skip_update:
MOVQ R12, R13
MOVQ R11, R12
MOVQ CX, R11
JMP sequenceDecs_decode_amd64_adjust_end
JMP sequenceDecs_decode_amd64_after_adjust
sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
CMPQ (R10), $0x00000000
@ -210,7 +227,7 @@ sequenceDecs_decode_amd64_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
MOVQ R11, CX
JMP sequenceDecs_decode_amd64_adjust_end
JMP sequenceDecs_decode_amd64_after_adjust
sequenceDecs_decode_amd64_adjust_offset_nonzero:
CMPQ CX, $0x01
@ -247,7 +264,7 @@ sequenceDecs_decode_amd64_adjust_temp_valid:
MOVQ AX, R11
MOVQ AX, CX
sequenceDecs_decode_amd64_adjust_end:
sequenceDecs_decode_amd64_after_adjust:
MOVQ CX, 16(R10)
// Check values
@ -303,10 +320,6 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
@ -356,49 +369,67 @@ sequenceDecs_decode_56_amd64_fill_byte_by_byte:
sequenceDecs_decode_56_amd64_fill_end:
// Update offset
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R15
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R15
ADDQ R15, AX
MOVQ AX, 16(R10)
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decode_56_amd64_of_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decode_56_amd64_of_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decode_56_amd64_of_update_zero
NEGQ CX
SHRQ CL, R15
ADDQ R15, AX
sequenceDecs_decode_56_amd64_of_update_zero:
MOVQ AX, 16(R10)
// Update match length
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R15
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R15
ADDQ R15, AX
MOVQ AX, 8(R10)
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decode_56_amd64_ml_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decode_56_amd64_ml_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decode_56_amd64_ml_update_zero
NEGQ CX
SHRQ CL, R15
ADDQ R15, AX
sequenceDecs_decode_56_amd64_ml_update_zero:
MOVQ AX, 8(R10)
// Update literal length
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R15
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R15
ADDQ R15, AX
MOVQ AX, (R10)
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decode_56_amd64_ll_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decode_56_amd64_ll_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decode_56_amd64_ll_update_zero
NEGQ CX
SHRQ CL, R15
ADDQ R15, AX
sequenceDecs_decode_56_amd64_ll_update_zero:
MOVQ AX, (R10)
// Fill bitreader for state updates
MOVQ R14, (SP)
@ -477,7 +508,7 @@ sequenceDecs_decode_56_amd64_skip_update:
MOVQ R12, R13
MOVQ R11, R12
MOVQ CX, R11
JMP sequenceDecs_decode_56_amd64_adjust_end
JMP sequenceDecs_decode_56_amd64_after_adjust
sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
CMPQ (R10), $0x00000000
@ -489,7 +520,7 @@ sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
MOVQ R11, CX
JMP sequenceDecs_decode_56_amd64_adjust_end
JMP sequenceDecs_decode_56_amd64_after_adjust
sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
CMPQ CX, $0x01
@ -526,7 +557,7 @@ sequenceDecs_decode_56_amd64_adjust_temp_valid:
MOVQ AX, R11
MOVQ AX, CX
sequenceDecs_decode_56_amd64_adjust_end:
sequenceDecs_decode_56_amd64_after_adjust:
MOVQ CX, 16(R10)
// Check values
@ -582,10 +613,6 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
@ -757,7 +784,7 @@ sequenceDecs_decode_bmi2_skip_update:
MOVQ R11, R12
MOVQ R10, R11
MOVQ CX, R10
JMP sequenceDecs_decode_bmi2_adjust_end
JMP sequenceDecs_decode_bmi2_after_adjust
sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
CMPQ (R9), $0x00000000
@ -769,7 +796,7 @@ sequenceDecs_decode_bmi2_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
MOVQ R10, CX
JMP sequenceDecs_decode_bmi2_adjust_end
JMP sequenceDecs_decode_bmi2_after_adjust
sequenceDecs_decode_bmi2_adjust_offset_nonzero:
CMPQ CX, $0x01
@ -806,7 +833,7 @@ sequenceDecs_decode_bmi2_adjust_temp_valid:
MOVQ R13, R10
MOVQ R13, CX
sequenceDecs_decode_bmi2_adjust_end:
sequenceDecs_decode_bmi2_after_adjust:
MOVQ CX, 16(R9)
// Check values
@ -862,10 +889,6 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
@ -1012,7 +1035,7 @@ sequenceDecs_decode_56_bmi2_skip_update:
MOVQ R11, R12
MOVQ R10, R11
MOVQ CX, R10
JMP sequenceDecs_decode_56_bmi2_adjust_end
JMP sequenceDecs_decode_56_bmi2_after_adjust
sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
CMPQ (R9), $0x00000000
@ -1024,7 +1047,7 @@ sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
MOVQ R10, CX
JMP sequenceDecs_decode_56_bmi2_adjust_end
JMP sequenceDecs_decode_56_bmi2_after_adjust
sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
CMPQ CX, $0x01
@ -1061,7 +1084,7 @@ sequenceDecs_decode_56_bmi2_adjust_temp_valid:
MOVQ R13, R10
MOVQ R13, CX
sequenceDecs_decode_56_bmi2_adjust_end:
sequenceDecs_decode_56_bmi2_after_adjust:
MOVQ CX, 16(R9)
// Check values
@ -1117,10 +1140,6 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
// Requires: SSE
TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
@ -1354,8 +1373,7 @@ loop_finished:
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
MOVQ 80(AX), CX
SUBQ CX, SI
SUBQ 80(AX), SI
MOVQ SI, 112(AX)
RET
@ -1367,8 +1385,7 @@ error_match_off_too_big:
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
MOVQ 80(AX), CX
SUBQ CX, SI
SUBQ 80(AX), SI
MOVQ SI, 112(AX)
RET
@ -1712,8 +1729,7 @@ loop_finished:
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
MOVQ 80(AX), CX
SUBQ CX, SI
SUBQ 80(AX), SI
MOVQ SI, 112(AX)
RET
@ -1725,8 +1741,7 @@ error_match_off_too_big:
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
MOVQ 80(AX), CX
SUBQ CX, SI
SUBQ 80(AX), SI
MOVQ SI, 112(AX)
RET
@ -1749,6 +1764,10 @@ TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
XORQ CX, CX
MOVQ CX, 8(SP)
MOVQ CX, 16(SP)
MOVQ CX, 24(SP)
MOVQ 112(AX), R10
MOVQ 128(AX), CX
MOVQ CX, 32(SP)
@ -1798,34 +1817,46 @@ sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
sequenceDecs_decodeSync_amd64_fill_end:
// Update offset
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R14
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R14
ADDQ R14, AX
MOVQ AX, 8(SP)
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decodeSync_amd64_of_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decodeSync_amd64_of_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decodeSync_amd64_of_update_zero
NEGQ CX
SHRQ CL, R14
ADDQ R14, AX
sequenceDecs_decodeSync_amd64_of_update_zero:
MOVQ AX, 8(SP)
// Update match length
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R14
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R14
ADDQ R14, AX
MOVQ AX, 16(SP)
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decodeSync_amd64_ml_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decodeSync_amd64_ml_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decodeSync_amd64_ml_update_zero
NEGQ CX
SHRQ CL, R14
ADDQ R14, AX
sequenceDecs_decodeSync_amd64_ml_update_zero:
MOVQ AX, 16(SP)
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
@ -1853,19 +1884,25 @@ sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
sequenceDecs_decodeSync_amd64_fill_2_end:
// Update literal length
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R14
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R14
ADDQ R14, AX
MOVQ AX, 24(SP)
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decodeSync_amd64_ll_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decodeSync_amd64_ll_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decodeSync_amd64_ll_update_zero
NEGQ CX
SHRQ CL, R14
ADDQ R14, AX
sequenceDecs_decodeSync_amd64_ll_update_zero:
MOVQ AX, 24(SP)
// Fill bitreader for state updates
MOVQ R13, (SP)
@ -1945,7 +1982,7 @@ sequenceDecs_decodeSync_amd64_skip_update:
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
JMP sequenceDecs_decodeSync_amd64_adjust_end
JMP sequenceDecs_decodeSync_amd64_after_adjust
sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
@ -1957,7 +1994,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
MOVQ 144(CX), R13
JMP sequenceDecs_decodeSync_amd64_adjust_end
JMP sequenceDecs_decodeSync_amd64_after_adjust
sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
MOVQ R13, AX
@ -1966,8 +2003,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
CMPQ R13, $0x03
CMOVQEQ R14, AX
CMOVQEQ R15, R14
LEAQ 144(CX), R15
ADDQ (R15)(AX*8), R14
ADDQ 144(CX)(AX*8), R14
JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
MOVQ $0x00000001, R14
@ -1983,7 +2019,7 @@ sequenceDecs_decodeSync_amd64_adjust_skip:
MOVQ R14, 144(CX)
MOVQ R14, R13
sequenceDecs_decodeSync_amd64_adjust_end:
sequenceDecs_decodeSync_amd64_after_adjust:
MOVQ R13, 8(SP)
// Check values
@ -2280,6 +2316,10 @@ TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
XORQ R9, R9
MOVQ R9, 8(SP)
MOVQ R9, 16(SP)
MOVQ R9, 24(SP)
MOVQ 112(CX), R9
MOVQ 128(CX), R10
MOVQ R10, 32(SP)
@ -2452,7 +2492,7 @@ sequenceDecs_decodeSync_bmi2_skip_update:
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
JMP sequenceDecs_decodeSync_bmi2_adjust_end
JMP sequenceDecs_decodeSync_bmi2_after_adjust
sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
@ -2464,7 +2504,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
MOVQ 144(CX), R13
JMP sequenceDecs_decodeSync_bmi2_adjust_end
JMP sequenceDecs_decodeSync_bmi2_after_adjust
sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
MOVQ R13, R12
@ -2473,8 +2513,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
CMPQ R13, $0x03
CMOVQEQ R14, R12
CMOVQEQ R15, R14
LEAQ 144(CX), R15
ADDQ (R15)(R12*8), R14
ADDQ 144(CX)(R12*8), R14
JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
MOVQ $0x00000001, R14
@ -2490,7 +2529,7 @@ sequenceDecs_decodeSync_bmi2_adjust_skip:
MOVQ R14, 144(CX)
MOVQ R14, R13
sequenceDecs_decodeSync_bmi2_adjust_end:
sequenceDecs_decodeSync_bmi2_after_adjust:
MOVQ R13, 8(SP)
// Check values
@ -2787,6 +2826,10 @@ TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
XORQ CX, CX
MOVQ CX, 8(SP)
MOVQ CX, 16(SP)
MOVQ CX, 24(SP)
MOVQ 112(AX), R10
MOVQ 128(AX), CX
MOVQ CX, 32(SP)
@ -2836,34 +2879,46 @@ sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
sequenceDecs_decodeSync_safe_amd64_fill_end:
// Update offset
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R14
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R14
ADDQ R14, AX
MOVQ AX, 8(SP)
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
NEGQ CX
SHRQ CL, R14
ADDQ R14, AX
sequenceDecs_decodeSync_safe_amd64_of_update_zero:
MOVQ AX, 8(SP)
// Update match length
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R14
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R14
ADDQ R14, AX
MOVQ AX, 16(SP)
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
NEGQ CX
SHRQ CL, R14
ADDQ R14, AX
sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
MOVQ AX, 16(SP)
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
@ -2891,19 +2946,25 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
sequenceDecs_decodeSync_safe_amd64_fill_2_end:
// Update literal length
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R14
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R14
ADDQ R14, AX
MOVQ AX, 24(SP)
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
NEGQ CX
SHRQ CL, R14
ADDQ R14, AX
sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
MOVQ AX, 24(SP)
// Fill bitreader for state updates
MOVQ R13, (SP)
@ -2983,7 +3044,7 @@ sequenceDecs_decodeSync_safe_amd64_skip_update:
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
JMP sequenceDecs_decodeSync_safe_amd64_adjust_end
JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
@ -2995,7 +3056,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
MOVQ 144(CX), R13
JMP sequenceDecs_decodeSync_safe_amd64_adjust_end
JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
MOVQ R13, AX
@ -3004,8 +3065,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
CMPQ R13, $0x03
CMOVQEQ R14, AX
CMOVQEQ R15, R14
LEAQ 144(CX), R15
ADDQ (R15)(AX*8), R14
ADDQ 144(CX)(AX*8), R14
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
MOVQ $0x00000001, R14
@ -3021,7 +3081,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_skip:
MOVQ R14, 144(CX)
MOVQ R14, R13
sequenceDecs_decodeSync_safe_amd64_adjust_end:
sequenceDecs_decodeSync_safe_amd64_after_adjust:
MOVQ R13, 8(SP)
// Check values
@ -3420,6 +3480,10 @@ TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
XORQ R9, R9
MOVQ R9, 8(SP)
MOVQ R9, 16(SP)
MOVQ R9, 24(SP)
MOVQ 112(CX), R9
MOVQ 128(CX), R10
MOVQ R10, 32(SP)
@ -3592,7 +3656,7 @@ sequenceDecs_decodeSync_safe_bmi2_skip_update:
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end
JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
@ -3604,7 +3668,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
MOVQ 144(CX), R13
JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end
JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
MOVQ R13, R12
@ -3613,8 +3677,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
CMPQ R13, $0x03
CMOVQEQ R14, R12
CMOVQEQ R15, R14
LEAQ 144(CX), R15
ADDQ (R15)(R12*8), R14
ADDQ 144(CX)(R12*8), R14
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
MOVQ $0x00000001, R14
@ -3630,7 +3693,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
MOVQ R14, 144(CX)
MOVQ R14, R13
sequenceDecs_decodeSync_safe_bmi2_adjust_end:
sequenceDecs_decodeSync_safe_bmi2_after_adjust:
MOVQ R13, 8(SP)
// Check values

View File

@ -111,7 +111,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
}
s.seqSize += ll + ml
if s.seqSize > maxBlockSize {
return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
litRemain -= ll
if litRemain < 0 {
@ -149,7 +149,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
}
s.seqSize += litRemain
if s.seqSize > maxBlockSize {
return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
err := br.close()
if err != nil {

View File

@ -36,9 +36,6 @@ const forcePreDef = false
// zstdMinMatch is the minimum zstd match length.
const zstdMinMatch = 3
// Reset the buffer offset when reaching this.
const bufferReset = math.MaxInt32 - MaxWindowSize
// fcsUnknown is used for unknown frame content size.
const fcsUnknown = math.MaxUint64
@ -75,7 +72,6 @@ var (
ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit")
// ErrUnknownDictionary is returned if the dictionary ID is unknown.
// For the time being dictionaries are not supported.
ErrUnknownDictionary = errors.New("unknown dictionary")
// ErrFrameSizeExceeded is returned if the stated frame size is exceeded.
@ -110,26 +106,25 @@ func printf(format string, a ...interface{}) {
}
}
// matchLen returns the maximum length.
// matchLen returns the maximum common prefix length of a and b.
// a must be the shortest of the two.
// The function also returns whether all bytes matched.
func matchLen(a, b []byte) int {
b = b[:len(a)]
for i := 0; i < len(a)-7; i += 8 {
if diff := load64(a, i) ^ load64(b, i); diff != 0 {
return i + (bits.TrailingZeros64(diff) >> 3)
func matchLen(a, b []byte) (n int) {
for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
if diff != 0 {
return n + bits.TrailingZeros64(diff)>>3
}
n += 8
}
checked := (len(a) >> 3) << 3
a = a[checked:]
b = b[checked:]
for i := range a {
if a[i] != b[i] {
return i + checked
break
}
n++
}
return len(a) + checked
return n
}
func load3232(b []byte, i int32) uint32 {
@ -140,10 +135,6 @@ func load6432(b []byte, i int32) uint64 {
return binary.LittleEndian.Uint64(b[i:])
}
func load64(b []byte, i int) uint64 {
return binary.LittleEndian.Uint64(b[i:])
}
type byter interface {
Bytes() []byte
Len() int