Update dependencies

* Updates netlink/netns
* Updates x/sys, arkade and apimachinery

Build passes, minor updates.

Signed-off-by: Alex Ellis (OpenFaaS Ltd) <alexellis2@gmail.com>
This commit is contained in:
Alex Ellis (OpenFaaS Ltd)
2022-10-03 16:43:35 +01:00
parent 60b724f014
commit 95792f8d58
438 changed files with 19023 additions and 20599 deletions

View File

@ -8,115 +8,10 @@ package huff0
import (
"encoding/binary"
"errors"
"fmt"
"io"
)
// bitReader reads a bitstream in reverse.
// The last set bit indicates the start of the stream and is used
// for aligning the input.
type bitReader struct {
in []byte
off uint // next byte to read is at in[off - 1]
value uint64
bitsRead uint8
}
// init initializes and resets the bit reader.
func (b *bitReader) init(in []byte) error {
if len(in) < 1 {
return errors.New("corrupt stream: too short")
}
b.in = in
b.off = uint(len(in))
// The highest bit of the last byte indicates where to start
v := in[len(in)-1]
if v == 0 {
return errors.New("corrupt stream, did not find end of stream")
}
b.bitsRead = 64
b.value = 0
if len(in) >= 8 {
b.fillFastStart()
} else {
b.fill()
b.fill()
}
b.bitsRead += 8 - uint8(highBit32(uint32(v)))
return nil
}
// peekBitsFast requires that at least one bit is requested every time.
// There are no checks if the buffer is filled.
func (b *bitReader) peekBitsFast(n uint8) uint16 {
const regMask = 64 - 1
v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
return v
}
// fillFast() will make sure at least 32 bits are available.
// There must be at least 4 bytes available.
func (b *bitReader) fillFast() {
if b.bitsRead < 32 {
return
}
// 2 bounds checks.
v := b.in[b.off-4 : b.off]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value = (b.value << 32) | uint64(low)
b.bitsRead -= 32
b.off -= 4
}
func (b *bitReader) advance(n uint8) {
b.bitsRead += n
}
// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
func (b *bitReader) fillFastStart() {
// Do single re-slice to avoid bounds checks.
b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
b.bitsRead = 0
b.off -= 8
}
// fill() will make sure at least 32 bits are available.
func (b *bitReader) fill() {
if b.bitsRead < 32 {
return
}
if b.off > 4 {
v := b.in[b.off-4:]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value = (b.value << 32) | uint64(low)
b.bitsRead -= 32
b.off -= 4
return
}
for b.off > 0 {
b.value = (b.value << 8) | uint64(b.in[b.off-1])
b.bitsRead -= 8
b.off--
}
}
// finished returns true if all bits have been read from the bit stream.
func (b *bitReader) finished() bool {
return b.off == 0 && b.bitsRead >= 64
}
// close the bitstream and returns an error if out-of-buffer reads occurred.
func (b *bitReader) close() error {
// Release reference.
b.in = nil
if b.bitsRead > 64 {
return io.ErrUnexpectedEOF
}
return nil
}
// bitReader reads a bitstream in reverse.
// The last set bit indicates the start of the stream and is used
// for aligning the input.
@ -213,10 +108,17 @@ func (b *bitReaderBytes) finished() bool {
return b.off == 0 && b.bitsRead >= 64
}
func (b *bitReaderBytes) remaining() uint {
return b.off*8 + uint(64-b.bitsRead)
}
// close the bitstream and returns an error if out-of-buffer reads occurred.
func (b *bitReaderBytes) close() error {
// Release reference.
b.in = nil
if b.remaining() > 0 {
return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
}
if b.bitsRead > 64 {
return io.ErrUnexpectedEOF
}
@ -313,15 +215,17 @@ func (b *bitReaderShifted) fill() {
}
}
// finished returns true if all bits have been read from the bit stream.
func (b *bitReaderShifted) finished() bool {
return b.off == 0 && b.bitsRead >= 64
func (b *bitReaderShifted) remaining() uint {
return b.off*8 + uint(64-b.bitsRead)
}
// close the bitstream and returns an error if out-of-buffer reads occurred.
func (b *bitReaderShifted) close() error {
// Release reference.
b.in = nil
if b.remaining() > 0 {
return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
}
if b.bitsRead > 64 {
return io.ErrUnexpectedEOF
}

View File

@ -5,8 +5,6 @@
package huff0
import "fmt"
// bitWriter will write bits.
// First bit will be LSB of the first byte of output.
type bitWriter struct {
@ -23,14 +21,6 @@ var bitMask16 = [32]uint16{
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF} /* up to 16 bits */
// addBits16NC will add up to 16 bits.
// It will not check if there is space for them,
// so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
b.nBits += bits
}
// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@ -70,104 +60,6 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
b.nBits += encA.nBits + encB.nBits
}
// addBits16ZeroNC will add up to 16 bits.
// It will not check if there is space for them,
// so the caller must ensure that it has flushed recently.
// This is fastest if bits can be zero.
func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
if bits == 0 {
return
}
value <<= (16 - bits) & 15
value >>= (16 - bits) & 15
b.bitContainer |= uint64(value) << (b.nBits & 63)
b.nBits += bits
}
// flush will flush all pending full bytes.
// There will be at least 56 bits available for writing when this has been called.
// Using flush32 is faster, but leaves less space for writing.
func (b *bitWriter) flush() {
v := b.nBits >> 3
switch v {
case 0:
return
case 1:
b.out = append(b.out,
byte(b.bitContainer),
)
b.bitContainer >>= 1 << 3
case 2:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
)
b.bitContainer >>= 2 << 3
case 3:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
)
b.bitContainer >>= 3 << 3
case 4:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
)
b.bitContainer >>= 4 << 3
case 5:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
byte(b.bitContainer>>32),
)
b.bitContainer >>= 5 << 3
case 6:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
byte(b.bitContainer>>32),
byte(b.bitContainer>>40),
)
b.bitContainer >>= 6 << 3
case 7:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
byte(b.bitContainer>>32),
byte(b.bitContainer>>40),
byte(b.bitContainer>>48),
)
b.bitContainer >>= 7 << 3
case 8:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
byte(b.bitContainer>>32),
byte(b.bitContainer>>40),
byte(b.bitContainer>>48),
byte(b.bitContainer>>56),
)
b.bitContainer = 0
b.nBits = 0
return
default:
panic(fmt.Errorf("bits (%d) > 64", b.nBits))
}
b.nBits &= 7
}
// flush32 will flush out, so there are at least 32 bits available for writing.
func (b *bitWriter) flush32() {
if b.nBits < 32 {
@ -201,10 +93,3 @@ func (b *bitWriter) close() error {
b.flushAlign()
return nil
}
// reset and continue writing by appending to out.
func (b *bitWriter) reset(out []byte) {
b.bitContainer = 0
b.nBits = 0
b.out = out
}

View File

@ -20,11 +20,6 @@ func (b *byteReader) init(in []byte) {
b.off = 0
}
// advance the stream b n bytes.
func (b *byteReader) advance(n uint) {
b.off += int(n)
}
// Int32 returns a little endian int32 starting at current offset.
func (b byteReader) Int32() int32 {
v3 := int32(b.b[b.off+3])
@ -43,11 +38,6 @@ func (b byteReader) Uint32() uint32 {
return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
}
// unread returns the unread portion of the input.
func (b byteReader) unread() []byte {
return b.b[b.off:]
}
// remain will return the number of bytes remaining.
func (b byteReader) remain() int {
return len(b.b) - b.off

View File

@ -2,6 +2,7 @@ package huff0
import (
"fmt"
"math"
"runtime"
"sync"
)
@ -161,6 +162,70 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
return s.Out, false, nil
}
// EstimateSizes will estimate the data sizes
func EstimateSizes(in []byte, s *Scratch) (tableSz, dataSz, reuseSz int, err error) {
s, err = s.prepare(in)
if err != nil {
return 0, 0, 0, err
}
// Create histogram, if none was provided.
tableSz, dataSz, reuseSz = -1, -1, -1
maxCount := s.maxCount
var canReuse = false
if maxCount == 0 {
maxCount, canReuse = s.countSimple(in)
} else {
canReuse = s.canUseTable(s.prevTable)
}
// We want the output size to be less than this:
wantSize := len(in)
if s.WantLogLess > 0 {
wantSize -= wantSize >> s.WantLogLess
}
// Reset for next run.
s.clearCount = true
s.maxCount = 0
if maxCount >= len(in) {
if maxCount > len(in) {
return 0, 0, 0, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
}
if len(in) == 1 {
return 0, 0, 0, ErrIncompressible
}
// One symbol, use RLE
return 0, 0, 0, ErrUseRLE
}
if maxCount == 1 || maxCount < (len(in)>>7) {
// Each symbol present maximum once or too well distributed.
return 0, 0, 0, ErrIncompressible
}
// Calculate new table.
err = s.buildCTable()
if err != nil {
return 0, 0, 0, err
}
if false && !s.canUseTable(s.cTable) {
panic("invalid table generated")
}
tableSz, err = s.cTable.estTableSize(s)
if err != nil {
return 0, 0, 0, err
}
if canReuse {
reuseSz = s.prevTable.estimateSize(s.count[:s.symbolLen])
}
dataSz = s.cTable.estimateSize(s.count[:s.symbolLen])
// Restore
return tableSz, dataSz, reuseSz, nil
}
func (s *Scratch) compress1X(src []byte) ([]byte, error) {
return s.compress1xDo(s.Out, src)
}
@ -225,6 +290,10 @@ func (s *Scratch) compress4X(src []byte) ([]byte, error) {
if err != nil {
return nil, err
}
if len(s.Out)-idx > math.MaxUint16 {
// We cannot store the size in the jump table
return nil, ErrIncompressible
}
// Write compressed length as little endian before block.
if i < 3 {
// Last length is not written.
@ -268,6 +337,10 @@ func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
return nil, errs[i]
}
o := s.tmpOut[i]
if len(o) > math.MaxUint16 {
// We cannot store the size in the jump table
return nil, ErrIncompressible
}
// Write compressed length as little endian before block.
if i < 3 {
// Last length is not written.
@ -331,6 +404,7 @@ func (s *Scratch) canUseTable(c cTable) bool {
return true
}
//lint:ignore U1000 used for debugging
func (s *Scratch) validateTable(c cTable) bool {
if len(c) < int(s.symbolLen) {
return false
@ -536,7 +610,6 @@ func (s *Scratch) huffSort() {
}
nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)}
}
return
}
func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,222 @@
//go:build amd64 && !appengine && !noasm && gc
// +build amd64,!appengine,!noasm,gc
// This file contains the specialisation of Decoder.Decompress4X
// and Decoder.Decompress1X that use an asm implementation of thir main loops.
package huff0
import (
"errors"
"fmt"
"github.com/klauspost/compress/internal/cpuinfo"
)
// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog > 8.
//go:noescape
func decompress4x_main_loop_amd64(ctx *decompress4xContext)
// decompress4x_8b_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog <= 8 which decodes 4 entries
// per loop.
//go:noescape
func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
// fallback8BitSize is the size where using Go version is faster.
const fallback8BitSize = 800
type decompress4xContext struct {
pbr *[4]bitReaderShifted
peekBits uint8
out *byte
dstEvery int
tbl *dEntrySingle
decoded int
limit *byte
}
// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
// the uncompressed data exactly.
func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if len(d.dt.single) == 0 {
return nil, errors.New("no table loaded")
}
if len(src) < 6+(4*1) {
return nil, errors.New("input too small")
}
use8BitTables := d.actualTableLog <= 8
if cap(dst) < fallback8BitSize && use8BitTables {
return d.decompress4X8bit(dst, src)
}
var br [4]bitReaderShifted
// Decode "jump table"
start := 6
for i := 0; i < 3; i++ {
length := int(src[i*2]) | (int(src[i*2+1]) << 8)
if start+length >= len(src) {
return nil, errors.New("truncated input (or invalid offset)")
}
err := br[i].init(src[start : start+length])
if err != nil {
return nil, err
}
start += length
}
err := br[3].init(src[start:])
if err != nil {
return nil, err
}
// destination, offset to match first output
dstSize := cap(dst)
dst = dst[:dstSize]
out := dst
dstEvery := (dstSize + 3) / 4
const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
var decoded int
if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
ctx := decompress4xContext{
pbr: &br,
peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
out: &out[0],
dstEvery: dstEvery,
tbl: &single[0],
limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
}
if use8BitTables {
decompress4x_8b_main_loop_amd64(&ctx)
} else {
decompress4x_main_loop_amd64(&ctx)
}
decoded = ctx.decoded
out = out[decoded/4:]
}
// Decode remaining.
remainBytes := dstEvery - (decoded / 4)
for i := range br {
offset := dstEvery * i
endsAt := offset + remainBytes
if endsAt > len(out) {
endsAt = len(out)
}
br := &br[i]
bitsLeft := br.remaining()
for bitsLeft > 0 {
br.fill()
if offset >= endsAt {
return nil, errors.New("corruption detected: stream overrun 4")
}
// Read value and increment offset.
val := br.peekBitsFast(d.actualTableLog)
v := single[val&tlMask].entry
nBits := uint8(v)
br.advance(nBits)
bitsLeft -= uint(nBits)
out[offset] = uint8(v >> 8)
offset++
}
if offset != endsAt {
return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
}
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
return nil, err
}
}
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
return dst, nil
}
// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress1X when tablelog > 8.
//go:noescape
func decompress1x_main_loop_amd64(ctx *decompress1xContext)
// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
// of Decompress1X when tablelog > 8.
//go:noescape
func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
type decompress1xContext struct {
pbr *bitReaderShifted
peekBits uint8
out *byte
outCap int
tbl *dEntrySingle
decoded int
}
// Error reported by asm implementations
const error_max_decoded_size_exeeded = -1
// Decompress1X will decompress a 1X encoded stream.
// The cap of the output buffer will be the maximum decompressed size.
// The length of the supplied input must match the end of a block exactly.
func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
if len(d.dt.single) == 0 {
return nil, errors.New("no table loaded")
}
var br bitReaderShifted
err := br.init(src)
if err != nil {
return dst, err
}
maxDecodedSize := cap(dst)
dst = dst[:maxDecodedSize]
const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
if maxDecodedSize >= 4 {
ctx := decompress1xContext{
pbr: &br,
out: &dst[0],
outCap: maxDecodedSize,
peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
tbl: &d.dt.single[0],
}
if cpuinfo.HasBMI2() {
decompress1x_main_loop_bmi2(&ctx)
} else {
decompress1x_main_loop_amd64(&ctx)
}
if ctx.decoded == error_max_decoded_size_exeeded {
return nil, ErrMaxDecodedSizeExceeded
}
dst = dst[:ctx.decoded]
}
// br < 8, so uint8 is fine
bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
for bitsLeft > 0 {
br.fill()
if len(dst) >= maxDecodedSize {
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
nBits := uint8(v.entry)
br.advance(nBits)
bitsLeft -= nBits
dst = append(dst, uint8(v.entry>>8))
}
return dst, br.close()
}

View File

@ -0,0 +1,847 @@
// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
//go:build amd64 && !appengine && !noasm && gc
// +build amd64,!appengine,!noasm,gc
// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
TEXT ·decompress4x_main_loop_amd64(SB), $0-8
XORQ DX, DX
// Preload values
MOVQ ctx+0(FP), AX
MOVBQZX 8(AX), DI
MOVQ 16(AX), SI
MOVQ 48(AX), BX
MOVQ 24(AX), R9
MOVQ 32(AX), R10
MOVQ (AX), R11
// Main loop
main_loop:
MOVQ SI, R8
CMPQ R8, BX
SETGE DL
// br0.fillFast32()
MOVQ 32(R11), R12
MOVBQZX 40(R11), R13
CMPQ R13, $0x20
JBE skip_fill0
MOVQ 24(R11), AX
SUBQ $0x20, R13
SUBQ $0x04, AX
MOVQ (R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(R14*1), R14
MOVQ R13, CX
SHLQ CL, R14
MOVQ AX, 24(R11)
ORQ R14, R12
// exhausted = exhausted || (br0.off < 4)
CMPQ AX, $0x04
SETLT AL
ORB AL, DL
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br0.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
// val1 := br0.peekTopBits(peekBits)
MOVQ DI, CX
MOVQ R12, R14
SHRQ CL, R14
// v1 := table[val1&mask]
MOVW (R10)(R14*2), CX
// br0.advance(uint8(v1.entry))
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (R8)
// update the bitreader structure
MOVQ R12, 32(R11)
MOVB R13, 40(R11)
ADDQ R9, R8
// br1.fillFast32()
MOVQ 80(R11), R12
MOVBQZX 88(R11), R13
CMPQ R13, $0x20
JBE skip_fill1
MOVQ 72(R11), AX
SUBQ $0x20, R13
SUBQ $0x04, AX
MOVQ 48(R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(R14*1), R14
MOVQ R13, CX
SHLQ CL, R14
MOVQ AX, 72(R11)
ORQ R14, R12
// exhausted = exhausted || (br1.off < 4)
CMPQ AX, $0x04
SETLT AL
ORB AL, DL
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br1.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
// val1 := br1.peekTopBits(peekBits)
MOVQ DI, CX
MOVQ R12, R14
SHRQ CL, R14
// v1 := table[val1&mask]
MOVW (R10)(R14*2), CX
// br1.advance(uint8(v1.entry))
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (R8)
// update the bitreader structure
MOVQ R12, 80(R11)
MOVB R13, 88(R11)
ADDQ R9, R8
// br2.fillFast32()
MOVQ 128(R11), R12
MOVBQZX 136(R11), R13
CMPQ R13, $0x20
JBE skip_fill2
MOVQ 120(R11), AX
SUBQ $0x20, R13
SUBQ $0x04, AX
MOVQ 96(R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(R14*1), R14
MOVQ R13, CX
SHLQ CL, R14
MOVQ AX, 120(R11)
ORQ R14, R12
// exhausted = exhausted || (br2.off < 4)
CMPQ AX, $0x04
SETLT AL
ORB AL, DL
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br2.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
// val1 := br2.peekTopBits(peekBits)
MOVQ DI, CX
MOVQ R12, R14
SHRQ CL, R14
// v1 := table[val1&mask]
MOVW (R10)(R14*2), CX
// br2.advance(uint8(v1.entry))
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (R8)
// update the bitreader structure
MOVQ R12, 128(R11)
MOVB R13, 136(R11)
ADDQ R9, R8
// br3.fillFast32()
MOVQ 176(R11), R12
MOVBQZX 184(R11), R13
CMPQ R13, $0x20
JBE skip_fill3
MOVQ 168(R11), AX
SUBQ $0x20, R13
SUBQ $0x04, AX
MOVQ 144(R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(R14*1), R14
MOVQ R13, CX
SHLQ CL, R14
MOVQ AX, 168(R11)
ORQ R14, R12
// exhausted = exhausted || (br3.off < 4)
CMPQ AX, $0x04
SETLT AL
ORB AL, DL
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br3.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
// val1 := br3.peekTopBits(peekBits)
MOVQ DI, CX
MOVQ R12, R14
SHRQ CL, R14
// v1 := table[val1&mask]
MOVW (R10)(R14*2), CX
// br3.advance(uint8(v1.entry))
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (R8)
// update the bitreader structure
MOVQ R12, 176(R11)
MOVB R13, 184(R11)
ADDQ $0x02, SI
TESTB DL, DL
JZ main_loop
MOVQ ctx+0(FP), AX
SUBQ 16(AX), SI
SHLQ $0x02, SI
MOVQ SI, 40(AX)
RET
// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
XORQ DX, DX
// Preload values
MOVQ ctx+0(FP), CX
MOVBQZX 8(CX), DI
MOVQ 16(CX), BX
MOVQ 48(CX), SI
MOVQ 24(CX), R9
MOVQ 32(CX), R10
MOVQ (CX), R11
// Main loop
main_loop:
MOVQ BX, R8
CMPQ R8, SI
SETGE DL
// br0.fillFast32()
MOVQ 32(R11), R12
MOVBQZX 40(R11), R13
CMPQ R13, $0x20
JBE skip_fill0
MOVQ 24(R11), R14
SUBQ $0x20, R13
SUBQ $0x04, R14
MOVQ (R11), R15
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R14)(R15*1), R15
MOVQ R13, CX
SHLQ CL, R15
MOVQ R14, 24(R11)
ORQ R15, R12
// exhausted = exhausted || (br0.off < 4)
CMPQ R14, $0x04
SETLT AL
ORB AL, DL
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br0.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
// val1 := br0.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v1 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br0.advance(uint8(v1.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// val2 := br0.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v2 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br0.advance(uint8(v2.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
// val3 := br0.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v3 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br0.advance(uint8(v3.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// these four writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (R8)
// update the bitreader structure
MOVQ R12, 32(R11)
MOVB R13, 40(R11)
ADDQ R9, R8
// br1.fillFast32()
MOVQ 80(R11), R12
MOVBQZX 88(R11), R13
CMPQ R13, $0x20
JBE skip_fill1
MOVQ 72(R11), R14
SUBQ $0x20, R13
SUBQ $0x04, R14
MOVQ 48(R11), R15
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R14)(R15*1), R15
MOVQ R13, CX
SHLQ CL, R15
MOVQ R14, 72(R11)
ORQ R15, R12
// exhausted = exhausted || (br1.off < 4)
CMPQ R14, $0x04
SETLT AL
ORB AL, DL
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br1.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
// val1 := br1.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v1 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br1.advance(uint8(v1.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// val2 := br1.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v2 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br1.advance(uint8(v2.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
// val3 := br1.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v3 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br1.advance(uint8(v3.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// these four writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (R8)
// update the bitreader structure
MOVQ R12, 80(R11)
MOVB R13, 88(R11)
ADDQ R9, R8
// br2.fillFast32()
MOVQ 128(R11), R12
MOVBQZX 136(R11), R13
CMPQ R13, $0x20
JBE skip_fill2
MOVQ 120(R11), R14
SUBQ $0x20, R13
SUBQ $0x04, R14
MOVQ 96(R11), R15
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R14)(R15*1), R15
MOVQ R13, CX
SHLQ CL, R15
MOVQ R14, 120(R11)
ORQ R15, R12
// exhausted = exhausted || (br2.off < 4)
CMPQ R14, $0x04
SETLT AL
ORB AL, DL
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br2.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
// val1 := br2.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v1 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br2.advance(uint8(v1.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// val2 := br2.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v2 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br2.advance(uint8(v2.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
// val3 := br2.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v3 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br2.advance(uint8(v3.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// these four writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (R8)
// update the bitreader structure
MOVQ R12, 128(R11)
MOVB R13, 136(R11)
ADDQ R9, R8
// br3.fillFast32()
MOVQ 176(R11), R12
MOVBQZX 184(R11), R13
CMPQ R13, $0x20
JBE skip_fill3
MOVQ 168(R11), R14
SUBQ $0x20, R13
SUBQ $0x04, R14
MOVQ 144(R11), R15
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R14)(R15*1), R15
MOVQ R13, CX
SHLQ CL, R15
MOVQ R14, 168(R11)
ORQ R15, R12
// exhausted = exhausted || (br3.off < 4)
CMPQ R14, $0x04
SETLT AL
ORB AL, DL
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br3.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
// val1 := br3.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v1 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br3.advance(uint8(v1.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// val2 := br3.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v2 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br3.advance(uint8(v2.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
// val3 := br3.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v3 := table[val0&mask]
MOVW (R10)(R14*2), CX
// br3.advance(uint8(v3.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// these four writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (R8)
// update the bitreader structure
MOVQ R12, 176(R11)
MOVB R13, 184(R11)
ADDQ $0x04, BX
TESTB DL, DL
JZ main_loop
MOVQ ctx+0(FP), AX
SUBQ 16(AX), BX
SHLQ $0x02, BX
MOVQ BX, 40(AX)
RET
// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
TEXT ·decompress1x_main_loop_amd64(SB), $0-8
MOVQ ctx+0(FP), CX
MOVQ 16(CX), DX
MOVQ 24(CX), BX
CMPQ BX, $0x04
JB error_max_decoded_size_exeeded
LEAQ (DX)(BX*1), BX
MOVQ (CX), SI
MOVQ (SI), R8
MOVQ 24(SI), R9
MOVQ 32(SI), R10
MOVBQZX 40(SI), R11
MOVQ 32(CX), SI
MOVBQZX 8(CX), DI
JMP loop_condition
main_loop:
// Check if we have room for 4 bytes in the output buffer
LEAQ 4(DX), CX
CMPQ CX, BX
JGE error_max_decoded_size_exeeded
// Decode 4 values
CMPQ R11, $0x20
JL bitReader_fillFast_1_end
SUBQ $0x20, R11
SUBQ $0x04, R9
MOVL (R8)(R9*1), R12
MOVQ R11, CX
SHLQ CL, R12
ORQ R12, R10
bitReader_fillFast_1_end:
MOVQ DI, CX
MOVQ R10, R12
SHRQ CL, R12
MOVW (SI)(R12*2), CX
MOVB CH, AL
MOVBQZX CL, CX
ADDQ CX, R11
SHLQ CL, R10
MOVQ DI, CX
MOVQ R10, R12
SHRQ CL, R12
MOVW (SI)(R12*2), CX
MOVB CH, AH
MOVBQZX CL, CX
ADDQ CX, R11
SHLQ CL, R10
BSWAPL AX
CMPQ R11, $0x20
JL bitReader_fillFast_2_end
SUBQ $0x20, R11
SUBQ $0x04, R9
MOVL (R8)(R9*1), R12
MOVQ R11, CX
SHLQ CL, R12
ORQ R12, R10
bitReader_fillFast_2_end:
MOVQ DI, CX
MOVQ R10, R12
SHRQ CL, R12
MOVW (SI)(R12*2), CX
MOVB CH, AH
MOVBQZX CL, CX
ADDQ CX, R11
SHLQ CL, R10
MOVQ DI, CX
MOVQ R10, R12
SHRQ CL, R12
MOVW (SI)(R12*2), CX
MOVB CH, AL
MOVBQZX CL, CX
ADDQ CX, R11
SHLQ CL, R10
BSWAPL AX
// Store the decoded values
MOVL AX, (DX)
ADDQ $0x04, DX
loop_condition:
CMPQ R9, $0x08
JGE main_loop
// Update ctx structure
MOVQ ctx+0(FP), AX
SUBQ 16(AX), DX
MOVQ DX, 40(AX)
MOVQ (AX), AX
MOVQ R9, 24(AX)
MOVQ R10, 32(AX)
MOVB R11, 40(AX)
RET
// Report error
error_max_decoded_size_exeeded:
MOVQ ctx+0(FP), AX
MOVQ $-1, CX
MOVQ CX, 40(AX)
RET
// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
// Requires: BMI2
TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
MOVQ ctx+0(FP), CX
MOVQ 16(CX), DX
MOVQ 24(CX), BX
CMPQ BX, $0x04
JB error_max_decoded_size_exeeded
LEAQ (DX)(BX*1), BX
MOVQ (CX), SI
MOVQ (SI), R8
MOVQ 24(SI), R9
MOVQ 32(SI), R10
MOVBQZX 40(SI), R11
MOVQ 32(CX), SI
MOVBQZX 8(CX), DI
JMP loop_condition
main_loop:
// Check if we have room for 4 bytes in the output buffer
LEAQ 4(DX), CX
CMPQ CX, BX
JGE error_max_decoded_size_exeeded
// Decode 4 values
CMPQ R11, $0x20
JL bitReader_fillFast_1_end
SUBQ $0x20, R11
SUBQ $0x04, R9
MOVL (R8)(R9*1), CX
SHLXQ R11, CX, CX
ORQ CX, R10
bitReader_fillFast_1_end:
SHRXQ DI, R10, CX
MOVW (SI)(CX*2), CX
MOVB CH, AL
MOVBQZX CL, CX
ADDQ CX, R11
SHLXQ CX, R10, R10
SHRXQ DI, R10, CX
MOVW (SI)(CX*2), CX
MOVB CH, AH
MOVBQZX CL, CX
ADDQ CX, R11
SHLXQ CX, R10, R10
BSWAPL AX
CMPQ R11, $0x20
JL bitReader_fillFast_2_end
SUBQ $0x20, R11
SUBQ $0x04, R9
MOVL (R8)(R9*1), CX
SHLXQ R11, CX, CX
ORQ CX, R10
bitReader_fillFast_2_end:
SHRXQ DI, R10, CX
MOVW (SI)(CX*2), CX
MOVB CH, AH
MOVBQZX CL, CX
ADDQ CX, R11
SHLXQ CX, R10, R10
SHRXQ DI, R10, CX
MOVW (SI)(CX*2), CX
MOVB CH, AL
MOVBQZX CL, CX
ADDQ CX, R11
SHLXQ CX, R10, R10
BSWAPL AX
// Store the decoded values
MOVL AX, (DX)
ADDQ $0x04, DX
loop_condition:
CMPQ R9, $0x08
JGE main_loop
// Update ctx structure
MOVQ ctx+0(FP), AX
SUBQ 16(AX), DX
MOVQ DX, 40(AX)
MOVQ (AX), AX
MOVQ R9, 24(AX)
MOVQ R10, 32(AX)
MOVB R11, 40(AX)
RET
// Report error
error_max_decoded_size_exeeded:
MOVQ ctx+0(FP), AX
MOVQ $-1, CX
MOVQ CX, 40(AX)
RET

View File

@ -0,0 +1,295 @@
//go:build !amd64 || appengine || !gc || noasm
// +build !amd64 appengine !gc noasm
// This file contains a generic implementation of Decoder.Decompress4X.
package huff0
import (
"errors"
"fmt"
)
// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
// the uncompressed data exactly.
func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if len(d.dt.single) == 0 {
return nil, errors.New("no table loaded")
}
if len(src) < 6+(4*1) {
return nil, errors.New("input too small")
}
if use8BitTables && d.actualTableLog <= 8 {
return d.decompress4X8bit(dst, src)
}
var br [4]bitReaderShifted
// Decode "jump table"
start := 6
for i := 0; i < 3; i++ {
length := int(src[i*2]) | (int(src[i*2+1]) << 8)
if start+length >= len(src) {
return nil, errors.New("truncated input (or invalid offset)")
}
err := br[i].init(src[start : start+length])
if err != nil {
return nil, err
}
start += length
}
err := br[3].init(src[start:])
if err != nil {
return nil, err
}
// destination, offset to match first output
dstSize := cap(dst)
dst = dst[:dstSize]
out := dst
dstEvery := (dstSize + 3) / 4
const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
buf := d.buffer()
var off uint8
var decoded int
// Decode 2 values from each decoder/loop.
const bufoff = 256
for {
if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
break
}
{
const stream = 0
const stream2 = 1
br[stream].fillFast()
br[stream2].fillFast()
val := br[stream].peekBitsFast(d.actualTableLog)
val2 := br[stream2].peekBitsFast(d.actualTableLog)
v := single[val&tlMask]
v2 := single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off] = uint8(v.entry >> 8)
buf[stream2][off] = uint8(v2.entry >> 8)
val = br[stream].peekBitsFast(d.actualTableLog)
val2 = br[stream2].peekBitsFast(d.actualTableLog)
v = single[val&tlMask]
v2 = single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off+1] = uint8(v.entry >> 8)
buf[stream2][off+1] = uint8(v2.entry >> 8)
}
{
const stream = 2
const stream2 = 3
br[stream].fillFast()
br[stream2].fillFast()
val := br[stream].peekBitsFast(d.actualTableLog)
val2 := br[stream2].peekBitsFast(d.actualTableLog)
v := single[val&tlMask]
v2 := single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off] = uint8(v.entry >> 8)
buf[stream2][off] = uint8(v2.entry >> 8)
val = br[stream].peekBitsFast(d.actualTableLog)
val2 = br[stream2].peekBitsFast(d.actualTableLog)
v = single[val&tlMask]
v2 = single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off+1] = uint8(v.entry >> 8)
buf[stream2][off+1] = uint8(v2.entry >> 8)
}
off += 2
if off == 0 {
if bufoff > dstEvery {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
copy(out[dstEvery:], buf[1][:])
copy(out[dstEvery*2:], buf[2][:])
copy(out[dstEvery*3:], buf[3][:])
out = out[bufoff:]
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
}
}
if off > 0 {
ioff := int(off)
if len(out) < dstEvery*3+ioff {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 3")
}
copy(out, buf[0][:off])
copy(out[dstEvery:], buf[1][:off])
copy(out[dstEvery*2:], buf[2][:off])
copy(out[dstEvery*3:], buf[3][:off])
decoded += int(off) * 4
out = out[off:]
}
// Decode remaining.
remainBytes := dstEvery - (decoded / 4)
for i := range br {
offset := dstEvery * i
endsAt := offset + remainBytes
if endsAt > len(out) {
endsAt = len(out)
}
br := &br[i]
bitsLeft := br.remaining()
for bitsLeft > 0 {
br.fill()
if offset >= endsAt {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}
// Read value and increment offset.
val := br.peekBitsFast(d.actualTableLog)
v := single[val&tlMask].entry
nBits := uint8(v)
br.advance(nBits)
bitsLeft -= uint(nBits)
out[offset] = uint8(v >> 8)
offset++
}
if offset != endsAt {
d.bufs.Put(buf)
return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
}
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
return nil, err
}
}
d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
return dst, nil
}
// Decompress1X will decompress a 1X encoded stream.
// The cap of the output buffer will be the maximum decompressed size.
// The length of the supplied input must match the end of a block exactly.
func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
if len(d.dt.single) == 0 {
return nil, errors.New("no table loaded")
}
if use8BitTables && d.actualTableLog <= 8 {
return d.decompress1X8Bit(dst, src)
}
var br bitReaderShifted
err := br.init(src)
if err != nil {
return dst, err
}
maxDecodedSize := cap(dst)
dst = dst[:0]
// Avoid bounds check by always having full sized table.
const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
dt := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
bufs := d.buffer()
buf := &bufs[0]
var off uint8
for br.off >= 8 {
br.fillFast()
v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
br.advance(uint8(v.entry))
buf[off+0] = uint8(v.entry >> 8)
v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
br.advance(uint8(v.entry))
buf[off+1] = uint8(v.entry >> 8)
// Refill
br.fillFast()
v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
br.advance(uint8(v.entry))
buf[off+2] = uint8(v.entry >> 8)
v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
br.advance(uint8(v.entry))
buf[off+3] = uint8(v.entry >> 8)
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
br.close()
d.bufs.Put(bufs)
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:]...)
}
}
if len(dst)+int(off) > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:off]...)
// br < 8, so uint8 is fine
bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
for bitsLeft > 0 {
br.fill()
if false && br.bitsRead >= 32 {
if br.off >= 4 {
v := br.in[br.off-4:]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
br.value = (br.value << 32) | uint64(low)
br.bitsRead -= 32
br.off -= 4
} else {
for br.off > 0 {
br.value = (br.value << 8) | uint64(br.in[br.off-1])
br.bitsRead -= 8
br.off--
}
}
}
if len(dst) >= maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
nBits := uint8(v.entry)
br.advance(nBits)
bitsLeft -= nBits
dst = append(dst, uint8(v.entry>>8))
}
d.bufs.Put(bufs)
return dst, br.close()
}

View File

@ -8,6 +8,7 @@ import (
"fmt"
"math"
"math/bits"
"sync"
"github.com/klauspost/compress/fse"
)
@ -116,6 +117,7 @@ type Scratch struct {
nodes []nodeElt
tmpOut [4][]byte
fse *fse.Scratch
decPool sync.Pool // *[4][256]byte buffers.
huffWeight [maxSymbolValue + 1]byte
}
@ -245,6 +247,68 @@ func (c cTable) write(s *Scratch) error {
return nil
}
func (c cTable) estTableSize(s *Scratch) (sz int, err error) {
var (
// precomputed conversion table
bitsToWeight [tableLogMax + 1]byte
huffLog = s.actualTableLog
// last weight is not saved.
maxSymbolValue = uint8(s.symbolLen - 1)
huffWeight = s.huffWeight[:256]
)
const (
maxFSETableLog = 6
)
// convert to weight
bitsToWeight[0] = 0
for n := uint8(1); n < huffLog+1; n++ {
bitsToWeight[n] = huffLog + 1 - n
}
// Acquire histogram for FSE.
hist := s.fse.Histogram()
hist = hist[:256]
for i := range hist[:16] {
hist[i] = 0
}
for n := uint8(0); n < maxSymbolValue; n++ {
v := bitsToWeight[c[n].nBits] & 15
huffWeight[n] = v
hist[v]++
}
// FSE compress if feasible.
if maxSymbolValue >= 2 {
huffMaxCnt := uint32(0)
huffMax := uint8(0)
for i, v := range hist[:16] {
if v == 0 {
continue
}
huffMax = byte(i)
if v > huffMaxCnt {
huffMaxCnt = v
}
}
s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
s.fse.TableLog = maxFSETableLog
b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
if err == nil && len(b) < int(s.symbolLen>>1) {
sz += 1 + len(b)
return sz, nil
}
// Unable to compress (RLE/uncompressible)
}
// write raw values as 4-bits (max : 15)
if maxSymbolValue > (256 - 128) {
// should not happen : likely means source cannot be compressed
return 0, ErrIncompressible
}
// special case, pack weights 4 bits/weight.
sz += 1 + int(maxSymbolValue/2)
return sz, nil
}
// estimateSize returns the estimated size in bytes of the input represented in the
// histogram supplied.
func (c cTable) estimateSize(hist []uint32) int {