Updates for text streaming functions

Signed-off-by: Alex Ellis (OpenFaaS Ltd) <alexellis2@gmail.com>
This commit is contained in:
Alex Ellis (OpenFaaS Ltd)
2024-01-11 18:13:34 +00:00
parent f17a25f3e8
commit 4a3fa684e2
59 changed files with 845 additions and 816 deletions

View File

@ -259,7 +259,7 @@ nyc-taxi-data-10M.csv gzkp 1 3325605752 922273214 13929 227.68
## Decompressor
Staus: STABLE - there may still be subtle bugs, but a wide variety of content has been tested.
Status: STABLE - there may still be subtle bugs, but a wide variety of content has been tested.
This library is being continuously [fuzz-tested](https://github.com/klauspost/compress-fuzz),
kindly supplied by [fuzzit.dev](https://fuzzit.dev/).

View File

@ -43,7 +43,7 @@ func (m *match) estBits(bitsPerByte int32) {
if m.rep < 0 {
ofc = ofCode(uint32(m.s-m.offset) + 3)
} else {
ofc = ofCode(uint32(m.rep))
ofc = ofCode(uint32(m.rep) & 3)
}
// Cost, excluding
ofTT, mlTT := fsePredefEnc[tableOffsets].ct.symbolTT[ofc], fsePredefEnc[tableMatchLengths].ct.symbolTT[mlc]
@ -197,12 +197,13 @@ encodeLoop:
// Set m to a match at offset if it looks like that will improve compression.
improve := func(m *match, offset int32, s int32, first uint32, rep int32) {
if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
delta := s - offset
if delta >= e.maxMatchOff || delta <= 0 || load3232(src, offset) != first {
return
}
if debugAsserts {
if offset <= 0 {
panic(offset)
if offset >= s {
panic(fmt.Sprintf("offset: %d - s:%d - rep: %d - cur :%d - max: %d", offset, s, rep, e.cur, e.maxMatchOff))
}
if !bytes.Equal(src[s:s+4], src[offset:offset+4]) {
panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
@ -226,7 +227,7 @@ encodeLoop:
}
}
l := 4 + e.matchlen(s+4, offset+4, src)
if rep < 0 {
if true {
// Extend candidate match backwards as far as possible.
tMin := s - e.maxMatchOff
if tMin < 0 {
@ -281,6 +282,7 @@ encodeLoop:
// Load next and check...
e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: candidateL.offset}
e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: candidateS.offset}
index0 := s + 1
// Look far ahead, unless we have a really long match already...
if best.length < goodEnough {
@ -343,8 +345,8 @@ encodeLoop:
if best.rep > 0 {
var seq seq
seq.matchLen = uint32(best.length - zstdMinMatch)
if debugAsserts && s <= nextEmit {
panic("s <= nextEmit")
if debugAsserts && s < nextEmit {
panic("s < nextEmit")
}
addLiterals(&seq, best.s)
@ -356,19 +358,16 @@ encodeLoop:
blk.sequences = append(blk.sequences, seq)
// Index old s + 1 -> s - 1
index0 := s + 1
s = best.s + best.length
nextEmit = s
if s >= sLimit {
if debugEncoder {
println("repeat ended", s, best.length)
}
break encodeLoop
}
// Index skipped...
end := s
if s > sLimit+4 {
end = sLimit + 4
}
off := index0 + e.cur
for index0 < s {
for index0 < end {
cv0 := load6432(src, index0)
h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
@ -377,6 +376,7 @@ encodeLoop:
off++
index0++
}
switch best.rep {
case 2, 4 | 1:
offset1, offset2 = offset2, offset1
@ -385,12 +385,17 @@ encodeLoop:
case 4 | 3:
offset1, offset2, offset3 = offset1-1, offset1, offset2
}
if s >= sLimit {
if debugEncoder {
println("repeat ended", s, best.length)
}
break encodeLoop
}
continue
}
// A 4-byte match has been found. Update recent offsets.
// We'll later see if more than 4 bytes.
index0 := s + 1
s = best.s
t := best.offset
offset1, offset2, offset3 = s-t, offset1, offset2
@ -418,19 +423,25 @@ encodeLoop:
}
blk.sequences = append(blk.sequences, seq)
nextEmit = s
if s >= sLimit {
break encodeLoop
// Index old s + 1 -> s - 1 or sLimit
end := s
if s > sLimit-4 {
end = sLimit - 4
}
// Index old s + 1 -> s - 1
for index0 < s {
off := index0 + e.cur
for index0 < end {
cv0 := load6432(src, index0)
h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
index0++
off++
}
if s >= sLimit {
break encodeLoop
}
}

View File

@ -145,7 +145,7 @@ encodeLoop:
var t int32
// We allow the encoder to optionally turn off repeat offsets across blocks
canRepeat := len(blk.sequences) > 2
var matched int32
var matched, index0 int32
for {
if debugAsserts && canRepeat && offset1 == 0 {
@ -162,6 +162,7 @@ encodeLoop:
off := s + e.cur
e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
index0 = s + 1
if canRepeat {
if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
@ -258,7 +259,6 @@ encodeLoop:
}
blk.sequences = append(blk.sequences, seq)
index0 := s + repOff2
s += lenght + repOff2
nextEmit = s
if s >= sLimit {
@ -498,15 +498,15 @@ encodeLoop:
}
// Index match start+1 (long) -> s - 1
index0 := s - l + 1
off := index0 + e.cur
for index0 < s-1 {
cv0 := load6432(src, index0)
cv1 := cv0 >> 8
h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
index0 += 2
off += 2
}
cv = load6432(src, s)
@ -672,7 +672,7 @@ encodeLoop:
var t int32
// We allow the encoder to optionally turn off repeat offsets across blocks
canRepeat := len(blk.sequences) > 2
var matched int32
var matched, index0 int32
for {
if debugAsserts && canRepeat && offset1 == 0 {
@ -691,6 +691,7 @@ encodeLoop:
e.markLongShardDirty(nextHashL)
e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
e.markShortShardDirty(nextHashS)
index0 = s + 1
if canRepeat {
if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
@ -726,7 +727,6 @@ encodeLoop:
blk.sequences = append(blk.sequences, seq)
// Index match start+1 (long) -> s - 1
index0 := s + repOff
s += lenght + repOff
nextEmit = s
@ -790,7 +790,6 @@ encodeLoop:
}
blk.sequences = append(blk.sequences, seq)
index0 := s + repOff2
s += lenght + repOff2
nextEmit = s
if s >= sLimit {
@ -1024,18 +1023,18 @@ encodeLoop:
}
// Index match start+1 (long) -> s - 1
index0 := s - l + 1
off := index0 + e.cur
for index0 < s-1 {
cv0 := load6432(src, index0)
cv1 := cv0 >> 8
h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
e.markLongShardDirty(h0)
h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
e.markShortShardDirty(h1)
index0 += 2
off += 2
}
cv = load6432(src, s)