#### Bzip2 Implementation
#### Copyright (C) 2023-2024 Remilia Scarlet
#### Copyright (C) 2015 Jaime Olivares
#### Copyright (c) 2011 Matthew Francis
#### MIT License
####
#### Ported from the Java implementation by Matthew Francis:
#### https://github.com/MateuszBartosiewicz/bzip2.
####
#### Ported by Remilia Scarlet from the C# implementation by Jamie Olivares:
#### http://github.com/jaime-olivares/bzip2
require "../../bitreader"
require "./crc32"
module RemiLib::Compression::BZip2
# Reads and decompresses a single BZip2 block.
#
# Block decoding consists of the following stages:
# 1. Read block header
# 2. Read Huffman tables
# 3. Read and decode Huffman encoded data
# 4. Run-Length Decoding[2]
# 5. Inverse Move To Front Transform
# 6. Inverse Burrows Wheeler Transform
# 7. Run-Length Decoding[1]
# 8. Optional Block De-Randomization
private class BlockDecompressor
@input : BitReader
# Used to calculate this block's CRC for the fully decoded data.
@crc : CRC32 = CRC32.new
# The CRC of the current block as read from the block header.
@blockCRC : UInt32
# `true` if the current block is randomised, or `false` otherwise.
@blockRandomized : Bool
# The end-of-block Huffman symbol. Decoding of the block ends when this is
# encountered.
@huffmanEndOfBlockSymbol : Int32 = 0
# A map from Huffman symbol index to output character. Some types of data
# (e.g. ASCII text) may contain only a limited number of byte values;
# Huffman symbols are only allocated to those values that actually occur in
# the uncompressed data.
@huffmanSymbolMap : Bytes = Bytes.new(256)
# Counts of each byte value within the BWT-transformed array data. Collected
# at the Move To Front stage, consumed by the Inverse Burrows Wheeler
# Transform stage.
@bwtByteCounts : Array(Int32) = Array(Int32).new(256, 0i32)
# The Burrows-Wheeler Transform processed data. Read at the Move To Front
# stage, consumed by the Inverse Burrows Wheeler Transform stage.
@bwtBlock : Bytes
# At each position contains the union of :
#
# * An output character (8 bits).
# * A pointer from each position to its successor (24 bits, left shifted 8 bits).
#
# As the pointer cannot exceed the maximum block size of 900k, 24 bits is
# more than enough to hold it; Folding the character data into the spare
# bits while performing the inverse BWT, when both pieces of information are
# available, saves a large number of memory accesses in the final decoding
# stages.
@bwtMergedPointers : Array(Int32) = [] of Int32
# The current merged pointer into the Burrow-Wheeler Transform array.
@bwtCurrentMergedPointer : Int32 = 0
# The actual length in bytes of the current block at the Inverse Burrows
# Wheeler Transform stage (before final Run-Length Decoding).
@bwtBlockLength : Int32 = 0
# The number of output bytes that have been decoded up to the Inverse
# Burrows Wheeler Transform stage.
@bwtBytesDecoded : Int32 = 0
# The most recently RLE decoded byte.
@rleLastDecodedByte : Int16 = -1i16
# The number of previous identical output bytes decoded. After 4 identical
# bytes, the next byte decoded is an RLE repeat count.
@rleAccumulator : Int32 = 0
# The RLE repeat count of the current decoded byte. When this reaches zero,
# a new byte is decoded.
@rleRepeat : UInt16 = 0
# If the current block is randomised, the position within the RNUMS
# randomisation array.
@randomIndex : Int32 = 0
# If the current block is randomised, the remaining count at the current
# RNUMS position.
@randomCount : Int32 = RNUMS[0] - 1
# Creates a new `BlockDecompressor` instance that will read bits from
# *input*.
def initialize(@input : BitReader, blockSize : Int)
@bwtBlock = Bytes.new(blockSize.to_u32!)
# Read block header
@blockCRC = @input.read(32).to_u32!
@blockRandomized = @input.read(1) != 0
bwtStartPointer : UInt32 = @input.read(24).to_u32!
# Read block data and decode through to the Inverse Burrows Wheeler Transform stage.
huffmanDecoder = readHuffmanTables
decodeHuffmanData(huffmanDecoder)
initializeInverseBWT(bwtStartPointer)
end
# Decodes a byte from the final Run-Length Encoding stage, pulling a new
# byte from the Burrows-Wheeler Transform stage when required. Returns -1
# when there are no more bytes to decode.
def read : Int16
nextByte : UInt8 = 0
while @rleRepeat < 1
return -1i16 if @bwtBytesDecoded == @bwtBlockLength
nextByte = decodeNextBWTByte
if nextByte != @rleLastDecodedByte
# New byte, restart accumulation
@rleLastDecodedByte = nextByte.to_i16!
@rleRepeat = 1
@rleAccumulator = 1
@crc.update(nextByte)
else
@rleAccumulator += 1
if @rleAccumulator == 4
# Accumulation complete, start repetition
@rleRepeat = decodeNextBWTByte.to_u16! + 1
@rleAccumulator = 0
@crc.update(nextByte, @rleRepeat)
else
@rleRepeat = 1
@crc.update(nextByte)
end
end
end
@rleRepeat -= 1
@rleLastDecodedByte
end
# Decodes multiple bytes from the final Run-Length Encoding stage, pulling
# new bytes from the Burrows-Wheeler Transform stage when required. This
# stores the results in *buf* and returns the index of the first element of
# *buf* that was unmodified.
@[AlwaysInline]
def read(buf : Bytes) : Int32
decoded : Int16 = -1i16
buf.size.times do |i|
decoded = read
return i if decoded == -1
buf.unsafe_put(i, decoded.to_u8!)
end
buf.size
end
# Verifies and returns the block's CRC. This method must only be called
# after all of the block's bytes have been read.
@[AlwaysInline]
def checkCrc : UInt32
ret = @crc.crc
#STDERR << ret << " == " << @blockCRC << '\n'
raise Error.new("BZip2 block CRC error") if @blockCRC != ret
ret
end
# Read and decode the block's Huffman tables.
private def readHuffmanTables : HuffmanStageDecoder
j : Int32 = 0
k : Int32 = 0
tableCodeLengths : Array(Array(UInt8)) = Array(Array(UInt8)).new(MAXIMUM_TABLES) do |_|
Array(UInt8).new(MAX_ALPHABET_SIZE, 0u8)
end
# Read Huffman symbol to output byte map.
huffmanUsedRanges : UInt32 = @input.read(16).to_u32!
huffmanSymbolCount : Int32 = 0
16.times do |i|
next if (huffmanUsedRanges & ((1 << 15) >> i)) == 0
j = 0
k = i << 4
while j < 16
if @input.read(1) != 0
@huffmanSymbolMap[huffmanSymbolCount] = k.to_u8!
huffmanSymbolCount += 1
end
j += 1
k += 1
end
end
endOfBlockSymbol : Int32 = huffmanSymbolCount + 1
@huffmanEndOfBlockSymbol = endOfBlockSymbol
# Read total number of tables and selectors.
totalTables = @input.read(3).to_u32!
totalSelectors = @input.read(15).to_u32!
if totalTables < MINIMUM_TABLES ||
totalTables > MAXIMUM_TABLES ||
totalSelectors < 1 ||
totalSelectors > MAXIMUM_SELECTORS
raise Error.new("BZip2 block's Huffman tables are invalid")
end
# Read and decode MTFed Huffman selector list.
tableMTF = MoveToFront.new
selectors = Bytes.new(totalSelectors)
totalSelectors.times do |selector|
selectors[selector] = tableMTF.indexToFront(@input.countOnes(discardFirstZero: true))
end
# Read the Canonical Huffman code lengths for each table.
currentLength : Int32 = 0
totalTables.times do |table|
currentLength = @input.read(5).to_i32!
0.upto(endOfBlockSymbol) do |i|
until @input.read(1) == 0
currentLength += (@input.read(1) != 0 ? -1 : 1)
end
tableCodeLengths[table][i] = currentLength.to_u8!
end
end
HuffmanStageDecoder.new(@input, endOfBlockSymbol + 1, tableCodeLengths, selectors)
end
# Reads the Huffman encoded data from the input stream, performs Run-Length
#Decoding and . applies the Move To Front transform to reconstruct the
#Burrows-Wheeler Transform array.
private def decodeHuffmanData(decoder : HuffmanStageDecoder) : Nil
symbolMTF = MoveToFront.new
blockLen : Int32 = 0
repeatCount : Int32 = 0
repeatInc : Int32 = 1
mtfValue : UInt8 = 0
nextSymbol : Int32 = 0
nextByte : UInt8 = 0
loop do
nextSymbol = decoder.nextSymbol
case nextSymbol
when RLE_SYMBOL_RUNA
repeatCount += repeatInc
repeatInc <<= 1
when RLE_SYMBOL_RUNB
repeatCount += repeatInc << 1
repeatInc <<= 1
else
nextByte = 0
if repeatCount > 0
if blockLen + repeatCount > @bwtBlock.size
raise Error.new("BZip2 block size exceeds declared block size")
end
nextByte = @huffmanSymbolMap[mtfValue]
@bwtByteCounts[nextByte] += repeatCount
while (repeatCount -= 1) >= 0
# Remi: Can be guaranteed because of the check above.
@bwtBlock.unsafe_put(blockLen, nextByte)
blockLen += 1
end
repeatCount = 0
repeatInc = 1
end
break if nextSymbol == @huffmanEndOfBlockSymbol
if blockLen >= @bwtBlock.size
raise Error.new("BZip2 block size exceeds declared block size")
end
mtfValue = symbolMTF.indexToFront((nextSymbol - 1) & 0xFF)
nextByte = @huffmanSymbolMap[mtfValue]
@bwtByteCounts[nextByte] += 1
@bwtBlock.unsafe_put(blockLen, nextByte) # Can be guaranteed because of the check above.
blockLen += 1
end
end
@bwtBlockLength = blockLen
end
# Sets up the Inverse Burrows-Wheeler Transform merged pointer array.
private def initializeInverseBWT(bwtStartPointer : UInt32) : Nil
mergedPointers = Array(Int32).new(@bwtBlockLength, 0i32)
charBase = Array(Int32).new(256, 0i32)
if bwtStartPointer < 0 || bwtStartPointer >= @bwtBlockLength
raise Error.new("BZip2 start pointer is invalid")
end
# Cumulatise character counts
charBase[1, 255] = @bwtByteCounts[0, 255]
(2..255).each do |i|
charBase.unsafe_put(i, charBase.unsafe_fetch(i) + charBase.unsafe_fetch(i - 1))
end
# Merged-Array Inverse Burrows-Wheeler Transform. Combining the output
# characters and forward pointers into a single array here, where we have
# already read both of the corresponding values, cuts down on memory
# accesses in the final walk through the array.
value : UInt8 = 0
@bwtBlockLength.times do |i|
value = @bwtBlock[i]
mergedPointers[charBase[value]] = (i << 8) + value
charBase[value] += 1
end
@bwtBlock = Bytes.new(0)
@bwtMergedPointers = mergedPointers
@bwtCurrentMergedPointer = mergedPointers[bwtStartPointer]
end
# Decodes a byte from the Burrows-Wheeler Transform stage. If the block has
# randomization applied, reverses the randomization.
@[AlwaysInline]
private def decodeNextBWTByte : UInt8
nextDecodedByte : UInt8 = (@bwtCurrentMergedPointer & 0xFF).to_u8!
@bwtCurrentMergedPointer = @bwtMergedPointers[@bwtCurrentMergedPointer >> 8]
if @blockRandomized
@randomCount -= 1
if @randomCount == 0
nextDecodedByte ^= 1
@randomIndex = (@randomIndex + 1) % 512
@randomCount = RNUMS[@randomIndex]
end
end
@bwtBytesDecoded += 1
nextDecodedByte
end
end
end
|