#### Bzip2 Implementation
#### Copyright (C) 2023-2024 Remilia Scarlet
#### Copyright (c) 2022 drone1400
#### Copyright (C) 2015 Jaime Olivares
#### Copyright (c) 2011 Matthew Francis
#### MIT License
####
#### Ported from the Java implementation by Matthew Francis:
#### https://github.com/MateuszBartosiewicz/bzip2.
####
#### Modified by drone1400
####
#### Ported by Remilia Scarlet from the C# implementation by Jamie Olivares:
#### http://github.com/jaime-olivares/bzip2
require "./bitwriter"
module RemiLib::Compression::BZip2
private class HuffmanStageEncoder
# The `IO` to which the Huffman tables and data are written.
@output : BitWriter
# The output of the Move To Front Transform and Run Length Encoding[2]
# stages.
@mtfBlock : Array(UInt16)
# The actual number of values contained in the mtfBlock array.
@mtfLength : Int32
# The number of unique values in the `@mtfBlock` array.
@mtfAlphabetSize : Int32
# The global frequencies of values within the `@mtfBlock` array.
@mtfSymbolFrequencies : Array(Int32)
# The canonical Huffman code lengths for each table.
@huffmanCodeLengths : Array(Array(Int32))
# Merged code symbols for each table. The value at each position is
# `((code length << 24) | code)`.
@huffmanMergedCodeSymbols : Array(Array(Int32))
# The selectors for each segment.
@selectors : Bytes
# Creates a new `HuffmanStageEncoder` instance.
#
# * `output`: The `BitWriter` to write the results to.
# * `mtfBlock`: The MTF block data.
# * `mtfLength`: The actual length of the MTF block.
# * `mtfAlphabetSize`: The size of the MTF block's alphabet.
# * `mtfSymbolFrequencies`: The frequencies of the MTF block's symbols.
def initialize(@output : BitWriter, @mtfBlock : Array(UInt16), @mtfLength : Int32, @mtfAlphabetSize : Int32,
@mtfSymbolFrequencies : Array(Int32), @int32Pool : ArrayPool(Int32))
totalTables = selectTableCount(@mtfLength)
@huffmanCodeLengths = Array(Array(Int32)).new(totalTables) { |_| Array(Int32).new(@mtfAlphabetSize, 0i32) }
@huffmanMergedCodeSymbols = Array(Array(Int32)).new(totalTables) { |_| Array(Int32).new(@mtfAlphabetSize, 0i32) }
@selectors = Bytes.new((@mtfLength + GROUP_RUN_LENGTH - 1).tdiv(GROUP_RUN_LENGTH))
end
# Encodes and writes the block data.
def encode : Nil
# Create optimised selector list and Huffman tables.
generateHuffmanOptimisationSeeds
3.downto(0) do |i|
optimizeSelectorsAndHuffmanTables(i == 0)
end
assignHuffmanCodeSymbols
# Write out the tables and the block data encoded with them.
writeSelectorsAndHuffmanTables
writeBlockData
end
# Selects an appropriate table count for a given MTF length.
@[AlwaysInline]
protected def selectTableCount(len : Int32) : Int32
case
when len >= 2400 then 6
when len >= 1200 then 5
when len >= 600 then 4
when len >= 200 then 3
else 2
end
end
# Generate a Huffman code length table for a given list of symbol
# frequencies.
protected def generateHuffmanCodeLengths(alphabetSize : Int32,
symbolFrequencies : Array(Array(Int32)),
codeLengths : Array(Array(Int32)),
index : Int32) : Nil
@int32Pool.withRentedArray(alphabetSize) do |mergedFrequenciesAndIndices|
@int32Pool.withRentedArray(alphabetSize) do |sortedFrequencies|
mergedFrequenciesAndIndices.fill(0)
sortedFrequencies.fill(0)
#: Array(Int32) = Array(Int32).new(alphabetSize, 0i32)
#sortedFrequencies : Array(Int32) = Array(Int32).new(alphabetSize, 0i32)
# The Huffman allocator needs its input symbol frequencies to be sorted,
# but we need to return code lengths in the same order as the
# corresponding frequencies are passed in.
# The symbol frequency and index are merged into a single array of
# integers - frequency in the high 23 bits, index in the low 9 bits.
#
# * 2^23 = 8,388,608 which is higher than the maximum possible frequency
# for one symbol in a block.
# * 2^9 = 512 which is higher than the maximum possible alphabet size
# (== 258).
#
# Sorting this array simultaneously sorts the frequencies and leaves a
# lookup that can be used to cheaply invert the sort.
alphabetSize.times do |i|
mergedFrequenciesAndIndices[i] = (symbolFrequencies[index][i] << 9) | i
end
mergedFrequenciesAndIndices.sort!
alphabetSize.times do |i|
sortedFrequencies[i] = mergedFrequenciesAndIndices[i] >> 9
end
# Allocate code lengths - the allocation is in place, so the code lengths
# will be in the sortedFrequencies array afterwards.
HuffmanAllocator.allocateHuffmanCodeLengths(sortedFrequencies, ENCODE_MAX_CODE_LENGTH)
# Reverse the sort to place the code lengths in the same order as the
# symbols whose frequencies were passed in.
alphabetSize.times do |i|
codeLengths[index][mergedFrequenciesAndIndices[i] & 0x1FF] = sortedFrequencies[i]
end
end
end
end
# Generate initial Huffman code length tables, giving each table a different
# low cost section of the alphabet that is roughly equal in overall
# cumulative frequency. Note that the initial tables are invalid for actual
# Huffman code generation, and only serve as the seed for later iterative
# optimization in `#optimizeSelectorsAndHuffmanTables`.
private def generateHuffmanOptimisationSeeds : Nil
totalTables : Int32 = @huffmanCodeLengths.size
remainingLength : Int32 = @mtfLength
lowCostEnd : Int32 = -1
targetCumulativeFrequency : Int32 = 0
lowCostStart : Int32 = 0
actualCumulativeFrequency : Int32 = 0
totalTables.times do |i|
targetCumulativeFrequency = remainingLength.tdiv(totalTables - i)
lowCostStart = lowCostEnd + 1
actualCumulativeFrequency = 0
while actualCumulativeFrequency < targetCumulativeFrequency && lowCostEnd < (@mtfAlphabetSize - 1)
actualCumulativeFrequency += @mtfSymbolFrequencies[lowCostEnd += 1]
end
if lowCostEnd > lowCostStart && i != 0 && i != (totalTables - 1) && ((totalTables - i) & 1) == 0
actualCumulativeFrequency -= @mtfSymbolFrequencies[lowCostEnd]
lowCostEnd -= 1
end
@mtfAlphabetSize.times do |j|
if j < lowCostStart || j > lowCostEnd
@huffmanCodeLengths[i][j] = HIGH_SYMBOL_COST
end
end
remainingLength -= actualCumulativeFrequency
end
end
# Co-optimise the selector list and the alternative Huffman table code
# lengths. This method is called repeatedly in the hope that the total
# encoded size of the selectors, the Huffman code lengths and the block data
# encoded with them will converge towards a minimum.
#
# If the data is highly incompressible, it is possible that the total
# encoded size will instead diverge (increase) slightly.
#
# if `storeSelectors?` is `true`, then this will write out the (final)
# chosen selectors.
private def optimizeSelectorsAndHuffmanTables(storeSelectors? : Bool) : Nil
totalTables : Int32 = @huffmanCodeLengths.size
tableFrequencies : Array(Array(Int32)) = Array(Array(Int32)).new(totalTables) do |_|
Array(Int32).new(@mtfAlphabetSize, 0i32)
end
selectorIndex : Int32 = 0
groupStart : Int32 = 0
groupEnd : Int32 = 0
bestTable : UInt8 = 0
bestCost : Int32 = 0
tableCost : Int32 = 0
value : Int32 = 0
@int32Pool.withRentedArray(totalTables) do |cost|
cost.fill(0)
# Find the best table for each group of 50 block bytes based on the
# current Huffman code lengths.
while groupStart < @mtfLength
groupEnd = Math.min(groupStart + GROUP_RUN_LENGTH, @mtfLength) - 1
# Calculate the cost of this group when encoded by each table.
cost.fill(0)
groupStart.upto(groupEnd) do |i|
value = @mtfBlock.unsafe_fetch(i).to_i32!
totalTables.times do |j|
# We can guarantee the j subscript
cost.unsafe_put(j, cost.unsafe_fetch(j) + @huffmanCodeLengths.unsafe_fetch(j).unsafe_fetch(value))
end
end
# Find the table with the least cost for this group.
bestTable = 0
bestCost = cost[0]
(1...totalTables).each do |i|
tableCost = cost.unsafe_fetch(i)
if tableCost < bestCost
bestCost = tableCost
bestTable = i.to_u8!
end
end
# Accumulate symbol frequencies for the table chosen for this block.
x : Int32 = 0
groupStart.upto(groupEnd) do |i|
x = tableFrequencies.unsafe_fetch(bestTable).unsafe_fetch(@mtfBlock.unsafe_fetch(i)) + 1
tableFrequencies.unsafe_fetch(bestTable).unsafe_put(@mtfBlock.unsafe_fetch(i), x)
end
# Store a selector indicating the table chosen for this block.
if storeSelectors?
@selectors.unsafe_put(selectorIndex, bestTable)
selectorIndex += 1
end
groupStart = groupEnd + 1
end
# Generate new Huffman code lengths based on the frequencies for each
# table accumulated in this iteration.
totalTables.times do |i|
generateHuffmanCodeLengths(@mtfAlphabetSize, tableFrequencies, @huffmanCodeLengths, i)
end
end
end
# Assigns Canonical Huffman codes based on the calculated lengths.
private def assignHuffmanCodeSymbols : Nil
totalTables = @huffmanCodeLengths.size
maxLen : Int32 = 0
minLen : Int32 = 0
len : Int32 = 0
code : Int32 = 0
totalTables.times do |i|
maxLen = 0
minLen = 32
@mtfAlphabetSize.times do |j|
len = @huffmanCodeLengths[i][j]
maxLen = len if len > maxLen
minLen = len if len < minLen
end
code = 0
(minLen..maxLen).each do |j|
@mtfAlphabetSize.times do |k|
if (@huffmanCodeLengths[i][k] & 0xFF) == j
@huffmanMergedCodeSymbols[i][k] = (j << 24) | code
code += 1
end
end
code <<= 1
end
end
end
# Write out the selector list and Huffman tables.
private def writeSelectorsAndHuffmanTables : Nil
totalSelectors : Int32 = @selectors.size
totalTables : Int32 = @huffmanCodeLengths.size
@output.writeBits(3, totalTables)
@output.writeBits(15, totalSelectors)
# Write the selectors.
selectorMTF : MoveToFront = MoveToFront.new
totalSelectors.times do |i|
@output.writeUnary(selectorMTF.valueToFront(@selectors[i]))
end
# Write the Huffman tables.
currentLength : Int32 = 0
codeLength : Int32 = 0
value : UInt32 = 0
delta : Int32 = 0
totalTables.times do |i|
currentLength = @huffmanCodeLengths[i][0]
@output.writeBits(5, currentLength)
@mtfAlphabetSize.times do |j|
codeLength = @huffmanCodeLengths[i][j]
value = (currentLength < codeLength) ? 2u32 : 3u32
delta = (codeLength - currentLength).abs
@output.io.flush
while delta > 0
@output.writeBits(2, value)
delta -= 1
end
@output.writeBoolean(false)
currentLength = codeLength
end
end
end
# Writes out the encoded block data.
private def writeBlockData : Nil
selectorIndex : Int32 = 0
groupEnd : Int32 = 0
index : Int32 = 0
mergedCodeSymbol : Int32 = 0
mtfIndex : Int32 = 0
while mtfIndex < @mtfLength
groupEnd = Math.min(mtfIndex + GROUP_RUN_LENGTH, @mtfLength) - 1
index = @selectors[selectorIndex].to_i32!
selectorIndex += 1
while mtfIndex <= groupEnd
mergedCodeSymbol = @huffmanMergedCodeSymbols[index][@mtfBlock[mtfIndex]]
mtfIndex += 1
@output.writeBits(mergedCodeSymbol >> 24, mergedCodeSymbol.to_u32!)
end
end
end
end
end