1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
| #### Bzip2 Implementation
#### Copyright (C) 2023-2024 Remilia Scarlet
#### Copyright (c) 2022 drone1400
#### Copyright (C) 2015 Jaime Olivares
#### Copyright (c) 2011 Matthew Francis
#### MIT License
####
#### Ported from the Java implementation by Matthew Francis:
#### https://github.com/MateuszBartosiewicz/bzip2.
####
#### Modified by drone1400
####
#### Ported by Remilia Scarlet from the C# implementation by Jamie Olivares:
#### http://github.com/jaime-olivares/bzip2
require "./bitwriter"
module RemiLib::Compression::BZip2
private class HuffmanStageEncoder
# The `IO` to which the Huffman tables and data are written.
@output : BitWriter
# The output of the Move To Front Transform and Run Length Encoding[2]
# stages.
@mtfBlock : Array(UInt16)
# The actual number of values contained in the mtfBlock array.
@mtfLength : Int32
# The number of unique values in the `@mtfBlock` array.
@mtfAlphabetSize : Int32
# The global frequencies of values within the `@mtfBlock` array.
@mtfSymbolFrequencies : Array(Int32)
# The canonical Huffman code lengths for each table.
@huffmanCodeLengths : Array(Array(Int32))
# Merged code symbols for each table. The value at each position is
# `((code length << 24) | code)`.
@huffmanMergedCodeSymbols : Array(Array(Int32))
# The selectors for each segment.
@selectors : Bytes
# Creates a new `HuffmanStageEncoder` instance.
#
# * `output`: The `BitWriter` to write the results to.
# * `mtfBlock`: The MTF block data.
# * `mtfLength`: The actual length of the MTF block.
# * `mtfAlphabetSize`: The size of the MTF block's alphabet.
# * `mtfSymbolFrequencies`: The frequencies of the MTF block's symbols.
def initialize(@output : BitWriter, @mtfBlock : Array(UInt16), @mtfLength : Int32, @mtfAlphabetSize : Int32,
@mtfSymbolFrequencies : Array(Int32), @int32Pool : ArrayPool(Int32))
totalTables = selectTableCount(@mtfLength)
@huffmanCodeLengths = Array(Array(Int32)).new(totalTables) { |_| Array(Int32).new(@mtfAlphabetSize, 0i32) }
@huffmanMergedCodeSymbols = Array(Array(Int32)).new(totalTables) { |_| Array(Int32).new(@mtfAlphabetSize, 0i32) }
@selectors = Bytes.new((@mtfLength + GROUP_RUN_LENGTH - 1).tdiv(GROUP_RUN_LENGTH))
end
# Encodes and writes the block data.
def encode : Nil
# Create optimised selector list and Huffman tables.
generateHuffmanOptimisationSeeds
3.downto(0) do |i|
optimizeSelectorsAndHuffmanTables(i == 0)
end
assignHuffmanCodeSymbols
# Write out the tables and the block data encoded with them.
writeSelectorsAndHuffmanTables
writeBlockData
end
# Selects an appropriate table count for a given MTF length.
@[AlwaysInline]
protected def selectTableCount(len : Int32) : Int32
case
when len >= 2400 then 6
when len >= 1200 then 5
when len >= 600 then 4
when len >= 200 then 3
else 2
end
end
# Generate a Huffman code length table for a given list of symbol
# frequencies.
protected def generateHuffmanCodeLengths(alphabetSize : Int32,
symbolFrequencies : Array(Array(Int32)),
codeLengths : Array(Array(Int32)),
index : Int32) : Nil
@int32Pool.withRentedArray(alphabetSize) do |mergedFrequenciesAndIndices|
@int32Pool.withRentedArray(alphabetSize) do |sortedFrequencies|
mergedFrequenciesAndIndices.fill(0)
sortedFrequencies.fill(0)
#: Array(Int32) = Array(Int32).new(alphabetSize, 0i32)
#sortedFrequencies : Array(Int32) = Array(Int32).new(alphabetSize, 0i32)
# The Huffman allocator needs its input symbol frequencies to be sorted,
# but we need to return code lengths in the same order as the
# corresponding frequencies are passed in.
# The symbol frequency and index are merged into a single array of
# integers - frequency in the high 23 bits, index in the low 9 bits.
#
# * 2^23 = 8,388,608 which is higher than the maximum possible frequency
# for one symbol in a block.
# * 2^9 = 512 which is higher than the maximum possible alphabet size
# (== 258).
#
# Sorting this array simultaneously sorts the frequencies and leaves a
# lookup that can be used to cheaply invert the sort.
alphabetSize.times do |i|
mergedFrequenciesAndIndices[i] = (symbolFrequencies[index][i] << 9) | i
end
mergedFrequenciesAndIndices.sort!
alphabetSize.times do |i|
sortedFrequencies[i] = mergedFrequenciesAndIndices[i] >> 9
end
# Allocate code lengths - the allocation is in place, so the code lengths
# will be in the sortedFrequencies array afterwards.
HuffmanAllocator.allocateHuffmanCodeLengths(sortedFrequencies, ENCODE_MAX_CODE_LENGTH)
# Reverse the sort to place the code lengths in the same order as the
# symbols whose frequencies were passed in.
alphabetSize.times do |i|
codeLengths[index][mergedFrequenciesAndIndices[i] & 0x1FF] = sortedFrequencies[i]
end
end
end
end
# Generate initial Huffman code length tables, giving each table a different
# low cost section of the alphabet that is roughly equal in overall
# cumulative frequency. Note that the initial tables are invalid for actual
# Huffman code generation, and only serve as the seed for later iterative
# optimization in `#optimizeSelectorsAndHuffmanTables`.
private def generateHuffmanOptimisationSeeds : Nil
totalTables : Int32 = @huffmanCodeLengths.size
remainingLength : Int32 = @mtfLength
lowCostEnd : Int32 = -1
targetCumulativeFrequency : Int32 = 0
lowCostStart : Int32 = 0
actualCumulativeFrequency : Int32 = 0
totalTables.times do |i|
targetCumulativeFrequency = remainingLength.tdiv(totalTables - i)
lowCostStart = lowCostEnd + 1
actualCumulativeFrequency = 0
while actualCumulativeFrequency < targetCumulativeFrequency && lowCostEnd < (@mtfAlphabetSize - 1)
actualCumulativeFrequency += @mtfSymbolFrequencies[lowCostEnd += 1]
end
if lowCostEnd > lowCostStart && i != 0 && i != (totalTables - 1) && ((totalTables - i) & 1) == 0
actualCumulativeFrequency -= @mtfSymbolFrequencies[lowCostEnd]
lowCostEnd -= 1
end
@mtfAlphabetSize.times do |j|
if j < lowCostStart || j > lowCostEnd
@huffmanCodeLengths[i][j] = HIGH_SYMBOL_COST
end
end
remainingLength -= actualCumulativeFrequency
end
end
# Co-optimise the selector list and the alternative Huffman table code
# lengths. This method is called repeatedly in the hope that the total
# encoded size of the selectors, the Huffman code lengths and the block data
# encoded with them will converge towards a minimum.
#
# If the data is highly incompressible, it is possible that the total
# encoded size will instead diverge (increase) slightly.
#
# if `storeSelectors?` is `true`, then this will write out the (final)
# chosen selectors.
private def optimizeSelectorsAndHuffmanTables(storeSelectors? : Bool) : Nil
totalTables : Int32 = @huffmanCodeLengths.size
tableFrequencies : Array(Array(Int32)) = Array(Array(Int32)).new(totalTables) do |_|
Array(Int32).new(@mtfAlphabetSize, 0i32)
end
selectorIndex : Int32 = 0
groupStart : Int32 = 0
groupEnd : Int32 = 0
bestTable : UInt8 = 0
bestCost : Int32 = 0
tableCost : Int32 = 0
value : Int32 = 0
@int32Pool.withRentedArray(totalTables) do |cost|
cost.fill(0)
# Find the best table for each group of 50 block bytes based on the
# current Huffman code lengths.
while groupStart < @mtfLength
groupEnd = Math.min(groupStart + GROUP_RUN_LENGTH, @mtfLength) - 1
# Calculate the cost of this group when encoded by each table.
cost.fill(0)
groupStart.upto(groupEnd) do |i|
value = @mtfBlock.unsafe_fetch(i).to_i32!
totalTables.times do |j|
# We can guarantee the j subscript
cost.unsafe_put(j, cost.unsafe_fetch(j) + @huffmanCodeLengths.unsafe_fetch(j).unsafe_fetch(value))
end
end
# Find the table with the least cost for this group.
bestTable = 0
bestCost = cost[0]
(1...totalTables).each do |i|
tableCost = cost.unsafe_fetch(i)
if tableCost < bestCost
bestCost = tableCost
bestTable = i.to_u8!
end
end
# Accumulate symbol frequencies for the table chosen for this block.
x : Int32 = 0
groupStart.upto(groupEnd) do |i|
x = tableFrequencies.unsafe_fetch(bestTable).unsafe_fetch(@mtfBlock.unsafe_fetch(i)) + 1
tableFrequencies.unsafe_fetch(bestTable).unsafe_put(@mtfBlock.unsafe_fetch(i), x)
end
# Store a selector indicating the table chosen for this block.
if storeSelectors?
@selectors.unsafe_put(selectorIndex, bestTable)
selectorIndex += 1
end
groupStart = groupEnd + 1
end
# Generate new Huffman code lengths based on the frequencies for each
# table accumulated in this iteration.
totalTables.times do |i|
generateHuffmanCodeLengths(@mtfAlphabetSize, tableFrequencies, @huffmanCodeLengths, i)
end
end
end
# Assigns Canonical Huffman codes based on the calculated lengths.
private def assignHuffmanCodeSymbols : Nil
totalTables = @huffmanCodeLengths.size
maxLen : Int32 = 0
minLen : Int32 = 0
len : Int32 = 0
code : Int32 = 0
totalTables.times do |i|
maxLen = 0
minLen = 32
@mtfAlphabetSize.times do |j|
len = @huffmanCodeLengths[i][j]
maxLen = len if len > maxLen
minLen = len if len < minLen
end
code = 0
(minLen..maxLen).each do |j|
@mtfAlphabetSize.times do |k|
if (@huffmanCodeLengths[i][k] & 0xFF) == j
@huffmanMergedCodeSymbols[i][k] = (j << 24) | code
code += 1
end
end
code <<= 1
end
end
end
# Write out the selector list and Huffman tables.
private def writeSelectorsAndHuffmanTables : Nil
totalSelectors : Int32 = @selectors.size
totalTables : Int32 = @huffmanCodeLengths.size
@output.writeBits(3, totalTables)
@output.writeBits(15, totalSelectors)
# Write the selectors.
selectorMTF : MoveToFront = MoveToFront.new
totalSelectors.times do |i|
@output.writeUnary(selectorMTF.valueToFront(@selectors[i]))
end
# Write the Huffman tables.
currentLength : Int32 = 0
codeLength : Int32 = 0
value : UInt32 = 0
delta : Int32 = 0
totalTables.times do |i|
currentLength = @huffmanCodeLengths[i][0]
@output.writeBits(5, currentLength)
@mtfAlphabetSize.times do |j|
codeLength = @huffmanCodeLengths[i][j]
value = (currentLength < codeLength) ? 2u32 : 3u32
delta = (codeLength - currentLength).abs
@output.io.flush
while delta > 0
@output.writeBits(2, value)
delta -= 1
end
@output.writeBoolean(false)
currentLength = codeLength
end
end
end
# Writes out the encoded block data.
private def writeBlockData : Nil
selectorIndex : Int32 = 0
groupEnd : Int32 = 0
index : Int32 = 0
mergedCodeSymbol : Int32 = 0
mtfIndex : Int32 = 0
while mtfIndex < @mtfLength
groupEnd = Math.min(mtfIndex + GROUP_RUN_LENGTH, @mtfLength) - 1
index = @selectors[selectorIndex].to_i32!
selectorIndex += 1
while mtfIndex <= groupEnd
mergedCodeSymbol = @huffmanMergedCodeSymbols[index][@mtfBlock[mtfIndex]]
mtfIndex += 1
@output.writeBits(mergedCodeSymbol >> 24, mergedCodeSymbol.to_u32!)
end
end
end
end
end
|