#### Bzip2 Implementation
#### Copyright (C) 2023-2024 Remilia Scarlet
#### Copyright (c) 2022 drone1400
#### Copyright (C) 2015 Jaime Olivares
#### Copyright (c) 2011 Matthew Francis
#### MIT License
####
#### Ported from the Java implementation by Matthew Francis:
#### https://github.com/MateuszBartosiewicz/bzip2.
####
#### Modified by drone1400
####
#### Ported by Remilia Scarlet from the C# implementation by Jamie Olivares:
#### http://github.com/jaime-olivares/bzip2
require "./bitwriter"
require "./crc32"
module RemiLib::Compression::BZip2
# Compresses and writes a single BZip2 block.
#
# Block encoding consists of the following stages:
#
# 1. Run-Length Encoding[1] - `#write`
# 2. Burrows Wheeler Transform - `close` (through `DivSufSort`)
# 3. Write block header - `#close`
# 4. Move To Front Transform - `#close` (through `HuffmanStageEncoder`)
# 5. Run-Length Encoding[2] - `#close` (through `HuffmanStageEncoder`)
# 6. Create and write Huffman tables - `#close` (through `HuffmanStageEncoder`)
# 7. Huffman encode and write data - `#close` (through `HuffmanStageEncoder`)
private class BlockCompressor
# The stream to which compressed BZip2 data is written.
@output : BitWriter
# CRC builder for the block.
@crc : CRC32 = CRC32.new
# The RLE'd block data.
@block : Bytes
# Current length of the data within the block array.
@blockLength : Int32 = 0
# A limit beyond which new data will not be accepted into the block.
@blockLengthLimit : Int32
# The values that are present within the RLE'd block data. For each index,
# true if that value is present within the data, otherwise false
@blockValuesPresent : Array(Bool) = Array(Bool).new(256, false)
# The Burrows Wheeler Transformed block data.
@bwtBlock : Array(Int32)
# The current RLE value being accumulated (undefined when rleLength is 0).
@rleCurrentValue : Int32 = -1
# The repeat count of the current RLE value.
@rleLength : Int32 = 0
@closed : Bool = false
# Creates a new `BlockCompressor` instance that will write compressed data
# to `output`.
#
# The `blockSize` parameter is the declared block size in bytes. Up to this
# many bytes will be accepted into the block after run-length encoding is
# applied.
def initialize(@output : BitWriter, blockSize : Int, @int32Pool : ArrayPool(Int32))
# One extra byte is added to allow for the block wrap applied in #close
@block = Bytes.new(blockSize + 1)
@bwtBlock = @int32Pool.rent(blockSize + 1)
@bwtBlock.fill(0)
# 5 bytes for one RLE run plus one byte - see `#write(UInt8)`.
@blockLengthLimit = blockSize - 6
end
# Returns `true` if any bytes have been written to the block, or `false`
# otherwise.
def empty? : Bool
@blockLength == 0 && @rleLength == 0
end
# Returns the CRC of the completed block. Only valid after calling `#close`.
@[AlwaysInline]
def crc : UInt32
@crc.crc
end
# Writes a byte to the block, accumulating to an RLE run where possible.
def write(value : UInt8) : Bool
return false if @blockLength > @blockLengthLimit
if @rleLength == 0
@rleCurrentValue = value.to_i32!
@rleLength = 1
elsif @rleCurrentValue != value
# This path commits us to write 6 bytes - one RLE run (5 bytes) plus one
# extra.
writeRun(@rleCurrentValue.to_u8!, @rleLength)
@rleCurrentValue = value.to_i32!
@rleLength = 1
else
if @rleLength == 254
writeRun(@rleCurrentValue.to_u8!, 255)
@rleLength = 0
else
@rleLength += 1
end
end
true
end
# Writes `slice` to the block. This returns the first index of `slice` that
# was not updated, which may be zero if the block is already full.
@[AlwaysInline]
def write(slice : Bytes) : Int32
pos : Int32 = 0
while pos < slice.size
break unless write(slice[pos])
pos += 1
end
pos
end
# Compresses and writes out the block.
def closeBlock : Nil
raise "Attempted to close a block twice" if @closed
# If an RLE run is in progress, write it out.
if @rleLength > 0
writeRun(@rleCurrentValue.to_u8!, @rleLength)
end
# Apply a one byte block wrap required by the BWT implementation.
@block[@blockLength] = @block[0]
# Perform the Burrows-Wheeler Transform.
transformer : DivSufSort = DivSufSort.new(@block, @bwtBlock, @blockLength, @int32Pool)
bwtStartPointer : Int32 = transformer.bwt
# Write out the block header.
@output.writeBits(24, BLOCK_HEADER_MARKER_1)
@output.writeBits(24, BLOCK_HEADER_MARKER_2)
@output.writeInt32(@crc.crc)
@output.writeBoolean(false) # Randomised block flag. We never create randomised blocks.
@output.writeBits(24, bwtStartPointer.to_u32!)
# Write out the symbol map.
writeSymbolMap
# Perform the Move-To-Front Transform and run-length encoding[2] stages.
mtf = MtfAndRle2StageEncoder.new(@bwtBlock, @blockLength, @blockValuesPresent)
mtf.encode
# Perform the Huffman Encoding stage and write out the encoded data.
huffman = HuffmanStageEncoder.new(@output, mtf.mtfBlock, mtf.mtfLength, mtf.mtfAlphabetSize,
mtf.mtfSymbolFrequencies, @int32Pool)
huffman.encode
@int32Pool.return(@bwtBlock)
end
# Write the Huffman symbol to output byte map.
private def writeSymbolMap : Nil
condensedInUse = Slice(Bool).new(16, false)
j : Int32 = 0
k : Int32 = 0
16.times do |i|
j = 0
k = i << 4
while j < 16
condensedInUse[i] = true if @blockValuesPresent[k]
j += 1
k += 1
end
end
16.times { |i| @output.writeBoolean(condensedInUse[i]) }
16.times do |i|
if condensedInUse[i]
j = 0
k = i * 16
while j < 16
@output.writeBoolean(@blockValuesPresent[k])
j += 1
k += 1
end
end
end
end
# Writes an RLE run to the block array, updating the block CRC and present
# values array as required.
@[AlwaysInline]
private def writeRun(value : UInt8, runLength : Int) : Nil
@blockValuesPresent[value] = true
@crc.update(value.to_u8!, runLength)
case runLength
when 1
@block[@blockLength] = value
@blockLength += 1
when 2
@block[@blockLength] = value
@block[@blockLength + 1] = value
@blockLength += 2
when 3
@block[@blockLength] = value
@block[@blockLength + 1] = value
@block[@blockLength + 2] = value
@blockLength += 3
else
runLength -= 4
@blockValuesPresent[runLength] = true
@block[@blockLength] = value
@block[@blockLength + 1] = value
@block[@blockLength + 2] = value
@block[@blockLength + 3] = value
@block[@blockLength + 4] = runLength.to_u8!
@blockLength += 5
end
end
end
end
|