Login
Artifact [2691ff9b2c]
Login

Artifact 2691ff9b2ca5a9f55f5bd292b392d7f5d7a37acb8fd2f6ec9b6a3d67b5dc95d5:


#### Bzip2 Implementation
#### Copyright (C) 2023-2024 Remilia Scarlet
#### Copyright (c) 2022 drone1400
#### Copyright (C) 2015 Jaime Olivares
#### Copyright (c) 2011 Matthew Francis
#### MIT License
####
#### Ported from the Java implementation by Matthew Francis:
#### https://github.com/MateuszBartosiewicz/bzip2.
####
#### Modified by drone1400
####
#### Ported by Remilia Scarlet from the C# implementation by Jamie Olivares:
#### http://github.com/jaime-olivares/bzip2
require "./bitwriter"
require "./crc32"

module RemiLib::Compression::BZip2
  # Compresses and writes a single BZip2 block.
  #
  # Block encoding consists of the following stages:
  #
  # 1. Run-Length Encoding[1] - `#write`
  # 2. Burrows Wheeler Transform - `close` (through `DivSufSort`)
  # 3. Write block header - `#close`
  # 4. Move To Front Transform - `#close` (through `HuffmanStageEncoder`)
  # 5. Run-Length Encoding[2] - `#close`  (through `HuffmanStageEncoder`)
  # 6. Create and write Huffman tables - `#close` (through `HuffmanStageEncoder`)
  # 7. Huffman encode and write data - `#close` (through `HuffmanStageEncoder`)
  private class BlockCompressor
    # The stream to which compressed BZip2 data is written.
    @output : BitWriter

    # CRC builder for the block.
    @crc : CRC32 = CRC32.new

    # The RLE'd block data.
    @block : Bytes

    # Current length of the data within the block array.
    @blockLength : Int32 = 0

    # A limit beyond which new data will not be accepted into the block.
    @blockLengthLimit : Int32

    # The values that are present within the RLE'd block data. For each index,
    # true if that value is present within the data, otherwise false
    @blockValuesPresent : Array(Bool) = Array(Bool).new(256, false)

    # The Burrows Wheeler Transformed block data.
    @bwtBlock : Array(Int32)

    # The current RLE value being accumulated (undefined when rleLength is 0).
    @rleCurrentValue : Int32 = -1

    # The repeat count of the current RLE value.
    @rleLength : Int32 = 0

    @closed : Bool = false

    # Creates a new `BlockCompressor` instance that will write compressed data
    # to `output`.
    #
    # The `blockSize` parameter is the declared block size in bytes. Up to this
    # many bytes will be accepted into the block after run-length encoding is
    # applied.
    def initialize(@output : BitWriter, blockSize : Int, @int32Pool : ArrayPool(Int32))
      # One extra byte is added to allow for the block wrap applied in #close
      @block = Bytes.new(blockSize + 1)
      @bwtBlock = @int32Pool.rent(blockSize + 1)
      @bwtBlock.fill(0)

      # 5 bytes for one RLE run plus one byte - see `#write(UInt8)`.
      @blockLengthLimit = blockSize - 6
    end

    # Returns `true` if any bytes have been written to the block, or `false`
    # otherwise.
    def empty? : Bool
      @blockLength == 0 && @rleLength == 0
    end

    # Returns the CRC of the completed block. Only valid after calling `#close`.
    @[AlwaysInline]
    def crc : UInt32
      @crc.crc
    end

    # Writes a byte to the block, accumulating to an RLE run where possible.
    def write(value : UInt8) : Bool
      return false if @blockLength > @blockLengthLimit

      if @rleLength == 0
        @rleCurrentValue = value.to_i32!
        @rleLength = 1
      elsif @rleCurrentValue != value
        # This path commits us to write 6 bytes - one RLE run (5 bytes) plus one
        # extra.
        writeRun(@rleCurrentValue.to_u8!, @rleLength)
        @rleCurrentValue = value.to_i32!
        @rleLength = 1
      else
        if @rleLength == 254
          writeRun(@rleCurrentValue.to_u8!, 255)
          @rleLength = 0
        else
          @rleLength += 1
        end
      end

      true
    end

    # Writes `slice` to the block.  This returns the first index of `slice` that
    # was not updated, which may be zero if the block is already full.
    @[AlwaysInline]
    def write(slice : Bytes) : Int32
      pos : Int32 = 0
      while pos < slice.size
        break unless write(slice[pos])
        pos += 1
      end
      pos
    end

    # Compresses and writes out the block.
    def closeBlock : Nil
      raise "Attempted to close a block twice" if @closed
      # If an RLE run is in progress, write it out.
      if @rleLength > 0
        writeRun(@rleCurrentValue.to_u8!, @rleLength)
      end

      # Apply a one byte block wrap required by the BWT implementation.
      @block[@blockLength] = @block[0]

      # Perform the Burrows-Wheeler Transform.
      transformer : DivSufSort = DivSufSort.new(@block, @bwtBlock, @blockLength, @int32Pool)
      bwtStartPointer : Int32 = transformer.bwt

      # Write out the block header.
      @output.writeBits(24, BLOCK_HEADER_MARKER_1)
      @output.writeBits(24, BLOCK_HEADER_MARKER_2)
      @output.writeInt32(@crc.crc)
      @output.writeBoolean(false) # Randomised block flag. We never create randomised blocks.
      @output.writeBits(24, bwtStartPointer.to_u32!)

      # Write out the symbol map.
      writeSymbolMap

      # Perform the Move-To-Front Transform and run-length encoding[2] stages.
      mtf = MtfAndRle2StageEncoder.new(@bwtBlock, @blockLength, @blockValuesPresent)
      mtf.encode

      # Perform the Huffman Encoding stage and write out the encoded data.
      huffman = HuffmanStageEncoder.new(@output, mtf.mtfBlock, mtf.mtfLength, mtf.mtfAlphabetSize,
                                        mtf.mtfSymbolFrequencies, @int32Pool)
      huffman.encode

      @int32Pool.return(@bwtBlock)
    end

    # Write the Huffman symbol to output byte map.
    private def writeSymbolMap : Nil
      condensedInUse = Slice(Bool).new(16, false)
      j : Int32 = 0
      k : Int32 = 0

      16.times do |i|
        j = 0
        k = i << 4
        while j < 16
          condensedInUse[i] = true if @blockValuesPresent[k]
          j += 1
          k += 1
        end
      end

      16.times { |i| @output.writeBoolean(condensedInUse[i]) }

      16.times do |i|
        if condensedInUse[i]
          j = 0
          k = i * 16
          while j < 16
            @output.writeBoolean(@blockValuesPresent[k])
            j += 1
            k += 1
          end
        end
      end
    end

    # Writes an RLE run to the block array, updating the block CRC and present
    # values array as required.
    @[AlwaysInline]
    private def writeRun(value : UInt8, runLength : Int) : Nil
      @blockValuesPresent[value] = true
      @crc.update(value.to_u8!, runLength)

      case runLength
      when 1
        @block[@blockLength] = value
        @blockLength += 1

      when 2
        @block[@blockLength] = value
        @block[@blockLength + 1] = value
        @blockLength += 2

      when 3
        @block[@blockLength] = value
        @block[@blockLength + 1] = value
        @block[@blockLength + 2] = value
        @blockLength += 3

      else
        runLength -= 4
        @blockValuesPresent[runLength] = true
        @block[@blockLength] = value
        @block[@blockLength + 1] = value
        @block[@blockLength + 2] = value
        @block[@blockLength + 3] = value
        @block[@blockLength + 4] = runLength.to_u8!
        @blockLength += 5
      end
    end
  end
end