libremiliacr: blockdecompressor.cr at tip

File src/remilib/compression/bzip/blockdecompressor.cr from the latest check-in

#### Bzip2 Implementation
#### Copyright (C) 2023-2024 Remilia Scarlet
#### Copyright (C) 2015 Jaime Olivares
#### Copyright (c) 2011 Matthew Francis
#### MIT License
####
#### Ported from the Java implementation by Matthew Francis:
#### https://github.com/MateuszBartosiewicz/bzip2.
####
#### Ported by Remilia Scarlet from the C# implementation by Jamie Olivares:
#### http://github.com/jaime-olivares/bzip2
require "../../bitreader"
require "./crc32"

module RemiLib::Compression::BZip2
  # Reads and decompresses a single BZip2 block.
  #
  # Block decoding consists of the following stages:
  # 1. Read block header
  # 2. Read Huffman tables
  # 3. Read and decode Huffman encoded data
  # 4. Run-Length Decoding[2]
  # 5. Inverse Move To Front Transform
  # 6. Inverse Burrows Wheeler Transform
  # 7. Run-Length Decoding[1]
  # 8. Optional Block De-Randomization
  private class BlockDecompressor
    @input : BitReader

    # Used to calculate this block's CRC for the fully decoded data.
    @crc : CRC32 = CRC32.new

    # The CRC of the current block as read from the block header.
    @blockCRC : UInt32

    # `true` if the current block is randomised, or `false` otherwise.
    @blockRandomized : Bool

    # The end-of-block Huffman symbol. Decoding of the block ends when this is
    # encountered.
    @huffmanEndOfBlockSymbol : Int32 = 0

    # A map from Huffman symbol index to output character.  Some types of data
    # (e.g. ASCII text) may contain only a limited number of byte values;
    # Huffman symbols are only allocated to those values that actually occur in
    # the uncompressed data.
    @huffmanSymbolMap : Bytes = Bytes.new(256)

    # Counts of each byte value within the BWT-transformed array data. Collected
    # at the Move To Front stage, consumed by the Inverse Burrows Wheeler
    # Transform stage.
    @bwtByteCounts : Array(Int32) = Array(Int32).new(256, 0i32)

    # The Burrows-Wheeler Transform processed data.  Read at the Move To Front
    # stage, consumed by the Inverse Burrows Wheeler Transform stage.
    @bwtBlock : Bytes

    # At each position contains the union of :
    #
    # * An output character (8 bits).
    # * A pointer from each position to its successor (24 bits, left shifted 8 bits).
    #
    # As the pointer cannot exceed the maximum block size of 900k, 24 bits is
    # more than enough to hold it; Folding the character data into the spare
    # bits while performing the inverse BWT, when both pieces of information are
    # available, saves a large number of memory accesses in the final decoding
    # stages.
    @bwtMergedPointers : Array(Int32) = [] of Int32

    # The current merged pointer into the Burrow-Wheeler Transform array.
    @bwtCurrentMergedPointer : Int32 = 0

    # The actual length in bytes of the current block at the Inverse Burrows
    # Wheeler Transform stage (before final Run-Length Decoding).
    @bwtBlockLength : Int32 = 0

    # The number of output bytes that have been decoded up to the Inverse
    # Burrows Wheeler Transform stage.
    @bwtBytesDecoded : Int32 = 0

    # The most recently RLE decoded byte.
    @rleLastDecodedByte : Int16 = -1i16

    # The number of previous identical output bytes decoded. After 4 identical
    # bytes, the next byte decoded is an RLE repeat count.
    @rleAccumulator : Int32 = 0

    # The RLE repeat count of the current decoded byte. When this reaches zero,
    # a new byte is decoded.
    @rleRepeat : UInt16 = 0

    # If the current block is randomised, the position within the RNUMS
    # randomisation array.
    @randomIndex : Int32 = 0

    # If the current block is randomised, the remaining count at the current
    # RNUMS position.
    @randomCount : Int32 = RNUMS[0] - 1

    # Creates a new `BlockDecompressor` instance that will read bits from
    # *input*.
    def initialize(@input : BitReader, blockSize : Int)
      @bwtBlock = Bytes.new(blockSize.to_u32!)

      # Read block header
      @blockCRC = @input.read(32).to_u32!
      @blockRandomized = @input.read(1) != 0
      bwtStartPointer : UInt32 = @input.read(24).to_u32!

      # Read block data and decode through to the Inverse Burrows Wheeler Transform stage.
      huffmanDecoder = readHuffmanTables
      decodeHuffmanData(huffmanDecoder)
      initializeInverseBWT(bwtStartPointer)
    end

    # Decodes a byte from the final Run-Length Encoding stage, pulling a new
    # byte from the Burrows-Wheeler Transform stage when required.  Returns -1
    # when there are no more bytes to decode.
    def read : Int16
      nextByte : UInt8 = 0

      while @rleRepeat < 1
        return -1i16 if @bwtBytesDecoded == @bwtBlockLength

        nextByte = decodeNextBWTByte
        if nextByte != @rleLastDecodedByte
          # New byte, restart accumulation
          @rleLastDecodedByte = nextByte.to_i16!
          @rleRepeat = 1
          @rleAccumulator = 1
          @crc.update(nextByte)
        else
          @rleAccumulator += 1
          if @rleAccumulator == 4
            # Accumulation complete, start repetition
            @rleRepeat = decodeNextBWTByte.to_u16! + 1
            @rleAccumulator = 0
            @crc.update(nextByte, @rleRepeat)
          else
            @rleRepeat = 1
            @crc.update(nextByte)
          end
        end
      end

      @rleRepeat -= 1
      @rleLastDecodedByte
    end

    # Decodes multiple bytes from the final Run-Length Encoding stage, pulling
    # new bytes from the Burrows-Wheeler Transform stage when required.  This
    # stores the results in *buf* and returns the index of the first element of
    # *buf* that was unmodified.
    @[AlwaysInline]
    def read(buf : Bytes) : Int32
      decoded : Int16 = -1i16
      buf.size.times do |i|
        decoded = read
        return i if decoded == -1
        buf.unsafe_put(i, decoded.to_u8!)
      end
      buf.size
    end

    # Verifies and returns the block's CRC.  This method must only be called
    # after all of the block's bytes have been read.
    @[AlwaysInline]
    def checkCrc : UInt32
      ret = @crc.crc
      #STDERR << ret << " == " << @blockCRC << '\n'
      raise Error.new("BZip2 block CRC error") if @blockCRC != ret
      ret
    end

    # Read and decode the block's Huffman tables.
    private def readHuffmanTables : HuffmanStageDecoder
      j : Int32 = 0
      k : Int32 = 0

      tableCodeLengths : Array(Array(UInt8)) = Array(Array(UInt8)).new(MAXIMUM_TABLES) do |_|
        Array(UInt8).new(MAX_ALPHABET_SIZE, 0u8)
      end

      # Read Huffman symbol to output byte map.
      huffmanUsedRanges : UInt32 = @input.read(16).to_u32!
      huffmanSymbolCount : Int32 = 0

      16.times do |i|
        next if (huffmanUsedRanges & ((1 << 15) >> i)) == 0
        j = 0
        k = i << 4
        while j < 16
          if @input.read(1) != 0
            @huffmanSymbolMap[huffmanSymbolCount] = k.to_u8!
            huffmanSymbolCount += 1
          end
          j += 1
          k += 1
        end
      end

      endOfBlockSymbol : Int32 = huffmanSymbolCount + 1
      @huffmanEndOfBlockSymbol = endOfBlockSymbol

      # Read total number of tables and selectors.
      totalTables = @input.read(3).to_u32!
      totalSelectors = @input.read(15).to_u32!
      if totalTables < MINIMUM_TABLES ||
         totalTables > MAXIMUM_TABLES ||
         totalSelectors < 1 ||
         totalSelectors > MAXIMUM_SELECTORS
        raise Error.new("BZip2 block's Huffman tables are invalid")
      end

      # Read and decode MTFed Huffman selector list.
      tableMTF = MoveToFront.new
      selectors = Bytes.new(totalSelectors)
      totalSelectors.times do |selector|
        selectors[selector] = tableMTF.indexToFront(@input.countOnes(discardFirstZero: true))
      end

      # Read the Canonical Huffman code lengths for each table.
      currentLength : Int32 = 0
      totalTables.times do |table|
        currentLength = @input.read(5).to_i32!

        0.upto(endOfBlockSymbol) do |i|
          until @input.read(1) == 0
            currentLength += (@input.read(1) != 0 ? -1 : 1)
          end
          tableCodeLengths[table][i] = currentLength.to_u8!
        end
      end

      HuffmanStageDecoder.new(@input, endOfBlockSymbol + 1, tableCodeLengths, selectors)
    end

    # Reads the Huffman encoded data from the input stream, performs Run-Length
    #Decoding and . applies the Move To Front transform to reconstruct the
    #Burrows-Wheeler Transform array.
    private def decodeHuffmanData(decoder : HuffmanStageDecoder) : Nil
      symbolMTF = MoveToFront.new
      blockLen : Int32 = 0
      repeatCount : Int32 = 0
      repeatInc : Int32 = 1
      mtfValue : UInt8 = 0
      nextSymbol : Int32 = 0
      nextByte : UInt8 = 0

      loop do
        nextSymbol = decoder.nextSymbol

        case nextSymbol
        when RLE_SYMBOL_RUNA
          repeatCount += repeatInc
          repeatInc <<= 1

        when RLE_SYMBOL_RUNB
          repeatCount += repeatInc << 1
          repeatInc <<= 1

        else
          nextByte = 0
          if repeatCount > 0
            if blockLen + repeatCount > @bwtBlock.size
              raise Error.new("BZip2 block size exceeds declared block size")
            end

            nextByte = @huffmanSymbolMap[mtfValue]
            @bwtByteCounts[nextByte] += repeatCount

            while (repeatCount -= 1) >= 0
              # Remi: Can be guaranteed because of the check above.
              @bwtBlock.unsafe_put(blockLen, nextByte)
              blockLen += 1
            end

            repeatCount = 0
            repeatInc = 1
          end

          break if nextSymbol == @huffmanEndOfBlockSymbol

          if blockLen >= @bwtBlock.size
            raise Error.new("BZip2 block size exceeds declared block size")
          end

          mtfValue = symbolMTF.indexToFront((nextSymbol - 1) & 0xFF)

          nextByte = @huffmanSymbolMap[mtfValue]
          @bwtByteCounts[nextByte] += 1
          @bwtBlock.unsafe_put(blockLen, nextByte) # Can be guaranteed because of the check above.
          blockLen += 1
        end
      end

      @bwtBlockLength = blockLen
    end

    # Sets up the Inverse Burrows-Wheeler Transform merged pointer array.
    private def initializeInverseBWT(bwtStartPointer : UInt32) : Nil
      mergedPointers = Array(Int32).new(@bwtBlockLength, 0i32)
      charBase = Array(Int32).new(256, 0i32)

      if bwtStartPointer < 0 || bwtStartPointer >= @bwtBlockLength
        raise Error.new("BZip2 start pointer is invalid")
      end

      # Cumulatise character counts
      charBase[1, 255] = @bwtByteCounts[0, 255]
      (2..255).each do |i|
        charBase.unsafe_put(i, charBase.unsafe_fetch(i) + charBase.unsafe_fetch(i - 1))
      end

      # Merged-Array Inverse Burrows-Wheeler Transform.  Combining the output
      # characters and forward pointers into a single array here, where we have
      # already read both of the corresponding values, cuts down on memory
      # accesses in the final walk through the array.
      value : UInt8 = 0
      @bwtBlockLength.times do |i|
        value = @bwtBlock[i]
        mergedPointers[charBase[value]] = (i << 8) + value
        charBase[value] += 1
      end

      @bwtBlock = Bytes.new(0)
      @bwtMergedPointers = mergedPointers
      @bwtCurrentMergedPointer = mergedPointers[bwtStartPointer]
    end

    # Decodes a byte from the Burrows-Wheeler Transform stage. If the block has
    # randomization applied, reverses the randomization.
    @[AlwaysInline]
    private def decodeNextBWTByte : UInt8
      nextDecodedByte : UInt8 = (@bwtCurrentMergedPointer & 0xFF).to_u8!
      @bwtCurrentMergedPointer = @bwtMergedPointers[@bwtCurrentMergedPointer >> 8]

      if @blockRandomized
        @randomCount -= 1
        if @randomCount == 0
          nextDecodedByte ^= 1
          @randomIndex = (@randomIndex + 1) % 512
          @randomCount = RNUMS[@randomIndex]
        end
      end

      @bwtBytesDecoded += 1
      nextDecodedByte
    end
  end
end