Login
Artifact [6317d09f8b]
Login

Artifact 6317d09f8b2ec087e5f56d92a78cad1b660184ca0ccc0c72229ae8de7a202584:


#### Bzip2 Implementation
#### Copyright (C) 2023-2024 Remilia Scarlet
#### Copyright (C) 2015 Jaime Olivares
#### Copyright (c) 2011 Matthew Francis
#### MIT License
####
#### Ported from the Java implementation by Matthew Francis:
#### https://github.com/MateuszBartosiewicz/bzip2.
####
#### Ported by Remilia Scarlet from the C# implementation by Jamie Olivares:
#### http://github.com/jaime-olivares/bzip2

module RemiLib::Compression::BZip2
  # An encoder for the BZip2 Move To Front Transform and Run-Length Encoding[2]
  # stages.
  #
  # Although conceptually these two stages are separate, it is computationally
  # efficient to perform them in one pass.
  private class MtfAndRle2StageEncoder
    # The Burrows-Wheeler transformed block.
    @bwtBlock : Array(Int32)

    # Actual length of the data in the bwtBlock array.
    @bwtLength : Int32

    # At each position, true if the byte value with that index is present within
    # the block, otherwise false.
    @bwtValuesInUse : Array(Bool)

    # The output of the Move To Front Transform and Run-Length Encoding[2]
    # stages.
    getter mtfBlock : Array(UInt16)

    # The global frequencies of values within the mtfBlock array.
    getter mtfSymbolFrequencies : Array(Int32) = Array(Int32).new(MAX_ALPHABET_SIZE, 0i32)

    # Gets The actual length of the MTF block.
    getter mtfLength : Int32 = 0

    # Gets the size of the MTF block's alphabet.
    getter mtfAlphabetSize : Int32 = 0

    # Creates a new `MtfAndRle2StageEncoder` instance.
    #
    # * `bwtBlock` is the Burrows-Wheeler Transformed block data.
    # * `bwtLength` is the actual length of the BWT data.
    # * `bwtValuesInUse` is the values that are present within the BWT data. For
    #   each index, `true` if that value is present within the data, otherwise
    #   `false`.
    def initialize(@bwtBlock : Array(Int32), @bwtLength : Int32, @bwtValuesInUse : Array(Bool))
      @mtfBlock = Array(UInt16).new(@bwtLength + 1, 0u16)
    end

    # Performs the Move To Front transform and Run Length Encoding[1] stages.
    def encode : Nil
      huffmanSymbolMap = StaticArray(UInt8, 256).new(0u8)
      symbolMTF : MoveToFront = MoveToFront.new

      totalUniqueValues : Int32 = 0
      256.times do |i|
        if @bwtValuesInUse[i]
          huffmanSymbolMap[i] = totalUniqueValues.to_u8!
          totalUniqueValues += 1
        end
      end

      endOfBlockSymbol : Int32 = totalUniqueValues + 1
      mtfIndex : Int32 = 0
      repeatCount : Int32 = 0
      totalRunAs : Int32 = 0
      totalRunBs : Int32 = 0
      mtfPosition : Int32 = 0

      @bwtLength.times do |i|
        # Move-To-Front
        mtfPosition = symbolMTF.valueToFront(huffmanSymbolMap[@bwtBlock.unsafe_fetch(i) & 0xFF])

        # Run-length Encoding
        if mtfPosition == 0
          repeatCount += 1
        else
          if repeatCount > 0
            repeatCount -= 1

            loop do
              if (repeatCount & 1) == 0
                @mtfBlock.unsafe_put(mtfIndex, RLE_SYMBOL_RUNA.to_u16!)
                mtfIndex += 1
                totalRunAs += 1
              else
                @mtfBlock.unsafe_put(mtfIndex, RLE_SYMBOL_RUNB.to_u16!)
                mtfIndex += 1
                totalRunBs += 1
              end

              break if repeatCount <= 1
              repeatCount = (repeatCount - 2) >> 1
            end

            repeatCount = 0
          end
          @mtfBlock.unsafe_put(mtfIndex, (mtfPosition + 1).to_u16!)
          mtfIndex += 1
          @mtfSymbolFrequencies.unsafe_put(mtfPosition + 1,
                                           @mtfSymbolFrequencies.unsafe_fetch(mtfPosition + 1) + 1)
        end
      end

      if repeatCount > 0
        repeatCount -= 1
        loop do
          if (repeatCount & 1) == 0
            @mtfBlock.unsafe_put(mtfIndex, RLE_SYMBOL_RUNA.to_u16!)
            mtfIndex += 1
            totalRunAs += 1
          else
            @mtfBlock.unsafe_put(mtfIndex, RLE_SYMBOL_RUNB.to_u16!)
            mtfIndex += 1
            totalRunBs += 1
          end

          break if repeatCount <= 1
          repeatCount = (repeatCount - 2) >> 1
        end
      end

      @mtfBlock.unsafe_put(mtfIndex, endOfBlockSymbol.to_u16!)

      @mtfSymbolFrequencies.unsafe_put(endOfBlockSymbol,
                                       @mtfSymbolFrequencies.unsafe_fetch(endOfBlockSymbol) + 1)
      @mtfSymbolFrequencies.unsafe_put(RLE_SYMBOL_RUNA,
                                       @mtfSymbolFrequencies.unsafe_fetch(RLE_SYMBOL_RUNA) + totalRunAs)
      @mtfSymbolFrequencies.unsafe_put(RLE_SYMBOL_RUNB,
                                       @mtfSymbolFrequencies.unsafe_fetch(RLE_SYMBOL_RUNB) + totalRunBs)

      @mtfLength = mtfIndex + 1
      @mtfAlphabetSize = endOfBlockSymbol + 1
    end
  end
end