Login
huffmanstagedecoder.cr at tip
Login

File src/remilib/compression/bzip/huffmanstagedecoder.cr from the latest check-in


     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
   100
   101
   102
   103
   104
   105
   106
   107
   108
   109
   110
   111
   112
   113
   114
   115
   116
   117
   118
   119
   120
   121
   122
   123
   124
   125
   126
   127
   128
   129
   130
   131
   132
   133
   134
   135
   136
   137
   138
   139
   140
   141
   142
   143
   144
   145
   146
   147
   148
   149
   150
   151
   152
   153
   154
   155
   156
   157
   158
   159
#### Bzip2 Implementation
#### Copyright (C) 2023-2024 Remilia Scarlet
#### Copyright (C) 2015 Jaime Olivares
#### Copyright (c) 2011 Matthew Francis
#### MIT License
####
#### Ported from the Java implementation by Matthew Francis:
#### https://github.com/MateuszBartosiewicz/bzip2.
####
#### Ported by Remilia Scarlet from the C# implementation by Jamie Olivares:
#### http://github.com/jaime-olivares/bzip2
require "../../bitreader"

module RemiLib::Compression::BZip2
  private class HuffmanStageDecoder
    @reader : BitReader

    # The Huffman table number to use for each group of 50 symbols.
    @selectors : Bytes

    # The minimum code length for each Huffman table.
    @minimumLengths : Array(Int32) = Array(Int32).new(MAXIMUM_TABLES , 0i32)

    # An array of values for each Huffman table that must be subtracted from the
    # numerical value of a Huffman code of a given bit length to give its
    # canonical code index.
    @codeBases : Array(Array(Int32))

    # An array of values for each Huffman table that gives the highest numerical
    # value of a Huffman code of a given bit length.
    @codeLimits : Array(Array(Int32))

    # A mapping for each Huffman table from canonical code index to output
    # symbol.
    @codeSymbols : Array(Array(Int32))

    # The Huffman table for the current group.
    @currentTable : UInt8

    # The index of the current group within the selectors array.
    @groupIndex : Int32 = -1

    # The byte position within the current group. A new group is selected every
    # 50 decoded bytes.
    @groupPosition : Int32 = -1

    # Creates a new `HuffmanStageDecoder` instance.
    def initialize(@reader : BitReader, alphabetSize : Int32, tableCodeLengths : Array(Array(UInt8)),
                   @selectors : Bytes)
      @codeBases = Array(Array(Int32)).new(MAXIMUM_TABLES) do |_|
        Array(Int32).new(MAX_CODE_LENGTH + 2, 0i32)
      end

      @codeLimits = Array(Array(Int32)).new(MAXIMUM_TABLES) do |_|
        Array(Int32).new(MAX_CODE_LENGTH + 1, 0i32)
      end

      @codeSymbols = Array(Array(Int32)).new(MAXIMUM_TABLES) do |_|
        Array(Int32).new(MAX_ALPHABET_SIZE, 0i32)
      end

      @currentTable = @selectors[0]
      createHuffmanDecodingTable(alphabetSize, tableCodeLengths)
    end

    # Decodes and returns the next symbol.  This will raise a `BZip2::Error` if
    # the end of the input stream is reached while decoding.
    def nextSymbol : Int32
      # Move to next group selector if required.
      @groupPosition += 1
      if @groupPosition % GROUP_RUN_LENGTH == 0
        @groupIndex += 1
        raise Error.new("Error decoding BZip2 block") if @groupIndex >= @selectors.size
        @currentTable = @selectors.unsafe_fetch(@groupIndex)
      end

      codeLength : Int32 = @minimumLengths[@currentTable]
      tableLimits = @codeLimits[@currentTable]

      # Starting with the minimum bit length for the table, read additional bits
      # one at a time until a complete code is recognised.
      codeBits : UInt32 = @reader.read(codeLength).to_u32!
      while codeLength <= MAX_CODE_LENGTH
        if codeBits <= tableLimits[codeLength]
          # Convert the code to a symbol index and return.
          return @codeSymbols[@currentTable][codeBits - @codeBases[@currentTable][codeLength]]
        end

        codeBits = (codeBits << 1) | @reader.read(1)
        codeLength += 1
      end

      # A valid code was not recognized.
      raise Error.new("Error decoding BZip2 block")
    end

    # Constructs Huffman decoding tables from lists of Canonical Huffman code
    # lengths.  `alphabetSize` is the total number of codes (uniform for each
    # table).  `tableCodeLengths` is the Canonical Huffman code lengths for each
    # table.
    private def createHuffmanDecodingTable(alphabetSize : Int32, tableCodeLengths : Array(Array(UInt8))) : Nil
      minLength : Int32 = 0
      maxLength : Int32 = 0
      code : Int32 = 0
      base1 : Int32 = 0
      codeIndex : Int32 = 0

      tableCodeLengths.size.times do |table|
        minLength = Int32::MAX
        maxLength = Int32::MIN

        tableBases = @codeBases[table]
        codeLengths = tableCodeLengths.unsafe_fetch(table)
        tableLimits = @codeLimits[table]
        tableSymbols = @codeSymbols[table]


        # Find the minimum and maximum code length for the table.
        alphabetSize.times do |i|
          maxLength = Math.max(codeLengths[i].to_i32!, maxLength)
          minLength = Math.min(codeLengths[i].to_i32!, minLength)
        end

        @minimumLengths[table] = minLength

        # Calculate the first output symbol for each code length.
        alphabetSize.times do |i|
          tableBases[codeLengths.unsafe_fetch(i) + 1] += 1
        end

        (1...(MAX_CODE_LENGTH + 2)).each do |i|
          tableBases[i] += tableBases[i - 1]
        end

        # Calculate the first and last Huffman code for each code length (codes
        # at a given length are sequential in value).
        code = 0
        (minLength..maxLength).each do |i|
          base1 = code
          code += tableBases[i + 1] - tableBases[i]
          tableBases[i] = base1 - tableBases[i]
          tableLimits[i] = code - 1
          code <<= 1
        end

        # Populate the mapping from canonical code index to output symbol.
        codeIndex = 0
        (minLength..maxLength).each do |bitLength|
          alphabetSize.times do |symbol|
            if codeLengths[symbol] == bitLength
              tableSymbols[codeIndex] = symbol
              codeIndex += 1
            end
          end
        end
      end
    end
  end
end