Login
blockcompressor.cr at tip
Login

File src/remilib/compression/bzip/blockcompressor.cr from the latest check-in


     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
   100
   101
   102
   103
   104
   105
   106
   107
   108
   109
   110
   111
   112
   113
   114
   115
   116
   117
   118
   119
   120
   121
   122
   123
   124
   125
   126
   127
   128
   129
   130
   131
   132
   133
   134
   135
   136
   137
   138
   139
   140
   141
   142
   143
   144
   145
   146
   147
   148
   149
   150
   151
   152
   153
   154
   155
   156
   157
   158
   159
   160
   161
   162
   163
   164
   165
   166
   167
   168
   169
   170
   171
   172
   173
   174
   175
   176
   177
   178
   179
   180
   181
   182
   183
   184
   185
   186
   187
   188
   189
   190
   191
   192
   193
   194
   195
   196
   197
   198
   199
   200
   201
   202
   203
   204
   205
   206
   207
   208
   209
   210
   211
   212
   213
   214
   215
   216
   217
   218
   219
   220
   221
   222
   223
   224
   225
   226
   227
   228
   229
#### Bzip2 Implementation
#### Copyright (C) 2023-2024 Remilia Scarlet
#### Copyright (c) 2022 drone1400
#### Copyright (C) 2015 Jaime Olivares
#### Copyright (c) 2011 Matthew Francis
#### MIT License
####
#### Ported from the Java implementation by Matthew Francis:
#### https://github.com/MateuszBartosiewicz/bzip2.
####
#### Modified by drone1400
####
#### Ported by Remilia Scarlet from the C# implementation by Jamie Olivares:
#### http://github.com/jaime-olivares/bzip2
require "./bitwriter"
require "./crc32"

module RemiLib::Compression::BZip2
  # Compresses and writes a single BZip2 block.
  #
  # Block encoding consists of the following stages:
  #
  # 1. Run-Length Encoding[1] - `#write`
  # 2. Burrows Wheeler Transform - `close` (through `DivSufSort`)
  # 3. Write block header - `#close`
  # 4. Move To Front Transform - `#close` (through `HuffmanStageEncoder`)
  # 5. Run-Length Encoding[2] - `#close`  (through `HuffmanStageEncoder`)
  # 6. Create and write Huffman tables - `#close` (through `HuffmanStageEncoder`)
  # 7. Huffman encode and write data - `#close` (through `HuffmanStageEncoder`)
  private class BlockCompressor
    # The stream to which compressed BZip2 data is written.
    @output : BitWriter

    # CRC builder for the block.
    @crc : CRC32 = CRC32.new

    # The RLE'd block data.
    @block : Bytes

    # Current length of the data within the block array.
    @blockLength : Int32 = 0

    # A limit beyond which new data will not be accepted into the block.
    @blockLengthLimit : Int32

    # The values that are present within the RLE'd block data. For each index,
    # true if that value is present within the data, otherwise false
    @blockValuesPresent : Array(Bool) = Array(Bool).new(256, false)

    # The Burrows Wheeler Transformed block data.
    @bwtBlock : Array(Int32)

    # The current RLE value being accumulated (undefined when rleLength is 0).
    @rleCurrentValue : Int32 = -1

    # The repeat count of the current RLE value.
    @rleLength : Int32 = 0

    @closed : Bool = false

    # Creates a new `BlockCompressor` instance that will write compressed data
    # to `output`.
    #
    # The `blockSize` parameter is the declared block size in bytes. Up to this
    # many bytes will be accepted into the block after run-length encoding is
    # applied.
    def initialize(@output : BitWriter, blockSize : Int, @int32Pool : ArrayPool(Int32))
      # One extra byte is added to allow for the block wrap applied in #close
      @block = Bytes.new(blockSize + 1)
      @bwtBlock = @int32Pool.rent(blockSize + 1)
      @bwtBlock.fill(0)

      # 5 bytes for one RLE run plus one byte - see `#write(UInt8)`.
      @blockLengthLimit = blockSize - 6
    end

    # Returns `true` if any bytes have been written to the block, or `false`
    # otherwise.
    def empty? : Bool
      @blockLength == 0 && @rleLength == 0
    end

    # Returns the CRC of the completed block. Only valid after calling `#close`.
    @[AlwaysInline]
    def crc : UInt32
      @crc.crc
    end

    # Writes a byte to the block, accumulating to an RLE run where possible.
    def write(value : UInt8) : Bool
      return false if @blockLength > @blockLengthLimit

      if @rleLength == 0
        @rleCurrentValue = value.to_i32!
        @rleLength = 1
      elsif @rleCurrentValue != value
        # This path commits us to write 6 bytes - one RLE run (5 bytes) plus one
        # extra.
        writeRun(@rleCurrentValue.to_u8!, @rleLength)
        @rleCurrentValue = value.to_i32!
        @rleLength = 1
      else
        if @rleLength == 254
          writeRun(@rleCurrentValue.to_u8!, 255)
          @rleLength = 0
        else
          @rleLength += 1
        end
      end

      true
    end

    # Writes `slice` to the block.  This returns the first index of `slice` that
    # was not updated, which may be zero if the block is already full.
    @[AlwaysInline]
    def write(slice : Bytes) : Int32
      pos : Int32 = 0
      while pos < slice.size
        break unless write(slice[pos])
        pos += 1
      end
      pos
    end

    # Compresses and writes out the block.
    def closeBlock : Nil
      raise "Attempted to close a block twice" if @closed
      # If an RLE run is in progress, write it out.
      if @rleLength > 0
        writeRun(@rleCurrentValue.to_u8!, @rleLength)
      end

      # Apply a one byte block wrap required by the BWT implementation.
      @block[@blockLength] = @block[0]

      # Perform the Burrows-Wheeler Transform.
      transformer : DivSufSort = DivSufSort.new(@block, @bwtBlock, @blockLength, @int32Pool)
      bwtStartPointer : Int32 = transformer.bwt

      # Write out the block header.
      @output.writeBits(24, BLOCK_HEADER_MARKER_1)
      @output.writeBits(24, BLOCK_HEADER_MARKER_2)
      @output.writeInt32(@crc.crc)
      @output.writeBoolean(false) # Randomised block flag. We never create randomised blocks.
      @output.writeBits(24, bwtStartPointer.to_u32!)

      # Write out the symbol map.
      writeSymbolMap

      # Perform the Move-To-Front Transform and run-length encoding[2] stages.
      mtf = MtfAndRle2StageEncoder.new(@bwtBlock, @blockLength, @blockValuesPresent)
      mtf.encode

      # Perform the Huffman Encoding stage and write out the encoded data.
      huffman = HuffmanStageEncoder.new(@output, mtf.mtfBlock, mtf.mtfLength, mtf.mtfAlphabetSize,
                                        mtf.mtfSymbolFrequencies, @int32Pool)
      huffman.encode

      @int32Pool.return(@bwtBlock)
    end

    # Write the Huffman symbol to output byte map.
    private def writeSymbolMap : Nil
      condensedInUse = Slice(Bool).new(16, false)
      j : Int32 = 0
      k : Int32 = 0

      16.times do |i|
        j = 0
        k = i << 4
        while j < 16
          condensedInUse[i] = true if @blockValuesPresent[k]
          j += 1
          k += 1
        end
      end

      16.times { |i| @output.writeBoolean(condensedInUse[i]) }

      16.times do |i|
        if condensedInUse[i]
          j = 0
          k = i * 16
          while j < 16
            @output.writeBoolean(@blockValuesPresent[k])
            j += 1
            k += 1
          end
        end
      end
    end

    # Writes an RLE run to the block array, updating the block CRC and present
    # values array as required.
    @[AlwaysInline]
    private def writeRun(value : UInt8, runLength : Int) : Nil
      @blockValuesPresent[value] = true
      @crc.update(value.to_u8!, runLength)

      case runLength
      when 1
        @block[@blockLength] = value
        @blockLength += 1

      when 2
        @block[@blockLength] = value
        @block[@blockLength + 1] = value
        @blockLength += 2

      when 3
        @block[@blockLength] = value
        @block[@blockLength + 1] = value
        @block[@blockLength + 2] = value
        @blockLength += 3

      else
        runLength -= 4
        @blockValuesPresent[runLength] = true
        @block[@blockLength] = value
        @block[@blockLength + 1] = value
        @block[@blockLength + 2] = value
        @block[@blockLength + 3] = value
        @block[@blockLength + 4] = runLength.to_u8!
        @blockLength += 5
      end
    end
  end
end