libremiliacr: xxhash128-internal.cr at tip

File src/remilib/digest/xxhash/xxhash128-internal.cr from the latest check-in

#### MIT License
####
#### Copyright (c) 2023-2024 Remilia Scarlet
#### Copyright (c) 2018 Melnik Alexander
####
#### Permission is hereby granted, free of charge, to any person obtaining a
#### copy of this software and associated documentation files (the "Software"),
#### to deal in the Software without restriction, including without limitation
#### the rights to use, copy, modify, merge, publish, distribute, sublicense,
#### and/or sell copies of the Software, and to permit persons to whom the
#### Software is furnished to do so, subject to the following conditions:
####
#### The above copyright notice and this permission notice shall be included in
#### all copies or substantial portions of the Software.
####
#### THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#### IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#### FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#### AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#### LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#### FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
#### DEALINGS IN THE SOFTWARE.
require "./xxhash-common"

####
#### Ported from C#:
#### https://github.com/uranium62/xxHash/tree/6b20e7f7b32dfc29e5019d3d35f5b7270f1656f3
####
#### Remi: TODO The original code was written before a native UInt128 type was
#### added to .NET, and so it used its own internal `uint128` type.  This was
#### somewhat retained in the code below.  The messy low/high uint64 code should
#### ideally be converted to native uint128 code.
####

module RemiLib::Digest
  class XXHash128 < XXHashInternal
    private XXH3_ACC_SIZE = 64

    @[AlwaysInline]
    private def self.xxh3Internal128Bit(input : Bytes, seed : UInt64, secret : Bytes) : UInt128
      len = input.size
      case
      when len <= 16 then len0To16(input, secret, seed)
      when len <= 128 then len17To128(input, secret, seed)
      when len <= XXH3_MIDSIZE_MAX then len129To240(input, secret, seed)
      else hashLongWithSeed(input, secret, seed)
      end
    end

    # Mixes all bits to finalize *hash*.
    #
    # The final mix ensures that all input bits have a chance to impact any bit
    # in the output digest, resulting in an unbiased distribution.
    @[AlwaysInline]
    private def self.xxh64Avalanche(hash : UInt64) : UInt64
      hash ^= hash.unsafe_shr(33)
      hash = hash &* XXH_PRIME64_2
      hash ^= hash.unsafe_shr(29)
      hash = hash &* XXH_PRIME64_3
      hash ^ hash.unsafe_shr(32)
    end

    # :ditto:
    @[AlwaysInline]
    private def self.xxh3Avalanche(h64 : UInt64) : UInt64
      h64 = xorShift64(h64, 37)
      h64 = h64 &* 0x165667919E3779F9_u64
      xorShift64(h64, 32)
    end

    @[AlwaysInline]
    private def self.len0To16(input : Bytes, secret : Bytes, seed : UInt64) : UInt128
      len = input.size
      case
      when len > 8
        len9To16(input, secret, seed)
      when len >= 4
        len4To8(input, secret, seed)
      when len != 0
        len1To3(input, secret, seed)
      else
        secretPtr : PUInt8 = secret.to_unsafe
        bitflipL : UInt64 = readLE64(secretPtr + 64) ^ readLE64(secretPtr + 72)
        bitflipH : UInt64 = readLE64(secretPtr + 80) ^ readLE64(secretPtr + 88)
        (xxh64Avalanche(seed ^ bitflipH).to_u128! << 64) | xxh64Avalanche(seed ^ bitflipL)
      end
    end

    @[AlwaysInline]
    private def self.len17To128(input : Bytes, secret : Bytes, seed : UInt64) : UInt128
      len : Int32 = input.size
      inputPtr : PUInt8 = input.to_unsafe
      secretPtr : PUInt8 = secret.to_unsafe
      acc : UInt128 = (len.to_u64! &* XXH_PRIME64_1).to_u128!

      if len > 32
        if len > 64
          if len > 96
            acc = mix32B(acc, inputPtr + 48, inputPtr + (len &- 64), secretPtr + 96, seed)
          end
          acc = mix32B(acc, inputPtr + 32, inputPtr + (len &- 48), secretPtr + 64, seed)
        end
        acc = mix32B(acc, inputPtr + 16, inputPtr + (len &- 32), secretPtr + 32, seed)
      end
      acc = mix32B(acc, inputPtr, inputPtr + (len &- 16), secretPtr, seed)

      hashL : UInt64 = low64(acc) &+ high64(acc)
      hashH : UInt64 = (low64(acc) &* XXH_PRIME64_1) &+
                       (high64(acc) &* XXH_PRIME64_4) &+
                       ((len.to_u64! &- seed) &* XXH_PRIME64_2)

      hashL = xxh3Avalanche(hashL)
      hashH = 0u64 &- xxh3Avalanche(hashH)
      makeU128(hashL, hashH)
    end

    @[AlwaysInline]
    private def self.len9To16(input : Bytes, secret : Bytes, seed : UInt64) : UInt128
      len : Int32 = input.size
      inputPtr : PUInt8 = input.to_unsafe
      secretPtr : PUInt8 = secret.to_unsafe

      bitflipL : UInt64 = (readLE64(secretPtr + 32) ^ readLE64(secretPtr + 40)) &- seed
      bitflipH : UInt64 = (readLE64(secretPtr + 48) ^ readLE64(secretPtr + 56)) &+ seed
      inputL : UInt64 = readLE64(inputPtr)
      inputH : UInt64 = readLE64(inputPtr + (len &- 8))

      m128 : UInt128 = mult64To128(inputL ^ inputH ^ bitflipL, XXH_PRIME64_1)
      m128L : UInt64 = low64(m128)
      m128H : UInt64 = high64(m128)

      m128L = m128L &+ (len.to_u64! &- 1).unsafe_shl(54)
      inputH ^= bitflipH

      m128H = m128H &+ inputH &+ mult32To64(inputH.to_u64!, XXH_PRIME32_2.to_u64! &- 1)
      m128L ^= swap64(m128H)

      hash : UInt128 = mult64To128(m128L, XXH_PRIME64_2)
      hashL : UInt64 = low64(hash)
      hashH : UInt64 = high64(hash)

      hashH = hashH &+ (m128H &* XXH_PRIME64_2)
      hashL = xxh3Avalanche(hashL)
      hashH = xxh3Avalanche(hashH)
      makeU128(hashL, hashH)
    end

    @[AlwaysInline]
    private def self.len1To3(input : Bytes, secret : Bytes, seed : UInt64) : UInt128
      len : Int32 = input.size
      inputPtr : PUInt8 = input.to_unsafe
      secretPtr : PUInt8 = secret.to_unsafe

      c1 : UInt8 = inputPtr[0]
      c2 : UInt8 = inputPtr[len.unsafe_shr(1)]
      c3 : UInt8 = inputPtr[len &- 1]

      combinedL : UInt32 = c1.to_u32!.unsafe_shl(16) |
                           c2.to_u32!.unsafe_shl(24) |
                           c3.to_u32! |
                           len.to_u32!.unsafe_shl(8)
      combinedH : UInt32 = rotl32(swap32(combinedL), 13)

      bitflipL : UInt64 = (readLE32(secretPtr) ^ readLE32(secretPtr + 4)).to_u64! &+ seed
      bitflipH : UInt64 = (readLE32(secretPtr + 8) ^ readLE32(secretPtr + 12)).to_u64! &- seed
      keyedL : UInt64 = combinedL.to_u64! ^ bitflipL
      keyedH : UInt64 = combinedH.to_u64! ^ bitflipH

      hashL : UInt64 = xxh64Avalanche(keyedL)
      hashH : UInt64 = xxh64Avalanche(keyedH)
      makeU128(hashL, hashH)
    end

    @[AlwaysInline]
    private def self.len4To8(input : Bytes, secret : Bytes, seed : UInt64) : UInt128
      len : Int32 = input.size
      inputPtr : PUInt8 = input.to_unsafe
      secretPtr : PUInt8 = secret.to_unsafe

      seed ^= swap32(seed.to_u32!).to_u64!.unsafe_shl(32)

      inputL : UInt32 = readLE32(inputPtr)
      inputH : UInt32 = readLE32(inputPtr + (len &- 4))
      input64 : UInt64 = inputL.to_u64! &+ inputH.to_u64!.unsafe_shl(32)
      bitflip : UInt64 = (readLE64(secretPtr + 16) ^ readLE64(secretPtr + 24)) &+ seed
      keyed : UInt64 = input64 ^ bitflip

      m128 : UInt128 = mult64To128(keyed, XXH_PRIME64_1 &+ len.to_u64!.unsafe_shl(2))
      m128L : UInt64 = low64(m128)
      m128H : UInt64 = high64(m128)

      m128H = m128H &+ m128L.unsafe_shl(1)
      m128L ^= m128H.unsafe_shr(3)

      m128L = xorShift64(m128L, 35)
      m128L = m128L &* 0x9FB21C651E98DF25_u64
      m128L = xorShift64(m128L, 28)
      m128H = xxh3Avalanche(m128H)
      makeU128(m128L, m128H)
    end

    @[AlwaysInline]
    private def self.len129To240(input : Bytes, secret : Bytes, seed : UInt64) : UInt128
      len : Int32 = input.size
      inputPtr : PUInt8 = input.to_unsafe
      secretPtr : PUInt8 = secret.to_unsafe

      numRounds : Int32 = len.tdiv(32)
      accL : UInt64 = len.to_u64! &* XXH_PRIME64_1
      accH : UInt64 = 0
      acc : UInt128 = makeU128(accL, accH)
      4.times do |i|
        acc = mix32B(acc,
                     inputPtr + (32 &* i),
                     inputPtr + (32 &* i) + 16,
                     secretPtr + (32 &* i),
                     seed)
      end

      accL = xxh3Avalanche(low64(acc))
      accH = xxh3Avalanche(high64(acc))
      acc = makeU128(accL, accH)

      4.upto(numRounds &- 1) do |i|
        acc = mix32B(acc,
                     inputPtr + (32 &* i),
                     inputPtr + (32 &* i) + 16,
                     secretPtr + (XXH3_MIDSIZE_STARTOFFSET &+ (32 &* (i &- 4))),
                     seed)
      end

      acc = mix32B(acc,
                   inputPtr + (len &- 16),
                   inputPtr + (len &- 32),
                   secretPtr + (XXH3_SECRET_SIZE_MIN &- XXH3_MIDSIZE_LASTOFFSET &- 16),
                   0u64 &- seed)

      accL = low64(acc)
      accH = high64(acc)
      hashL : UInt64 = accL &+ accH
      hashH : UInt64 = (accL &* XXH_PRIME64_1) &+
                       (accH &* XXH_PRIME64_4) &+
                       ((len.to_u64! &- seed) &* XXH_PRIME64_2)
      hashL = xxh3Avalanche(hashL)
      hashH = 0u64 &- xxh3Avalanche(hashH)
      makeU128(hashL, hashH)
    end

    # A bit slower than XXH3_mix16B, but handles multiply by zero better.
    @[AlwaysInline]
    private def self.mix32B(acc : UInt128, input1 : PUInt8, input2 : PUInt8, secret : PUInt8, seed : UInt64) : UInt128
      accL : UInt64 = low64(acc) &+ mix16B(input1, secret, seed)
      accL ^= readLE64(input2) &+ readLE64(input2 + 8)

      accH : UInt64 = high64(acc) &+ mix16B(input2, secret + 16, seed)
      accH ^= readLE64(input1) &+ readLE64(input1 + 8)
      makeU128(accL, accH)
    end

    @[AlwaysInline]
    private def self.hashLongWithSeed(input : Bytes, secret : Bytes, seed : UInt64) : UInt128
      if seed == 0
        hashLongInternal(input, secret)
      else
        customSecret = Slice(UInt8).new(XXH3_SECRET_DEFAULT_SIZE)
        xxh3InitCustomSecret(customSecret.to_unsafe, seed)
        hashLongInternal(input, customSecret)
      end
    end

    @[AlwaysInline]
    private def self.hashLongInternal(input : Bytes, secret : Bytes) : UInt128
      len = input.size.to_u64!
      src = XXH3_INIT_ACC.to_unsafe
      acc = StaticArray(UInt64, 8).new { |i| src[i].to_u64! }
      hashLongInternalLoop(acc.to_unsafe, input, secret)

      secretPtr : PUInt8 = secret.to_unsafe
      accPtr : PUInt64 = acc.to_unsafe
      hashL : UInt64 = mergeAccs(accPtr,
                                 secretPtr + XXH_SECRET_MERGEACCS_START,
                                 len &* XXH_PRIME64_1)
      hashH : UInt64 = mergeAccs(accPtr,
                                 secretPtr + (secret.size.to_u64! &- XXH3_ACC_SIZE &- XXH_SECRET_MERGEACCS_START),
                                 ~(len &* XXH_PRIME64_2))
      makeU128(hashL, hashH)
    end

    @[AlwaysInline]
    private def self.hashLongInternalLoop(acc : PUInt64, input : Bytes, secret : Bytes) : Nil
      inputPtr : PUInt8 = input.to_unsafe
      secretPtr : PUInt8 = secret.to_unsafe
      len : Int32 = input.size
      secretLen : Int32 = secret.size

      numStripesPerBlock : Int32 = (secretLen &- XXH_STRIPE_LEN).tdiv(XXH_SECRET_CONSUME_RATE)
      blockLen : Int32 = XXH_STRIPE_LEN * numStripesPerBlock
      numBlocks : Int32 = (len &- 1).tdiv(blockLen)

      numBlocks.times do |n|
        xxh3Accumulate(acc, inputPtr + n &* blockLen, secretPtr, numStripesPerBlock)
        xxh3ScrambleAcc(acc, secretPtr + (secretLen &- XXH_STRIPE_LEN))
      end

      numStripes : Int32 = ((len &- 1) &- (blockLen &* numBlocks)).tdiv(XXH_STRIPE_LEN)
      xxh3Accumulate(acc, inputPtr + numBlocks &* blockLen, secretPtr, numStripes)

      b : PUInt8 = inputPtr + (len &- XXH_STRIPE_LEN)
      xxh3Accumulate512(acc, b, secretPtr + (secretLen &- XXH_STRIPE_LEN &- XXH_SECRET_LASTACC_START))
    end

    @[AlwaysInline]
    private def self.mergeAccs(acc : PUInt64, secret : PUInt8, start : UInt64) : UInt64
      result : UInt64 = start
      4.times do |i|
        result = result &+ mix2Accs(acc + 2 &* i, secret + 16 &* i)
      end
      xxh3Avalanche(result)
    end

    @[AlwaysInline]
    private def self.mix2Accs(acc : PUInt64, secret : PUInt8) : UInt64
      mult128Fold64(acc[0] ^ readLE64(secret),
                    acc[1] ^ readLE64(secret + 8))
    end

    @[AlwaysInline]
    private def self.xxh3Accumulate(acc : PUInt64, input : PUInt8, secret : PUInt8, numStripes : Int) : Nil
      numStripes.times do |n|
        inp : PUInt8 = input + n &* XXH_STRIPE_LEN
        xxh3Accumulate512(acc, inp, secret + n &* XXH_SECRET_CONSUME_RATE)
      end
    end

    # xxh3Accumulate512 is the tightest loop for long inputs, and it is the most
    # optimized.
    #
    # It is a hardened version of UMAC, based off of FARSH's implementation.
    #
    # This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
    # implementations, and it is ridiculously fast.
    #
    # We harden it by mixing the original input to the accumulators as well as
    # the product.
    #
    # This means that in the (relatively likely) case of a multiply by zero, the
    # original input is preserved.
    #
    # On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
    # cross-pollination, as otherwise the upper and lower halves would be
    # essentially independent.
    #
    # This doesn't matter on 64-bit hashes since they all get merged together in
    # the end, so we skip the extra step.
    #
    # Both XXH3 64-bit and XXH3 128-bit use this subroutine.
    @[AlwaysInline]
    private def self.xxh3Accumulate512(acc : PUInt64, input : PUInt8, secret : PUInt8) : Nil
      xxh3Accumulate512Scalar(acc, input, secret)
    end

    # Processes a 64 byte block of data using the scalar path.
    @[AlwaysInline]
    private def self.xxh3Accumulate512Scalar(acc : PUInt64, input : PUInt8, secret : PUInt8) : Nil
      XXH_ACC_NB.times { |i| xxh3ScalarRound(acc, input, secret, i) }
    end

    # Scalar round for `#xxh3Accumulate512Scalar`.
    @[AlwaysInline]
    private def self.xxh3ScalarRound(acc : PUInt64, input : PUInt8, secret : PUInt8, lane : Int) : Nil
      dataVal : UInt64 = readLE64(input + lane &* 8)
      dataKey : UInt64 = dataVal ^ readLE64(secret + lane &* 8)
      acc[lane ^ 1] = acc[lane ^ 1] &+ dataVal
      acc[lane] = acc[lane] &+ mult32To64(dataKey, dataKey.unsafe_shr(32))
    end

    # Scrambles the accumulators after a large chunk has been read.
    @[AlwaysInline]
    private def self.xxh3ScrambleAcc(acc : PUInt64, secret : PUInt8) : Nil
      xxh3ScrambleAccScalar(acc, secret)
    end

    # :ditto:
    @[AlwaysInline]
    private def self.xxh3ScrambleAccScalar(acc : PUInt64, secret : PUInt8) : Nil
      XXH_ACC_NB.times { |i| xxh3ScalarScrambleRound(acc, secret, i) }
    end

    # Scalar scramble step for `xxh3ScrambleAccScalar`.
    @[AlwaysInline]
    private def self.xxh3ScalarScrambleRound(acc : PUInt64, secret : PUInt8, lane : Int) : Nil
      xacc : PUInt64 = acc
      xsecret : PUInt8 = secret
      key64 : UInt64 = readLE64(xsecret + lane &* 8)
      acc64 : UInt64 = xacc[lane]
      acc64 = xorShift64(acc64, 47)
      acc64 ^= key64
      acc64 = acc64 &* XXH_PRIME32_1
      xacc[lane] = acc64
    end


    @[AlwaysInline]
    private def self.xxh3InitCustomSecret(customSecret : PUInt8, seed : UInt64) : Nil
      xxh3InitCustomSecretScalar(customSecret, seed)
    end

    @[AlwaysInline]
    private def self.xxh3InitCustomSecretScalar(customSecret : PUInt8, seed : UInt64) : Nil
      secretPtr : PUInt8 = XXH3_SECRET.to_unsafe
      numRounds : Int32 = XXH_SECRET_DEFAULT_SIZE.tdiv(16)
      lo : UInt64 = 0
      hi : UInt64 = 0
      numRounds.times do |i|
        lo = readLE64(secretPtr + 16 &* i) &+ seed
        hi = readLE64(secretPtr + 16 &* i + 8) &- seed
        writeLE64(customSecret + 16 &* i, lo)
        writeLE64(customSecret + 16 &* i + 8, hi)
      end
    end
  end
end