5693 lines
270 KiB
Lua
5693 lines
270 KiB
Lua
--------------------------------------------------------------------------------------------------------------------------
|
|
-- sha2.lua
|
|
--------------------------------------------------------------------------------------------------------------------------
|
|
-- VERSION: 12 (2022-02-23)
|
|
-- AUTHOR: Egor Skriptunoff
|
|
-- LICENSE: MIT (the same license as Lua itself)
|
|
-- URL: https://github.com/Egor-Skriptunoff/pure_lua_SHA
|
|
--
|
|
-- DESCRIPTION:
|
|
-- This module contains functions to calculate SHA digest:
|
|
-- MD5, SHA-1,
|
|
-- SHA-224, SHA-256, SHA-512/224, SHA-512/256, SHA-384, SHA-512,
|
|
-- SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128, SHAKE256,
|
|
-- HMAC,
|
|
-- BLAKE2b, BLAKE2s, BLAKE2bp, BLAKE2sp, BLAKE2Xb, BLAKE2Xs,
|
|
-- BLAKE3, BLAKE3_KDF
|
|
-- Written in pure Lua.
|
|
-- Compatible with:
|
|
-- Lua 5.1, Lua 5.2, Lua 5.3, Lua 5.4, Fengari, LuaJIT 2.0/2.1 (any CPU endianness).
|
|
-- Main feature of this module: it was heavily optimized for speed.
|
|
-- For every Lua version the module contains particular implementation branch to get benefits from version-specific features.
|
|
-- - branch for Lua 5.1 (emulating bitwise operators using look-up table)
|
|
-- - branch for Lua 5.2 (using bit32/bit library), suitable for both Lua 5.2 with native "bit32" and Lua 5.1 with external library "bit"
|
|
-- - branch for Lua 5.3/5.4 (using native 64-bit bitwise operators)
|
|
-- - branch for Lua 5.3/5.4 (using native 32-bit bitwise operators) for Lua built with LUA_INT_TYPE=LUA_INT_INT
|
|
-- - branch for LuaJIT without FFI library (useful in a sandboxed environment)
|
|
-- - branch for LuaJIT x86 without FFI library (LuaJIT x86 has oddity because of lack of CPU registers)
|
|
-- - branch for LuaJIT 2.0 with FFI library (bit.* functions work only with Lua numbers)
|
|
-- - branch for LuaJIT 2.1 with FFI library (bit.* functions can work with "int64_t" arguments)
|
|
--
|
|
--
|
|
-- USAGE:
|
|
-- Input data should be provided as a binary string: either as a whole string or as a sequence of substrings (chunk-by-chunk loading, total length < 9*10^15 bytes).
|
|
-- Result (SHA digest) is returned in hexadecimal representation as a string of lowercase hex digits.
|
|
-- Simplest usage example:
|
|
-- local sha = require("sha2")
|
|
-- local your_hash = sha.sha256("your string")
|
|
-- See file "sha2_test.lua" for more examples.
|
|
--
|
|
--
|
|
-- CHANGELOG:
|
|
-- version date description
|
|
-- ------- ---------- -----------
|
|
-- 12 2022-02-23 Now works in Luau (but NOT optimized for speed)
|
|
-- 11 2022-01-09 BLAKE3 added
|
|
-- 10 2022-01-02 BLAKE2 functions added
|
|
-- 9 2020-05-10 Now works in OpenWrt's Lua (dialect of Lua 5.1 with "double" + "invisible int32")
|
|
-- 8 2019-09-03 SHA-3 functions added
|
|
-- 7 2019-03-17 Added functions to convert to/from base64
|
|
-- 6 2018-11-12 HMAC added
|
|
-- 5 2018-11-10 SHA-1 added
|
|
-- 4 2018-11-03 MD5 added
|
|
-- 3 2018-11-02 Bug fixed: incorrect hashing of long (2 GByte) data streams on Lua 5.3/5.4 built with "int32" integers
|
|
-- 2 2018-10-07 Decreased module loading time in Lua 5.1 implementation branch (thanks to Peter Melnichenko for giving a hint)
|
|
-- 1 2018-10-06 First release (only SHA-2 functions)
|
|
-----------------------------------------------------------------------------
|
|
|
|
|
|
local print_debug_messages = false -- set to true to view some messages about your system's abilities and implementation branch chosen for your system
|
|
|
|
local unpack, table_concat, byte, char, string_rep, sub, gsub, gmatch, string_format, floor, ceil, math_min, math_max, tonumber, type, math_huge =
|
|
table.unpack or unpack, table.concat, string.byte, string.char, string.rep, string.sub, string.gsub, string.gmatch, string.format, math.floor, math.ceil, math.min, math.max, tonumber, type, math.huge
|
|
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- EXAMINING YOUR SYSTEM
|
|
--------------------------------------------------------------------------------
|
|
|
|
local function get_precision(one)
|
|
-- "one" must be either float 1.0 or integer 1
|
|
-- returns bits_precision, is_integer
|
|
-- This function works correctly with all floating point datatypes (including non-IEEE-754)
|
|
local k, n, m, prev_n = 0, one, one
|
|
while true do
|
|
k, prev_n, n, m = k + 1, n, n + n + 1, m + m + k % 2
|
|
if k > 256 or n - (n - 1) ~= 1 or m - (m - 1) ~= 1 or n == m then
|
|
return k, false -- floating point datatype
|
|
elseif n == prev_n then
|
|
return k, true -- integer datatype
|
|
end
|
|
end
|
|
end
|
|
|
|
-- Make sure Lua has "double" numbers
|
|
local x = 2/3
|
|
local Lua_has_double = x * 5 > 3 and x * 4 < 3 and get_precision(1.0) >= 53
|
|
assert(Lua_has_double, "at least 53-bit floating point numbers are required")
|
|
|
|
-- Q:
|
|
-- SHA2 was designed for FPU-less machines.
|
|
-- So, why floating point numbers are needed for this module?
|
|
-- A:
|
|
-- 53-bit "double" numbers are useful to calculate "magic numbers" used in SHA.
|
|
-- I prefer to write 50 LOC "magic numbers calculator" instead of storing more than 200 constants explicitly in this source file.
|
|
|
|
local int_prec, Lua_has_integers = get_precision(1)
|
|
local Lua_has_int64 = Lua_has_integers and int_prec == 64
|
|
local Lua_has_int32 = Lua_has_integers and int_prec == 32
|
|
assert(Lua_has_int64 or Lua_has_int32 or not Lua_has_integers, "Lua integers must be either 32-bit or 64-bit")
|
|
|
|
-- Q:
|
|
-- Does it mean that almost all non-standard configurations are not supported?
|
|
-- A:
|
|
-- Yes. Sorry, too many problems to support all possible Lua numbers configurations.
|
|
-- Lua 5.1/5.2 with "int32" will not work.
|
|
-- Lua 5.1/5.2 with "int64" will not work.
|
|
-- Lua 5.1/5.2 with "int128" will not work.
|
|
-- Lua 5.1/5.2 with "float" will not work.
|
|
-- Lua 5.1/5.2 with "double" is OK. (default config for Lua 5.1, Lua 5.2, LuaJIT)
|
|
-- Lua 5.3/5.4 with "int32" + "float" will not work.
|
|
-- Lua 5.3/5.4 with "int64" + "float" will not work.
|
|
-- Lua 5.3/5.4 with "int128" + "float" will not work.
|
|
-- Lua 5.3/5.4 with "int32" + "double" is OK. (config used by Fengari)
|
|
-- Lua 5.3/5.4 with "int64" + "double" is OK. (default config for Lua 5.3, Lua 5.4)
|
|
-- Lua 5.3/5.4 with "int128" + "double" will not work.
|
|
-- Using floating point numbers better than "double" instead of "double" is OK (non-IEEE-754 floating point implementation are allowed).
|
|
-- Using "int128" instead of "int64" is not OK: "int128" would require different branch of implementation for optimized SHA512.
|
|
|
|
-- Check for LuaJIT and 32-bit bitwise libraries
|
|
local is_LuaJIT = ({false, [1] = true})[1] and _VERSION ~= "Luau" and (type(jit) ~= "table" or jit.version_num >= 20000) -- LuaJIT 1.x.x and Luau are treated as vanilla Lua 5.1/5.2
|
|
local is_LuaJIT_21 -- LuaJIT 2.1+
|
|
local LuaJIT_arch
|
|
local ffi -- LuaJIT FFI library (as a table)
|
|
local b -- 32-bit bitwise library (as a table)
|
|
local library_name
|
|
|
|
if is_LuaJIT then
|
|
-- Assuming "bit" library is always available on LuaJIT
|
|
b = require"bit"
|
|
library_name = "bit"
|
|
-- "ffi" is intentionally disabled on some systems for safety reason
|
|
local LuaJIT_has_FFI, result = pcall(require, "ffi")
|
|
if LuaJIT_has_FFI then
|
|
ffi = result
|
|
end
|
|
is_LuaJIT_21 = not not loadstring"b=0b0"
|
|
LuaJIT_arch = type(jit) == "table" and jit.arch or ffi and ffi.arch or nil
|
|
else
|
|
-- For vanilla Lua, "bit"/"bit32" libraries are searched in global namespace only. No attempt is made to load a library if it's not loaded yet.
|
|
for _, libname in ipairs(_VERSION == "Lua 5.2" and {"bit32", "bit"} or {"bit", "bit32"}) do
|
|
if type(_G[libname]) == "table" and _G[libname].bxor then
|
|
b = _G[libname]
|
|
library_name = libname
|
|
break
|
|
end
|
|
end
|
|
end
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- You can disable here some of your system's abilities (for testing purposes)
|
|
--------------------------------------------------------------------------------
|
|
-- is_LuaJIT = nil
|
|
-- is_LuaJIT_21 = nil
|
|
-- ffi = nil
|
|
-- Lua_has_int32 = nil
|
|
-- Lua_has_int64 = nil
|
|
-- b, library_name = nil
|
|
--------------------------------------------------------------------------------
|
|
|
|
if print_debug_messages then
|
|
-- Printing list of abilities of your system
|
|
print("Abilities:")
|
|
print(" Lua version: "..(is_LuaJIT and "LuaJIT "..(is_LuaJIT_21 and "2.1 " or "2.0 ")..(LuaJIT_arch or "")..(ffi and " with FFI" or " without FFI") or _VERSION))
|
|
print(" Integer bitwise operators: "..(Lua_has_int64 and "int64" or Lua_has_int32 and "int32" or "no"))
|
|
print(" 32-bit bitwise library: "..(library_name or "not found"))
|
|
end
|
|
|
|
-- Selecting the most suitable implementation for given set of abilities
|
|
local method, branch
|
|
if is_LuaJIT and ffi then
|
|
method = "Using 'ffi' library of LuaJIT"
|
|
branch = "FFI"
|
|
elseif is_LuaJIT then
|
|
method = "Using special code for sandboxed LuaJIT (no FFI)"
|
|
branch = "LJ"
|
|
elseif Lua_has_int64 then
|
|
method = "Using native int64 bitwise operators"
|
|
branch = "INT64"
|
|
elseif Lua_has_int32 then
|
|
method = "Using native int32 bitwise operators"
|
|
branch = "INT32"
|
|
elseif library_name then -- when bitwise library is available (Lua 5.2 with native library "bit32" or Lua 5.1 with external library "bit")
|
|
method = "Using '"..library_name.."' library"
|
|
branch = "LIB32"
|
|
else
|
|
method = "Emulating bitwise operators using look-up table"
|
|
branch = "EMUL"
|
|
end
|
|
|
|
if print_debug_messages then
|
|
-- Printing the implementation selected to be used on your system
|
|
print("Implementation selected:")
|
|
print(" "..method)
|
|
end
|
|
|
|
-- For computercraft, use emulated mode for the sake of compatibility
|
|
branch = "EMUL"
|
|
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- BASIC 32-BIT BITWISE FUNCTIONS
|
|
--------------------------------------------------------------------------------
|
|
|
|
local AND, OR, XOR, SHL, SHR, ROL, ROR, NOT, NORM, HEX, XOR_BYTE
|
|
-- Only low 32 bits of function arguments matter, high bits are ignored
|
|
-- The result of all functions (except HEX) is an integer inside "correct range":
|
|
-- for "bit" library: (-2^31)..(2^31-1)
|
|
-- for "bit32" library: 0..(2^32-1)
|
|
|
|
if branch == "FFI" or branch == "LJ" or branch == "LIB32" then
|
|
|
|
-- Your system has 32-bit bitwise library (either "bit" or "bit32")
|
|
|
|
AND = b.band -- 2 arguments
|
|
OR = b.bor -- 2 arguments
|
|
XOR = b.bxor -- 2..5 arguments
|
|
-- SHL = b.lshift -- second argument is integer 0..31
|
|
SHL = b.blshift
|
|
-- SHR = b.rshift -- second argument is integer 0..31
|
|
SHR = b.brshift
|
|
-- ROL = b.rol or b.lrotate -- second argument is integer 0..31
|
|
-- ROR = b.ror or b.rrotate -- second argument is integer 0..31
|
|
--function ROL(x, n)
|
|
-- x = x % 2^32 * 2^n
|
|
-- local r = x % 2^32
|
|
-- return r + (x - r) / 2^32
|
|
--end
|
|
|
|
--function ROR(x, n)
|
|
-- x = x % 2^32 / 2^n
|
|
-- local r = x % 1
|
|
-- return r * 2^32 + (x - r)
|
|
--end
|
|
|
|
NOT = b.bnot -- only for LuaJIT
|
|
NORM = b.tobit -- only for LuaJIT
|
|
HEX = b.tohex -- returns string of 8 lowercase hexadecimal digits
|
|
assert(AND and OR and XOR and SHL and SHR and ROL and ROR and NOT, "Library '"..library_name.."' is incomplete")
|
|
XOR_BYTE = XOR -- XOR of two bytes (0..255)
|
|
|
|
elseif branch == "EMUL" then
|
|
|
|
-- Emulating 32-bit bitwise operations using 53-bit floating point arithmetic
|
|
|
|
function SHL(x, n)
|
|
return (x * 2^n) % 2^32
|
|
end
|
|
|
|
function SHR(x, n)
|
|
x = x % 2^32 / 2^n
|
|
return x - x % 1
|
|
end
|
|
|
|
function ROL(x, n)
|
|
x = x % 2^32 * 2^n
|
|
local r = x % 2^32
|
|
return r + (x - r) / 2^32
|
|
end
|
|
|
|
function ROR(x, n)
|
|
x = x % 2^32 / 2^n
|
|
local r = x % 1
|
|
return r * 2^32 + (x - r)
|
|
end
|
|
|
|
local AND_of_two_bytes = {[0] = 0} -- look-up table (256*256 entries)
|
|
local idx = 0
|
|
for y = 0, 127 * 256, 256 do
|
|
for x = y, y + 127 do
|
|
x = AND_of_two_bytes[x] * 2
|
|
AND_of_two_bytes[idx] = x
|
|
AND_of_two_bytes[idx + 1] = x
|
|
AND_of_two_bytes[idx + 256] = x
|
|
AND_of_two_bytes[idx + 257] = x + 1
|
|
idx = idx + 2
|
|
end
|
|
idx = idx + 256
|
|
end
|
|
|
|
local function and_or_xor(x, y, operation)
|
|
-- operation: nil = AND, 1 = OR, 2 = XOR
|
|
local x0 = x % 2^32
|
|
local y0 = y % 2^32
|
|
local rx = x0 % 256
|
|
local ry = y0 % 256
|
|
local res = AND_of_two_bytes[rx + ry * 256]
|
|
x = x0 - rx
|
|
y = (y0 - ry) / 256
|
|
rx = x % 65536
|
|
ry = y % 256
|
|
res = res + AND_of_two_bytes[rx + ry] * 256
|
|
x = (x - rx) / 256
|
|
y = (y - ry) / 256
|
|
rx = x % 65536 + y % 256
|
|
res = res + AND_of_two_bytes[rx] * 65536
|
|
res = res + AND_of_two_bytes[(x + y - rx) / 256] * 16777216
|
|
if operation then
|
|
res = x0 + y0 - operation * res
|
|
end
|
|
return res
|
|
end
|
|
|
|
function AND(x, y)
|
|
return and_or_xor(x, y)
|
|
end
|
|
|
|
function OR(x, y)
|
|
return and_or_xor(x, y, 1)
|
|
end
|
|
|
|
function XOR(x, y, z, t, u) -- 2..5 arguments
|
|
if z then
|
|
if t then
|
|
if u then
|
|
t = and_or_xor(t, u, 2)
|
|
end
|
|
z = and_or_xor(z, t, 2)
|
|
end
|
|
y = and_or_xor(y, z, 2)
|
|
end
|
|
return and_or_xor(x, y, 2)
|
|
end
|
|
|
|
function XOR_BYTE(x, y)
|
|
return x + y - 2 * AND_of_two_bytes[x + y * 256]
|
|
end
|
|
|
|
end
|
|
|
|
HEX = HEX
|
|
or
|
|
pcall(string_format, "%x", 2^31) and
|
|
function (x) -- returns string of 8 lowercase hexadecimal digits
|
|
return string_format("%08x", x % 4294967296)
|
|
end
|
|
or
|
|
function (x) -- for OpenWrt's dialect of Lua
|
|
return string_format("%08x", (x + 2^31) % 2^32 - 2^31)
|
|
end
|
|
|
|
local function XORA5(x, y)
|
|
return XOR(x, y or 0xA5A5A5A5) % 4294967296
|
|
end
|
|
|
|
local function create_array_of_lanes()
|
|
return {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
|
|
end
|
|
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- CREATING OPTIMIZED INNER LOOP
|
|
--------------------------------------------------------------------------------
|
|
|
|
-- Inner loop functions
|
|
local sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64
|
|
|
|
-- Arrays of SHA-2 "magic numbers" (in "INT64" and "FFI" branches "*_lo" arrays contain 64-bit values)
|
|
local sha2_K_lo, sha2_K_hi, sha2_H_lo, sha2_H_hi, sha3_RC_lo, sha3_RC_hi = {}, {}, {}, {}, {}, {}
|
|
local sha2_H_ext256 = {[224] = {}, [256] = sha2_H_hi}
|
|
local sha2_H_ext512_lo, sha2_H_ext512_hi = {[384] = {}, [512] = sha2_H_lo}, {[384] = {}, [512] = sha2_H_hi}
|
|
local md5_K, md5_sha1_H = {}, {0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0}
|
|
local md5_next_shift = {0, 0, 0, 0, 0, 0, 0, 0, 28, 25, 26, 27, 0, 0, 10, 9, 11, 12, 0, 15, 16, 17, 18, 0, 20, 22, 23, 21}
|
|
local HEX64, lanes_index_base -- defined only for branches that internally use 64-bit integers: "INT64" and "FFI"
|
|
local common_W = {} -- temporary table shared between all calculations (to avoid creating new temporary table every time)
|
|
local common_W_blake2b, common_W_blake2s, v_for_blake2s_feed_64 = common_W, common_W, {}
|
|
local K_lo_modulo, hi_factor, hi_factor_keccak = 4294967296, 0, 0
|
|
local sigma = {
|
|
{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 },
|
|
{ 15, 11, 5, 9, 10, 16, 14, 7, 2, 13, 1, 3, 12, 8, 6, 4 },
|
|
{ 12, 9, 13, 1, 6, 3, 16, 14, 11, 15, 4, 7, 8, 2, 10, 5 },
|
|
{ 8, 10, 4, 2, 14, 13, 12, 15, 3, 7, 6, 11, 5, 1, 16, 9 },
|
|
{ 10, 1, 6, 8, 3, 5, 11, 16, 15, 2, 12, 13, 7, 9, 4, 14 },
|
|
{ 3, 13, 7, 11, 1, 12, 9, 4, 5, 14, 8, 6, 16, 15, 2, 10 },
|
|
{ 13, 6, 2, 16, 15, 14, 5, 11, 1, 8, 7, 4, 10, 3, 9, 12 },
|
|
{ 14, 12, 8, 15, 13, 2, 4, 10, 6, 1, 16, 5, 9, 7, 3, 11 },
|
|
{ 7, 16, 15, 10, 12, 4, 1, 9, 13, 3, 14, 8, 2, 5, 11, 6 },
|
|
{ 11, 3, 9, 5, 8, 7, 2, 6, 16, 12, 10, 15, 4, 13, 14, 1 },
|
|
}; sigma[11], sigma[12] = sigma[1], sigma[2]
|
|
local perm_blake3 = {
|
|
1, 3, 4, 11, 13, 10, 12, 6,
|
|
1, 3, 4, 11, 13, 10,
|
|
2, 7, 5, 8, 14, 15, 16, 9,
|
|
2, 7, 5, 8, 14, 15,
|
|
}
|
|
|
|
local function build_keccak_format(elem)
|
|
local keccak_format = {}
|
|
for _, size in ipairs{1, 9, 13, 17, 18, 21} do
|
|
keccak_format[size] = "<"..string_rep(elem, size)
|
|
end
|
|
return keccak_format
|
|
end
|
|
|
|
|
|
if branch == "FFI" then
|
|
|
|
local common_W_FFI_int32 = ffi.new("int32_t[?]", 80) -- 64 is enough for SHA256, but 80 is needed for SHA-1
|
|
common_W_blake2s = common_W_FFI_int32
|
|
v_for_blake2s_feed_64 = ffi.new("int32_t[?]", 16)
|
|
perm_blake3 = ffi.new("uint8_t[?]", #perm_blake3 + 1, 0, unpack(perm_blake3))
|
|
for j = 1, 10 do
|
|
sigma[j] = ffi.new("uint8_t[?]", #sigma[j] + 1, 0, unpack(sigma[j]))
|
|
end; sigma[11], sigma[12] = sigma[1], sigma[2]
|
|
|
|
|
|
-- SHA256 implementation for "LuaJIT with FFI" branch
|
|
|
|
function sha256_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W, K = common_W_FFI_int32, sha2_K_hi
|
|
for pos = offs, offs + size - 1, 64 do
|
|
for j = 0, 15 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos) -- slow, but doesn't depend on endianness
|
|
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
|
|
end
|
|
for j = 16, 63 do
|
|
local a, b = W[j-15], W[j-2]
|
|
W[j] = NORM( XOR(ROR(a, 7), ROL(a, 14), SHR(a, 3)) + XOR(ROL(b, 15), ROL(b, 13), SHR(b, 10)) + W[j-7] + W[j-16] )
|
|
end
|
|
local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for j = 0, 63, 8 do -- Thanks to Peter Cawley for this workaround (unroll the loop to avoid "PHI shuffling too complex" due to PHIs overlap)
|
|
local z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j] + K[j+1] + h) )
|
|
h, g, f, e = g, f, e, NORM( d + z )
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+1] + K[j+2] + h) )
|
|
h, g, f, e = g, f, e, NORM( d + z )
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+2] + K[j+3] + h) )
|
|
h, g, f, e = g, f, e, NORM( d + z )
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+3] + K[j+4] + h) )
|
|
h, g, f, e = g, f, e, NORM( d + z )
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+4] + K[j+5] + h) )
|
|
h, g, f, e = g, f, e, NORM( d + z )
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+5] + K[j+6] + h) )
|
|
h, g, f, e = g, f, e, NORM( d + z )
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+6] + K[j+7] + h) )
|
|
h, g, f, e = g, f, e, NORM( d + z )
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+7] + K[j+8] + h) )
|
|
h, g, f, e = g, f, e, NORM( d + z )
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
end
|
|
H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
|
|
H[5], H[6], H[7], H[8] = NORM(e + H[5]), NORM(f + H[6]), NORM(g + H[7]), NORM(h + H[8])
|
|
end
|
|
end
|
|
|
|
|
|
local common_W_FFI_int64 = ffi.new("int64_t[?]", 80)
|
|
common_W_blake2b = common_W_FFI_int64
|
|
local int64 = ffi.typeof"int64_t"
|
|
local int32 = ffi.typeof"int32_t"
|
|
local uint32 = ffi.typeof"uint32_t"
|
|
hi_factor = int64(2^32)
|
|
|
|
if is_LuaJIT_21 then -- LuaJIT 2.1 supports bitwise 64-bit operations
|
|
|
|
local AND64, OR64, XOR64, NOT64, SHL64, SHR64, ROL64, ROR64 -- introducing synonyms for better code readability
|
|
= AND, OR, XOR, NOT, SHL, SHR, ROL, ROR
|
|
HEX64 = HEX
|
|
|
|
|
|
-- BLAKE2b implementation for "LuaJIT 2.1 + FFI" branch
|
|
|
|
do
|
|
local v = ffi.new("int64_t[?]", 16)
|
|
local W = common_W_blake2b
|
|
|
|
local function G(a, b, c, d, k1, k2)
|
|
local va, vb, vc, vd = v[a], v[b], v[c], v[d]
|
|
va = W[k1] + (va + vb)
|
|
vd = ROR64(XOR64(vd, va), 32)
|
|
vc = vc + vd
|
|
vb = ROR64(XOR64(vb, vc), 24)
|
|
va = W[k2] + (va + vb)
|
|
vd = ROR64(XOR64(vd, va), 16)
|
|
vc = vc + vd
|
|
vb = ROL64(XOR64(vb, vc), 1)
|
|
v[a], v[b], v[c], v[d] = va, vb, vc, vd
|
|
end
|
|
|
|
function blake2b_feed_128(H, _, str, offs, size, bytes_compressed, last_block_size, is_last_node)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for pos = offs, offs + size - 1, 128 do
|
|
if str then
|
|
for j = 1, 16 do
|
|
pos = pos + 8
|
|
local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos)
|
|
W[j] = XOR64(OR(SHL(h, 24), SHL(g, 16), SHL(f, 8), e) * int64(2^32), uint32(int32(OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))))
|
|
end
|
|
end
|
|
v[0x0], v[0x1], v[0x2], v[0x3], v[0x4], v[0x5], v[0x6], v[0x7] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
v[0x8], v[0x9], v[0xA], v[0xB], v[0xD], v[0xE], v[0xF] = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
|
|
bytes_compressed = bytes_compressed + (last_block_size or 128)
|
|
v[0xC] = XOR64(sha2_H_lo[5], bytes_compressed) -- t0 = low_8_bytes(bytes_compressed)
|
|
-- t1 = high_8_bytes(bytes_compressed) = 0, message length is always below 2^53 bytes
|
|
if last_block_size then -- flag f0
|
|
v[0xE] = NOT64(v[0xE])
|
|
end
|
|
if is_last_node then -- flag f1
|
|
v[0xF] = NOT64(v[0xF])
|
|
end
|
|
for j = 1, 12 do
|
|
local row = sigma[j]
|
|
G(0, 4, 8, 12, row[ 1], row[ 2])
|
|
G(1, 5, 9, 13, row[ 3], row[ 4])
|
|
G(2, 6, 10, 14, row[ 5], row[ 6])
|
|
G(3, 7, 11, 15, row[ 7], row[ 8])
|
|
G(0, 5, 10, 15, row[ 9], row[10])
|
|
G(1, 6, 11, 12, row[11], row[12])
|
|
G(2, 7, 8, 13, row[13], row[14])
|
|
G(3, 4, 9, 14, row[15], row[16])
|
|
end
|
|
h1 = XOR64(h1, v[0x0], v[0x8])
|
|
h2 = XOR64(h2, v[0x1], v[0x9])
|
|
h3 = XOR64(h3, v[0x2], v[0xA])
|
|
h4 = XOR64(h4, v[0x3], v[0xB])
|
|
h5 = XOR64(h5, v[0x4], v[0xC])
|
|
h6 = XOR64(h6, v[0x5], v[0xD])
|
|
h7 = XOR64(h7, v[0x6], v[0xE])
|
|
h8 = XOR64(h8, v[0x7], v[0xF])
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
return bytes_compressed
|
|
end
|
|
|
|
end
|
|
|
|
|
|
-- SHA-3 implementation for "LuaJIT 2.1 + FFI" branch
|
|
|
|
local arr64_t = ffi.typeof"int64_t[?]"
|
|
-- lanes array is indexed from 0
|
|
lanes_index_base = 0
|
|
hi_factor_keccak = int64(2^32)
|
|
|
|
function create_array_of_lanes()
|
|
return arr64_t(30) -- 25 + 5 for temporary usage
|
|
end
|
|
|
|
function keccak_feed(lanes, _, str, offs, size, block_size_in_bytes)
|
|
-- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
|
|
local RC = sha3_RC_lo
|
|
local qwords_qty = SHR(block_size_in_bytes, 3)
|
|
for pos = offs, offs + size - 1, block_size_in_bytes do
|
|
for j = 0, qwords_qty - 1 do
|
|
pos = pos + 8
|
|
local h, g, f, e, d, c, b, a = byte(str, pos - 7, pos) -- slow, but doesn't depend on endianness
|
|
lanes[j] = XOR64(lanes[j], OR64(OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32), uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h)))))
|
|
end
|
|
for round_idx = 1, 24 do
|
|
for j = 0, 4 do
|
|
lanes[25 + j] = XOR64(lanes[j], lanes[j+5], lanes[j+10], lanes[j+15], lanes[j+20])
|
|
end
|
|
local D = XOR64(lanes[25], ROL64(lanes[27], 1))
|
|
lanes[1], lanes[6], lanes[11], lanes[16] = ROL64(XOR64(D, lanes[6]), 44), ROL64(XOR64(D, lanes[16]), 45), ROL64(XOR64(D, lanes[1]), 1), ROL64(XOR64(D, lanes[11]), 10)
|
|
lanes[21] = ROL64(XOR64(D, lanes[21]), 2)
|
|
D = XOR64(lanes[26], ROL64(lanes[28], 1))
|
|
lanes[2], lanes[7], lanes[12], lanes[22] = ROL64(XOR64(D, lanes[12]), 43), ROL64(XOR64(D, lanes[22]), 61), ROL64(XOR64(D, lanes[7]), 6), ROL64(XOR64(D, lanes[2]), 62)
|
|
lanes[17] = ROL64(XOR64(D, lanes[17]), 15)
|
|
D = XOR64(lanes[27], ROL64(lanes[29], 1))
|
|
lanes[3], lanes[8], lanes[18], lanes[23] = ROL64(XOR64(D, lanes[18]), 21), ROL64(XOR64(D, lanes[3]), 28), ROL64(XOR64(D, lanes[23]), 56), ROL64(XOR64(D, lanes[8]), 55)
|
|
lanes[13] = ROL64(XOR64(D, lanes[13]), 25)
|
|
D = XOR64(lanes[28], ROL64(lanes[25], 1))
|
|
lanes[4], lanes[14], lanes[19], lanes[24] = ROL64(XOR64(D, lanes[24]), 14), ROL64(XOR64(D, lanes[19]), 8), ROL64(XOR64(D, lanes[4]), 27), ROL64(XOR64(D, lanes[14]), 39)
|
|
lanes[9] = ROL64(XOR64(D, lanes[9]), 20)
|
|
D = XOR64(lanes[29], ROL64(lanes[26], 1))
|
|
lanes[5], lanes[10], lanes[15], lanes[20] = ROL64(XOR64(D, lanes[10]), 3), ROL64(XOR64(D, lanes[20]), 18), ROL64(XOR64(D, lanes[5]), 36), ROL64(XOR64(D, lanes[15]), 41)
|
|
lanes[0] = XOR64(D, lanes[0])
|
|
lanes[0], lanes[1], lanes[2], lanes[3], lanes[4] = XOR64(lanes[0], AND64(NOT64(lanes[1]), lanes[2]), RC[round_idx]), XOR64(lanes[1], AND64(NOT64(lanes[2]), lanes[3])), XOR64(lanes[2], AND64(NOT64(lanes[3]), lanes[4])), XOR64(lanes[3], AND64(NOT64(lanes[4]), lanes[0])), XOR64(lanes[4], AND64(NOT64(lanes[0]), lanes[1]))
|
|
lanes[5], lanes[6], lanes[7], lanes[8], lanes[9] = XOR64(lanes[8], AND64(NOT64(lanes[9]), lanes[5])), XOR64(lanes[9], AND64(NOT64(lanes[5]), lanes[6])), XOR64(lanes[5], AND64(NOT64(lanes[6]), lanes[7])), XOR64(lanes[6], AND64(NOT64(lanes[7]), lanes[8])), XOR64(lanes[7], AND64(NOT64(lanes[8]), lanes[9]))
|
|
lanes[10], lanes[11], lanes[12], lanes[13], lanes[14] = XOR64(lanes[11], AND64(NOT64(lanes[12]), lanes[13])), XOR64(lanes[12], AND64(NOT64(lanes[13]), lanes[14])), XOR64(lanes[13], AND64(NOT64(lanes[14]), lanes[10])), XOR64(lanes[14], AND64(NOT64(lanes[10]), lanes[11])), XOR64(lanes[10], AND64(NOT64(lanes[11]), lanes[12]))
|
|
lanes[15], lanes[16], lanes[17], lanes[18], lanes[19] = XOR64(lanes[19], AND64(NOT64(lanes[15]), lanes[16])), XOR64(lanes[15], AND64(NOT64(lanes[16]), lanes[17])), XOR64(lanes[16], AND64(NOT64(lanes[17]), lanes[18])), XOR64(lanes[17], AND64(NOT64(lanes[18]), lanes[19])), XOR64(lanes[18], AND64(NOT64(lanes[19]), lanes[15]))
|
|
lanes[20], lanes[21], lanes[22], lanes[23], lanes[24] = XOR64(lanes[22], AND64(NOT64(lanes[23]), lanes[24])), XOR64(lanes[23], AND64(NOT64(lanes[24]), lanes[20])), XOR64(lanes[24], AND64(NOT64(lanes[20]), lanes[21])), XOR64(lanes[20], AND64(NOT64(lanes[21]), lanes[22])), XOR64(lanes[21], AND64(NOT64(lanes[22]), lanes[23]))
|
|
end
|
|
end
|
|
end
|
|
|
|
|
|
local A5_long = 0xA5A5A5A5 * int64(2^32 + 1) -- It's impossible to use constant 0xA5A5A5A5A5A5A5A5LL because it will raise syntax error on other Lua versions
|
|
|
|
function XORA5(long, long2)
|
|
return XOR64(long, long2 or A5_long)
|
|
end
|
|
|
|
|
|
-- SHA512 implementation for "LuaJIT 2.1 + FFI" branch
|
|
|
|
function sha512_feed_128(H, _, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
local W, K = common_W_FFI_int64, sha2_K_lo
|
|
for pos = offs, offs + size - 1, 128 do
|
|
for j = 0, 15 do
|
|
pos = pos + 8
|
|
local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos) -- slow, but doesn't depend on endianness
|
|
W[j] = OR64(OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32), uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h))))
|
|
end
|
|
for j = 16, 79 do
|
|
local a, b = W[j-15], W[j-2]
|
|
W[j] = XOR64(ROR64(a, 1), ROR64(a, 8), SHR64(a, 7)) + XOR64(ROR64(b, 19), ROL64(b, 3), SHR64(b, 6)) + W[j-7] + W[j-16]
|
|
end
|
|
local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for j = 0, 79, 8 do
|
|
local z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+1] + W[j]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
|
|
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+2] + W[j+1]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
|
|
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+3] + W[j+2]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
|
|
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+4] + W[j+3]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
|
|
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+5] + W[j+4]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
|
|
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+6] + W[j+5]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
|
|
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+7] + W[j+6]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
|
|
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+8] + W[j+7]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
|
|
end
|
|
H[1] = a + H[1]
|
|
H[2] = b + H[2]
|
|
H[3] = c + H[3]
|
|
H[4] = d + H[4]
|
|
H[5] = e + H[5]
|
|
H[6] = f + H[6]
|
|
H[7] = g + H[7]
|
|
H[8] = h + H[8]
|
|
end
|
|
end
|
|
|
|
else -- LuaJIT 2.0 doesn't support 64-bit bitwise operations
|
|
|
|
local U = ffi.new("union{int64_t i64; struct{int32_t "..(ffi.abi("le") and "lo, hi" or "hi, lo")..";} i32;}[3]")
|
|
-- this array of unions is used for fast splitting int64 into int32_high and int32_low
|
|
|
|
-- "xorrific" 64-bit functions :-)
|
|
-- int64 input is splitted into two int32 parts, some bitwise 32-bit operations are performed, finally the result is converted to int64
|
|
-- these functions are needed because bit.* functions in LuaJIT 2.0 don't work with int64_t
|
|
|
|
local function XORROR64_1(a)
|
|
-- return XOR64(ROR64(a, 1), ROR64(a, 8), SHR64(a, 7))
|
|
U[0].i64 = a
|
|
local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
|
|
local t_lo = XOR(SHR(a_lo, 1), SHL(a_hi, 31), SHR(a_lo, 8), SHL(a_hi, 24), SHR(a_lo, 7), SHL(a_hi, 25))
|
|
local t_hi = XOR(SHR(a_hi, 1), SHL(a_lo, 31), SHR(a_hi, 8), SHL(a_lo, 24), SHR(a_hi, 7))
|
|
return t_hi * int64(2^32) + uint32(int32(t_lo))
|
|
end
|
|
|
|
local function XORROR64_2(b)
|
|
-- return XOR64(ROR64(b, 19), ROL64(b, 3), SHR64(b, 6))
|
|
U[0].i64 = b
|
|
local b_lo, b_hi = U[0].i32.lo, U[0].i32.hi
|
|
local u_lo = XOR(SHR(b_lo, 19), SHL(b_hi, 13), SHL(b_lo, 3), SHR(b_hi, 29), SHR(b_lo, 6), SHL(b_hi, 26))
|
|
local u_hi = XOR(SHR(b_hi, 19), SHL(b_lo, 13), SHL(b_hi, 3), SHR(b_lo, 29), SHR(b_hi, 6))
|
|
return u_hi * int64(2^32) + uint32(int32(u_lo))
|
|
end
|
|
|
|
local function XORROR64_3(e)
|
|
-- return XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23))
|
|
U[0].i64 = e
|
|
local e_lo, e_hi = U[0].i32.lo, U[0].i32.hi
|
|
local u_lo = XOR(SHR(e_lo, 14), SHL(e_hi, 18), SHR(e_lo, 18), SHL(e_hi, 14), SHL(e_lo, 23), SHR(e_hi, 9))
|
|
local u_hi = XOR(SHR(e_hi, 14), SHL(e_lo, 18), SHR(e_hi, 18), SHL(e_lo, 14), SHL(e_hi, 23), SHR(e_lo, 9))
|
|
return u_hi * int64(2^32) + uint32(int32(u_lo))
|
|
end
|
|
|
|
local function XORROR64_6(a)
|
|
-- return XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30))
|
|
U[0].i64 = a
|
|
local b_lo, b_hi = U[0].i32.lo, U[0].i32.hi
|
|
local u_lo = XOR(SHR(b_lo, 28), SHL(b_hi, 4), SHL(b_lo, 30), SHR(b_hi, 2), SHL(b_lo, 25), SHR(b_hi, 7))
|
|
local u_hi = XOR(SHR(b_hi, 28), SHL(b_lo, 4), SHL(b_hi, 30), SHR(b_lo, 2), SHL(b_hi, 25), SHR(b_lo, 7))
|
|
return u_hi * int64(2^32) + uint32(int32(u_lo))
|
|
end
|
|
|
|
local function XORROR64_4(e, f, g)
|
|
-- return XOR64(g, AND64(e, XOR64(f, g)))
|
|
U[0].i64 = f
|
|
U[1].i64 = g
|
|
U[2].i64 = e
|
|
local f_lo, f_hi = U[0].i32.lo, U[0].i32.hi
|
|
local g_lo, g_hi = U[1].i32.lo, U[1].i32.hi
|
|
local e_lo, e_hi = U[2].i32.lo, U[2].i32.hi
|
|
local result_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo)))
|
|
local result_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi)))
|
|
return result_hi * int64(2^32) + uint32(int32(result_lo))
|
|
end
|
|
|
|
local function XORROR64_5(a, b, c)
|
|
-- return XOR64(AND64(XOR64(a, b), c), AND64(a, b))
|
|
U[0].i64 = a
|
|
U[1].i64 = b
|
|
U[2].i64 = c
|
|
local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
|
|
local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
|
|
local c_lo, c_hi = U[2].i32.lo, U[2].i32.hi
|
|
local result_lo = XOR(AND(XOR(a_lo, b_lo), c_lo), AND(a_lo, b_lo))
|
|
local result_hi = XOR(AND(XOR(a_hi, b_hi), c_hi), AND(a_hi, b_hi))
|
|
return result_hi * int64(2^32) + uint32(int32(result_lo))
|
|
end
|
|
|
|
local function XORROR64_7(a, b, m)
|
|
-- return ROR64(XOR64(a, b), m), m = 1..31
|
|
U[0].i64 = a
|
|
U[1].i64 = b
|
|
local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
|
|
local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
|
|
local c_lo, c_hi = XOR(a_lo, b_lo), XOR(a_hi, b_hi)
|
|
local t_lo = XOR(SHR(c_lo, m), SHL(c_hi, -m))
|
|
local t_hi = XOR(SHR(c_hi, m), SHL(c_lo, -m))
|
|
return t_hi * int64(2^32) + uint32(int32(t_lo))
|
|
end
|
|
|
|
local function XORROR64_8(a, b)
|
|
-- return ROL64(XOR64(a, b), 1)
|
|
U[0].i64 = a
|
|
U[1].i64 = b
|
|
local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
|
|
local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
|
|
local c_lo, c_hi = XOR(a_lo, b_lo), XOR(a_hi, b_hi)
|
|
local t_lo = XOR(SHL(c_lo, 1), SHR(c_hi, 31))
|
|
local t_hi = XOR(SHL(c_hi, 1), SHR(c_lo, 31))
|
|
return t_hi * int64(2^32) + uint32(int32(t_lo))
|
|
end
|
|
|
|
local function XORROR64_9(a, b)
|
|
-- return ROR64(XOR64(a, b), 32)
|
|
U[0].i64 = a
|
|
U[1].i64 = b
|
|
local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
|
|
local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
|
|
local t_hi, t_lo = XOR(a_lo, b_lo), XOR(a_hi, b_hi)
|
|
return t_hi * int64(2^32) + uint32(int32(t_lo))
|
|
end
|
|
|
|
local function XOR64(a, b)
|
|
-- return XOR64(a, b)
|
|
U[0].i64 = a
|
|
U[1].i64 = b
|
|
local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
|
|
local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
|
|
local t_lo, t_hi = XOR(a_lo, b_lo), XOR(a_hi, b_hi)
|
|
return t_hi * int64(2^32) + uint32(int32(t_lo))
|
|
end
|
|
|
|
local function XORROR64_11(a, b, c)
|
|
-- return XOR64(a, b, c)
|
|
U[0].i64 = a
|
|
U[1].i64 = b
|
|
U[2].i64 = c
|
|
local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
|
|
local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
|
|
local c_lo, c_hi = U[2].i32.lo, U[2].i32.hi
|
|
local t_lo, t_hi = XOR(a_lo, b_lo, c_lo), XOR(a_hi, b_hi, c_hi)
|
|
return t_hi * int64(2^32) + uint32(int32(t_lo))
|
|
end
|
|
|
|
function XORA5(long, long2)
|
|
-- return XOR64(long, long2 or 0xA5A5A5A5A5A5A5A5)
|
|
U[0].i64 = long
|
|
local lo32, hi32 = U[0].i32.lo, U[0].i32.hi
|
|
local long2_lo, long2_hi = 0xA5A5A5A5, 0xA5A5A5A5
|
|
if long2 then
|
|
U[1].i64 = long2
|
|
long2_lo, long2_hi = U[1].i32.lo, U[1].i32.hi
|
|
end
|
|
lo32 = XOR(lo32, long2_lo)
|
|
hi32 = XOR(hi32, long2_hi)
|
|
return hi32 * int64(2^32) + uint32(int32(lo32))
|
|
end
|
|
|
|
function HEX64(long)
|
|
U[0].i64 = long
|
|
return HEX(U[0].i32.hi)..HEX(U[0].i32.lo)
|
|
end
|
|
|
|
|
|
-- SHA512 implementation for "LuaJIT 2.0 + FFI" branch
|
|
|
|
function sha512_feed_128(H, _, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
local W, K = common_W_FFI_int64, sha2_K_lo
|
|
for pos = offs, offs + size - 1, 128 do
|
|
for j = 0, 15 do
|
|
pos = pos + 8
|
|
local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos) -- slow, but doesn't depend on endianness
|
|
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32) + uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h)))
|
|
end
|
|
for j = 16, 79 do
|
|
W[j] = XORROR64_1(W[j-15]) + XORROR64_2(W[j-2]) + W[j-7] + W[j-16]
|
|
end
|
|
local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for j = 0, 79, 8 do
|
|
local z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+1] + W[j]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
|
|
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+2] + W[j+1]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
|
|
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+3] + W[j+2]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
|
|
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+4] + W[j+3]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
|
|
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+5] + W[j+4]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
|
|
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+6] + W[j+5]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
|
|
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+7] + W[j+6]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
|
|
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+8] + W[j+7]
|
|
h, g, f, e = g, f, e, z + d
|
|
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
|
|
end
|
|
H[1] = a + H[1]
|
|
H[2] = b + H[2]
|
|
H[3] = c + H[3]
|
|
H[4] = d + H[4]
|
|
H[5] = e + H[5]
|
|
H[6] = f + H[6]
|
|
H[7] = g + H[7]
|
|
H[8] = h + H[8]
|
|
end
|
|
end
|
|
|
|
|
|
-- BLAKE2b implementation for "LuaJIT 2.0 + FFI" branch
|
|
|
|
do
|
|
local v = ffi.new("int64_t[?]", 16)
|
|
local W = common_W_blake2b
|
|
|
|
local function G(a, b, c, d, k1, k2)
|
|
local va, vb, vc, vd = v[a], v[b], v[c], v[d]
|
|
va = W[k1] + (va + vb)
|
|
vd = XORROR64_9(vd, va)
|
|
vc = vc + vd
|
|
vb = XORROR64_7(vb, vc, 24)
|
|
va = W[k2] + (va + vb)
|
|
vd = XORROR64_7(vd, va, 16)
|
|
vc = vc + vd
|
|
vb = XORROR64_8(vb, vc)
|
|
v[a], v[b], v[c], v[d] = va, vb, vc, vd
|
|
end
|
|
|
|
function blake2b_feed_128(H, _, str, offs, size, bytes_compressed, last_block_size, is_last_node)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for pos = offs, offs + size - 1, 128 do
|
|
if str then
|
|
for j = 1, 16 do
|
|
pos = pos + 8
|
|
local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos)
|
|
W[j] = XOR64(OR(SHL(h, 24), SHL(g, 16), SHL(f, 8), e) * int64(2^32), uint32(int32(OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))))
|
|
end
|
|
end
|
|
v[0x0], v[0x1], v[0x2], v[0x3], v[0x4], v[0x5], v[0x6], v[0x7] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
v[0x8], v[0x9], v[0xA], v[0xB], v[0xD], v[0xE], v[0xF] = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
|
|
bytes_compressed = bytes_compressed + (last_block_size or 128)
|
|
v[0xC] = XOR64(sha2_H_lo[5], bytes_compressed) -- t0 = low_8_bytes(bytes_compressed)
|
|
-- t1 = high_8_bytes(bytes_compressed) = 0, message length is always below 2^53 bytes
|
|
if last_block_size then -- flag f0
|
|
v[0xE] = -1 - v[0xE]
|
|
end
|
|
if is_last_node then -- flag f1
|
|
v[0xF] = -1 - v[0xF]
|
|
end
|
|
for j = 1, 12 do
|
|
local row = sigma[j]
|
|
G(0, 4, 8, 12, row[ 1], row[ 2])
|
|
G(1, 5, 9, 13, row[ 3], row[ 4])
|
|
G(2, 6, 10, 14, row[ 5], row[ 6])
|
|
G(3, 7, 11, 15, row[ 7], row[ 8])
|
|
G(0, 5, 10, 15, row[ 9], row[10])
|
|
G(1, 6, 11, 12, row[11], row[12])
|
|
G(2, 7, 8, 13, row[13], row[14])
|
|
G(3, 4, 9, 14, row[15], row[16])
|
|
end
|
|
h1 = XORROR64_11(h1, v[0x0], v[0x8])
|
|
h2 = XORROR64_11(h2, v[0x1], v[0x9])
|
|
h3 = XORROR64_11(h3, v[0x2], v[0xA])
|
|
h4 = XORROR64_11(h4, v[0x3], v[0xB])
|
|
h5 = XORROR64_11(h5, v[0x4], v[0xC])
|
|
h6 = XORROR64_11(h6, v[0x5], v[0xD])
|
|
h7 = XORROR64_11(h7, v[0x6], v[0xE])
|
|
h8 = XORROR64_11(h8, v[0x7], v[0xF])
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
return bytes_compressed
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
-- MD5 implementation for "LuaJIT with FFI" branch
|
|
|
|
function md5_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W, K = common_W_FFI_int32, md5_K
|
|
for pos = offs, offs + size - 1, 64 do
|
|
for j = 0, 15 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos) -- slow, but doesn't depend on endianness
|
|
W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
|
|
end
|
|
local a, b, c, d = H[1], H[2], H[3], H[4]
|
|
for j = 0, 15, 4 do
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+1] + W[j ] + a), 7) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+2] + W[j+1] + a), 12) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+3] + W[j+2] + a), 17) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+4] + W[j+3] + a), 22) + b)
|
|
end
|
|
for j = 16, 31, 4 do
|
|
local g = 5*j
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+1] + W[AND(g + 1, 15)] + a), 5) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+2] + W[AND(g + 6, 15)] + a), 9) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+3] + W[AND(g - 5, 15)] + a), 14) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+4] + W[AND(g , 15)] + a), 20) + b)
|
|
end
|
|
for j = 32, 47, 4 do
|
|
local g = 3*j
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+1] + W[AND(g + 5, 15)] + a), 4) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+2] + W[AND(g + 8, 15)] + a), 11) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+3] + W[AND(g - 5, 15)] + a), 16) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+4] + W[AND(g - 2, 15)] + a), 23) + b)
|
|
end
|
|
for j = 48, 63, 4 do
|
|
local g = 7*j
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+1] + W[AND(g , 15)] + a), 6) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+2] + W[AND(g + 7, 15)] + a), 10) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+3] + W[AND(g - 2, 15)] + a), 15) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+4] + W[AND(g + 5, 15)] + a), 21) + b)
|
|
end
|
|
H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
|
|
end
|
|
end
|
|
|
|
|
|
-- SHA-1 implementation for "LuaJIT with FFI" branch
|
|
|
|
function sha1_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W = common_W_FFI_int32
|
|
for pos = offs, offs + size - 1, 64 do
|
|
for j = 0, 15 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos) -- slow, but doesn't depend on endianness
|
|
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
|
|
end
|
|
for j = 16, 79 do
|
|
W[j] = ROL(XOR(W[j-3], W[j-8], W[j-14], W[j-16]), 1)
|
|
end
|
|
local a, b, c, d, e = H[1], H[2], H[3], H[4], H[5]
|
|
for j = 0, 19, 5 do
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j] + 0x5A827999 + e)) -- constant = floor(2^30 * sqrt(2))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+1] + 0x5A827999 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+2] + 0x5A827999 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+3] + 0x5A827999 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+4] + 0x5A827999 + e))
|
|
end
|
|
for j = 20, 39, 5 do
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0x6ED9EBA1 + e)) -- 2^30 * sqrt(3)
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0x6ED9EBA1 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0x6ED9EBA1 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0x6ED9EBA1 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0x6ED9EBA1 + e))
|
|
end
|
|
for j = 40, 59, 5 do
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j] + 0x8F1BBCDC + e)) -- 2^30 * sqrt(5)
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+1] + 0x8F1BBCDC + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+2] + 0x8F1BBCDC + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+3] + 0x8F1BBCDC + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+4] + 0x8F1BBCDC + e))
|
|
end
|
|
for j = 60, 79, 5 do
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0xCA62C1D6 + e)) -- 2^30 * sqrt(10)
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0xCA62C1D6 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0xCA62C1D6 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0xCA62C1D6 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0xCA62C1D6 + e))
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]), NORM(e + H[5])
|
|
end
|
|
end
|
|
|
|
end
|
|
|
|
|
|
if branch == "FFI" and not is_LuaJIT_21 or branch == "LJ" then
|
|
|
|
if branch == "FFI" then
|
|
local arr32_t = ffi.typeof"int32_t[?]"
|
|
|
|
function create_array_of_lanes()
|
|
return arr32_t(31) -- 25 + 5 + 1 (due to 1-based indexing)
|
|
end
|
|
|
|
end
|
|
|
|
|
|
-- SHA-3 implementation for "LuaJIT 2.0 + FFI" and "LuaJIT without FFI" branches
|
|
|
|
function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes)
|
|
-- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
|
|
local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi
|
|
local qwords_qty = SHR(block_size_in_bytes, 3)
|
|
for pos = offs, offs + size - 1, block_size_in_bytes do
|
|
for j = 1, qwords_qty do
|
|
local a, b, c, d = byte(str, pos + 1, pos + 4)
|
|
lanes_lo[j] = XOR(lanes_lo[j], OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))
|
|
pos = pos + 8
|
|
a, b, c, d = byte(str, pos - 3, pos)
|
|
lanes_hi[j] = XOR(lanes_hi[j], OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))
|
|
end
|
|
for round_idx = 1, 24 do
|
|
for j = 1, 5 do
|
|
lanes_lo[25 + j] = XOR(lanes_lo[j], lanes_lo[j + 5], lanes_lo[j + 10], lanes_lo[j + 15], lanes_lo[j + 20])
|
|
end
|
|
for j = 1, 5 do
|
|
lanes_hi[25 + j] = XOR(lanes_hi[j], lanes_hi[j + 5], lanes_hi[j + 10], lanes_hi[j + 15], lanes_hi[j + 20])
|
|
end
|
|
local D_lo = XOR(lanes_lo[26], SHL(lanes_lo[28], 1), SHR(lanes_hi[28], 31))
|
|
local D_hi = XOR(lanes_hi[26], SHL(lanes_hi[28], 1), SHR(lanes_lo[28], 31))
|
|
lanes_lo[2], lanes_hi[2], lanes_lo[7], lanes_hi[7], lanes_lo[12], lanes_hi[12], lanes_lo[17], lanes_hi[17] = XOR(SHR(XOR(D_lo, lanes_lo[7]), 20), SHL(XOR(D_hi, lanes_hi[7]), 12)), XOR(SHR(XOR(D_hi, lanes_hi[7]), 20), SHL(XOR(D_lo, lanes_lo[7]), 12)), XOR(SHR(XOR(D_lo, lanes_lo[17]), 19), SHL(XOR(D_hi, lanes_hi[17]), 13)), XOR(SHR(XOR(D_hi, lanes_hi[17]), 19), SHL(XOR(D_lo, lanes_lo[17]), 13)), XOR(SHL(XOR(D_lo, lanes_lo[2]), 1), SHR(XOR(D_hi, lanes_hi[2]), 31)), XOR(SHL(XOR(D_hi, lanes_hi[2]), 1), SHR(XOR(D_lo, lanes_lo[2]), 31)), XOR(SHL(XOR(D_lo, lanes_lo[12]), 10), SHR(XOR(D_hi, lanes_hi[12]), 22)), XOR(SHL(XOR(D_hi, lanes_hi[12]), 10), SHR(XOR(D_lo, lanes_lo[12]), 22))
|
|
local L, H = XOR(D_lo, lanes_lo[22]), XOR(D_hi, lanes_hi[22])
|
|
lanes_lo[22], lanes_hi[22] = XOR(SHL(L, 2), SHR(H, 30)), XOR(SHL(H, 2), SHR(L, 30))
|
|
D_lo = XOR(lanes_lo[27], SHL(lanes_lo[29], 1), SHR(lanes_hi[29], 31))
|
|
D_hi = XOR(lanes_hi[27], SHL(lanes_hi[29], 1), SHR(lanes_lo[29], 31))
|
|
lanes_lo[3], lanes_hi[3], lanes_lo[8], lanes_hi[8], lanes_lo[13], lanes_hi[13], lanes_lo[23], lanes_hi[23] = XOR(SHR(XOR(D_lo, lanes_lo[13]), 21), SHL(XOR(D_hi, lanes_hi[13]), 11)), XOR(SHR(XOR(D_hi, lanes_hi[13]), 21), SHL(XOR(D_lo, lanes_lo[13]), 11)), XOR(SHR(XOR(D_lo, lanes_lo[23]), 3), SHL(XOR(D_hi, lanes_hi[23]), 29)), XOR(SHR(XOR(D_hi, lanes_hi[23]), 3), SHL(XOR(D_lo, lanes_lo[23]), 29)), XOR(SHL(XOR(D_lo, lanes_lo[8]), 6), SHR(XOR(D_hi, lanes_hi[8]), 26)), XOR(SHL(XOR(D_hi, lanes_hi[8]), 6), SHR(XOR(D_lo, lanes_lo[8]), 26)), XOR(SHR(XOR(D_lo, lanes_lo[3]), 2), SHL(XOR(D_hi, lanes_hi[3]), 30)), XOR(SHR(XOR(D_hi, lanes_hi[3]), 2), SHL(XOR(D_lo, lanes_lo[3]), 30))
|
|
L, H = XOR(D_lo, lanes_lo[18]), XOR(D_hi, lanes_hi[18])
|
|
lanes_lo[18], lanes_hi[18] = XOR(SHL(L, 15), SHR(H, 17)), XOR(SHL(H, 15), SHR(L, 17))
|
|
D_lo = XOR(lanes_lo[28], SHL(lanes_lo[30], 1), SHR(lanes_hi[30], 31))
|
|
D_hi = XOR(lanes_hi[28], SHL(lanes_hi[30], 1), SHR(lanes_lo[30], 31))
|
|
lanes_lo[4], lanes_hi[4], lanes_lo[9], lanes_hi[9], lanes_lo[19], lanes_hi[19], lanes_lo[24], lanes_hi[24] = XOR(SHL(XOR(D_lo, lanes_lo[19]), 21), SHR(XOR(D_hi, lanes_hi[19]), 11)), XOR(SHL(XOR(D_hi, lanes_hi[19]), 21), SHR(XOR(D_lo, lanes_lo[19]), 11)), XOR(SHL(XOR(D_lo, lanes_lo[4]), 28), SHR(XOR(D_hi, lanes_hi[4]), 4)), XOR(SHL(XOR(D_hi, lanes_hi[4]), 28), SHR(XOR(D_lo, lanes_lo[4]), 4)), XOR(SHR(XOR(D_lo, lanes_lo[24]), 8), SHL(XOR(D_hi, lanes_hi[24]), 24)), XOR(SHR(XOR(D_hi, lanes_hi[24]), 8), SHL(XOR(D_lo, lanes_lo[24]), 24)), XOR(SHR(XOR(D_lo, lanes_lo[9]), 9), SHL(XOR(D_hi, lanes_hi[9]), 23)), XOR(SHR(XOR(D_hi, lanes_hi[9]), 9), SHL(XOR(D_lo, lanes_lo[9]), 23))
|
|
L, H = XOR(D_lo, lanes_lo[14]), XOR(D_hi, lanes_hi[14])
|
|
lanes_lo[14], lanes_hi[14] = XOR(SHL(L, 25), SHR(H, 7)), XOR(SHL(H, 25), SHR(L, 7))
|
|
D_lo = XOR(lanes_lo[29], SHL(lanes_lo[26], 1), SHR(lanes_hi[26], 31))
|
|
D_hi = XOR(lanes_hi[29], SHL(lanes_hi[26], 1), SHR(lanes_lo[26], 31))
|
|
lanes_lo[5], lanes_hi[5], lanes_lo[15], lanes_hi[15], lanes_lo[20], lanes_hi[20], lanes_lo[25], lanes_hi[25] = XOR(SHL(XOR(D_lo, lanes_lo[25]), 14), SHR(XOR(D_hi, lanes_hi[25]), 18)), XOR(SHL(XOR(D_hi, lanes_hi[25]), 14), SHR(XOR(D_lo, lanes_lo[25]), 18)), XOR(SHL(XOR(D_lo, lanes_lo[20]), 8), SHR(XOR(D_hi, lanes_hi[20]), 24)), XOR(SHL(XOR(D_hi, lanes_hi[20]), 8), SHR(XOR(D_lo, lanes_lo[20]), 24)), XOR(SHL(XOR(D_lo, lanes_lo[5]), 27), SHR(XOR(D_hi, lanes_hi[5]), 5)), XOR(SHL(XOR(D_hi, lanes_hi[5]), 27), SHR(XOR(D_lo, lanes_lo[5]), 5)), XOR(SHR(XOR(D_lo, lanes_lo[15]), 25), SHL(XOR(D_hi, lanes_hi[15]), 7)), XOR(SHR(XOR(D_hi, lanes_hi[15]), 25), SHL(XOR(D_lo, lanes_lo[15]), 7))
|
|
L, H = XOR(D_lo, lanes_lo[10]), XOR(D_hi, lanes_hi[10])
|
|
lanes_lo[10], lanes_hi[10] = XOR(SHL(L, 20), SHR(H, 12)), XOR(SHL(H, 20), SHR(L, 12))
|
|
D_lo = XOR(lanes_lo[30], SHL(lanes_lo[27], 1), SHR(lanes_hi[27], 31))
|
|
D_hi = XOR(lanes_hi[30], SHL(lanes_hi[27], 1), SHR(lanes_lo[27], 31))
|
|
lanes_lo[6], lanes_hi[6], lanes_lo[11], lanes_hi[11], lanes_lo[16], lanes_hi[16], lanes_lo[21], lanes_hi[21] = XOR(SHL(XOR(D_lo, lanes_lo[11]), 3), SHR(XOR(D_hi, lanes_hi[11]), 29)), XOR(SHL(XOR(D_hi, lanes_hi[11]), 3), SHR(XOR(D_lo, lanes_lo[11]), 29)), XOR(SHL(XOR(D_lo, lanes_lo[21]), 18), SHR(XOR(D_hi, lanes_hi[21]), 14)), XOR(SHL(XOR(D_hi, lanes_hi[21]), 18), SHR(XOR(D_lo, lanes_lo[21]), 14)), XOR(SHR(XOR(D_lo, lanes_lo[6]), 28), SHL(XOR(D_hi, lanes_hi[6]), 4)), XOR(SHR(XOR(D_hi, lanes_hi[6]), 28), SHL(XOR(D_lo, lanes_lo[6]), 4)), XOR(SHR(XOR(D_lo, lanes_lo[16]), 23), SHL(XOR(D_hi, lanes_hi[16]), 9)), XOR(SHR(XOR(D_hi, lanes_hi[16]), 23), SHL(XOR(D_lo, lanes_lo[16]), 9))
|
|
lanes_lo[1], lanes_hi[1] = XOR(D_lo, lanes_lo[1]), XOR(D_hi, lanes_hi[1])
|
|
lanes_lo[1], lanes_lo[2], lanes_lo[3], lanes_lo[4], lanes_lo[5] = XOR(lanes_lo[1], AND(NOT(lanes_lo[2]), lanes_lo[3]), RC_lo[round_idx]), XOR(lanes_lo[2], AND(NOT(lanes_lo[3]), lanes_lo[4])), XOR(lanes_lo[3], AND(NOT(lanes_lo[4]), lanes_lo[5])), XOR(lanes_lo[4], AND(NOT(lanes_lo[5]), lanes_lo[1])), XOR(lanes_lo[5], AND(NOT(lanes_lo[1]), lanes_lo[2]))
|
|
lanes_lo[6], lanes_lo[7], lanes_lo[8], lanes_lo[9], lanes_lo[10] = XOR(lanes_lo[9], AND(NOT(lanes_lo[10]), lanes_lo[6])), XOR(lanes_lo[10], AND(NOT(lanes_lo[6]), lanes_lo[7])), XOR(lanes_lo[6], AND(NOT(lanes_lo[7]), lanes_lo[8])), XOR(lanes_lo[7], AND(NOT(lanes_lo[8]), lanes_lo[9])), XOR(lanes_lo[8], AND(NOT(lanes_lo[9]), lanes_lo[10]))
|
|
lanes_lo[11], lanes_lo[12], lanes_lo[13], lanes_lo[14], lanes_lo[15] = XOR(lanes_lo[12], AND(NOT(lanes_lo[13]), lanes_lo[14])), XOR(lanes_lo[13], AND(NOT(lanes_lo[14]), lanes_lo[15])), XOR(lanes_lo[14], AND(NOT(lanes_lo[15]), lanes_lo[11])), XOR(lanes_lo[15], AND(NOT(lanes_lo[11]), lanes_lo[12])), XOR(lanes_lo[11], AND(NOT(lanes_lo[12]), lanes_lo[13]))
|
|
lanes_lo[16], lanes_lo[17], lanes_lo[18], lanes_lo[19], lanes_lo[20] = XOR(lanes_lo[20], AND(NOT(lanes_lo[16]), lanes_lo[17])), XOR(lanes_lo[16], AND(NOT(lanes_lo[17]), lanes_lo[18])), XOR(lanes_lo[17], AND(NOT(lanes_lo[18]), lanes_lo[19])), XOR(lanes_lo[18], AND(NOT(lanes_lo[19]), lanes_lo[20])), XOR(lanes_lo[19], AND(NOT(lanes_lo[20]), lanes_lo[16]))
|
|
lanes_lo[21], lanes_lo[22], lanes_lo[23], lanes_lo[24], lanes_lo[25] = XOR(lanes_lo[23], AND(NOT(lanes_lo[24]), lanes_lo[25])), XOR(lanes_lo[24], AND(NOT(lanes_lo[25]), lanes_lo[21])), XOR(lanes_lo[25], AND(NOT(lanes_lo[21]), lanes_lo[22])), XOR(lanes_lo[21], AND(NOT(lanes_lo[22]), lanes_lo[23])), XOR(lanes_lo[22], AND(NOT(lanes_lo[23]), lanes_lo[24]))
|
|
lanes_hi[1], lanes_hi[2], lanes_hi[3], lanes_hi[4], lanes_hi[5] = XOR(lanes_hi[1], AND(NOT(lanes_hi[2]), lanes_hi[3]), RC_hi[round_idx]), XOR(lanes_hi[2], AND(NOT(lanes_hi[3]), lanes_hi[4])), XOR(lanes_hi[3], AND(NOT(lanes_hi[4]), lanes_hi[5])), XOR(lanes_hi[4], AND(NOT(lanes_hi[5]), lanes_hi[1])), XOR(lanes_hi[5], AND(NOT(lanes_hi[1]), lanes_hi[2]))
|
|
lanes_hi[6], lanes_hi[7], lanes_hi[8], lanes_hi[9], lanes_hi[10] = XOR(lanes_hi[9], AND(NOT(lanes_hi[10]), lanes_hi[6])), XOR(lanes_hi[10], AND(NOT(lanes_hi[6]), lanes_hi[7])), XOR(lanes_hi[6], AND(NOT(lanes_hi[7]), lanes_hi[8])), XOR(lanes_hi[7], AND(NOT(lanes_hi[8]), lanes_hi[9])), XOR(lanes_hi[8], AND(NOT(lanes_hi[9]), lanes_hi[10]))
|
|
lanes_hi[11], lanes_hi[12], lanes_hi[13], lanes_hi[14], lanes_hi[15] = XOR(lanes_hi[12], AND(NOT(lanes_hi[13]), lanes_hi[14])), XOR(lanes_hi[13], AND(NOT(lanes_hi[14]), lanes_hi[15])), XOR(lanes_hi[14], AND(NOT(lanes_hi[15]), lanes_hi[11])), XOR(lanes_hi[15], AND(NOT(lanes_hi[11]), lanes_hi[12])), XOR(lanes_hi[11], AND(NOT(lanes_hi[12]), lanes_hi[13]))
|
|
lanes_hi[16], lanes_hi[17], lanes_hi[18], lanes_hi[19], lanes_hi[20] = XOR(lanes_hi[20], AND(NOT(lanes_hi[16]), lanes_hi[17])), XOR(lanes_hi[16], AND(NOT(lanes_hi[17]), lanes_hi[18])), XOR(lanes_hi[17], AND(NOT(lanes_hi[18]), lanes_hi[19])), XOR(lanes_hi[18], AND(NOT(lanes_hi[19]), lanes_hi[20])), XOR(lanes_hi[19], AND(NOT(lanes_hi[20]), lanes_hi[16]))
|
|
lanes_hi[21], lanes_hi[22], lanes_hi[23], lanes_hi[24], lanes_hi[25] = XOR(lanes_hi[23], AND(NOT(lanes_hi[24]), lanes_hi[25])), XOR(lanes_hi[24], AND(NOT(lanes_hi[25]), lanes_hi[21])), XOR(lanes_hi[25], AND(NOT(lanes_hi[21]), lanes_hi[22])), XOR(lanes_hi[21], AND(NOT(lanes_hi[22]), lanes_hi[23])), XOR(lanes_hi[22], AND(NOT(lanes_hi[23]), lanes_hi[24]))
|
|
end
|
|
end
|
|
end
|
|
|
|
end
|
|
|
|
|
|
if branch == "LJ" then
|
|
|
|
|
|
-- SHA256 implementation for "LuaJIT without FFI" branch
|
|
|
|
function sha256_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W, K = common_W, sha2_K_hi
|
|
for pos = offs, offs + size - 1, 64 do
|
|
for j = 1, 16 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
|
|
end
|
|
for j = 17, 64 do
|
|
local a, b = W[j-15], W[j-2]
|
|
W[j] = NORM( NORM( XOR(ROR(a, 7), ROL(a, 14), SHR(a, 3)) + XOR(ROL(b, 15), ROL(b, 13), SHR(b, 10)) ) + NORM( W[j-7] + W[j-16] ) )
|
|
end
|
|
local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for j = 1, 64, 8 do -- Thanks to Peter Cawley for this workaround (unroll the loop to avoid "PHI shuffling too complex" due to PHIs overlap)
|
|
local z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j] + W[j] + h) )
|
|
h, g, f, e = g, f, e, NORM(d + z)
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+1] + W[j+1] + h) )
|
|
h, g, f, e = g, f, e, NORM(d + z)
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+2] + W[j+2] + h) )
|
|
h, g, f, e = g, f, e, NORM(d + z)
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+3] + W[j+3] + h) )
|
|
h, g, f, e = g, f, e, NORM(d + z)
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+4] + W[j+4] + h) )
|
|
h, g, f, e = g, f, e, NORM(d + z)
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+5] + W[j+5] + h) )
|
|
h, g, f, e = g, f, e, NORM(d + z)
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+6] + W[j+6] + h) )
|
|
h, g, f, e = g, f, e, NORM(d + z)
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+7] + W[j+7] + h) )
|
|
h, g, f, e = g, f, e, NORM(d + z)
|
|
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
|
|
end
|
|
H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
|
|
H[5], H[6], H[7], H[8] = NORM(e + H[5]), NORM(f + H[6]), NORM(g + H[7]), NORM(h + H[8])
|
|
end
|
|
end
|
|
|
|
local function ADD64_4(a_lo, a_hi, b_lo, b_hi, c_lo, c_hi, d_lo, d_hi)
|
|
local sum_lo = a_lo % 2^32 + b_lo % 2^32 + c_lo % 2^32 + d_lo % 2^32
|
|
local sum_hi = a_hi + b_hi + c_hi + d_hi
|
|
local result_lo = NORM( sum_lo )
|
|
local result_hi = NORM( sum_hi + floor(sum_lo / 2^32) )
|
|
return result_lo, result_hi
|
|
end
|
|
|
|
if LuaJIT_arch == "x86" then -- Special trick is required to avoid "PHI shuffling too complex" on x86 platform
|
|
|
|
|
|
-- SHA512 implementation for "LuaJIT x86 without FFI" branch
|
|
|
|
function sha512_feed_128(H_lo, H_hi, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
-- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k]
|
|
local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi
|
|
for pos = offs, offs + size - 1, 128 do
|
|
for j = 1, 16*2 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
|
|
end
|
|
for jj = 17*2, 80*2, 2 do
|
|
local a_lo, a_hi = W[jj-30], W[jj-31]
|
|
local t_lo = XOR(OR(SHR(a_lo, 1), SHL(a_hi, 31)), OR(SHR(a_lo, 8), SHL(a_hi, 24)), OR(SHR(a_lo, 7), SHL(a_hi, 25)))
|
|
local t_hi = XOR(OR(SHR(a_hi, 1), SHL(a_lo, 31)), OR(SHR(a_hi, 8), SHL(a_lo, 24)), SHR(a_hi, 7))
|
|
local b_lo, b_hi = W[jj-4], W[jj-5]
|
|
local u_lo = XOR(OR(SHR(b_lo, 19), SHL(b_hi, 13)), OR(SHL(b_lo, 3), SHR(b_hi, 29)), OR(SHR(b_lo, 6), SHL(b_hi, 26)))
|
|
local u_hi = XOR(OR(SHR(b_hi, 19), SHL(b_lo, 13)), OR(SHL(b_hi, 3), SHR(b_lo, 29)), SHR(b_hi, 6))
|
|
W[jj], W[jj-1] = ADD64_4(t_lo, t_hi, u_lo, u_hi, W[jj-14], W[jj-15], W[jj-32], W[jj-33])
|
|
end
|
|
local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
|
|
local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
|
|
local zero = 0
|
|
for j = 1, 80 do
|
|
local t_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo)))
|
|
local t_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi)))
|
|
local u_lo = XOR(OR(SHR(e_lo, 14), SHL(e_hi, 18)), OR(SHR(e_lo, 18), SHL(e_hi, 14)), OR(SHL(e_lo, 23), SHR(e_hi, 9)))
|
|
local u_hi = XOR(OR(SHR(e_hi, 14), SHL(e_lo, 18)), OR(SHR(e_hi, 18), SHL(e_lo, 14)), OR(SHL(e_hi, 23), SHR(e_lo, 9)))
|
|
local sum_lo = u_lo % 2^32 + t_lo % 2^32 + h_lo % 2^32 + K_lo[j] + W[2*j] % 2^32
|
|
local z_lo, z_hi = NORM( sum_lo ), NORM( u_hi + t_hi + h_hi + K_hi[j] + W[2*j-1] + floor(sum_lo / 2^32) )
|
|
zero = zero + zero -- this thick is needed to avoid "PHI shuffling too complex" due to PHIs overlap
|
|
h_lo, h_hi, g_lo, g_hi, f_lo, f_hi = OR(zero, g_lo), OR(zero, g_hi), OR(zero, f_lo), OR(zero, f_hi), OR(zero, e_lo), OR(zero, e_hi)
|
|
local sum_lo = z_lo % 2^32 + d_lo % 2^32
|
|
e_lo, e_hi = NORM( sum_lo ), NORM( z_hi + d_hi + floor(sum_lo / 2^32) )
|
|
d_lo, d_hi, c_lo, c_hi, b_lo, b_hi = OR(zero, c_lo), OR(zero, c_hi), OR(zero, b_lo), OR(zero, b_hi), OR(zero, a_lo), OR(zero, a_hi)
|
|
u_lo = XOR(OR(SHR(b_lo, 28), SHL(b_hi, 4)), OR(SHL(b_lo, 30), SHR(b_hi, 2)), OR(SHL(b_lo, 25), SHR(b_hi, 7)))
|
|
u_hi = XOR(OR(SHR(b_hi, 28), SHL(b_lo, 4)), OR(SHL(b_hi, 30), SHR(b_lo, 2)), OR(SHL(b_hi, 25), SHR(b_lo, 7)))
|
|
t_lo = OR(AND(d_lo, c_lo), AND(b_lo, XOR(d_lo, c_lo)))
|
|
t_hi = OR(AND(d_hi, c_hi), AND(b_hi, XOR(d_hi, c_hi)))
|
|
local sum_lo = z_lo % 2^32 + t_lo % 2^32 + u_lo % 2^32
|
|
a_lo, a_hi = NORM( sum_lo ), NORM( z_hi + t_hi + u_hi + floor(sum_lo / 2^32) )
|
|
end
|
|
H_lo[1], H_hi[1] = ADD64_4(H_lo[1], H_hi[1], a_lo, a_hi, 0, 0, 0, 0)
|
|
H_lo[2], H_hi[2] = ADD64_4(H_lo[2], H_hi[2], b_lo, b_hi, 0, 0, 0, 0)
|
|
H_lo[3], H_hi[3] = ADD64_4(H_lo[3], H_hi[3], c_lo, c_hi, 0, 0, 0, 0)
|
|
H_lo[4], H_hi[4] = ADD64_4(H_lo[4], H_hi[4], d_lo, d_hi, 0, 0, 0, 0)
|
|
H_lo[5], H_hi[5] = ADD64_4(H_lo[5], H_hi[5], e_lo, e_hi, 0, 0, 0, 0)
|
|
H_lo[6], H_hi[6] = ADD64_4(H_lo[6], H_hi[6], f_lo, f_hi, 0, 0, 0, 0)
|
|
H_lo[7], H_hi[7] = ADD64_4(H_lo[7], H_hi[7], g_lo, g_hi, 0, 0, 0, 0)
|
|
H_lo[8], H_hi[8] = ADD64_4(H_lo[8], H_hi[8], h_lo, h_hi, 0, 0, 0, 0)
|
|
end
|
|
end
|
|
|
|
else -- all platforms except x86
|
|
|
|
|
|
-- SHA512 implementation for "LuaJIT non-x86 without FFI" branch
|
|
|
|
function sha512_feed_128(H_lo, H_hi, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
-- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k]
|
|
local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi
|
|
for pos = offs, offs + size - 1, 128 do
|
|
for j = 1, 16*2 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
|
|
end
|
|
for jj = 17*2, 80*2, 2 do
|
|
local a_lo, a_hi = W[jj-30], W[jj-31]
|
|
local t_lo = XOR(OR(SHR(a_lo, 1), SHL(a_hi, 31)), OR(SHR(a_lo, 8), SHL(a_hi, 24)), OR(SHR(a_lo, 7), SHL(a_hi, 25)))
|
|
local t_hi = XOR(OR(SHR(a_hi, 1), SHL(a_lo, 31)), OR(SHR(a_hi, 8), SHL(a_lo, 24)), SHR(a_hi, 7))
|
|
local b_lo, b_hi = W[jj-4], W[jj-5]
|
|
local u_lo = XOR(OR(SHR(b_lo, 19), SHL(b_hi, 13)), OR(SHL(b_lo, 3), SHR(b_hi, 29)), OR(SHR(b_lo, 6), SHL(b_hi, 26)))
|
|
local u_hi = XOR(OR(SHR(b_hi, 19), SHL(b_lo, 13)), OR(SHL(b_hi, 3), SHR(b_lo, 29)), SHR(b_hi, 6))
|
|
W[jj], W[jj-1] = ADD64_4(t_lo, t_hi, u_lo, u_hi, W[jj-14], W[jj-15], W[jj-32], W[jj-33])
|
|
end
|
|
local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
|
|
local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
|
|
for j = 1, 80 do
|
|
local t_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo)))
|
|
local t_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi)))
|
|
local u_lo = XOR(OR(SHR(e_lo, 14), SHL(e_hi, 18)), OR(SHR(e_lo, 18), SHL(e_hi, 14)), OR(SHL(e_lo, 23), SHR(e_hi, 9)))
|
|
local u_hi = XOR(OR(SHR(e_hi, 14), SHL(e_lo, 18)), OR(SHR(e_hi, 18), SHL(e_lo, 14)), OR(SHL(e_hi, 23), SHR(e_lo, 9)))
|
|
local sum_lo = u_lo % 2^32 + t_lo % 2^32 + h_lo % 2^32 + K_lo[j] + W[2*j] % 2^32
|
|
local z_lo, z_hi = NORM( sum_lo ), NORM( u_hi + t_hi + h_hi + K_hi[j] + W[2*j-1] + floor(sum_lo / 2^32) )
|
|
h_lo, h_hi, g_lo, g_hi, f_lo, f_hi = g_lo, g_hi, f_lo, f_hi, e_lo, e_hi
|
|
local sum_lo = z_lo % 2^32 + d_lo % 2^32
|
|
e_lo, e_hi = NORM( sum_lo ), NORM( z_hi + d_hi + floor(sum_lo / 2^32) )
|
|
d_lo, d_hi, c_lo, c_hi, b_lo, b_hi = c_lo, c_hi, b_lo, b_hi, a_lo, a_hi
|
|
u_lo = XOR(OR(SHR(b_lo, 28), SHL(b_hi, 4)), OR(SHL(b_lo, 30), SHR(b_hi, 2)), OR(SHL(b_lo, 25), SHR(b_hi, 7)))
|
|
u_hi = XOR(OR(SHR(b_hi, 28), SHL(b_lo, 4)), OR(SHL(b_hi, 30), SHR(b_lo, 2)), OR(SHL(b_hi, 25), SHR(b_lo, 7)))
|
|
t_lo = OR(AND(d_lo, c_lo), AND(b_lo, XOR(d_lo, c_lo)))
|
|
t_hi = OR(AND(d_hi, c_hi), AND(b_hi, XOR(d_hi, c_hi)))
|
|
local sum_lo = z_lo % 2^32 + u_lo % 2^32 + t_lo % 2^32
|
|
a_lo, a_hi = NORM( sum_lo ), NORM( z_hi + u_hi + t_hi + floor(sum_lo / 2^32) )
|
|
end
|
|
H_lo[1], H_hi[1] = ADD64_4(H_lo[1], H_hi[1], a_lo, a_hi, 0, 0, 0, 0)
|
|
H_lo[2], H_hi[2] = ADD64_4(H_lo[2], H_hi[2], b_lo, b_hi, 0, 0, 0, 0)
|
|
H_lo[3], H_hi[3] = ADD64_4(H_lo[3], H_hi[3], c_lo, c_hi, 0, 0, 0, 0)
|
|
H_lo[4], H_hi[4] = ADD64_4(H_lo[4], H_hi[4], d_lo, d_hi, 0, 0, 0, 0)
|
|
H_lo[5], H_hi[5] = ADD64_4(H_lo[5], H_hi[5], e_lo, e_hi, 0, 0, 0, 0)
|
|
H_lo[6], H_hi[6] = ADD64_4(H_lo[6], H_hi[6], f_lo, f_hi, 0, 0, 0, 0)
|
|
H_lo[7], H_hi[7] = ADD64_4(H_lo[7], H_hi[7], g_lo, g_hi, 0, 0, 0, 0)
|
|
H_lo[8], H_hi[8] = ADD64_4(H_lo[8], H_hi[8], h_lo, h_hi, 0, 0, 0, 0)
|
|
end
|
|
end
|
|
|
|
end
|
|
|
|
|
|
-- MD5 implementation for "LuaJIT without FFI" branch
|
|
|
|
function md5_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W, K = common_W, md5_K
|
|
for pos = offs, offs + size - 1, 64 do
|
|
for j = 1, 16 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
|
|
end
|
|
local a, b, c, d = H[1], H[2], H[3], H[4]
|
|
for j = 1, 16, 4 do
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j ] + W[j ] + a), 7) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+1] + W[j+1] + a), 12) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+2] + W[j+2] + a), 17) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+3] + W[j+3] + a), 22) + b)
|
|
end
|
|
for j = 17, 32, 4 do
|
|
local g = 5*j-4
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j ] + W[AND(g , 15) + 1] + a), 5) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+1] + W[AND(g + 5, 15) + 1] + a), 9) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+2] + W[AND(g + 10, 15) + 1] + a), 14) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+3] + W[AND(g - 1, 15) + 1] + a), 20) + b)
|
|
end
|
|
for j = 33, 48, 4 do
|
|
local g = 3*j+2
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j ] + W[AND(g , 15) + 1] + a), 4) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+1] + W[AND(g + 3, 15) + 1] + a), 11) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+2] + W[AND(g + 6, 15) + 1] + a), 16) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+3] + W[AND(g - 7, 15) + 1] + a), 23) + b)
|
|
end
|
|
for j = 49, 64, 4 do
|
|
local g = j*7
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j ] + W[AND(g - 7, 15) + 1] + a), 6) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+1] + W[AND(g , 15) + 1] + a), 10) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+2] + W[AND(g + 7, 15) + 1] + a), 15) + b)
|
|
a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+3] + W[AND(g - 2, 15) + 1] + a), 21) + b)
|
|
end
|
|
H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
|
|
end
|
|
end
|
|
|
|
|
|
-- SHA-1 implementation for "LuaJIT without FFI" branch
|
|
|
|
function sha1_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W = common_W
|
|
for pos = offs, offs + size - 1, 64 do
|
|
for j = 1, 16 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
|
|
end
|
|
for j = 17, 80 do
|
|
W[j] = ROL(XOR(W[j-3], W[j-8], W[j-14], W[j-16]), 1)
|
|
end
|
|
local a, b, c, d, e = H[1], H[2], H[3], H[4], H[5]
|
|
for j = 1, 20, 5 do
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j] + 0x5A827999 + e)) -- constant = floor(2^30 * sqrt(2))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+1] + 0x5A827999 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+2] + 0x5A827999 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+3] + 0x5A827999 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+4] + 0x5A827999 + e))
|
|
end
|
|
for j = 21, 40, 5 do
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0x6ED9EBA1 + e)) -- 2^30 * sqrt(3)
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0x6ED9EBA1 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0x6ED9EBA1 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0x6ED9EBA1 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0x6ED9EBA1 + e))
|
|
end
|
|
for j = 41, 60, 5 do
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j] + 0x8F1BBCDC + e)) -- 2^30 * sqrt(5)
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+1] + 0x8F1BBCDC + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+2] + 0x8F1BBCDC + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+3] + 0x8F1BBCDC + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+4] + 0x8F1BBCDC + e))
|
|
end
|
|
for j = 61, 80, 5 do
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0xCA62C1D6 + e)) -- 2^30 * sqrt(10)
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0xCA62C1D6 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0xCA62C1D6 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0xCA62C1D6 + e))
|
|
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0xCA62C1D6 + e))
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]), NORM(e + H[5])
|
|
end
|
|
end
|
|
|
|
|
|
-- BLAKE2b implementation for "LuaJIT without FFI" branch
|
|
|
|
do
|
|
local v_lo, v_hi = {}, {}
|
|
|
|
local function G(a, b, c, d, k1, k2)
|
|
local W = common_W
|
|
local va_lo, vb_lo, vc_lo, vd_lo = v_lo[a], v_lo[b], v_lo[c], v_lo[d]
|
|
local va_hi, vb_hi, vc_hi, vd_hi = v_hi[a], v_hi[b], v_hi[c], v_hi[d]
|
|
local z = W[2*k1-1] + (va_lo % 2^32 + vb_lo % 2^32)
|
|
va_lo = NORM(z)
|
|
va_hi = NORM(W[2*k1] + (va_hi + vb_hi + floor(z / 2^32)))
|
|
vd_lo, vd_hi = XOR(vd_hi, va_hi), XOR(vd_lo, va_lo)
|
|
z = vc_lo % 2^32 + vd_lo % 2^32
|
|
vc_lo = NORM(z)
|
|
vc_hi = NORM(vc_hi + vd_hi + floor(z / 2^32))
|
|
vb_lo, vb_hi = XOR(vb_lo, vc_lo), XOR(vb_hi, vc_hi)
|
|
vb_lo, vb_hi = XOR(SHR(vb_lo, 24), SHL(vb_hi, 8)), XOR(SHR(vb_hi, 24), SHL(vb_lo, 8))
|
|
z = W[2*k2-1] + (va_lo % 2^32 + vb_lo % 2^32)
|
|
va_lo = NORM(z)
|
|
va_hi = NORM(W[2*k2] + (va_hi + vb_hi + floor(z / 2^32)))
|
|
vd_lo, vd_hi = XOR(vd_lo, va_lo), XOR(vd_hi, va_hi)
|
|
vd_lo, vd_hi = XOR(SHR(vd_lo, 16), SHL(vd_hi, 16)), XOR(SHR(vd_hi, 16), SHL(vd_lo, 16))
|
|
z = vc_lo % 2^32 + vd_lo % 2^32
|
|
vc_lo = NORM(z)
|
|
vc_hi = NORM(vc_hi + vd_hi + floor(z / 2^32))
|
|
vb_lo, vb_hi = XOR(vb_lo, vc_lo), XOR(vb_hi, vc_hi)
|
|
vb_lo, vb_hi = XOR(SHL(vb_lo, 1), SHR(vb_hi, 31)), XOR(SHL(vb_hi, 1), SHR(vb_lo, 31))
|
|
v_lo[a], v_lo[b], v_lo[c], v_lo[d] = va_lo, vb_lo, vc_lo, vd_lo
|
|
v_hi[a], v_hi[b], v_hi[c], v_hi[d] = va_hi, vb_hi, vc_hi, vd_hi
|
|
end
|
|
|
|
function blake2b_feed_128(H_lo, H_hi, str, offs, size, bytes_compressed, last_block_size, is_last_node)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
local W = common_W
|
|
local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
|
|
local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
|
|
for pos = offs, offs + size - 1, 128 do
|
|
if str then
|
|
for j = 1, 32 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = d * 2^24 + OR(SHL(c, 16), SHL(b, 8), a)
|
|
end
|
|
end
|
|
v_lo[0x0], v_lo[0x1], v_lo[0x2], v_lo[0x3], v_lo[0x4], v_lo[0x5], v_lo[0x6], v_lo[0x7] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
|
|
v_lo[0x8], v_lo[0x9], v_lo[0xA], v_lo[0xB], v_lo[0xC], v_lo[0xD], v_lo[0xE], v_lo[0xF] = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[5], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
|
|
v_hi[0x0], v_hi[0x1], v_hi[0x2], v_hi[0x3], v_hi[0x4], v_hi[0x5], v_hi[0x6], v_hi[0x7] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
|
|
v_hi[0x8], v_hi[0x9], v_hi[0xA], v_hi[0xB], v_hi[0xC], v_hi[0xD], v_hi[0xE], v_hi[0xF] = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
|
|
bytes_compressed = bytes_compressed + (last_block_size or 128)
|
|
local t0_lo = bytes_compressed % 2^32
|
|
local t0_hi = floor(bytes_compressed / 2^32)
|
|
v_lo[0xC] = XOR(v_lo[0xC], t0_lo) -- t0 = low_8_bytes(bytes_compressed)
|
|
v_hi[0xC] = XOR(v_hi[0xC], t0_hi)
|
|
-- t1 = high_8_bytes(bytes_compressed) = 0, message length is always below 2^53 bytes
|
|
if last_block_size then -- flag f0
|
|
v_lo[0xE] = NOT(v_lo[0xE])
|
|
v_hi[0xE] = NOT(v_hi[0xE])
|
|
end
|
|
if is_last_node then -- flag f1
|
|
v_lo[0xF] = NOT(v_lo[0xF])
|
|
v_hi[0xF] = NOT(v_hi[0xF])
|
|
end
|
|
for j = 1, 12 do
|
|
local row = sigma[j]
|
|
G(0, 4, 8, 12, row[ 1], row[ 2])
|
|
G(1, 5, 9, 13, row[ 3], row[ 4])
|
|
G(2, 6, 10, 14, row[ 5], row[ 6])
|
|
G(3, 7, 11, 15, row[ 7], row[ 8])
|
|
G(0, 5, 10, 15, row[ 9], row[10])
|
|
G(1, 6, 11, 12, row[11], row[12])
|
|
G(2, 7, 8, 13, row[13], row[14])
|
|
G(3, 4, 9, 14, row[15], row[16])
|
|
end
|
|
h1_lo = XOR(h1_lo, v_lo[0x0], v_lo[0x8])
|
|
h2_lo = XOR(h2_lo, v_lo[0x1], v_lo[0x9])
|
|
h3_lo = XOR(h3_lo, v_lo[0x2], v_lo[0xA])
|
|
h4_lo = XOR(h4_lo, v_lo[0x3], v_lo[0xB])
|
|
h5_lo = XOR(h5_lo, v_lo[0x4], v_lo[0xC])
|
|
h6_lo = XOR(h6_lo, v_lo[0x5], v_lo[0xD])
|
|
h7_lo = XOR(h7_lo, v_lo[0x6], v_lo[0xE])
|
|
h8_lo = XOR(h8_lo, v_lo[0x7], v_lo[0xF])
|
|
h1_hi = XOR(h1_hi, v_hi[0x0], v_hi[0x8])
|
|
h2_hi = XOR(h2_hi, v_hi[0x1], v_hi[0x9])
|
|
h3_hi = XOR(h3_hi, v_hi[0x2], v_hi[0xA])
|
|
h4_hi = XOR(h4_hi, v_hi[0x3], v_hi[0xB])
|
|
h5_hi = XOR(h5_hi, v_hi[0x4], v_hi[0xC])
|
|
h6_hi = XOR(h6_hi, v_hi[0x5], v_hi[0xD])
|
|
h7_hi = XOR(h7_hi, v_hi[0x6], v_hi[0xE])
|
|
h8_hi = XOR(h8_hi, v_hi[0x7], v_hi[0xF])
|
|
end
|
|
H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo % 2^32, h2_lo % 2^32, h3_lo % 2^32, h4_lo % 2^32, h5_lo % 2^32, h6_lo % 2^32, h7_lo % 2^32, h8_lo % 2^32
|
|
H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi % 2^32, h2_hi % 2^32, h3_hi % 2^32, h4_hi % 2^32, h5_hi % 2^32, h6_hi % 2^32, h7_hi % 2^32, h8_hi % 2^32
|
|
return bytes_compressed
|
|
end
|
|
|
|
end
|
|
end
|
|
|
|
|
|
if branch == "FFI" or branch == "LJ" then
|
|
|
|
|
|
-- BLAKE2s and BLAKE3 implementations for "LuaJIT with FFI" and "LuaJIT without FFI" branches
|
|
|
|
do
|
|
local W = common_W_blake2s
|
|
local v = v_for_blake2s_feed_64
|
|
|
|
local function G(a, b, c, d, k1, k2)
|
|
local va, vb, vc, vd = v[a], v[b], v[c], v[d]
|
|
va = NORM(W[k1] + (va + vb))
|
|
vd = ROR(XOR(vd, va), 16)
|
|
vc = NORM(vc + vd)
|
|
vb = ROR(XOR(vb, vc), 12)
|
|
va = NORM(W[k2] + (va + vb))
|
|
vd = ROR(XOR(vd, va), 8)
|
|
vc = NORM(vc + vd)
|
|
vb = ROR(XOR(vb, vc), 7)
|
|
v[a], v[b], v[c], v[d] = va, vb, vc, vd
|
|
end
|
|
|
|
function blake2s_feed_64(H, str, offs, size, bytes_compressed, last_block_size, is_last_node)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = NORM(H[1]), NORM(H[2]), NORM(H[3]), NORM(H[4]), NORM(H[5]), NORM(H[6]), NORM(H[7]), NORM(H[8])
|
|
for pos = offs, offs + size - 1, 64 do
|
|
if str then
|
|
for j = 1, 16 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
|
|
end
|
|
end
|
|
v[0x0], v[0x1], v[0x2], v[0x3], v[0x4], v[0x5], v[0x6], v[0x7] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
v[0x8], v[0x9], v[0xA], v[0xB], v[0xE], v[0xF] = NORM(sha2_H_hi[1]), NORM(sha2_H_hi[2]), NORM(sha2_H_hi[3]), NORM(sha2_H_hi[4]), NORM(sha2_H_hi[7]), NORM(sha2_H_hi[8])
|
|
bytes_compressed = bytes_compressed + (last_block_size or 64)
|
|
local t0 = bytes_compressed % 2^32
|
|
local t1 = floor(bytes_compressed / 2^32)
|
|
v[0xC] = XOR(sha2_H_hi[5], t0) -- t0 = low_4_bytes(bytes_compressed)
|
|
v[0xD] = XOR(sha2_H_hi[6], t1) -- t1 = high_4_bytes(bytes_compressed
|
|
if last_block_size then -- flag f0
|
|
v[0xE] = NOT(v[0xE])
|
|
end
|
|
if is_last_node then -- flag f1
|
|
v[0xF] = NOT(v[0xF])
|
|
end
|
|
for j = 1, 10 do
|
|
local row = sigma[j]
|
|
G(0, 4, 8, 12, row[ 1], row[ 2])
|
|
G(1, 5, 9, 13, row[ 3], row[ 4])
|
|
G(2, 6, 10, 14, row[ 5], row[ 6])
|
|
G(3, 7, 11, 15, row[ 7], row[ 8])
|
|
G(0, 5, 10, 15, row[ 9], row[10])
|
|
G(1, 6, 11, 12, row[11], row[12])
|
|
G(2, 7, 8, 13, row[13], row[14])
|
|
G(3, 4, 9, 14, row[15], row[16])
|
|
end
|
|
h1 = XOR(h1, v[0x0], v[0x8])
|
|
h2 = XOR(h2, v[0x1], v[0x9])
|
|
h3 = XOR(h3, v[0x2], v[0xA])
|
|
h4 = XOR(h4, v[0x3], v[0xB])
|
|
h5 = XOR(h5, v[0x4], v[0xC])
|
|
h6 = XOR(h6, v[0x5], v[0xD])
|
|
h7 = XOR(h7, v[0x6], v[0xE])
|
|
h8 = XOR(h8, v[0x7], v[0xF])
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
return bytes_compressed
|
|
end
|
|
|
|
function blake3_feed_64(str, offs, size, flags, chunk_index, H_in, H_out, wide_output, block_length)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
block_length = block_length or 64
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = NORM(H_in[1]), NORM(H_in[2]), NORM(H_in[3]), NORM(H_in[4]), NORM(H_in[5]), NORM(H_in[6]), NORM(H_in[7]), NORM(H_in[8])
|
|
H_out = H_out or H_in
|
|
for pos = offs, offs + size - 1, 64 do
|
|
if str then
|
|
for j = 1, 16 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
|
|
end
|
|
end
|
|
v[0x0], v[0x1], v[0x2], v[0x3], v[0x4], v[0x5], v[0x6], v[0x7] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
v[0x8], v[0x9], v[0xA], v[0xB] = NORM(sha2_H_hi[1]), NORM(sha2_H_hi[2]), NORM(sha2_H_hi[3]), NORM(sha2_H_hi[4])
|
|
v[0xC] = NORM(chunk_index % 2^32) -- t0 = low_4_bytes(chunk_index)
|
|
v[0xD] = floor(chunk_index / 2^32) -- t1 = high_4_bytes(chunk_index)
|
|
v[0xE], v[0xF] = block_length, flags
|
|
for j = 1, 7 do
|
|
G(0, 4, 8, 12, perm_blake3[j], perm_blake3[j + 14])
|
|
G(1, 5, 9, 13, perm_blake3[j + 1], perm_blake3[j + 2])
|
|
G(2, 6, 10, 14, perm_blake3[j + 16], perm_blake3[j + 7])
|
|
G(3, 7, 11, 15, perm_blake3[j + 15], perm_blake3[j + 17])
|
|
G(0, 5, 10, 15, perm_blake3[j + 21], perm_blake3[j + 5])
|
|
G(1, 6, 11, 12, perm_blake3[j + 3], perm_blake3[j + 6])
|
|
G(2, 7, 8, 13, perm_blake3[j + 4], perm_blake3[j + 18])
|
|
G(3, 4, 9, 14, perm_blake3[j + 19], perm_blake3[j + 20])
|
|
end
|
|
if wide_output then
|
|
H_out[ 9] = XOR(h1, v[0x8])
|
|
H_out[10] = XOR(h2, v[0x9])
|
|
H_out[11] = XOR(h3, v[0xA])
|
|
H_out[12] = XOR(h4, v[0xB])
|
|
H_out[13] = XOR(h5, v[0xC])
|
|
H_out[14] = XOR(h6, v[0xD])
|
|
H_out[15] = XOR(h7, v[0xE])
|
|
H_out[16] = XOR(h8, v[0xF])
|
|
end
|
|
h1 = XOR(v[0x0], v[0x8])
|
|
h2 = XOR(v[0x1], v[0x9])
|
|
h3 = XOR(v[0x2], v[0xA])
|
|
h4 = XOR(v[0x3], v[0xB])
|
|
h5 = XOR(v[0x4], v[0xC])
|
|
h6 = XOR(v[0x5], v[0xD])
|
|
h7 = XOR(v[0x6], v[0xE])
|
|
h8 = XOR(v[0x7], v[0xF])
|
|
end
|
|
H_out[1], H_out[2], H_out[3], H_out[4], H_out[5], H_out[6], H_out[7], H_out[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
if branch == "INT64" then
|
|
|
|
|
|
-- implementation for Lua 5.3/5.4
|
|
|
|
hi_factor = 4294967296
|
|
hi_factor_keccak = 4294967296
|
|
lanes_index_base = 1
|
|
|
|
HEX64, XORA5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64 = load[=[-- branch "INT64"
|
|
local md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sigma, common_W, sha2_H_lo, sha2_H_hi, perm_blake3 = ...
|
|
local string_format, string_unpack = string.format, string.unpack
|
|
|
|
local function HEX64(x)
|
|
return string_format("%016x", x)
|
|
end
|
|
|
|
local function XORA5(x, y)
|
|
return x ~ (y or 0xa5a5a5a5a5a5a5a5)
|
|
end
|
|
|
|
local function XOR_BYTE(x, y)
|
|
return x ~ y
|
|
end
|
|
|
|
local function sha256_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W, K = common_W, sha2_K_hi
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for pos = offs + 1, offs + size, 64 do
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
|
|
string_unpack(">I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
|
|
for j = 17, 64 do
|
|
local a = W[j-15]
|
|
a = a<<32 | a
|
|
local b = W[j-2]
|
|
b = b<<32 | b
|
|
W[j] = (a>>7 ~ a>>18 ~ a>>35) + (b>>17 ~ b>>19 ~ b>>42) + W[j-7] + W[j-16] & (1<<32)-1
|
|
end
|
|
local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8
|
|
for j = 1, 64 do
|
|
e = e<<32 | e & (1<<32)-1
|
|
local z = (e>>6 ~ e>>11 ~ e>>25) + (g ~ e & (f ~ g)) + h + K[j] + W[j]
|
|
h = g
|
|
g = f
|
|
f = e
|
|
e = z + d
|
|
d = c
|
|
c = b
|
|
b = a
|
|
a = a<<32 | a & (1<<32)-1
|
|
a = z + ((a ~ c) & d ~ a & c) + (a>>2 ~ a>>13 ~ a>>22)
|
|
end
|
|
h1 = a + h1
|
|
h2 = b + h2
|
|
h3 = c + h3
|
|
h4 = d + h4
|
|
h5 = e + h5
|
|
h6 = f + h6
|
|
h7 = g + h7
|
|
h8 = h + h8
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
end
|
|
|
|
local function sha512_feed_128(H, _, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
local W, K = common_W, sha2_K_lo
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for pos = offs + 1, offs + size, 128 do
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
|
|
string_unpack(">i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8", str, pos)
|
|
for j = 17, 80 do
|
|
local a = W[j-15]
|
|
local b = W[j-2]
|
|
W[j] = (a >> 1 ~ a >> 7 ~ a >> 8 ~ a << 56 ~ a << 63) + (b >> 6 ~ b >> 19 ~ b >> 61 ~ b << 3 ~ b << 45) + W[j-7] + W[j-16]
|
|
end
|
|
local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8
|
|
for j = 1, 80 do
|
|
local z = (e >> 14 ~ e >> 18 ~ e >> 41 ~ e << 23 ~ e << 46 ~ e << 50) + (g ~ e & (f ~ g)) + h + K[j] + W[j]
|
|
h = g
|
|
g = f
|
|
f = e
|
|
e = z + d
|
|
d = c
|
|
c = b
|
|
b = a
|
|
a = z + ((a ~ c) & d ~ a & c) + (a >> 28 ~ a >> 34 ~ a >> 39 ~ a << 25 ~ a << 30 ~ a << 36)
|
|
end
|
|
h1 = a + h1
|
|
h2 = b + h2
|
|
h3 = c + h3
|
|
h4 = d + h4
|
|
h5 = e + h5
|
|
h6 = f + h6
|
|
h7 = g + h7
|
|
h8 = h + h8
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
end
|
|
|
|
local function md5_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W, K, md5_next_shift = common_W, md5_K, md5_next_shift
|
|
local h1, h2, h3, h4 = H[1], H[2], H[3], H[4]
|
|
for pos = offs + 1, offs + size, 64 do
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
|
|
string_unpack("<I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
|
|
local a, b, c, d = h1, h2, h3, h4
|
|
local s = 32-7
|
|
for j = 1, 16 do
|
|
local F = (d ~ b & (c ~ d)) + a + K[j] + W[j]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = ((F<<32 | F & (1<<32)-1) >> s) + b
|
|
s = md5_next_shift[s]
|
|
end
|
|
s = 32-5
|
|
for j = 17, 32 do
|
|
local F = (c ~ d & (b ~ c)) + a + K[j] + W[(5*j-4 & 15) + 1]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = ((F<<32 | F & (1<<32)-1) >> s) + b
|
|
s = md5_next_shift[s]
|
|
end
|
|
s = 32-4
|
|
for j = 33, 48 do
|
|
local F = (b ~ c ~ d) + a + K[j] + W[(3*j+2 & 15) + 1]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = ((F<<32 | F & (1<<32)-1) >> s) + b
|
|
s = md5_next_shift[s]
|
|
end
|
|
s = 32-6
|
|
for j = 49, 64 do
|
|
local F = (c ~ (b | ~d)) + a + K[j] + W[(j*7-7 & 15) + 1]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = ((F<<32 | F & (1<<32)-1) >> s) + b
|
|
s = md5_next_shift[s]
|
|
end
|
|
h1 = a + h1
|
|
h2 = b + h2
|
|
h3 = c + h3
|
|
h4 = d + h4
|
|
end
|
|
H[1], H[2], H[3], H[4] = h1, h2, h3, h4
|
|
end
|
|
|
|
local function sha1_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W = common_W
|
|
local h1, h2, h3, h4, h5 = H[1], H[2], H[3], H[4], H[5]
|
|
for pos = offs + 1, offs + size, 64 do
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
|
|
string_unpack(">I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
|
|
for j = 17, 80 do
|
|
local a = W[j-3] ~ W[j-8] ~ W[j-14] ~ W[j-16]
|
|
W[j] = (a<<32 | a) << 1 >> 32
|
|
end
|
|
local a, b, c, d, e = h1, h2, h3, h4, h5
|
|
for j = 1, 20 do
|
|
local z = ((a<<32 | a & (1<<32)-1) >> 27) + (d ~ b & (c ~ d)) + 0x5A827999 + W[j] + e -- constant = floor(2^30 * sqrt(2))
|
|
e = d
|
|
d = c
|
|
c = (b<<32 | b & (1<<32)-1) >> 2
|
|
b = a
|
|
a = z
|
|
end
|
|
for j = 21, 40 do
|
|
local z = ((a<<32 | a & (1<<32)-1) >> 27) + (b ~ c ~ d) + 0x6ED9EBA1 + W[j] + e -- 2^30 * sqrt(3)
|
|
e = d
|
|
d = c
|
|
c = (b<<32 | b & (1<<32)-1) >> 2
|
|
b = a
|
|
a = z
|
|
end
|
|
for j = 41, 60 do
|
|
local z = ((a<<32 | a & (1<<32)-1) >> 27) + ((b ~ c) & d ~ b & c) + 0x8F1BBCDC + W[j] + e -- 2^30 * sqrt(5)
|
|
e = d
|
|
d = c
|
|
c = (b<<32 | b & (1<<32)-1) >> 2
|
|
b = a
|
|
a = z
|
|
end
|
|
for j = 61, 80 do
|
|
local z = ((a<<32 | a & (1<<32)-1) >> 27) + (b ~ c ~ d) + 0xCA62C1D6 + W[j] + e -- 2^30 * sqrt(10)
|
|
e = d
|
|
d = c
|
|
c = (b<<32 | b & (1<<32)-1) >> 2
|
|
b = a
|
|
a = z
|
|
end
|
|
h1 = a + h1
|
|
h2 = b + h2
|
|
h3 = c + h3
|
|
h4 = d + h4
|
|
h5 = e + h5
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5] = h1, h2, h3, h4, h5
|
|
end
|
|
|
|
local keccak_format_i8 = build_keccak_format("i8")
|
|
|
|
local function keccak_feed(lanes, _, str, offs, size, block_size_in_bytes)
|
|
-- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
|
|
local RC = sha3_RC_lo
|
|
local qwords_qty = block_size_in_bytes / 8
|
|
local keccak_format = keccak_format_i8[qwords_qty]
|
|
for pos = offs + 1, offs + size, block_size_in_bytes do
|
|
local qwords_from_message = {string_unpack(keccak_format, str, pos)}
|
|
for j = 1, qwords_qty do
|
|
lanes[j] = lanes[j] ~ qwords_from_message[j]
|
|
end
|
|
local L01, L02, L03, L04, L05, L06, L07, L08, L09, L10, L11, L12, L13, L14, L15, L16, L17, L18, L19, L20, L21, L22, L23, L24, L25 =
|
|
lanes[1], lanes[2], lanes[3], lanes[4], lanes[5], lanes[6], lanes[7], lanes[8], lanes[9], lanes[10], lanes[11], lanes[12], lanes[13],
|
|
lanes[14], lanes[15], lanes[16], lanes[17], lanes[18], lanes[19], lanes[20], lanes[21], lanes[22], lanes[23], lanes[24], lanes[25]
|
|
for round_idx = 1, 24 do
|
|
local C1 = L01 ~ L06 ~ L11 ~ L16 ~ L21
|
|
local C2 = L02 ~ L07 ~ L12 ~ L17 ~ L22
|
|
local C3 = L03 ~ L08 ~ L13 ~ L18 ~ L23
|
|
local C4 = L04 ~ L09 ~ L14 ~ L19 ~ L24
|
|
local C5 = L05 ~ L10 ~ L15 ~ L20 ~ L25
|
|
local D = C1 ~ C3<<1 ~ C3>>63
|
|
local T0 = D ~ L02
|
|
local T1 = D ~ L07
|
|
local T2 = D ~ L12
|
|
local T3 = D ~ L17
|
|
local T4 = D ~ L22
|
|
L02 = T1<<44 ~ T1>>20
|
|
L07 = T3<<45 ~ T3>>19
|
|
L12 = T0<<1 ~ T0>>63
|
|
L17 = T2<<10 ~ T2>>54
|
|
L22 = T4<<2 ~ T4>>62
|
|
D = C2 ~ C4<<1 ~ C4>>63
|
|
T0 = D ~ L03
|
|
T1 = D ~ L08
|
|
T2 = D ~ L13
|
|
T3 = D ~ L18
|
|
T4 = D ~ L23
|
|
L03 = T2<<43 ~ T2>>21
|
|
L08 = T4<<61 ~ T4>>3
|
|
L13 = T1<<6 ~ T1>>58
|
|
L18 = T3<<15 ~ T3>>49
|
|
L23 = T0<<62 ~ T0>>2
|
|
D = C3 ~ C5<<1 ~ C5>>63
|
|
T0 = D ~ L04
|
|
T1 = D ~ L09
|
|
T2 = D ~ L14
|
|
T3 = D ~ L19
|
|
T4 = D ~ L24
|
|
L04 = T3<<21 ~ T3>>43
|
|
L09 = T0<<28 ~ T0>>36
|
|
L14 = T2<<25 ~ T2>>39
|
|
L19 = T4<<56 ~ T4>>8
|
|
L24 = T1<<55 ~ T1>>9
|
|
D = C4 ~ C1<<1 ~ C1>>63
|
|
T0 = D ~ L05
|
|
T1 = D ~ L10
|
|
T2 = D ~ L15
|
|
T3 = D ~ L20
|
|
T4 = D ~ L25
|
|
L05 = T4<<14 ~ T4>>50
|
|
L10 = T1<<20 ~ T1>>44
|
|
L15 = T3<<8 ~ T3>>56
|
|
L20 = T0<<27 ~ T0>>37
|
|
L25 = T2<<39 ~ T2>>25
|
|
D = C5 ~ C2<<1 ~ C2>>63
|
|
T1 = D ~ L06
|
|
T2 = D ~ L11
|
|
T3 = D ~ L16
|
|
T4 = D ~ L21
|
|
L06 = T2<<3 ~ T2>>61
|
|
L11 = T4<<18 ~ T4>>46
|
|
L16 = T1<<36 ~ T1>>28
|
|
L21 = T3<<41 ~ T3>>23
|
|
L01 = D ~ L01
|
|
L01, L02, L03, L04, L05 = L01 ~ ~L02 & L03, L02 ~ ~L03 & L04, L03 ~ ~L04 & L05, L04 ~ ~L05 & L01, L05 ~ ~L01 & L02
|
|
L06, L07, L08, L09, L10 = L09 ~ ~L10 & L06, L10 ~ ~L06 & L07, L06 ~ ~L07 & L08, L07 ~ ~L08 & L09, L08 ~ ~L09 & L10
|
|
L11, L12, L13, L14, L15 = L12 ~ ~L13 & L14, L13 ~ ~L14 & L15, L14 ~ ~L15 & L11, L15 ~ ~L11 & L12, L11 ~ ~L12 & L13
|
|
L16, L17, L18, L19, L20 = L20 ~ ~L16 & L17, L16 ~ ~L17 & L18, L17 ~ ~L18 & L19, L18 ~ ~L19 & L20, L19 ~ ~L20 & L16
|
|
L21, L22, L23, L24, L25 = L23 ~ ~L24 & L25, L24 ~ ~L25 & L21, L25 ~ ~L21 & L22, L21 ~ ~L22 & L23, L22 ~ ~L23 & L24
|
|
L01 = L01 ~ RC[round_idx]
|
|
end
|
|
lanes[1] = L01
|
|
lanes[2] = L02
|
|
lanes[3] = L03
|
|
lanes[4] = L04
|
|
lanes[5] = L05
|
|
lanes[6] = L06
|
|
lanes[7] = L07
|
|
lanes[8] = L08
|
|
lanes[9] = L09
|
|
lanes[10] = L10
|
|
lanes[11] = L11
|
|
lanes[12] = L12
|
|
lanes[13] = L13
|
|
lanes[14] = L14
|
|
lanes[15] = L15
|
|
lanes[16] = L16
|
|
lanes[17] = L17
|
|
lanes[18] = L18
|
|
lanes[19] = L19
|
|
lanes[20] = L20
|
|
lanes[21] = L21
|
|
lanes[22] = L22
|
|
lanes[23] = L23
|
|
lanes[24] = L24
|
|
lanes[25] = L25
|
|
end
|
|
end
|
|
|
|
local function blake2s_feed_64(H, str, offs, size, bytes_compressed, last_block_size, is_last_node)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W = common_W
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for pos = offs + 1, offs + size, 64 do
|
|
if str then
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
|
|
string_unpack("<I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
|
|
end
|
|
local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
|
|
local v8, v9, vA, vB, vC, vD, vE, vF = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
|
|
bytes_compressed = bytes_compressed + (last_block_size or 64)
|
|
vC = vC ~ bytes_compressed -- t0 = low_4_bytes(bytes_compressed)
|
|
vD = vD ~ bytes_compressed >> 32 -- t1 = high_4_bytes(bytes_compressed)
|
|
if last_block_size then -- flag f0
|
|
vE = ~vE
|
|
end
|
|
if is_last_node then -- flag f1
|
|
vF = ~vF
|
|
end
|
|
for j = 1, 10 do
|
|
local row = sigma[j]
|
|
v0 = v0 + v4 + W[row[1]]
|
|
vC = vC ~ v0
|
|
vC = (vC & (1<<32)-1) >> 16 | vC << 16
|
|
v8 = v8 + vC
|
|
v4 = v4 ~ v8
|
|
v4 = (v4 & (1<<32)-1) >> 12 | v4 << 20
|
|
v0 = v0 + v4 + W[row[2]]
|
|
vC = vC ~ v0
|
|
vC = (vC & (1<<32)-1) >> 8 | vC << 24
|
|
v8 = v8 + vC
|
|
v4 = v4 ~ v8
|
|
v4 = (v4 & (1<<32)-1) >> 7 | v4 << 25
|
|
v1 = v1 + v5 + W[row[3]]
|
|
vD = vD ~ v1
|
|
vD = (vD & (1<<32)-1) >> 16 | vD << 16
|
|
v9 = v9 + vD
|
|
v5 = v5 ~ v9
|
|
v5 = (v5 & (1<<32)-1) >> 12 | v5 << 20
|
|
v1 = v1 + v5 + W[row[4]]
|
|
vD = vD ~ v1
|
|
vD = (vD & (1<<32)-1) >> 8 | vD << 24
|
|
v9 = v9 + vD
|
|
v5 = v5 ~ v9
|
|
v5 = (v5 & (1<<32)-1) >> 7 | v5 << 25
|
|
v2 = v2 + v6 + W[row[5]]
|
|
vE = vE ~ v2
|
|
vE = (vE & (1<<32)-1) >> 16 | vE << 16
|
|
vA = vA + vE
|
|
v6 = v6 ~ vA
|
|
v6 = (v6 & (1<<32)-1) >> 12 | v6 << 20
|
|
v2 = v2 + v6 + W[row[6]]
|
|
vE = vE ~ v2
|
|
vE = (vE & (1<<32)-1) >> 8 | vE << 24
|
|
vA = vA + vE
|
|
v6 = v6 ~ vA
|
|
v6 = (v6 & (1<<32)-1) >> 7 | v6 << 25
|
|
v3 = v3 + v7 + W[row[7]]
|
|
vF = vF ~ v3
|
|
vF = (vF & (1<<32)-1) >> 16 | vF << 16
|
|
vB = vB + vF
|
|
v7 = v7 ~ vB
|
|
v7 = (v7 & (1<<32)-1) >> 12 | v7 << 20
|
|
v3 = v3 + v7 + W[row[8]]
|
|
vF = vF ~ v3
|
|
vF = (vF & (1<<32)-1) >> 8 | vF << 24
|
|
vB = vB + vF
|
|
v7 = v7 ~ vB
|
|
v7 = (v7 & (1<<32)-1) >> 7 | v7 << 25
|
|
v0 = v0 + v5 + W[row[9]]
|
|
vF = vF ~ v0
|
|
vF = (vF & (1<<32)-1) >> 16 | vF << 16
|
|
vA = vA + vF
|
|
v5 = v5 ~ vA
|
|
v5 = (v5 & (1<<32)-1) >> 12 | v5 << 20
|
|
v0 = v0 + v5 + W[row[10]]
|
|
vF = vF ~ v0
|
|
vF = (vF & (1<<32)-1) >> 8 | vF << 24
|
|
vA = vA + vF
|
|
v5 = v5 ~ vA
|
|
v5 = (v5 & (1<<32)-1) >> 7 | v5 << 25
|
|
v1 = v1 + v6 + W[row[11]]
|
|
vC = vC ~ v1
|
|
vC = (vC & (1<<32)-1) >> 16 | vC << 16
|
|
vB = vB + vC
|
|
v6 = v6 ~ vB
|
|
v6 = (v6 & (1<<32)-1) >> 12 | v6 << 20
|
|
v1 = v1 + v6 + W[row[12]]
|
|
vC = vC ~ v1
|
|
vC = (vC & (1<<32)-1) >> 8 | vC << 24
|
|
vB = vB + vC
|
|
v6 = v6 ~ vB
|
|
v6 = (v6 & (1<<32)-1) >> 7 | v6 << 25
|
|
v2 = v2 + v7 + W[row[13]]
|
|
vD = vD ~ v2
|
|
vD = (vD & (1<<32)-1) >> 16 | vD << 16
|
|
v8 = v8 + vD
|
|
v7 = v7 ~ v8
|
|
v7 = (v7 & (1<<32)-1) >> 12 | v7 << 20
|
|
v2 = v2 + v7 + W[row[14]]
|
|
vD = vD ~ v2
|
|
vD = (vD & (1<<32)-1) >> 8 | vD << 24
|
|
v8 = v8 + vD
|
|
v7 = v7 ~ v8
|
|
v7 = (v7 & (1<<32)-1) >> 7 | v7 << 25
|
|
v3 = v3 + v4 + W[row[15]]
|
|
vE = vE ~ v3
|
|
vE = (vE & (1<<32)-1) >> 16 | vE << 16
|
|
v9 = v9 + vE
|
|
v4 = v4 ~ v9
|
|
v4 = (v4 & (1<<32)-1) >> 12 | v4 << 20
|
|
v3 = v3 + v4 + W[row[16]]
|
|
vE = vE ~ v3
|
|
vE = (vE & (1<<32)-1) >> 8 | vE << 24
|
|
v9 = v9 + vE
|
|
v4 = v4 ~ v9
|
|
v4 = (v4 & (1<<32)-1) >> 7 | v4 << 25
|
|
end
|
|
h1 = h1 ~ v0 ~ v8
|
|
h2 = h2 ~ v1 ~ v9
|
|
h3 = h3 ~ v2 ~ vA
|
|
h4 = h4 ~ v3 ~ vB
|
|
h5 = h5 ~ v4 ~ vC
|
|
h6 = h6 ~ v5 ~ vD
|
|
h7 = h7 ~ v6 ~ vE
|
|
h8 = h8 ~ v7 ~ vF
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
return bytes_compressed
|
|
end
|
|
|
|
local function blake2b_feed_128(H, _, str, offs, size, bytes_compressed, last_block_size, is_last_node)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
local W = common_W
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for pos = offs + 1, offs + size, 128 do
|
|
if str then
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
|
|
string_unpack("<i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8", str, pos)
|
|
end
|
|
local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
|
|
local v8, v9, vA, vB, vC, vD, vE, vF = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[5], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
|
|
bytes_compressed = bytes_compressed + (last_block_size or 128)
|
|
vC = vC ~ bytes_compressed -- t0 = low_8_bytes(bytes_compressed)
|
|
-- t1 = high_8_bytes(bytes_compressed) = 0, message length is always below 2^53 bytes
|
|
if last_block_size then -- flag f0
|
|
vE = ~vE
|
|
end
|
|
if is_last_node then -- flag f1
|
|
vF = ~vF
|
|
end
|
|
for j = 1, 12 do
|
|
local row = sigma[j]
|
|
v0 = v0 + v4 + W[row[1]]
|
|
vC = vC ~ v0
|
|
vC = vC >> 32 | vC << 32
|
|
v8 = v8 + vC
|
|
v4 = v4 ~ v8
|
|
v4 = v4 >> 24 | v4 << 40
|
|
v0 = v0 + v4 + W[row[2]]
|
|
vC = vC ~ v0
|
|
vC = vC >> 16 | vC << 48
|
|
v8 = v8 + vC
|
|
v4 = v4 ~ v8
|
|
v4 = v4 >> 63 | v4 << 1
|
|
v1 = v1 + v5 + W[row[3]]
|
|
vD = vD ~ v1
|
|
vD = vD >> 32 | vD << 32
|
|
v9 = v9 + vD
|
|
v5 = v5 ~ v9
|
|
v5 = v5 >> 24 | v5 << 40
|
|
v1 = v1 + v5 + W[row[4]]
|
|
vD = vD ~ v1
|
|
vD = vD >> 16 | vD << 48
|
|
v9 = v9 + vD
|
|
v5 = v5 ~ v9
|
|
v5 = v5 >> 63 | v5 << 1
|
|
v2 = v2 + v6 + W[row[5]]
|
|
vE = vE ~ v2
|
|
vE = vE >> 32 | vE << 32
|
|
vA = vA + vE
|
|
v6 = v6 ~ vA
|
|
v6 = v6 >> 24 | v6 << 40
|
|
v2 = v2 + v6 + W[row[6]]
|
|
vE = vE ~ v2
|
|
vE = vE >> 16 | vE << 48
|
|
vA = vA + vE
|
|
v6 = v6 ~ vA
|
|
v6 = v6 >> 63 | v6 << 1
|
|
v3 = v3 + v7 + W[row[7]]
|
|
vF = vF ~ v3
|
|
vF = vF >> 32 | vF << 32
|
|
vB = vB + vF
|
|
v7 = v7 ~ vB
|
|
v7 = v7 >> 24 | v7 << 40
|
|
v3 = v3 + v7 + W[row[8]]
|
|
vF = vF ~ v3
|
|
vF = vF >> 16 | vF << 48
|
|
vB = vB + vF
|
|
v7 = v7 ~ vB
|
|
v7 = v7 >> 63 | v7 << 1
|
|
v0 = v0 + v5 + W[row[9]]
|
|
vF = vF ~ v0
|
|
vF = vF >> 32 | vF << 32
|
|
vA = vA + vF
|
|
v5 = v5 ~ vA
|
|
v5 = v5 >> 24 | v5 << 40
|
|
v0 = v0 + v5 + W[row[10]]
|
|
vF = vF ~ v0
|
|
vF = vF >> 16 | vF << 48
|
|
vA = vA + vF
|
|
v5 = v5 ~ vA
|
|
v5 = v5 >> 63 | v5 << 1
|
|
v1 = v1 + v6 + W[row[11]]
|
|
vC = vC ~ v1
|
|
vC = vC >> 32 | vC << 32
|
|
vB = vB + vC
|
|
v6 = v6 ~ vB
|
|
v6 = v6 >> 24 | v6 << 40
|
|
v1 = v1 + v6 + W[row[12]]
|
|
vC = vC ~ v1
|
|
vC = vC >> 16 | vC << 48
|
|
vB = vB + vC
|
|
v6 = v6 ~ vB
|
|
v6 = v6 >> 63 | v6 << 1
|
|
v2 = v2 + v7 + W[row[13]]
|
|
vD = vD ~ v2
|
|
vD = vD >> 32 | vD << 32
|
|
v8 = v8 + vD
|
|
v7 = v7 ~ v8
|
|
v7 = v7 >> 24 | v7 << 40
|
|
v2 = v2 + v7 + W[row[14]]
|
|
vD = vD ~ v2
|
|
vD = vD >> 16 | vD << 48
|
|
v8 = v8 + vD
|
|
v7 = v7 ~ v8
|
|
v7 = v7 >> 63 | v7 << 1
|
|
v3 = v3 + v4 + W[row[15]]
|
|
vE = vE ~ v3
|
|
vE = vE >> 32 | vE << 32
|
|
v9 = v9 + vE
|
|
v4 = v4 ~ v9
|
|
v4 = v4 >> 24 | v4 << 40
|
|
v3 = v3 + v4 + W[row[16]]
|
|
vE = vE ~ v3
|
|
vE = vE >> 16 | vE << 48
|
|
v9 = v9 + vE
|
|
v4 = v4 ~ v9
|
|
v4 = v4 >> 63 | v4 << 1
|
|
end
|
|
h1 = h1 ~ v0 ~ v8
|
|
h2 = h2 ~ v1 ~ v9
|
|
h3 = h3 ~ v2 ~ vA
|
|
h4 = h4 ~ v3 ~ vB
|
|
h5 = h5 ~ v4 ~ vC
|
|
h6 = h6 ~ v5 ~ vD
|
|
h7 = h7 ~ v6 ~ vE
|
|
h8 = h8 ~ v7 ~ vF
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
return bytes_compressed
|
|
end
|
|
|
|
local function blake3_feed_64(str, offs, size, flags, chunk_index, H_in, H_out, wide_output, block_length)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
block_length = block_length or 64
|
|
local W = common_W
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H_in[1], H_in[2], H_in[3], H_in[4], H_in[5], H_in[6], H_in[7], H_in[8]
|
|
H_out = H_out or H_in
|
|
for pos = offs + 1, offs + size, 64 do
|
|
if str then
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
|
|
string_unpack("<I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
|
|
end
|
|
local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
|
|
local v8, v9, vA, vB = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4]
|
|
local t0 = chunk_index % 2^32 -- t0 = low_4_bytes(chunk_index)
|
|
local t1 = (chunk_index - t0) / 2^32 -- t1 = high_4_bytes(chunk_index)
|
|
local vC, vD, vE, vF = 0|t0, 0|t1, block_length, flags
|
|
for j = 1, 7 do
|
|
v0 = v0 + v4 + W[perm_blake3[j]]
|
|
vC = vC ~ v0
|
|
vC = (vC & (1<<32)-1) >> 16 | vC << 16
|
|
v8 = v8 + vC
|
|
v4 = v4 ~ v8
|
|
v4 = (v4 & (1<<32)-1) >> 12 | v4 << 20
|
|
v0 = v0 + v4 + W[perm_blake3[j + 14]]
|
|
vC = vC ~ v0
|
|
vC = (vC & (1<<32)-1) >> 8 | vC << 24
|
|
v8 = v8 + vC
|
|
v4 = v4 ~ v8
|
|
v4 = (v4 & (1<<32)-1) >> 7 | v4 << 25
|
|
v1 = v1 + v5 + W[perm_blake3[j + 1]]
|
|
vD = vD ~ v1
|
|
vD = (vD & (1<<32)-1) >> 16 | vD << 16
|
|
v9 = v9 + vD
|
|
v5 = v5 ~ v9
|
|
v5 = (v5 & (1<<32)-1) >> 12 | v5 << 20
|
|
v1 = v1 + v5 + W[perm_blake3[j + 2]]
|
|
vD = vD ~ v1
|
|
vD = (vD & (1<<32)-1) >> 8 | vD << 24
|
|
v9 = v9 + vD
|
|
v5 = v5 ~ v9
|
|
v5 = (v5 & (1<<32)-1) >> 7 | v5 << 25
|
|
v2 = v2 + v6 + W[perm_blake3[j + 16]]
|
|
vE = vE ~ v2
|
|
vE = (vE & (1<<32)-1) >> 16 | vE << 16
|
|
vA = vA + vE
|
|
v6 = v6 ~ vA
|
|
v6 = (v6 & (1<<32)-1) >> 12 | v6 << 20
|
|
v2 = v2 + v6 + W[perm_blake3[j + 7]]
|
|
vE = vE ~ v2
|
|
vE = (vE & (1<<32)-1) >> 8 | vE << 24
|
|
vA = vA + vE
|
|
v6 = v6 ~ vA
|
|
v6 = (v6 & (1<<32)-1) >> 7 | v6 << 25
|
|
v3 = v3 + v7 + W[perm_blake3[j + 15]]
|
|
vF = vF ~ v3
|
|
vF = (vF & (1<<32)-1) >> 16 | vF << 16
|
|
vB = vB + vF
|
|
v7 = v7 ~ vB
|
|
v7 = (v7 & (1<<32)-1) >> 12 | v7 << 20
|
|
v3 = v3 + v7 + W[perm_blake3[j + 17]]
|
|
vF = vF ~ v3
|
|
vF = (vF & (1<<32)-1) >> 8 | vF << 24
|
|
vB = vB + vF
|
|
v7 = v7 ~ vB
|
|
v7 = (v7 & (1<<32)-1) >> 7 | v7 << 25
|
|
v0 = v0 + v5 + W[perm_blake3[j + 21]]
|
|
vF = vF ~ v0
|
|
vF = (vF & (1<<32)-1) >> 16 | vF << 16
|
|
vA = vA + vF
|
|
v5 = v5 ~ vA
|
|
v5 = (v5 & (1<<32)-1) >> 12 | v5 << 20
|
|
v0 = v0 + v5 + W[perm_blake3[j + 5]]
|
|
vF = vF ~ v0
|
|
vF = (vF & (1<<32)-1) >> 8 | vF << 24
|
|
vA = vA + vF
|
|
v5 = v5 ~ vA
|
|
v5 = (v5 & (1<<32)-1) >> 7 | v5 << 25
|
|
v1 = v1 + v6 + W[perm_blake3[j + 3]]
|
|
vC = vC ~ v1
|
|
vC = (vC & (1<<32)-1) >> 16 | vC << 16
|
|
vB = vB + vC
|
|
v6 = v6 ~ vB
|
|
v6 = (v6 & (1<<32)-1) >> 12 | v6 << 20
|
|
v1 = v1 + v6 + W[perm_blake3[j + 6]]
|
|
vC = vC ~ v1
|
|
vC = (vC & (1<<32)-1) >> 8 | vC << 24
|
|
vB = vB + vC
|
|
v6 = v6 ~ vB
|
|
v6 = (v6 & (1<<32)-1) >> 7 | v6 << 25
|
|
v2 = v2 + v7 + W[perm_blake3[j + 4]]
|
|
vD = vD ~ v2
|
|
vD = (vD & (1<<32)-1) >> 16 | vD << 16
|
|
v8 = v8 + vD
|
|
v7 = v7 ~ v8
|
|
v7 = (v7 & (1<<32)-1) >> 12 | v7 << 20
|
|
v2 = v2 + v7 + W[perm_blake3[j + 18]]
|
|
vD = vD ~ v2
|
|
vD = (vD & (1<<32)-1) >> 8 | vD << 24
|
|
v8 = v8 + vD
|
|
v7 = v7 ~ v8
|
|
v7 = (v7 & (1<<32)-1) >> 7 | v7 << 25
|
|
v3 = v3 + v4 + W[perm_blake3[j + 19]]
|
|
vE = vE ~ v3
|
|
vE = (vE & (1<<32)-1) >> 16 | vE << 16
|
|
v9 = v9 + vE
|
|
v4 = v4 ~ v9
|
|
v4 = (v4 & (1<<32)-1) >> 12 | v4 << 20
|
|
v3 = v3 + v4 + W[perm_blake3[j + 20]]
|
|
vE = vE ~ v3
|
|
vE = (vE & (1<<32)-1) >> 8 | vE << 24
|
|
v9 = v9 + vE
|
|
v4 = v4 ~ v9
|
|
v4 = (v4 & (1<<32)-1) >> 7 | v4 << 25
|
|
end
|
|
if wide_output then
|
|
H_out[ 9] = h1 ~ v8
|
|
H_out[10] = h2 ~ v9
|
|
H_out[11] = h3 ~ vA
|
|
H_out[12] = h4 ~ vB
|
|
H_out[13] = h5 ~ vC
|
|
H_out[14] = h6 ~ vD
|
|
H_out[15] = h7 ~ vE
|
|
H_out[16] = h8 ~ vF
|
|
end
|
|
h1 = v0 ~ v8
|
|
h2 = v1 ~ v9
|
|
h3 = v2 ~ vA
|
|
h4 = v3 ~ vB
|
|
h5 = v4 ~ vC
|
|
h6 = v5 ~ vD
|
|
h7 = v6 ~ vE
|
|
h8 = v7 ~ vF
|
|
end
|
|
H_out[1], H_out[2], H_out[3], H_out[4], H_out[5], H_out[6], H_out[7], H_out[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
end
|
|
|
|
return HEX64, XORA5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64
|
|
]=](md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sigma, common_W, sha2_H_lo, sha2_H_hi, perm_blake3)
|
|
|
|
end
|
|
|
|
|
|
if branch == "INT32" then
|
|
|
|
|
|
-- implementation for Lua 5.3/5.4 having non-standard numbers config "int32"+"double" (built with LUA_INT_TYPE=LUA_INT_INT)
|
|
|
|
K_lo_modulo = 2^32
|
|
|
|
function HEX(x) -- returns string of 8 lowercase hexadecimal digits
|
|
return string_format("%08x", x)
|
|
end
|
|
|
|
XORA5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64 = load[=[-- branch "INT32"
|
|
local md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sha3_RC_hi, sigma, common_W, sha2_H_lo, sha2_H_hi, perm_blake3 = ...
|
|
local string_unpack, floor = string.unpack, math.floor
|
|
|
|
local function XORA5(x, y)
|
|
return x ~ (y and (y + 2^31) % 2^32 - 2^31 or 0xA5A5A5A5)
|
|
end
|
|
|
|
local function XOR_BYTE(x, y)
|
|
return x ~ y
|
|
end
|
|
|
|
local function sha256_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W, K = common_W, sha2_K_hi
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for pos = offs + 1, offs + size, 64 do
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
|
|
string_unpack(">i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
|
|
for j = 17, 64 do
|
|
local a, b = W[j-15], W[j-2]
|
|
W[j] = (a>>7 ~ a<<25 ~ a<<14 ~ a>>18 ~ a>>3) + (b<<15 ~ b>>17 ~ b<<13 ~ b>>19 ~ b>>10) + W[j-7] + W[j-16]
|
|
end
|
|
local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8
|
|
for j = 1, 64 do
|
|
local z = (e>>6 ~ e<<26 ~ e>>11 ~ e<<21 ~ e>>25 ~ e<<7) + (g ~ e & (f ~ g)) + h + K[j] + W[j]
|
|
h = g
|
|
g = f
|
|
f = e
|
|
e = z + d
|
|
d = c
|
|
c = b
|
|
b = a
|
|
a = z + ((a ~ c) & d ~ a & c) + (a>>2 ~ a<<30 ~ a>>13 ~ a<<19 ~ a<<10 ~ a>>22)
|
|
end
|
|
h1 = a + h1
|
|
h2 = b + h2
|
|
h3 = c + h3
|
|
h4 = d + h4
|
|
h5 = e + h5
|
|
h6 = f + h6
|
|
h7 = g + h7
|
|
h8 = h + h8
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
end
|
|
|
|
local function sha512_feed_128(H_lo, H_hi, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
-- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k]
|
|
local floor, W, K_lo, K_hi = floor, common_W, sha2_K_lo, sha2_K_hi
|
|
local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
|
|
local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
|
|
for pos = offs + 1, offs + size, 128 do
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16],
|
|
W[17], W[18], W[19], W[20], W[21], W[22], W[23], W[24], W[25], W[26], W[27], W[28], W[29], W[30], W[31], W[32] =
|
|
string_unpack(">i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
|
|
for jj = 17*2, 80*2, 2 do
|
|
local a_lo, a_hi, b_lo, b_hi = W[jj-30], W[jj-31], W[jj-4], W[jj-5]
|
|
local tmp =
|
|
(a_lo>>1 ~ a_hi<<31 ~ a_lo>>8 ~ a_hi<<24 ~ a_lo>>7 ~ a_hi<<25) % 2^32
|
|
+ (b_lo>>19 ~ b_hi<<13 ~ b_lo<<3 ~ b_hi>>29 ~ b_lo>>6 ~ b_hi<<26) % 2^32
|
|
+ W[jj-14] % 2^32 + W[jj-32] % 2^32
|
|
W[jj-1] =
|
|
(a_hi>>1 ~ a_lo<<31 ~ a_hi>>8 ~ a_lo<<24 ~ a_hi>>7)
|
|
+ (b_hi>>19 ~ b_lo<<13 ~ b_hi<<3 ~ b_lo>>29 ~ b_hi>>6)
|
|
+ W[jj-15] + W[jj-33] + floor(tmp / 2^32)
|
|
W[jj] = 0|((tmp + 2^31) % 2^32 - 2^31)
|
|
end
|
|
local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
|
|
local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
|
|
for j = 1, 80 do
|
|
local jj = 2*j
|
|
local z_lo = (e_lo>>14 ~ e_hi<<18 ~ e_lo>>18 ~ e_hi<<14 ~ e_lo<<23 ~ e_hi>>9) % 2^32 + (g_lo ~ e_lo & (f_lo ~ g_lo)) % 2^32 + h_lo % 2^32 + K_lo[j] + W[jj] % 2^32
|
|
local z_hi = (e_hi>>14 ~ e_lo<<18 ~ e_hi>>18 ~ e_lo<<14 ~ e_hi<<23 ~ e_lo>>9) + (g_hi ~ e_hi & (f_hi ~ g_hi)) + h_hi + K_hi[j] + W[jj-1] + floor(z_lo / 2^32)
|
|
z_lo = z_lo % 2^32
|
|
h_lo = g_lo; h_hi = g_hi
|
|
g_lo = f_lo; g_hi = f_hi
|
|
f_lo = e_lo; f_hi = e_hi
|
|
e_lo = z_lo + d_lo % 2^32
|
|
e_hi = z_hi + d_hi + floor(e_lo / 2^32)
|
|
e_lo = 0|((e_lo + 2^31) % 2^32 - 2^31)
|
|
d_lo = c_lo; d_hi = c_hi
|
|
c_lo = b_lo; c_hi = b_hi
|
|
b_lo = a_lo; b_hi = a_hi
|
|
z_lo = z_lo + (d_lo & c_lo ~ b_lo & (d_lo ~ c_lo)) % 2^32 + (b_lo>>28 ~ b_hi<<4 ~ b_lo<<30 ~ b_hi>>2 ~ b_lo<<25 ~ b_hi>>7) % 2^32
|
|
a_hi = z_hi + (d_hi & c_hi ~ b_hi & (d_hi ~ c_hi)) + (b_hi>>28 ~ b_lo<<4 ~ b_hi<<30 ~ b_lo>>2 ~ b_hi<<25 ~ b_lo>>7) + floor(z_lo / 2^32)
|
|
a_lo = 0|((z_lo + 2^31) % 2^32 - 2^31)
|
|
end
|
|
a_lo = h1_lo % 2^32 + a_lo % 2^32
|
|
h1_hi = h1_hi + a_hi + floor(a_lo / 2^32)
|
|
h1_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
|
|
a_lo = h2_lo % 2^32 + b_lo % 2^32
|
|
h2_hi = h2_hi + b_hi + floor(a_lo / 2^32)
|
|
h2_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
|
|
a_lo = h3_lo % 2^32 + c_lo % 2^32
|
|
h3_hi = h3_hi + c_hi + floor(a_lo / 2^32)
|
|
h3_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
|
|
a_lo = h4_lo % 2^32 + d_lo % 2^32
|
|
h4_hi = h4_hi + d_hi + floor(a_lo / 2^32)
|
|
h4_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
|
|
a_lo = h5_lo % 2^32 + e_lo % 2^32
|
|
h5_hi = h5_hi + e_hi + floor(a_lo / 2^32)
|
|
h5_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
|
|
a_lo = h6_lo % 2^32 + f_lo % 2^32
|
|
h6_hi = h6_hi + f_hi + floor(a_lo / 2^32)
|
|
h6_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
|
|
a_lo = h7_lo % 2^32 + g_lo % 2^32
|
|
h7_hi = h7_hi + g_hi + floor(a_lo / 2^32)
|
|
h7_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
|
|
a_lo = h8_lo % 2^32 + h_lo % 2^32
|
|
h8_hi = h8_hi + h_hi + floor(a_lo / 2^32)
|
|
h8_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
|
|
end
|
|
H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
|
|
H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
|
|
end
|
|
|
|
local function md5_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W, K, md5_next_shift = common_W, md5_K, md5_next_shift
|
|
local h1, h2, h3, h4 = H[1], H[2], H[3], H[4]
|
|
for pos = offs + 1, offs + size, 64 do
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
|
|
string_unpack("<i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
|
|
local a, b, c, d = h1, h2, h3, h4
|
|
local s = 32-7
|
|
for j = 1, 16 do
|
|
local F = (d ~ b & (c ~ d)) + a + K[j] + W[j]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = (F << 32-s | F>>s) + b
|
|
s = md5_next_shift[s]
|
|
end
|
|
s = 32-5
|
|
for j = 17, 32 do
|
|
local F = (c ~ d & (b ~ c)) + a + K[j] + W[(5*j-4 & 15) + 1]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = (F << 32-s | F>>s) + b
|
|
s = md5_next_shift[s]
|
|
end
|
|
s = 32-4
|
|
for j = 33, 48 do
|
|
local F = (b ~ c ~ d) + a + K[j] + W[(3*j+2 & 15) + 1]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = (F << 32-s | F>>s) + b
|
|
s = md5_next_shift[s]
|
|
end
|
|
s = 32-6
|
|
for j = 49, 64 do
|
|
local F = (c ~ (b | ~d)) + a + K[j] + W[(j*7-7 & 15) + 1]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = (F << 32-s | F>>s) + b
|
|
s = md5_next_shift[s]
|
|
end
|
|
h1 = a + h1
|
|
h2 = b + h2
|
|
h3 = c + h3
|
|
h4 = d + h4
|
|
end
|
|
H[1], H[2], H[3], H[4] = h1, h2, h3, h4
|
|
end
|
|
|
|
local function sha1_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W = common_W
|
|
local h1, h2, h3, h4, h5 = H[1], H[2], H[3], H[4], H[5]
|
|
for pos = offs + 1, offs + size, 64 do
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
|
|
string_unpack(">i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
|
|
for j = 17, 80 do
|
|
local a = W[j-3] ~ W[j-8] ~ W[j-14] ~ W[j-16]
|
|
W[j] = a << 1 ~ a >> 31
|
|
end
|
|
local a, b, c, d, e = h1, h2, h3, h4, h5
|
|
for j = 1, 20 do
|
|
local z = (a << 5 ~ a >> 27) + (d ~ b & (c ~ d)) + 0x5A827999 + W[j] + e -- constant = floor(2^30 * sqrt(2))
|
|
e = d
|
|
d = c
|
|
c = b << 30 ~ b >> 2
|
|
b = a
|
|
a = z
|
|
end
|
|
for j = 21, 40 do
|
|
local z = (a << 5 ~ a >> 27) + (b ~ c ~ d) + 0x6ED9EBA1 + W[j] + e -- 2^30 * sqrt(3)
|
|
e = d
|
|
d = c
|
|
c = b << 30 ~ b >> 2
|
|
b = a
|
|
a = z
|
|
end
|
|
for j = 41, 60 do
|
|
local z = (a << 5 ~ a >> 27) + ((b ~ c) & d ~ b & c) + 0x8F1BBCDC + W[j] + e -- 2^30 * sqrt(5)
|
|
e = d
|
|
d = c
|
|
c = b << 30 ~ b >> 2
|
|
b = a
|
|
a = z
|
|
end
|
|
for j = 61, 80 do
|
|
local z = (a << 5 ~ a >> 27) + (b ~ c ~ d) + 0xCA62C1D6 + W[j] + e -- 2^30 * sqrt(10)
|
|
e = d
|
|
d = c
|
|
c = b << 30 ~ b >> 2
|
|
b = a
|
|
a = z
|
|
end
|
|
h1 = a + h1
|
|
h2 = b + h2
|
|
h3 = c + h3
|
|
h4 = d + h4
|
|
h5 = e + h5
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5] = h1, h2, h3, h4, h5
|
|
end
|
|
|
|
local keccak_format_i4i4 = build_keccak_format("i4i4")
|
|
|
|
local function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes)
|
|
-- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
|
|
local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi
|
|
local qwords_qty = block_size_in_bytes / 8
|
|
local keccak_format = keccak_format_i4i4[qwords_qty]
|
|
for pos = offs + 1, offs + size, block_size_in_bytes do
|
|
local dwords_from_message = {string_unpack(keccak_format, str, pos)}
|
|
for j = 1, qwords_qty do
|
|
lanes_lo[j] = lanes_lo[j] ~ dwords_from_message[2*j-1]
|
|
lanes_hi[j] = lanes_hi[j] ~ dwords_from_message[2*j]
|
|
end
|
|
local L01_lo, L01_hi, L02_lo, L02_hi, L03_lo, L03_hi, L04_lo, L04_hi, L05_lo, L05_hi, L06_lo, L06_hi, L07_lo, L07_hi, L08_lo, L08_hi,
|
|
L09_lo, L09_hi, L10_lo, L10_hi, L11_lo, L11_hi, L12_lo, L12_hi, L13_lo, L13_hi, L14_lo, L14_hi, L15_lo, L15_hi, L16_lo, L16_hi,
|
|
L17_lo, L17_hi, L18_lo, L18_hi, L19_lo, L19_hi, L20_lo, L20_hi, L21_lo, L21_hi, L22_lo, L22_hi, L23_lo, L23_hi, L24_lo, L24_hi, L25_lo, L25_hi =
|
|
lanes_lo[1], lanes_hi[1], lanes_lo[2], lanes_hi[2], lanes_lo[3], lanes_hi[3], lanes_lo[4], lanes_hi[4], lanes_lo[5], lanes_hi[5],
|
|
lanes_lo[6], lanes_hi[6], lanes_lo[7], lanes_hi[7], lanes_lo[8], lanes_hi[8], lanes_lo[9], lanes_hi[9], lanes_lo[10], lanes_hi[10],
|
|
lanes_lo[11], lanes_hi[11], lanes_lo[12], lanes_hi[12], lanes_lo[13], lanes_hi[13], lanes_lo[14], lanes_hi[14], lanes_lo[15], lanes_hi[15],
|
|
lanes_lo[16], lanes_hi[16], lanes_lo[17], lanes_hi[17], lanes_lo[18], lanes_hi[18], lanes_lo[19], lanes_hi[19], lanes_lo[20], lanes_hi[20],
|
|
lanes_lo[21], lanes_hi[21], lanes_lo[22], lanes_hi[22], lanes_lo[23], lanes_hi[23], lanes_lo[24], lanes_hi[24], lanes_lo[25], lanes_hi[25]
|
|
for round_idx = 1, 24 do
|
|
local C1_lo = L01_lo ~ L06_lo ~ L11_lo ~ L16_lo ~ L21_lo
|
|
local C1_hi = L01_hi ~ L06_hi ~ L11_hi ~ L16_hi ~ L21_hi
|
|
local C2_lo = L02_lo ~ L07_lo ~ L12_lo ~ L17_lo ~ L22_lo
|
|
local C2_hi = L02_hi ~ L07_hi ~ L12_hi ~ L17_hi ~ L22_hi
|
|
local C3_lo = L03_lo ~ L08_lo ~ L13_lo ~ L18_lo ~ L23_lo
|
|
local C3_hi = L03_hi ~ L08_hi ~ L13_hi ~ L18_hi ~ L23_hi
|
|
local C4_lo = L04_lo ~ L09_lo ~ L14_lo ~ L19_lo ~ L24_lo
|
|
local C4_hi = L04_hi ~ L09_hi ~ L14_hi ~ L19_hi ~ L24_hi
|
|
local C5_lo = L05_lo ~ L10_lo ~ L15_lo ~ L20_lo ~ L25_lo
|
|
local C5_hi = L05_hi ~ L10_hi ~ L15_hi ~ L20_hi ~ L25_hi
|
|
local D_lo = C1_lo ~ C3_lo<<1 ~ C3_hi>>31
|
|
local D_hi = C1_hi ~ C3_hi<<1 ~ C3_lo>>31
|
|
local T0_lo = D_lo ~ L02_lo
|
|
local T0_hi = D_hi ~ L02_hi
|
|
local T1_lo = D_lo ~ L07_lo
|
|
local T1_hi = D_hi ~ L07_hi
|
|
local T2_lo = D_lo ~ L12_lo
|
|
local T2_hi = D_hi ~ L12_hi
|
|
local T3_lo = D_lo ~ L17_lo
|
|
local T3_hi = D_hi ~ L17_hi
|
|
local T4_lo = D_lo ~ L22_lo
|
|
local T4_hi = D_hi ~ L22_hi
|
|
L02_lo = T1_lo>>20 ~ T1_hi<<12
|
|
L02_hi = T1_hi>>20 ~ T1_lo<<12
|
|
L07_lo = T3_lo>>19 ~ T3_hi<<13
|
|
L07_hi = T3_hi>>19 ~ T3_lo<<13
|
|
L12_lo = T0_lo<<1 ~ T0_hi>>31
|
|
L12_hi = T0_hi<<1 ~ T0_lo>>31
|
|
L17_lo = T2_lo<<10 ~ T2_hi>>22
|
|
L17_hi = T2_hi<<10 ~ T2_lo>>22
|
|
L22_lo = T4_lo<<2 ~ T4_hi>>30
|
|
L22_hi = T4_hi<<2 ~ T4_lo>>30
|
|
D_lo = C2_lo ~ C4_lo<<1 ~ C4_hi>>31
|
|
D_hi = C2_hi ~ C4_hi<<1 ~ C4_lo>>31
|
|
T0_lo = D_lo ~ L03_lo
|
|
T0_hi = D_hi ~ L03_hi
|
|
T1_lo = D_lo ~ L08_lo
|
|
T1_hi = D_hi ~ L08_hi
|
|
T2_lo = D_lo ~ L13_lo
|
|
T2_hi = D_hi ~ L13_hi
|
|
T3_lo = D_lo ~ L18_lo
|
|
T3_hi = D_hi ~ L18_hi
|
|
T4_lo = D_lo ~ L23_lo
|
|
T4_hi = D_hi ~ L23_hi
|
|
L03_lo = T2_lo>>21 ~ T2_hi<<11
|
|
L03_hi = T2_hi>>21 ~ T2_lo<<11
|
|
L08_lo = T4_lo>>3 ~ T4_hi<<29
|
|
L08_hi = T4_hi>>3 ~ T4_lo<<29
|
|
L13_lo = T1_lo<<6 ~ T1_hi>>26
|
|
L13_hi = T1_hi<<6 ~ T1_lo>>26
|
|
L18_lo = T3_lo<<15 ~ T3_hi>>17
|
|
L18_hi = T3_hi<<15 ~ T3_lo>>17
|
|
L23_lo = T0_lo>>2 ~ T0_hi<<30
|
|
L23_hi = T0_hi>>2 ~ T0_lo<<30
|
|
D_lo = C3_lo ~ C5_lo<<1 ~ C5_hi>>31
|
|
D_hi = C3_hi ~ C5_hi<<1 ~ C5_lo>>31
|
|
T0_lo = D_lo ~ L04_lo
|
|
T0_hi = D_hi ~ L04_hi
|
|
T1_lo = D_lo ~ L09_lo
|
|
T1_hi = D_hi ~ L09_hi
|
|
T2_lo = D_lo ~ L14_lo
|
|
T2_hi = D_hi ~ L14_hi
|
|
T3_lo = D_lo ~ L19_lo
|
|
T3_hi = D_hi ~ L19_hi
|
|
T4_lo = D_lo ~ L24_lo
|
|
T4_hi = D_hi ~ L24_hi
|
|
L04_lo = T3_lo<<21 ~ T3_hi>>11
|
|
L04_hi = T3_hi<<21 ~ T3_lo>>11
|
|
L09_lo = T0_lo<<28 ~ T0_hi>>4
|
|
L09_hi = T0_hi<<28 ~ T0_lo>>4
|
|
L14_lo = T2_lo<<25 ~ T2_hi>>7
|
|
L14_hi = T2_hi<<25 ~ T2_lo>>7
|
|
L19_lo = T4_lo>>8 ~ T4_hi<<24
|
|
L19_hi = T4_hi>>8 ~ T4_lo<<24
|
|
L24_lo = T1_lo>>9 ~ T1_hi<<23
|
|
L24_hi = T1_hi>>9 ~ T1_lo<<23
|
|
D_lo = C4_lo ~ C1_lo<<1 ~ C1_hi>>31
|
|
D_hi = C4_hi ~ C1_hi<<1 ~ C1_lo>>31
|
|
T0_lo = D_lo ~ L05_lo
|
|
T0_hi = D_hi ~ L05_hi
|
|
T1_lo = D_lo ~ L10_lo
|
|
T1_hi = D_hi ~ L10_hi
|
|
T2_lo = D_lo ~ L15_lo
|
|
T2_hi = D_hi ~ L15_hi
|
|
T3_lo = D_lo ~ L20_lo
|
|
T3_hi = D_hi ~ L20_hi
|
|
T4_lo = D_lo ~ L25_lo
|
|
T4_hi = D_hi ~ L25_hi
|
|
L05_lo = T4_lo<<14 ~ T4_hi>>18
|
|
L05_hi = T4_hi<<14 ~ T4_lo>>18
|
|
L10_lo = T1_lo<<20 ~ T1_hi>>12
|
|
L10_hi = T1_hi<<20 ~ T1_lo>>12
|
|
L15_lo = T3_lo<<8 ~ T3_hi>>24
|
|
L15_hi = T3_hi<<8 ~ T3_lo>>24
|
|
L20_lo = T0_lo<<27 ~ T0_hi>>5
|
|
L20_hi = T0_hi<<27 ~ T0_lo>>5
|
|
L25_lo = T2_lo>>25 ~ T2_hi<<7
|
|
L25_hi = T2_hi>>25 ~ T2_lo<<7
|
|
D_lo = C5_lo ~ C2_lo<<1 ~ C2_hi>>31
|
|
D_hi = C5_hi ~ C2_hi<<1 ~ C2_lo>>31
|
|
T1_lo = D_lo ~ L06_lo
|
|
T1_hi = D_hi ~ L06_hi
|
|
T2_lo = D_lo ~ L11_lo
|
|
T2_hi = D_hi ~ L11_hi
|
|
T3_lo = D_lo ~ L16_lo
|
|
T3_hi = D_hi ~ L16_hi
|
|
T4_lo = D_lo ~ L21_lo
|
|
T4_hi = D_hi ~ L21_hi
|
|
L06_lo = T2_lo<<3 ~ T2_hi>>29
|
|
L06_hi = T2_hi<<3 ~ T2_lo>>29
|
|
L11_lo = T4_lo<<18 ~ T4_hi>>14
|
|
L11_hi = T4_hi<<18 ~ T4_lo>>14
|
|
L16_lo = T1_lo>>28 ~ T1_hi<<4
|
|
L16_hi = T1_hi>>28 ~ T1_lo<<4
|
|
L21_lo = T3_lo>>23 ~ T3_hi<<9
|
|
L21_hi = T3_hi>>23 ~ T3_lo<<9
|
|
L01_lo = D_lo ~ L01_lo
|
|
L01_hi = D_hi ~ L01_hi
|
|
L01_lo, L02_lo, L03_lo, L04_lo, L05_lo = L01_lo ~ ~L02_lo & L03_lo, L02_lo ~ ~L03_lo & L04_lo, L03_lo ~ ~L04_lo & L05_lo, L04_lo ~ ~L05_lo & L01_lo, L05_lo ~ ~L01_lo & L02_lo
|
|
L01_hi, L02_hi, L03_hi, L04_hi, L05_hi = L01_hi ~ ~L02_hi & L03_hi, L02_hi ~ ~L03_hi & L04_hi, L03_hi ~ ~L04_hi & L05_hi, L04_hi ~ ~L05_hi & L01_hi, L05_hi ~ ~L01_hi & L02_hi
|
|
L06_lo, L07_lo, L08_lo, L09_lo, L10_lo = L09_lo ~ ~L10_lo & L06_lo, L10_lo ~ ~L06_lo & L07_lo, L06_lo ~ ~L07_lo & L08_lo, L07_lo ~ ~L08_lo & L09_lo, L08_lo ~ ~L09_lo & L10_lo
|
|
L06_hi, L07_hi, L08_hi, L09_hi, L10_hi = L09_hi ~ ~L10_hi & L06_hi, L10_hi ~ ~L06_hi & L07_hi, L06_hi ~ ~L07_hi & L08_hi, L07_hi ~ ~L08_hi & L09_hi, L08_hi ~ ~L09_hi & L10_hi
|
|
L11_lo, L12_lo, L13_lo, L14_lo, L15_lo = L12_lo ~ ~L13_lo & L14_lo, L13_lo ~ ~L14_lo & L15_lo, L14_lo ~ ~L15_lo & L11_lo, L15_lo ~ ~L11_lo & L12_lo, L11_lo ~ ~L12_lo & L13_lo
|
|
L11_hi, L12_hi, L13_hi, L14_hi, L15_hi = L12_hi ~ ~L13_hi & L14_hi, L13_hi ~ ~L14_hi & L15_hi, L14_hi ~ ~L15_hi & L11_hi, L15_hi ~ ~L11_hi & L12_hi, L11_hi ~ ~L12_hi & L13_hi
|
|
L16_lo, L17_lo, L18_lo, L19_lo, L20_lo = L20_lo ~ ~L16_lo & L17_lo, L16_lo ~ ~L17_lo & L18_lo, L17_lo ~ ~L18_lo & L19_lo, L18_lo ~ ~L19_lo & L20_lo, L19_lo ~ ~L20_lo & L16_lo
|
|
L16_hi, L17_hi, L18_hi, L19_hi, L20_hi = L20_hi ~ ~L16_hi & L17_hi, L16_hi ~ ~L17_hi & L18_hi, L17_hi ~ ~L18_hi & L19_hi, L18_hi ~ ~L19_hi & L20_hi, L19_hi ~ ~L20_hi & L16_hi
|
|
L21_lo, L22_lo, L23_lo, L24_lo, L25_lo = L23_lo ~ ~L24_lo & L25_lo, L24_lo ~ ~L25_lo & L21_lo, L25_lo ~ ~L21_lo & L22_lo, L21_lo ~ ~L22_lo & L23_lo, L22_lo ~ ~L23_lo & L24_lo
|
|
L21_hi, L22_hi, L23_hi, L24_hi, L25_hi = L23_hi ~ ~L24_hi & L25_hi, L24_hi ~ ~L25_hi & L21_hi, L25_hi ~ ~L21_hi & L22_hi, L21_hi ~ ~L22_hi & L23_hi, L22_hi ~ ~L23_hi & L24_hi
|
|
L01_lo = L01_lo ~ RC_lo[round_idx]
|
|
L01_hi = L01_hi ~ RC_hi[round_idx]
|
|
end
|
|
lanes_lo[1] = L01_lo; lanes_hi[1] = L01_hi
|
|
lanes_lo[2] = L02_lo; lanes_hi[2] = L02_hi
|
|
lanes_lo[3] = L03_lo; lanes_hi[3] = L03_hi
|
|
lanes_lo[4] = L04_lo; lanes_hi[4] = L04_hi
|
|
lanes_lo[5] = L05_lo; lanes_hi[5] = L05_hi
|
|
lanes_lo[6] = L06_lo; lanes_hi[6] = L06_hi
|
|
lanes_lo[7] = L07_lo; lanes_hi[7] = L07_hi
|
|
lanes_lo[8] = L08_lo; lanes_hi[8] = L08_hi
|
|
lanes_lo[9] = L09_lo; lanes_hi[9] = L09_hi
|
|
lanes_lo[10] = L10_lo; lanes_hi[10] = L10_hi
|
|
lanes_lo[11] = L11_lo; lanes_hi[11] = L11_hi
|
|
lanes_lo[12] = L12_lo; lanes_hi[12] = L12_hi
|
|
lanes_lo[13] = L13_lo; lanes_hi[13] = L13_hi
|
|
lanes_lo[14] = L14_lo; lanes_hi[14] = L14_hi
|
|
lanes_lo[15] = L15_lo; lanes_hi[15] = L15_hi
|
|
lanes_lo[16] = L16_lo; lanes_hi[16] = L16_hi
|
|
lanes_lo[17] = L17_lo; lanes_hi[17] = L17_hi
|
|
lanes_lo[18] = L18_lo; lanes_hi[18] = L18_hi
|
|
lanes_lo[19] = L19_lo; lanes_hi[19] = L19_hi
|
|
lanes_lo[20] = L20_lo; lanes_hi[20] = L20_hi
|
|
lanes_lo[21] = L21_lo; lanes_hi[21] = L21_hi
|
|
lanes_lo[22] = L22_lo; lanes_hi[22] = L22_hi
|
|
lanes_lo[23] = L23_lo; lanes_hi[23] = L23_hi
|
|
lanes_lo[24] = L24_lo; lanes_hi[24] = L24_hi
|
|
lanes_lo[25] = L25_lo; lanes_hi[25] = L25_hi
|
|
end
|
|
end
|
|
|
|
local function blake2s_feed_64(H, str, offs, size, bytes_compressed, last_block_size, is_last_node)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W = common_W
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for pos = offs + 1, offs + size, 64 do
|
|
if str then
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
|
|
string_unpack("<i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
|
|
end
|
|
local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
|
|
local v8, v9, vA, vB, vC, vD, vE, vF = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
|
|
bytes_compressed = bytes_compressed + (last_block_size or 64)
|
|
local t0 = bytes_compressed % 2^32
|
|
local t1 = (bytes_compressed - t0) / 2^32
|
|
t0 = (t0 + 2^31) % 2^32 - 2^31 -- convert to int32 range (-2^31)..(2^31-1) to avoid "number has no integer representation" error while XORing
|
|
vC = vC ~ t0 -- t0 = low_4_bytes(bytes_compressed)
|
|
vD = vD ~ t1 -- t1 = high_4_bytes(bytes_compressed)
|
|
if last_block_size then -- flag f0
|
|
vE = ~vE
|
|
end
|
|
if is_last_node then -- flag f1
|
|
vF = ~vF
|
|
end
|
|
for j = 1, 10 do
|
|
local row = sigma[j]
|
|
v0 = v0 + v4 + W[row[1]]
|
|
vC = vC ~ v0
|
|
vC = vC >> 16 | vC << 16
|
|
v8 = v8 + vC
|
|
v4 = v4 ~ v8
|
|
v4 = v4 >> 12 | v4 << 20
|
|
v0 = v0 + v4 + W[row[2]]
|
|
vC = vC ~ v0
|
|
vC = vC >> 8 | vC << 24
|
|
v8 = v8 + vC
|
|
v4 = v4 ~ v8
|
|
v4 = v4 >> 7 | v4 << 25
|
|
v1 = v1 + v5 + W[row[3]]
|
|
vD = vD ~ v1
|
|
vD = vD >> 16 | vD << 16
|
|
v9 = v9 + vD
|
|
v5 = v5 ~ v9
|
|
v5 = v5 >> 12 | v5 << 20
|
|
v1 = v1 + v5 + W[row[4]]
|
|
vD = vD ~ v1
|
|
vD = vD >> 8 | vD << 24
|
|
v9 = v9 + vD
|
|
v5 = v5 ~ v9
|
|
v5 = v5 >> 7 | v5 << 25
|
|
v2 = v2 + v6 + W[row[5]]
|
|
vE = vE ~ v2
|
|
vE = vE >> 16 | vE << 16
|
|
vA = vA + vE
|
|
v6 = v6 ~ vA
|
|
v6 = v6 >> 12 | v6 << 20
|
|
v2 = v2 + v6 + W[row[6]]
|
|
vE = vE ~ v2
|
|
vE = vE >> 8 | vE << 24
|
|
vA = vA + vE
|
|
v6 = v6 ~ vA
|
|
v6 = v6 >> 7 | v6 << 25
|
|
v3 = v3 + v7 + W[row[7]]
|
|
vF = vF ~ v3
|
|
vF = vF >> 16 | vF << 16
|
|
vB = vB + vF
|
|
v7 = v7 ~ vB
|
|
v7 = v7 >> 12 | v7 << 20
|
|
v3 = v3 + v7 + W[row[8]]
|
|
vF = vF ~ v3
|
|
vF = vF >> 8 | vF << 24
|
|
vB = vB + vF
|
|
v7 = v7 ~ vB
|
|
v7 = v7 >> 7 | v7 << 25
|
|
v0 = v0 + v5 + W[row[9]]
|
|
vF = vF ~ v0
|
|
vF = vF >> 16 | vF << 16
|
|
vA = vA + vF
|
|
v5 = v5 ~ vA
|
|
v5 = v5 >> 12 | v5 << 20
|
|
v0 = v0 + v5 + W[row[10]]
|
|
vF = vF ~ v0
|
|
vF = vF >> 8 | vF << 24
|
|
vA = vA + vF
|
|
v5 = v5 ~ vA
|
|
v5 = v5 >> 7 | v5 << 25
|
|
v1 = v1 + v6 + W[row[11]]
|
|
vC = vC ~ v1
|
|
vC = vC >> 16 | vC << 16
|
|
vB = vB + vC
|
|
v6 = v6 ~ vB
|
|
v6 = v6 >> 12 | v6 << 20
|
|
v1 = v1 + v6 + W[row[12]]
|
|
vC = vC ~ v1
|
|
vC = vC >> 8 | vC << 24
|
|
vB = vB + vC
|
|
v6 = v6 ~ vB
|
|
v6 = v6 >> 7 | v6 << 25
|
|
v2 = v2 + v7 + W[row[13]]
|
|
vD = vD ~ v2
|
|
vD = vD >> 16 | vD << 16
|
|
v8 = v8 + vD
|
|
v7 = v7 ~ v8
|
|
v7 = v7 >> 12 | v7 << 20
|
|
v2 = v2 + v7 + W[row[14]]
|
|
vD = vD ~ v2
|
|
vD = vD >> 8 | vD << 24
|
|
v8 = v8 + vD
|
|
v7 = v7 ~ v8
|
|
v7 = v7 >> 7 | v7 << 25
|
|
v3 = v3 + v4 + W[row[15]]
|
|
vE = vE ~ v3
|
|
vE = vE >> 16 | vE << 16
|
|
v9 = v9 + vE
|
|
v4 = v4 ~ v9
|
|
v4 = v4 >> 12 | v4 << 20
|
|
v3 = v3 + v4 + W[row[16]]
|
|
vE = vE ~ v3
|
|
vE = vE >> 8 | vE << 24
|
|
v9 = v9 + vE
|
|
v4 = v4 ~ v9
|
|
v4 = v4 >> 7 | v4 << 25
|
|
end
|
|
h1 = h1 ~ v0 ~ v8
|
|
h2 = h2 ~ v1 ~ v9
|
|
h3 = h3 ~ v2 ~ vA
|
|
h4 = h4 ~ v3 ~ vB
|
|
h5 = h5 ~ v4 ~ vC
|
|
h6 = h6 ~ v5 ~ vD
|
|
h7 = h7 ~ v6 ~ vE
|
|
h8 = h8 ~ v7 ~ vF
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
return bytes_compressed
|
|
end
|
|
|
|
local function blake2b_feed_128(H_lo, H_hi, str, offs, size, bytes_compressed, last_block_size, is_last_node)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
local W = common_W
|
|
local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
|
|
local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
|
|
for pos = offs + 1, offs + size, 128 do
|
|
if str then
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16],
|
|
W[17], W[18], W[19], W[20], W[21], W[22], W[23], W[24], W[25], W[26], W[27], W[28], W[29], W[30], W[31], W[32] =
|
|
string_unpack("<i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
|
|
end
|
|
local v0_lo, v1_lo, v2_lo, v3_lo, v4_lo, v5_lo, v6_lo, v7_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
|
|
local v0_hi, v1_hi, v2_hi, v3_hi, v4_hi, v5_hi, v6_hi, v7_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
|
|
local v8_lo, v9_lo, vA_lo, vB_lo, vC_lo, vD_lo, vE_lo, vF_lo = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[5], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
|
|
local v8_hi, v9_hi, vA_hi, vB_hi, vC_hi, vD_hi, vE_hi, vF_hi = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
|
|
bytes_compressed = bytes_compressed + (last_block_size or 128)
|
|
local t0_lo = bytes_compressed % 2^32
|
|
local t0_hi = (bytes_compressed - t0_lo) / 2^32
|
|
t0_lo = (t0_lo + 2^31) % 2^32 - 2^31 -- convert to int32 range (-2^31)..(2^31-1) to avoid "number has no integer representation" error while XORing
|
|
vC_lo = vC_lo ~ t0_lo -- t0 = low_8_bytes(bytes_compressed)
|
|
vC_hi = vC_hi ~ t0_hi
|
|
-- t1 = high_8_bytes(bytes_compressed) = 0, message length is always below 2^53 bytes
|
|
if last_block_size then -- flag f0
|
|
vE_lo = ~vE_lo
|
|
vE_hi = ~vE_hi
|
|
end
|
|
if is_last_node then -- flag f1
|
|
vF_lo = ~vF_lo
|
|
vF_hi = ~vF_hi
|
|
end
|
|
for j = 1, 12 do
|
|
local row = sigma[j]
|
|
local k = row[1] * 2
|
|
v0_lo = v0_lo % 2^32 + v4_lo % 2^32 + W[k-1] % 2^32
|
|
v0_hi = v0_hi + v4_hi + floor(v0_lo / 2^32) + W[k]
|
|
v0_lo = 0|((v0_lo + 2^31) % 2^32 - 2^31)
|
|
vC_lo, vC_hi = vC_hi ~ v0_hi, vC_lo ~ v0_lo
|
|
v8_lo = v8_lo % 2^32 + vC_lo % 2^32
|
|
v8_hi = v8_hi + vC_hi + floor(v8_lo / 2^32)
|
|
v8_lo = 0|((v8_lo + 2^31) % 2^32 - 2^31)
|
|
v4_lo, v4_hi = v4_lo ~ v8_lo, v4_hi ~ v8_hi
|
|
v4_lo, v4_hi = v4_lo >> 24 | v4_hi << 8, v4_hi >> 24 | v4_lo << 8
|
|
k = row[2] * 2
|
|
v0_lo = v0_lo % 2^32 + v4_lo % 2^32 + W[k-1] % 2^32
|
|
v0_hi = v0_hi + v4_hi + floor(v0_lo / 2^32) + W[k]
|
|
v0_lo = 0|((v0_lo + 2^31) % 2^32 - 2^31)
|
|
vC_lo, vC_hi = vC_lo ~ v0_lo, vC_hi ~ v0_hi
|
|
vC_lo, vC_hi = vC_lo >> 16 | vC_hi << 16, vC_hi >> 16 | vC_lo << 16
|
|
v8_lo = v8_lo % 2^32 + vC_lo % 2^32
|
|
v8_hi = v8_hi + vC_hi + floor(v8_lo / 2^32)
|
|
v8_lo = 0|((v8_lo + 2^31) % 2^32 - 2^31)
|
|
v4_lo, v4_hi = v4_lo ~ v8_lo, v4_hi ~ v8_hi
|
|
v4_lo, v4_hi = v4_lo << 1 | v4_hi >> 31, v4_hi << 1 | v4_lo >> 31
|
|
k = row[3] * 2
|
|
v1_lo = v1_lo % 2^32 + v5_lo % 2^32 + W[k-1] % 2^32
|
|
v1_hi = v1_hi + v5_hi + floor(v1_lo / 2^32) + W[k]
|
|
v1_lo = 0|((v1_lo + 2^31) % 2^32 - 2^31)
|
|
vD_lo, vD_hi = vD_hi ~ v1_hi, vD_lo ~ v1_lo
|
|
v9_lo = v9_lo % 2^32 + vD_lo % 2^32
|
|
v9_hi = v9_hi + vD_hi + floor(v9_lo / 2^32)
|
|
v9_lo = 0|((v9_lo + 2^31) % 2^32 - 2^31)
|
|
v5_lo, v5_hi = v5_lo ~ v9_lo, v5_hi ~ v9_hi
|
|
v5_lo, v5_hi = v5_lo >> 24 | v5_hi << 8, v5_hi >> 24 | v5_lo << 8
|
|
k = row[4] * 2
|
|
v1_lo = v1_lo % 2^32 + v5_lo % 2^32 + W[k-1] % 2^32
|
|
v1_hi = v1_hi + v5_hi + floor(v1_lo / 2^32) + W[k]
|
|
v1_lo = 0|((v1_lo + 2^31) % 2^32 - 2^31)
|
|
vD_lo, vD_hi = vD_lo ~ v1_lo, vD_hi ~ v1_hi
|
|
vD_lo, vD_hi = vD_lo >> 16 | vD_hi << 16, vD_hi >> 16 | vD_lo << 16
|
|
v9_lo = v9_lo % 2^32 + vD_lo % 2^32
|
|
v9_hi = v9_hi + vD_hi + floor(v9_lo / 2^32)
|
|
v9_lo = 0|((v9_lo + 2^31) % 2^32 - 2^31)
|
|
v5_lo, v5_hi = v5_lo ~ v9_lo, v5_hi ~ v9_hi
|
|
v5_lo, v5_hi = v5_lo << 1 | v5_hi >> 31, v5_hi << 1 | v5_lo >> 31
|
|
k = row[5] * 2
|
|
v2_lo = v2_lo % 2^32 + v6_lo % 2^32 + W[k-1] % 2^32
|
|
v2_hi = v2_hi + v6_hi + floor(v2_lo / 2^32) + W[k]
|
|
v2_lo = 0|((v2_lo + 2^31) % 2^32 - 2^31)
|
|
vE_lo, vE_hi = vE_hi ~ v2_hi, vE_lo ~ v2_lo
|
|
vA_lo = vA_lo % 2^32 + vE_lo % 2^32
|
|
vA_hi = vA_hi + vE_hi + floor(vA_lo / 2^32)
|
|
vA_lo = 0|((vA_lo + 2^31) % 2^32 - 2^31)
|
|
v6_lo, v6_hi = v6_lo ~ vA_lo, v6_hi ~ vA_hi
|
|
v6_lo, v6_hi = v6_lo >> 24 | v6_hi << 8, v6_hi >> 24 | v6_lo << 8
|
|
k = row[6] * 2
|
|
v2_lo = v2_lo % 2^32 + v6_lo % 2^32 + W[k-1] % 2^32
|
|
v2_hi = v2_hi + v6_hi + floor(v2_lo / 2^32) + W[k]
|
|
v2_lo = 0|((v2_lo + 2^31) % 2^32 - 2^31)
|
|
vE_lo, vE_hi = vE_lo ~ v2_lo, vE_hi ~ v2_hi
|
|
vE_lo, vE_hi = vE_lo >> 16 | vE_hi << 16, vE_hi >> 16 | vE_lo << 16
|
|
vA_lo = vA_lo % 2^32 + vE_lo % 2^32
|
|
vA_hi = vA_hi + vE_hi + floor(vA_lo / 2^32)
|
|
vA_lo = 0|((vA_lo + 2^31) % 2^32 - 2^31)
|
|
v6_lo, v6_hi = v6_lo ~ vA_lo, v6_hi ~ vA_hi
|
|
v6_lo, v6_hi = v6_lo << 1 | v6_hi >> 31, v6_hi << 1 | v6_lo >> 31
|
|
k = row[7] * 2
|
|
v3_lo = v3_lo % 2^32 + v7_lo % 2^32 + W[k-1] % 2^32
|
|
v3_hi = v3_hi + v7_hi + floor(v3_lo / 2^32) + W[k]
|
|
v3_lo = 0|((v3_lo + 2^31) % 2^32 - 2^31)
|
|
vF_lo, vF_hi = vF_hi ~ v3_hi, vF_lo ~ v3_lo
|
|
vB_lo = vB_lo % 2^32 + vF_lo % 2^32
|
|
vB_hi = vB_hi + vF_hi + floor(vB_lo / 2^32)
|
|
vB_lo = 0|((vB_lo + 2^31) % 2^32 - 2^31)
|
|
v7_lo, v7_hi = v7_lo ~ vB_lo, v7_hi ~ vB_hi
|
|
v7_lo, v7_hi = v7_lo >> 24 | v7_hi << 8, v7_hi >> 24 | v7_lo << 8
|
|
k = row[8] * 2
|
|
v3_lo = v3_lo % 2^32 + v7_lo % 2^32 + W[k-1] % 2^32
|
|
v3_hi = v3_hi + v7_hi + floor(v3_lo / 2^32) + W[k]
|
|
v3_lo = 0|((v3_lo + 2^31) % 2^32 - 2^31)
|
|
vF_lo, vF_hi = vF_lo ~ v3_lo, vF_hi ~ v3_hi
|
|
vF_lo, vF_hi = vF_lo >> 16 | vF_hi << 16, vF_hi >> 16 | vF_lo << 16
|
|
vB_lo = vB_lo % 2^32 + vF_lo % 2^32
|
|
vB_hi = vB_hi + vF_hi + floor(vB_lo / 2^32)
|
|
vB_lo = 0|((vB_lo + 2^31) % 2^32 - 2^31)
|
|
v7_lo, v7_hi = v7_lo ~ vB_lo, v7_hi ~ vB_hi
|
|
v7_lo, v7_hi = v7_lo << 1 | v7_hi >> 31, v7_hi << 1 | v7_lo >> 31
|
|
k = row[9] * 2
|
|
v0_lo = v0_lo % 2^32 + v5_lo % 2^32 + W[k-1] % 2^32
|
|
v0_hi = v0_hi + v5_hi + floor(v0_lo / 2^32) + W[k]
|
|
v0_lo = 0|((v0_lo + 2^31) % 2^32 - 2^31)
|
|
vF_lo, vF_hi = vF_hi ~ v0_hi, vF_lo ~ v0_lo
|
|
vA_lo = vA_lo % 2^32 + vF_lo % 2^32
|
|
vA_hi = vA_hi + vF_hi + floor(vA_lo / 2^32)
|
|
vA_lo = 0|((vA_lo + 2^31) % 2^32 - 2^31)
|
|
v5_lo, v5_hi = v5_lo ~ vA_lo, v5_hi ~ vA_hi
|
|
v5_lo, v5_hi = v5_lo >> 24 | v5_hi << 8, v5_hi >> 24 | v5_lo << 8
|
|
k = row[10] * 2
|
|
v0_lo = v0_lo % 2^32 + v5_lo % 2^32 + W[k-1] % 2^32
|
|
v0_hi = v0_hi + v5_hi + floor(v0_lo / 2^32) + W[k]
|
|
v0_lo = 0|((v0_lo + 2^31) % 2^32 - 2^31)
|
|
vF_lo, vF_hi = vF_lo ~ v0_lo, vF_hi ~ v0_hi
|
|
vF_lo, vF_hi = vF_lo >> 16 | vF_hi << 16, vF_hi >> 16 | vF_lo << 16
|
|
vA_lo = vA_lo % 2^32 + vF_lo % 2^32
|
|
vA_hi = vA_hi + vF_hi + floor(vA_lo / 2^32)
|
|
vA_lo = 0|((vA_lo + 2^31) % 2^32 - 2^31)
|
|
v5_lo, v5_hi = v5_lo ~ vA_lo, v5_hi ~ vA_hi
|
|
v5_lo, v5_hi = v5_lo << 1 | v5_hi >> 31, v5_hi << 1 | v5_lo >> 31
|
|
k = row[11] * 2
|
|
v1_lo = v1_lo % 2^32 + v6_lo % 2^32 + W[k-1] % 2^32
|
|
v1_hi = v1_hi + v6_hi + floor(v1_lo / 2^32) + W[k]
|
|
v1_lo = 0|((v1_lo + 2^31) % 2^32 - 2^31)
|
|
vC_lo, vC_hi = vC_hi ~ v1_hi, vC_lo ~ v1_lo
|
|
vB_lo = vB_lo % 2^32 + vC_lo % 2^32
|
|
vB_hi = vB_hi + vC_hi + floor(vB_lo / 2^32)
|
|
vB_lo = 0|((vB_lo + 2^31) % 2^32 - 2^31)
|
|
v6_lo, v6_hi = v6_lo ~ vB_lo, v6_hi ~ vB_hi
|
|
v6_lo, v6_hi = v6_lo >> 24 | v6_hi << 8, v6_hi >> 24 | v6_lo << 8
|
|
k = row[12] * 2
|
|
v1_lo = v1_lo % 2^32 + v6_lo % 2^32 + W[k-1] % 2^32
|
|
v1_hi = v1_hi + v6_hi + floor(v1_lo / 2^32) + W[k]
|
|
v1_lo = 0|((v1_lo + 2^31) % 2^32 - 2^31)
|
|
vC_lo, vC_hi = vC_lo ~ v1_lo, vC_hi ~ v1_hi
|
|
vC_lo, vC_hi = vC_lo >> 16 | vC_hi << 16, vC_hi >> 16 | vC_lo << 16
|
|
vB_lo = vB_lo % 2^32 + vC_lo % 2^32
|
|
vB_hi = vB_hi + vC_hi + floor(vB_lo / 2^32)
|
|
vB_lo = 0|((vB_lo + 2^31) % 2^32 - 2^31)
|
|
v6_lo, v6_hi = v6_lo ~ vB_lo, v6_hi ~ vB_hi
|
|
v6_lo, v6_hi = v6_lo << 1 | v6_hi >> 31, v6_hi << 1 | v6_lo >> 31
|
|
k = row[13] * 2
|
|
v2_lo = v2_lo % 2^32 + v7_lo % 2^32 + W[k-1] % 2^32
|
|
v2_hi = v2_hi + v7_hi + floor(v2_lo / 2^32) + W[k]
|
|
v2_lo = 0|((v2_lo + 2^31) % 2^32 - 2^31)
|
|
vD_lo, vD_hi = vD_hi ~ v2_hi, vD_lo ~ v2_lo
|
|
v8_lo = v8_lo % 2^32 + vD_lo % 2^32
|
|
v8_hi = v8_hi + vD_hi + floor(v8_lo / 2^32)
|
|
v8_lo = 0|((v8_lo + 2^31) % 2^32 - 2^31)
|
|
v7_lo, v7_hi = v7_lo ~ v8_lo, v7_hi ~ v8_hi
|
|
v7_lo, v7_hi = v7_lo >> 24 | v7_hi << 8, v7_hi >> 24 | v7_lo << 8
|
|
k = row[14] * 2
|
|
v2_lo = v2_lo % 2^32 + v7_lo % 2^32 + W[k-1] % 2^32
|
|
v2_hi = v2_hi + v7_hi + floor(v2_lo / 2^32) + W[k]
|
|
v2_lo = 0|((v2_lo + 2^31) % 2^32 - 2^31)
|
|
vD_lo, vD_hi = vD_lo ~ v2_lo, vD_hi ~ v2_hi
|
|
vD_lo, vD_hi = vD_lo >> 16 | vD_hi << 16, vD_hi >> 16 | vD_lo << 16
|
|
v8_lo = v8_lo % 2^32 + vD_lo % 2^32
|
|
v8_hi = v8_hi + vD_hi + floor(v8_lo / 2^32)
|
|
v8_lo = 0|((v8_lo + 2^31) % 2^32 - 2^31)
|
|
v7_lo, v7_hi = v7_lo ~ v8_lo, v7_hi ~ v8_hi
|
|
v7_lo, v7_hi = v7_lo << 1 | v7_hi >> 31, v7_hi << 1 | v7_lo >> 31
|
|
k = row[15] * 2
|
|
v3_lo = v3_lo % 2^32 + v4_lo % 2^32 + W[k-1] % 2^32
|
|
v3_hi = v3_hi + v4_hi + floor(v3_lo / 2^32) + W[k]
|
|
v3_lo = 0|((v3_lo + 2^31) % 2^32 - 2^31)
|
|
vE_lo, vE_hi = vE_hi ~ v3_hi, vE_lo ~ v3_lo
|
|
v9_lo = v9_lo % 2^32 + vE_lo % 2^32
|
|
v9_hi = v9_hi + vE_hi + floor(v9_lo / 2^32)
|
|
v9_lo = 0|((v9_lo + 2^31) % 2^32 - 2^31)
|
|
v4_lo, v4_hi = v4_lo ~ v9_lo, v4_hi ~ v9_hi
|
|
v4_lo, v4_hi = v4_lo >> 24 | v4_hi << 8, v4_hi >> 24 | v4_lo << 8
|
|
k = row[16] * 2
|
|
v3_lo = v3_lo % 2^32 + v4_lo % 2^32 + W[k-1] % 2^32
|
|
v3_hi = v3_hi + v4_hi + floor(v3_lo / 2^32) + W[k]
|
|
v3_lo = 0|((v3_lo + 2^31) % 2^32 - 2^31)
|
|
vE_lo, vE_hi = vE_lo ~ v3_lo, vE_hi ~ v3_hi
|
|
vE_lo, vE_hi = vE_lo >> 16 | vE_hi << 16, vE_hi >> 16 | vE_lo << 16
|
|
v9_lo = v9_lo % 2^32 + vE_lo % 2^32
|
|
v9_hi = v9_hi + vE_hi + floor(v9_lo / 2^32)
|
|
v9_lo = 0|((v9_lo + 2^31) % 2^32 - 2^31)
|
|
v4_lo, v4_hi = v4_lo ~ v9_lo, v4_hi ~ v9_hi
|
|
v4_lo, v4_hi = v4_lo << 1 | v4_hi >> 31, v4_hi << 1 | v4_lo >> 31
|
|
end
|
|
h1_lo = h1_lo ~ v0_lo ~ v8_lo
|
|
h2_lo = h2_lo ~ v1_lo ~ v9_lo
|
|
h3_lo = h3_lo ~ v2_lo ~ vA_lo
|
|
h4_lo = h4_lo ~ v3_lo ~ vB_lo
|
|
h5_lo = h5_lo ~ v4_lo ~ vC_lo
|
|
h6_lo = h6_lo ~ v5_lo ~ vD_lo
|
|
h7_lo = h7_lo ~ v6_lo ~ vE_lo
|
|
h8_lo = h8_lo ~ v7_lo ~ vF_lo
|
|
h1_hi = h1_hi ~ v0_hi ~ v8_hi
|
|
h2_hi = h2_hi ~ v1_hi ~ v9_hi
|
|
h3_hi = h3_hi ~ v2_hi ~ vA_hi
|
|
h4_hi = h4_hi ~ v3_hi ~ vB_hi
|
|
h5_hi = h5_hi ~ v4_hi ~ vC_hi
|
|
h6_hi = h6_hi ~ v5_hi ~ vD_hi
|
|
h7_hi = h7_hi ~ v6_hi ~ vE_hi
|
|
h8_hi = h8_hi ~ v7_hi ~ vF_hi
|
|
end
|
|
H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
|
|
H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
|
|
return bytes_compressed
|
|
end
|
|
|
|
local function blake3_feed_64(str, offs, size, flags, chunk_index, H_in, H_out, wide_output, block_length)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
block_length = block_length or 64
|
|
local W = common_W
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H_in[1], H_in[2], H_in[3], H_in[4], H_in[5], H_in[6], H_in[7], H_in[8]
|
|
H_out = H_out or H_in
|
|
for pos = offs + 1, offs + size, 64 do
|
|
if str then
|
|
W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
|
|
string_unpack("<i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
|
|
end
|
|
local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
|
|
local v8, v9, vA, vB = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4]
|
|
local t0 = chunk_index % 2^32 -- t0 = low_4_bytes(chunk_index)
|
|
local t1 = (chunk_index - t0) / 2^32 -- t1 = high_4_bytes(chunk_index)
|
|
t0 = (t0 + 2^31) % 2^32 - 2^31 -- convert to int32 range (-2^31)..(2^31-1) to avoid "number has no integer representation" error while ORing
|
|
local vC, vD, vE, vF = 0|t0, 0|t1, block_length, flags
|
|
for j = 1, 7 do
|
|
v0 = v0 + v4 + W[perm_blake3[j]]
|
|
vC = vC ~ v0
|
|
vC = vC >> 16 | vC << 16
|
|
v8 = v8 + vC
|
|
v4 = v4 ~ v8
|
|
v4 = v4 >> 12 | v4 << 20
|
|
v0 = v0 + v4 + W[perm_blake3[j + 14]]
|
|
vC = vC ~ v0
|
|
vC = vC >> 8 | vC << 24
|
|
v8 = v8 + vC
|
|
v4 = v4 ~ v8
|
|
v4 = v4 >> 7 | v4 << 25
|
|
v1 = v1 + v5 + W[perm_blake3[j + 1]]
|
|
vD = vD ~ v1
|
|
vD = vD >> 16 | vD << 16
|
|
v9 = v9 + vD
|
|
v5 = v5 ~ v9
|
|
v5 = v5 >> 12 | v5 << 20
|
|
v1 = v1 + v5 + W[perm_blake3[j + 2]]
|
|
vD = vD ~ v1
|
|
vD = vD >> 8 | vD << 24
|
|
v9 = v9 + vD
|
|
v5 = v5 ~ v9
|
|
v5 = v5 >> 7 | v5 << 25
|
|
v2 = v2 + v6 + W[perm_blake3[j + 16]]
|
|
vE = vE ~ v2
|
|
vE = vE >> 16 | vE << 16
|
|
vA = vA + vE
|
|
v6 = v6 ~ vA
|
|
v6 = v6 >> 12 | v6 << 20
|
|
v2 = v2 + v6 + W[perm_blake3[j + 7]]
|
|
vE = vE ~ v2
|
|
vE = vE >> 8 | vE << 24
|
|
vA = vA + vE
|
|
v6 = v6 ~ vA
|
|
v6 = v6 >> 7 | v6 << 25
|
|
v3 = v3 + v7 + W[perm_blake3[j + 15]]
|
|
vF = vF ~ v3
|
|
vF = vF >> 16 | vF << 16
|
|
vB = vB + vF
|
|
v7 = v7 ~ vB
|
|
v7 = v7 >> 12 | v7 << 20
|
|
v3 = v3 + v7 + W[perm_blake3[j + 17]]
|
|
vF = vF ~ v3
|
|
vF = vF >> 8 | vF << 24
|
|
vB = vB + vF
|
|
v7 = v7 ~ vB
|
|
v7 = v7 >> 7 | v7 << 25
|
|
v0 = v0 + v5 + W[perm_blake3[j + 21]]
|
|
vF = vF ~ v0
|
|
vF = vF >> 16 | vF << 16
|
|
vA = vA + vF
|
|
v5 = v5 ~ vA
|
|
v5 = v5 >> 12 | v5 << 20
|
|
v0 = v0 + v5 + W[perm_blake3[j + 5]]
|
|
vF = vF ~ v0
|
|
vF = vF >> 8 | vF << 24
|
|
vA = vA + vF
|
|
v5 = v5 ~ vA
|
|
v5 = v5 >> 7 | v5 << 25
|
|
v1 = v1 + v6 + W[perm_blake3[j + 3]]
|
|
vC = vC ~ v1
|
|
vC = vC >> 16 | vC << 16
|
|
vB = vB + vC
|
|
v6 = v6 ~ vB
|
|
v6 = v6 >> 12 | v6 << 20
|
|
v1 = v1 + v6 + W[perm_blake3[j + 6]]
|
|
vC = vC ~ v1
|
|
vC = vC >> 8 | vC << 24
|
|
vB = vB + vC
|
|
v6 = v6 ~ vB
|
|
v6 = v6 >> 7 | v6 << 25
|
|
v2 = v2 + v7 + W[perm_blake3[j + 4]]
|
|
vD = vD ~ v2
|
|
vD = vD >> 16 | vD << 16
|
|
v8 = v8 + vD
|
|
v7 = v7 ~ v8
|
|
v7 = v7 >> 12 | v7 << 20
|
|
v2 = v2 + v7 + W[perm_blake3[j + 18]]
|
|
vD = vD ~ v2
|
|
vD = vD >> 8 | vD << 24
|
|
v8 = v8 + vD
|
|
v7 = v7 ~ v8
|
|
v7 = v7 >> 7 | v7 << 25
|
|
v3 = v3 + v4 + W[perm_blake3[j + 19]]
|
|
vE = vE ~ v3
|
|
vE = vE >> 16 | vE << 16
|
|
v9 = v9 + vE
|
|
v4 = v4 ~ v9
|
|
v4 = v4 >> 12 | v4 << 20
|
|
v3 = v3 + v4 + W[perm_blake3[j + 20]]
|
|
vE = vE ~ v3
|
|
vE = vE >> 8 | vE << 24
|
|
v9 = v9 + vE
|
|
v4 = v4 ~ v9
|
|
v4 = v4 >> 7 | v4 << 25
|
|
end
|
|
if wide_output then
|
|
H_out[ 9] = h1 ~ v8
|
|
H_out[10] = h2 ~ v9
|
|
H_out[11] = h3 ~ vA
|
|
H_out[12] = h4 ~ vB
|
|
H_out[13] = h5 ~ vC
|
|
H_out[14] = h6 ~ vD
|
|
H_out[15] = h7 ~ vE
|
|
H_out[16] = h8 ~ vF
|
|
end
|
|
h1 = v0 ~ v8
|
|
h2 = v1 ~ v9
|
|
h3 = v2 ~ vA
|
|
h4 = v3 ~ vB
|
|
h5 = v4 ~ vC
|
|
h6 = v5 ~ vD
|
|
h7 = v6 ~ vE
|
|
h8 = v7 ~ vF
|
|
end
|
|
H_out[1], H_out[2], H_out[3], H_out[4], H_out[5], H_out[6], H_out[7], H_out[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
end
|
|
|
|
return XORA5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64
|
|
]=](md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sha3_RC_hi, sigma, common_W, sha2_H_lo, sha2_H_hi, perm_blake3)
|
|
|
|
end
|
|
|
|
XOR = XOR or XORA5
|
|
|
|
if branch == "LIB32" or branch == "EMUL" then
|
|
|
|
|
|
-- implementation for Lua 5.1/5.2 (with or without bitwise library available)
|
|
|
|
function sha256_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W, K = common_W, sha2_K_hi
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for pos = offs, offs + size - 1, 64 do
|
|
for j = 1, 16 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = ((a * 256 + b) * 256 + c) * 256 + d
|
|
end
|
|
for j = 17, 64 do
|
|
local a, b = W[j-15], W[j-2]
|
|
local a7, a18, b17, b19 = a / 2^7, a / 2^18, b / 2^17, b / 2^19
|
|
W[j] = (XOR(a7 % 1 * (2^32 - 1) + a7, a18 % 1 * (2^32 - 1) + a18, (a - a % 2^3) / 2^3) + W[j-16] + W[j-7]
|
|
+ XOR(b17 % 1 * (2^32 - 1) + b17, b19 % 1 * (2^32 - 1) + b19, (b - b % 2^10) / 2^10)) % 2^32
|
|
end
|
|
local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8
|
|
for j = 1, 64 do
|
|
e = e % 2^32
|
|
local e6, e11, e7 = e / 2^6, e / 2^11, e * 2^7
|
|
local e7_lo = e7 % 2^32
|
|
local z = AND(e, f) + AND(-1-e, g) + h + K[j] + W[j]
|
|
+ XOR(e6 % 1 * (2^32 - 1) + e6, e11 % 1 * (2^32 - 1) + e11, e7_lo + (e7 - e7_lo) / 2^32)
|
|
h = g
|
|
g = f
|
|
f = e
|
|
e = z + d
|
|
d = c
|
|
c = b
|
|
b = a % 2^32
|
|
local b2, b13, b10 = b / 2^2, b / 2^13, b * 2^10
|
|
local b10_lo = b10 % 2^32
|
|
a = z + AND(d, c) + AND(b, XOR(d, c)) +
|
|
XOR(b2 % 1 * (2^32 - 1) + b2, b13 % 1 * (2^32 - 1) + b13, b10_lo + (b10 - b10_lo) / 2^32)
|
|
end
|
|
h1, h2, h3, h4 = (a + h1) % 2^32, (b + h2) % 2^32, (c + h3) % 2^32, (d + h4) % 2^32
|
|
h5, h6, h7, h8 = (e + h5) % 2^32, (f + h6) % 2^32, (g + h7) % 2^32, (h + h8) % 2^32
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
end
|
|
|
|
|
|
function sha512_feed_128(H_lo, H_hi, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
-- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k]
|
|
local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi
|
|
local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
|
|
local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
|
|
for pos = offs, offs + size - 1, 128 do
|
|
for j = 1, 16*2 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = ((a * 256 + b) * 256 + c) * 256 + d
|
|
end
|
|
for jj = 17*2, 80*2, 2 do
|
|
local a_hi, a_lo, b_hi, b_lo = W[jj-31], W[jj-30], W[jj-5], W[jj-4]
|
|
local b_hi_6, b_hi_19, b_hi_29, b_lo_19, b_lo_29, a_hi_1, a_hi_7, a_hi_8, a_lo_1, a_lo_8 =
|
|
b_hi % 2^6, b_hi % 2^19, b_hi % 2^29, b_lo % 2^19, b_lo % 2^29, a_hi % 2^1, a_hi % 2^7, a_hi % 2^8, a_lo % 2^1, a_lo % 2^8
|
|
local tmp1 = XOR((a_lo - a_lo_1) / 2^1 + a_hi_1 * 2^31, (a_lo - a_lo_8) / 2^8 + a_hi_8 * 2^24, (a_lo - a_lo % 2^7) / 2^7 + a_hi_7 * 2^25) % 2^32
|
|
+ XOR((b_lo - b_lo_19) / 2^19 + b_hi_19 * 2^13, b_lo_29 * 2^3 + (b_hi - b_hi_29) / 2^29, (b_lo - b_lo % 2^6) / 2^6 + b_hi_6 * 2^26) % 2^32
|
|
+ W[jj-14] + W[jj-32]
|
|
local tmp2 = tmp1 % 2^32
|
|
W[jj-1] = (XOR((a_hi - a_hi_1) / 2^1 + a_lo_1 * 2^31, (a_hi - a_hi_8) / 2^8 + a_lo_8 * 2^24, (a_hi - a_hi_7) / 2^7)
|
|
+ XOR((b_hi - b_hi_19) / 2^19 + b_lo_19 * 2^13, b_hi_29 * 2^3 + (b_lo - b_lo_29) / 2^29, (b_hi - b_hi_6) / 2^6)
|
|
+ W[jj-15] + W[jj-33] + (tmp1 - tmp2) / 2^32) % 2^32
|
|
W[jj] = tmp2
|
|
end
|
|
local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
|
|
local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
|
|
for j = 1, 80 do
|
|
local jj = 2*j
|
|
local e_lo_9, e_lo_14, e_lo_18, e_hi_9, e_hi_14, e_hi_18 = e_lo % 2^9, e_lo % 2^14, e_lo % 2^18, e_hi % 2^9, e_hi % 2^14, e_hi % 2^18
|
|
local tmp1 = (AND(e_lo, f_lo) + AND(-1-e_lo, g_lo)) % 2^32 + h_lo + K_lo[j] + W[jj]
|
|
+ XOR((e_lo - e_lo_14) / 2^14 + e_hi_14 * 2^18, (e_lo - e_lo_18) / 2^18 + e_hi_18 * 2^14, e_lo_9 * 2^23 + (e_hi - e_hi_9) / 2^9) % 2^32
|
|
local z_lo = tmp1 % 2^32
|
|
local z_hi = AND(e_hi, f_hi) + AND(-1-e_hi, g_hi) + h_hi + K_hi[j] + W[jj-1] + (tmp1 - z_lo) / 2^32
|
|
+ XOR((e_hi - e_hi_14) / 2^14 + e_lo_14 * 2^18, (e_hi - e_hi_18) / 2^18 + e_lo_18 * 2^14, e_hi_9 * 2^23 + (e_lo - e_lo_9) / 2^9)
|
|
h_lo = g_lo; h_hi = g_hi
|
|
g_lo = f_lo; g_hi = f_hi
|
|
f_lo = e_lo; f_hi = e_hi
|
|
tmp1 = z_lo + d_lo
|
|
e_lo = tmp1 % 2^32
|
|
e_hi = (z_hi + d_hi + (tmp1 - e_lo) / 2^32) % 2^32
|
|
d_lo = c_lo; d_hi = c_hi
|
|
c_lo = b_lo; c_hi = b_hi
|
|
b_lo = a_lo; b_hi = a_hi
|
|
local b_lo_2, b_lo_7, b_lo_28, b_hi_2, b_hi_7, b_hi_28 = b_lo % 2^2, b_lo % 2^7, b_lo % 2^28, b_hi % 2^2, b_hi % 2^7, b_hi % 2^28
|
|
tmp1 = z_lo + (AND(d_lo, c_lo) + AND(b_lo, XOR(d_lo, c_lo))) % 2^32
|
|
+ XOR((b_lo - b_lo_28) / 2^28 + b_hi_28 * 2^4, b_lo_2 * 2^30 + (b_hi - b_hi_2) / 2^2, b_lo_7 * 2^25 + (b_hi - b_hi_7) / 2^7) % 2^32
|
|
a_lo = tmp1 % 2^32
|
|
a_hi = (z_hi + AND(d_hi, c_hi) + AND(b_hi, XOR(d_hi, c_hi)) + (tmp1 - a_lo) / 2^32
|
|
+ XOR((b_hi - b_hi_28) / 2^28 + b_lo_28 * 2^4, b_hi_2 * 2^30 + (b_lo - b_lo_2) / 2^2, b_hi_7 * 2^25 + (b_lo - b_lo_7) / 2^7)) % 2^32
|
|
end
|
|
a_lo = h1_lo + a_lo
|
|
h1_lo = a_lo % 2^32
|
|
h1_hi = (h1_hi + a_hi + (a_lo - h1_lo) / 2^32) % 2^32
|
|
a_lo = h2_lo + b_lo
|
|
h2_lo = a_lo % 2^32
|
|
h2_hi = (h2_hi + b_hi + (a_lo - h2_lo) / 2^32) % 2^32
|
|
a_lo = h3_lo + c_lo
|
|
h3_lo = a_lo % 2^32
|
|
h3_hi = (h3_hi + c_hi + (a_lo - h3_lo) / 2^32) % 2^32
|
|
a_lo = h4_lo + d_lo
|
|
h4_lo = a_lo % 2^32
|
|
h4_hi = (h4_hi + d_hi + (a_lo - h4_lo) / 2^32) % 2^32
|
|
a_lo = h5_lo + e_lo
|
|
h5_lo = a_lo % 2^32
|
|
h5_hi = (h5_hi + e_hi + (a_lo - h5_lo) / 2^32) % 2^32
|
|
a_lo = h6_lo + f_lo
|
|
h6_lo = a_lo % 2^32
|
|
h6_hi = (h6_hi + f_hi + (a_lo - h6_lo) / 2^32) % 2^32
|
|
a_lo = h7_lo + g_lo
|
|
h7_lo = a_lo % 2^32
|
|
h7_hi = (h7_hi + g_hi + (a_lo - h7_lo) / 2^32) % 2^32
|
|
a_lo = h8_lo + h_lo
|
|
h8_lo = a_lo % 2^32
|
|
h8_hi = (h8_hi + h_hi + (a_lo - h8_lo) / 2^32) % 2^32
|
|
end
|
|
H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
|
|
H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
|
|
end
|
|
|
|
|
|
if branch == "LIB32" then
|
|
|
|
function md5_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W, K, md5_next_shift = common_W, md5_K, md5_next_shift
|
|
local h1, h2, h3, h4 = H[1], H[2], H[3], H[4]
|
|
for pos = offs, offs + size - 1, 64 do
|
|
for j = 1, 16 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = ((d * 256 + c) * 256 + b) * 256 + a
|
|
end
|
|
local a, b, c, d = h1, h2, h3, h4
|
|
local s = 25
|
|
for j = 1, 16 do
|
|
local F = ROR(AND(b, c) + AND(-1-b, d) + a + K[j] + W[j], s) + b
|
|
s = md5_next_shift[s]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = F
|
|
end
|
|
s = 27
|
|
for j = 17, 32 do
|
|
local F = ROR(AND(d, b) + AND(-1-d, c) + a + K[j] + W[(5*j-4) % 16 + 1], s) + b
|
|
s = md5_next_shift[s]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = F
|
|
end
|
|
s = 28
|
|
for j = 33, 48 do
|
|
local F = ROR(XOR(XOR(b, c), d) + a + K[j] + W[(3*j+2) % 16 + 1], s) + b
|
|
s = md5_next_shift[s]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = F
|
|
end
|
|
s = 26
|
|
for j = 49, 64 do
|
|
local F = ROR(XOR(c, OR(b, -1-d)) + a + K[j] + W[(j*7-7) % 16 + 1], s) + b
|
|
s = md5_next_shift[s]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = F
|
|
end
|
|
h1 = (a + h1) % 2^32
|
|
h2 = (b + h2) % 2^32
|
|
h3 = (c + h3) % 2^32
|
|
h4 = (d + h4) % 2^32
|
|
end
|
|
H[1], H[2], H[3], H[4] = h1, h2, h3, h4
|
|
end
|
|
|
|
elseif branch == "EMUL" then
|
|
|
|
function md5_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W, K, md5_next_shift = common_W, md5_K, md5_next_shift
|
|
local h1, h2, h3, h4 = H[1], H[2], H[3], H[4]
|
|
for pos = offs, offs + size - 1, 64 do
|
|
for j = 1, 16 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = ((d * 256 + c) * 256 + b) * 256 + a
|
|
end
|
|
local a, b, c, d = h1, h2, h3, h4
|
|
local s = 25
|
|
for j = 1, 16 do
|
|
local z = (AND(b, c) + AND(-1-b, d) + a + K[j] + W[j]) % 2^32 / 2^s
|
|
local y = z % 1
|
|
s = md5_next_shift[s]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = y * 2^32 + (z - y) + b
|
|
end
|
|
s = 27
|
|
for j = 17, 32 do
|
|
local z = (AND(d, b) + AND(-1-d, c) + a + K[j] + W[(5*j-4) % 16 + 1]) % 2^32 / 2^s
|
|
local y = z % 1
|
|
s = md5_next_shift[s]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = y * 2^32 + (z - y) + b
|
|
end
|
|
s = 28
|
|
for j = 33, 48 do
|
|
local z = (XOR(XOR(b, c), d) + a + K[j] + W[(3*j+2) % 16 + 1]) % 2^32 / 2^s
|
|
local y = z % 1
|
|
s = md5_next_shift[s]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = y * 2^32 + (z - y) + b
|
|
end
|
|
s = 26
|
|
for j = 49, 64 do
|
|
local z = (XOR(c, OR(b, -1-d)) + a + K[j] + W[(j*7-7) % 16 + 1]) % 2^32 / 2^s
|
|
local y = z % 1
|
|
s = md5_next_shift[s]
|
|
a = d
|
|
d = c
|
|
c = b
|
|
b = y * 2^32 + (z - y) + b
|
|
end
|
|
h1 = (a + h1) % 2^32
|
|
h2 = (b + h2) % 2^32
|
|
h3 = (c + h3) % 2^32
|
|
h4 = (d + h4) % 2^32
|
|
end
|
|
H[1], H[2], H[3], H[4] = h1, h2, h3, h4
|
|
end
|
|
|
|
end
|
|
|
|
|
|
function sha1_feed_64(H, str, offs, size)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W = common_W
|
|
local h1, h2, h3, h4, h5 = H[1], H[2], H[3], H[4], H[5]
|
|
for pos = offs, offs + size - 1, 64 do
|
|
for j = 1, 16 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = ((a * 256 + b) * 256 + c) * 256 + d
|
|
end
|
|
for j = 17, 80 do
|
|
local a = XOR(W[j-3], W[j-8], W[j-14], W[j-16]) % 2^32 * 2
|
|
local b = a % 2^32
|
|
W[j] = b + (a - b) / 2^32
|
|
end
|
|
local a, b, c, d, e = h1, h2, h3, h4, h5
|
|
for j = 1, 20 do
|
|
local a5 = a * 2^5
|
|
local z = a5 % 2^32
|
|
z = z + (a5 - z) / 2^32 + AND(b, c) + AND(-1-b, d) + 0x5A827999 + W[j] + e -- constant = floor(2^30 * sqrt(2))
|
|
e = d
|
|
d = c
|
|
c = b / 2^2
|
|
c = c % 1 * (2^32 - 1) + c
|
|
b = a
|
|
a = z % 2^32
|
|
end
|
|
for j = 21, 40 do
|
|
local a5 = a * 2^5
|
|
local z = a5 % 2^32
|
|
z = z + (a5 - z) / 2^32 + XOR(b, c, d) + 0x6ED9EBA1 + W[j] + e -- 2^30 * sqrt(3)
|
|
e = d
|
|
d = c
|
|
c = b / 2^2
|
|
c = c % 1 * (2^32 - 1) + c
|
|
b = a
|
|
a = z % 2^32
|
|
end
|
|
for j = 41, 60 do
|
|
local a5 = a * 2^5
|
|
local z = a5 % 2^32
|
|
z = z + (a5 - z) / 2^32 + AND(d, c) + AND(b, XOR(d, c)) + 0x8F1BBCDC + W[j] + e -- 2^30 * sqrt(5)
|
|
e = d
|
|
d = c
|
|
c = b / 2^2
|
|
c = c % 1 * (2^32 - 1) + c
|
|
b = a
|
|
a = z % 2^32
|
|
end
|
|
for j = 61, 80 do
|
|
local a5 = a * 2^5
|
|
local z = a5 % 2^32
|
|
z = z + (a5 - z) / 2^32 + XOR(b, c, d) + 0xCA62C1D6 + W[j] + e -- 2^30 * sqrt(10)
|
|
e = d
|
|
d = c
|
|
c = b / 2^2
|
|
c = c % 1 * (2^32 - 1) + c
|
|
b = a
|
|
a = z % 2^32
|
|
end
|
|
h1 = (a + h1) % 2^32
|
|
h2 = (b + h2) % 2^32
|
|
h3 = (c + h3) % 2^32
|
|
h4 = (d + h4) % 2^32
|
|
h5 = (e + h5) % 2^32
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5] = h1, h2, h3, h4, h5
|
|
end
|
|
|
|
|
|
function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes)
|
|
-- This is an example of a Lua function having 79 local variables :-)
|
|
-- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
|
|
local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi
|
|
local qwords_qty = block_size_in_bytes / 8
|
|
for pos = offs, offs + size - 1, block_size_in_bytes do
|
|
for j = 1, qwords_qty do
|
|
local a, b, c, d = byte(str, pos + 1, pos + 4)
|
|
lanes_lo[j] = XOR(lanes_lo[j], ((d * 256 + c) * 256 + b) * 256 + a)
|
|
pos = pos + 8
|
|
a, b, c, d = byte(str, pos - 3, pos)
|
|
lanes_hi[j] = XOR(lanes_hi[j], ((d * 256 + c) * 256 + b) * 256 + a)
|
|
end
|
|
local L01_lo, L01_hi, L02_lo, L02_hi, L03_lo, L03_hi, L04_lo, L04_hi, L05_lo, L05_hi, L06_lo, L06_hi, L07_lo, L07_hi, L08_lo, L08_hi,
|
|
L09_lo, L09_hi, L10_lo, L10_hi, L11_lo, L11_hi, L12_lo, L12_hi, L13_lo, L13_hi, L14_lo, L14_hi, L15_lo, L15_hi, L16_lo, L16_hi,
|
|
L17_lo, L17_hi, L18_lo, L18_hi, L19_lo, L19_hi, L20_lo, L20_hi, L21_lo, L21_hi, L22_lo, L22_hi, L23_lo, L23_hi, L24_lo, L24_hi, L25_lo, L25_hi =
|
|
lanes_lo[1], lanes_hi[1], lanes_lo[2], lanes_hi[2], lanes_lo[3], lanes_hi[3], lanes_lo[4], lanes_hi[4], lanes_lo[5], lanes_hi[5],
|
|
lanes_lo[6], lanes_hi[6], lanes_lo[7], lanes_hi[7], lanes_lo[8], lanes_hi[8], lanes_lo[9], lanes_hi[9], lanes_lo[10], lanes_hi[10],
|
|
lanes_lo[11], lanes_hi[11], lanes_lo[12], lanes_hi[12], lanes_lo[13], lanes_hi[13], lanes_lo[14], lanes_hi[14], lanes_lo[15], lanes_hi[15],
|
|
lanes_lo[16], lanes_hi[16], lanes_lo[17], lanes_hi[17], lanes_lo[18], lanes_hi[18], lanes_lo[19], lanes_hi[19], lanes_lo[20], lanes_hi[20],
|
|
lanes_lo[21], lanes_hi[21], lanes_lo[22], lanes_hi[22], lanes_lo[23], lanes_hi[23], lanes_lo[24], lanes_hi[24], lanes_lo[25], lanes_hi[25]
|
|
for round_idx = 1, 24 do
|
|
local C1_lo = XOR(L01_lo, L06_lo, L11_lo, L16_lo, L21_lo)
|
|
local C1_hi = XOR(L01_hi, L06_hi, L11_hi, L16_hi, L21_hi)
|
|
local C2_lo = XOR(L02_lo, L07_lo, L12_lo, L17_lo, L22_lo)
|
|
local C2_hi = XOR(L02_hi, L07_hi, L12_hi, L17_hi, L22_hi)
|
|
local C3_lo = XOR(L03_lo, L08_lo, L13_lo, L18_lo, L23_lo)
|
|
local C3_hi = XOR(L03_hi, L08_hi, L13_hi, L18_hi, L23_hi)
|
|
local C4_lo = XOR(L04_lo, L09_lo, L14_lo, L19_lo, L24_lo)
|
|
local C4_hi = XOR(L04_hi, L09_hi, L14_hi, L19_hi, L24_hi)
|
|
local C5_lo = XOR(L05_lo, L10_lo, L15_lo, L20_lo, L25_lo)
|
|
local C5_hi = XOR(L05_hi, L10_hi, L15_hi, L20_hi, L25_hi)
|
|
local D_lo = XOR(C1_lo, C3_lo * 2 + (C3_hi % 2^32 - C3_hi % 2^31) / 2^31)
|
|
local D_hi = XOR(C1_hi, C3_hi * 2 + (C3_lo % 2^32 - C3_lo % 2^31) / 2^31)
|
|
local T0_lo = XOR(D_lo, L02_lo)
|
|
local T0_hi = XOR(D_hi, L02_hi)
|
|
local T1_lo = XOR(D_lo, L07_lo)
|
|
local T1_hi = XOR(D_hi, L07_hi)
|
|
local T2_lo = XOR(D_lo, L12_lo)
|
|
local T2_hi = XOR(D_hi, L12_hi)
|
|
local T3_lo = XOR(D_lo, L17_lo)
|
|
local T3_hi = XOR(D_hi, L17_hi)
|
|
local T4_lo = XOR(D_lo, L22_lo)
|
|
local T4_hi = XOR(D_hi, L22_hi)
|
|
L02_lo = (T1_lo % 2^32 - T1_lo % 2^20) / 2^20 + T1_hi * 2^12
|
|
L02_hi = (T1_hi % 2^32 - T1_hi % 2^20) / 2^20 + T1_lo * 2^12
|
|
L07_lo = (T3_lo % 2^32 - T3_lo % 2^19) / 2^19 + T3_hi * 2^13
|
|
L07_hi = (T3_hi % 2^32 - T3_hi % 2^19) / 2^19 + T3_lo * 2^13
|
|
L12_lo = T0_lo * 2 + (T0_hi % 2^32 - T0_hi % 2^31) / 2^31
|
|
L12_hi = T0_hi * 2 + (T0_lo % 2^32 - T0_lo % 2^31) / 2^31
|
|
L17_lo = T2_lo * 2^10 + (T2_hi % 2^32 - T2_hi % 2^22) / 2^22
|
|
L17_hi = T2_hi * 2^10 + (T2_lo % 2^32 - T2_lo % 2^22) / 2^22
|
|
L22_lo = T4_lo * 2^2 + (T4_hi % 2^32 - T4_hi % 2^30) / 2^30
|
|
L22_hi = T4_hi * 2^2 + (T4_lo % 2^32 - T4_lo % 2^30) / 2^30
|
|
D_lo = XOR(C2_lo, C4_lo * 2 + (C4_hi % 2^32 - C4_hi % 2^31) / 2^31)
|
|
D_hi = XOR(C2_hi, C4_hi * 2 + (C4_lo % 2^32 - C4_lo % 2^31) / 2^31)
|
|
T0_lo = XOR(D_lo, L03_lo)
|
|
T0_hi = XOR(D_hi, L03_hi)
|
|
T1_lo = XOR(D_lo, L08_lo)
|
|
T1_hi = XOR(D_hi, L08_hi)
|
|
T2_lo = XOR(D_lo, L13_lo)
|
|
T2_hi = XOR(D_hi, L13_hi)
|
|
T3_lo = XOR(D_lo, L18_lo)
|
|
T3_hi = XOR(D_hi, L18_hi)
|
|
T4_lo = XOR(D_lo, L23_lo)
|
|
T4_hi = XOR(D_hi, L23_hi)
|
|
L03_lo = (T2_lo % 2^32 - T2_lo % 2^21) / 2^21 + T2_hi * 2^11
|
|
L03_hi = (T2_hi % 2^32 - T2_hi % 2^21) / 2^21 + T2_lo * 2^11
|
|
L08_lo = (T4_lo % 2^32 - T4_lo % 2^3) / 2^3 + T4_hi * 2^29 % 2^32
|
|
L08_hi = (T4_hi % 2^32 - T4_hi % 2^3) / 2^3 + T4_lo * 2^29 % 2^32
|
|
L13_lo = T1_lo * 2^6 + (T1_hi % 2^32 - T1_hi % 2^26) / 2^26
|
|
L13_hi = T1_hi * 2^6 + (T1_lo % 2^32 - T1_lo % 2^26) / 2^26
|
|
L18_lo = T3_lo * 2^15 + (T3_hi % 2^32 - T3_hi % 2^17) / 2^17
|
|
L18_hi = T3_hi * 2^15 + (T3_lo % 2^32 - T3_lo % 2^17) / 2^17
|
|
L23_lo = (T0_lo % 2^32 - T0_lo % 2^2) / 2^2 + T0_hi * 2^30 % 2^32
|
|
L23_hi = (T0_hi % 2^32 - T0_hi % 2^2) / 2^2 + T0_lo * 2^30 % 2^32
|
|
D_lo = XOR(C3_lo, C5_lo * 2 + (C5_hi % 2^32 - C5_hi % 2^31) / 2^31)
|
|
D_hi = XOR(C3_hi, C5_hi * 2 + (C5_lo % 2^32 - C5_lo % 2^31) / 2^31)
|
|
T0_lo = XOR(D_lo, L04_lo)
|
|
T0_hi = XOR(D_hi, L04_hi)
|
|
T1_lo = XOR(D_lo, L09_lo)
|
|
T1_hi = XOR(D_hi, L09_hi)
|
|
T2_lo = XOR(D_lo, L14_lo)
|
|
T2_hi = XOR(D_hi, L14_hi)
|
|
T3_lo = XOR(D_lo, L19_lo)
|
|
T3_hi = XOR(D_hi, L19_hi)
|
|
T4_lo = XOR(D_lo, L24_lo)
|
|
T4_hi = XOR(D_hi, L24_hi)
|
|
L04_lo = T3_lo * 2^21 % 2^32 + (T3_hi % 2^32 - T3_hi % 2^11) / 2^11
|
|
L04_hi = T3_hi * 2^21 % 2^32 + (T3_lo % 2^32 - T3_lo % 2^11) / 2^11
|
|
L09_lo = T0_lo * 2^28 % 2^32 + (T0_hi % 2^32 - T0_hi % 2^4) / 2^4
|
|
L09_hi = T0_hi * 2^28 % 2^32 + (T0_lo % 2^32 - T0_lo % 2^4) / 2^4
|
|
L14_lo = T2_lo * 2^25 % 2^32 + (T2_hi % 2^32 - T2_hi % 2^7) / 2^7
|
|
L14_hi = T2_hi * 2^25 % 2^32 + (T2_lo % 2^32 - T2_lo % 2^7) / 2^7
|
|
L19_lo = (T4_lo % 2^32 - T4_lo % 2^8) / 2^8 + T4_hi * 2^24 % 2^32
|
|
L19_hi = (T4_hi % 2^32 - T4_hi % 2^8) / 2^8 + T4_lo * 2^24 % 2^32
|
|
L24_lo = (T1_lo % 2^32 - T1_lo % 2^9) / 2^9 + T1_hi * 2^23 % 2^32
|
|
L24_hi = (T1_hi % 2^32 - T1_hi % 2^9) / 2^9 + T1_lo * 2^23 % 2^32
|
|
D_lo = XOR(C4_lo, C1_lo * 2 + (C1_hi % 2^32 - C1_hi % 2^31) / 2^31)
|
|
D_hi = XOR(C4_hi, C1_hi * 2 + (C1_lo % 2^32 - C1_lo % 2^31) / 2^31)
|
|
T0_lo = XOR(D_lo, L05_lo)
|
|
T0_hi = XOR(D_hi, L05_hi)
|
|
T1_lo = XOR(D_lo, L10_lo)
|
|
T1_hi = XOR(D_hi, L10_hi)
|
|
T2_lo = XOR(D_lo, L15_lo)
|
|
T2_hi = XOR(D_hi, L15_hi)
|
|
T3_lo = XOR(D_lo, L20_lo)
|
|
T3_hi = XOR(D_hi, L20_hi)
|
|
T4_lo = XOR(D_lo, L25_lo)
|
|
T4_hi = XOR(D_hi, L25_hi)
|
|
L05_lo = T4_lo * 2^14 + (T4_hi % 2^32 - T4_hi % 2^18) / 2^18
|
|
L05_hi = T4_hi * 2^14 + (T4_lo % 2^32 - T4_lo % 2^18) / 2^18
|
|
L10_lo = T1_lo * 2^20 % 2^32 + (T1_hi % 2^32 - T1_hi % 2^12) / 2^12
|
|
L10_hi = T1_hi * 2^20 % 2^32 + (T1_lo % 2^32 - T1_lo % 2^12) / 2^12
|
|
L15_lo = T3_lo * 2^8 + (T3_hi % 2^32 - T3_hi % 2^24) / 2^24
|
|
L15_hi = T3_hi * 2^8 + (T3_lo % 2^32 - T3_lo % 2^24) / 2^24
|
|
L20_lo = T0_lo * 2^27 % 2^32 + (T0_hi % 2^32 - T0_hi % 2^5) / 2^5
|
|
L20_hi = T0_hi * 2^27 % 2^32 + (T0_lo % 2^32 - T0_lo % 2^5) / 2^5
|
|
L25_lo = (T2_lo % 2^32 - T2_lo % 2^25) / 2^25 + T2_hi * 2^7
|
|
L25_hi = (T2_hi % 2^32 - T2_hi % 2^25) / 2^25 + T2_lo * 2^7
|
|
D_lo = XOR(C5_lo, C2_lo * 2 + (C2_hi % 2^32 - C2_hi % 2^31) / 2^31)
|
|
D_hi = XOR(C5_hi, C2_hi * 2 + (C2_lo % 2^32 - C2_lo % 2^31) / 2^31)
|
|
T1_lo = XOR(D_lo, L06_lo)
|
|
T1_hi = XOR(D_hi, L06_hi)
|
|
T2_lo = XOR(D_lo, L11_lo)
|
|
T2_hi = XOR(D_hi, L11_hi)
|
|
T3_lo = XOR(D_lo, L16_lo)
|
|
T3_hi = XOR(D_hi, L16_hi)
|
|
T4_lo = XOR(D_lo, L21_lo)
|
|
T4_hi = XOR(D_hi, L21_hi)
|
|
L06_lo = T2_lo * 2^3 + (T2_hi % 2^32 - T2_hi % 2^29) / 2^29
|
|
L06_hi = T2_hi * 2^3 + (T2_lo % 2^32 - T2_lo % 2^29) / 2^29
|
|
L11_lo = T4_lo * 2^18 + (T4_hi % 2^32 - T4_hi % 2^14) / 2^14
|
|
L11_hi = T4_hi * 2^18 + (T4_lo % 2^32 - T4_lo % 2^14) / 2^14
|
|
L16_lo = (T1_lo % 2^32 - T1_lo % 2^28) / 2^28 + T1_hi * 2^4
|
|
L16_hi = (T1_hi % 2^32 - T1_hi % 2^28) / 2^28 + T1_lo * 2^4
|
|
L21_lo = (T3_lo % 2^32 - T3_lo % 2^23) / 2^23 + T3_hi * 2^9
|
|
L21_hi = (T3_hi % 2^32 - T3_hi % 2^23) / 2^23 + T3_lo * 2^9
|
|
L01_lo = XOR(D_lo, L01_lo)
|
|
L01_hi = XOR(D_hi, L01_hi)
|
|
L01_lo, L02_lo, L03_lo, L04_lo, L05_lo = XOR(L01_lo, AND(-1-L02_lo, L03_lo)), XOR(L02_lo, AND(-1-L03_lo, L04_lo)), XOR(L03_lo, AND(-1-L04_lo, L05_lo)), XOR(L04_lo, AND(-1-L05_lo, L01_lo)), XOR(L05_lo, AND(-1-L01_lo, L02_lo))
|
|
L01_hi, L02_hi, L03_hi, L04_hi, L05_hi = XOR(L01_hi, AND(-1-L02_hi, L03_hi)), XOR(L02_hi, AND(-1-L03_hi, L04_hi)), XOR(L03_hi, AND(-1-L04_hi, L05_hi)), XOR(L04_hi, AND(-1-L05_hi, L01_hi)), XOR(L05_hi, AND(-1-L01_hi, L02_hi))
|
|
L06_lo, L07_lo, L08_lo, L09_lo, L10_lo = XOR(L09_lo, AND(-1-L10_lo, L06_lo)), XOR(L10_lo, AND(-1-L06_lo, L07_lo)), XOR(L06_lo, AND(-1-L07_lo, L08_lo)), XOR(L07_lo, AND(-1-L08_lo, L09_lo)), XOR(L08_lo, AND(-1-L09_lo, L10_lo))
|
|
L06_hi, L07_hi, L08_hi, L09_hi, L10_hi = XOR(L09_hi, AND(-1-L10_hi, L06_hi)), XOR(L10_hi, AND(-1-L06_hi, L07_hi)), XOR(L06_hi, AND(-1-L07_hi, L08_hi)), XOR(L07_hi, AND(-1-L08_hi, L09_hi)), XOR(L08_hi, AND(-1-L09_hi, L10_hi))
|
|
L11_lo, L12_lo, L13_lo, L14_lo, L15_lo = XOR(L12_lo, AND(-1-L13_lo, L14_lo)), XOR(L13_lo, AND(-1-L14_lo, L15_lo)), XOR(L14_lo, AND(-1-L15_lo, L11_lo)), XOR(L15_lo, AND(-1-L11_lo, L12_lo)), XOR(L11_lo, AND(-1-L12_lo, L13_lo))
|
|
L11_hi, L12_hi, L13_hi, L14_hi, L15_hi = XOR(L12_hi, AND(-1-L13_hi, L14_hi)), XOR(L13_hi, AND(-1-L14_hi, L15_hi)), XOR(L14_hi, AND(-1-L15_hi, L11_hi)), XOR(L15_hi, AND(-1-L11_hi, L12_hi)), XOR(L11_hi, AND(-1-L12_hi, L13_hi))
|
|
L16_lo, L17_lo, L18_lo, L19_lo, L20_lo = XOR(L20_lo, AND(-1-L16_lo, L17_lo)), XOR(L16_lo, AND(-1-L17_lo, L18_lo)), XOR(L17_lo, AND(-1-L18_lo, L19_lo)), XOR(L18_lo, AND(-1-L19_lo, L20_lo)), XOR(L19_lo, AND(-1-L20_lo, L16_lo))
|
|
L16_hi, L17_hi, L18_hi, L19_hi, L20_hi = XOR(L20_hi, AND(-1-L16_hi, L17_hi)), XOR(L16_hi, AND(-1-L17_hi, L18_hi)), XOR(L17_hi, AND(-1-L18_hi, L19_hi)), XOR(L18_hi, AND(-1-L19_hi, L20_hi)), XOR(L19_hi, AND(-1-L20_hi, L16_hi))
|
|
L21_lo, L22_lo, L23_lo, L24_lo, L25_lo = XOR(L23_lo, AND(-1-L24_lo, L25_lo)), XOR(L24_lo, AND(-1-L25_lo, L21_lo)), XOR(L25_lo, AND(-1-L21_lo, L22_lo)), XOR(L21_lo, AND(-1-L22_lo, L23_lo)), XOR(L22_lo, AND(-1-L23_lo, L24_lo))
|
|
L21_hi, L22_hi, L23_hi, L24_hi, L25_hi = XOR(L23_hi, AND(-1-L24_hi, L25_hi)), XOR(L24_hi, AND(-1-L25_hi, L21_hi)), XOR(L25_hi, AND(-1-L21_hi, L22_hi)), XOR(L21_hi, AND(-1-L22_hi, L23_hi)), XOR(L22_hi, AND(-1-L23_hi, L24_hi))
|
|
L01_lo = XOR(L01_lo, RC_lo[round_idx])
|
|
L01_hi = L01_hi + RC_hi[round_idx] -- RC_hi[] is either 0 or 0x80000000, so we could use fast addition instead of slow XOR
|
|
end
|
|
lanes_lo[1] = L01_lo; lanes_hi[1] = L01_hi
|
|
lanes_lo[2] = L02_lo; lanes_hi[2] = L02_hi
|
|
lanes_lo[3] = L03_lo; lanes_hi[3] = L03_hi
|
|
lanes_lo[4] = L04_lo; lanes_hi[4] = L04_hi
|
|
lanes_lo[5] = L05_lo; lanes_hi[5] = L05_hi
|
|
lanes_lo[6] = L06_lo; lanes_hi[6] = L06_hi
|
|
lanes_lo[7] = L07_lo; lanes_hi[7] = L07_hi
|
|
lanes_lo[8] = L08_lo; lanes_hi[8] = L08_hi
|
|
lanes_lo[9] = L09_lo; lanes_hi[9] = L09_hi
|
|
lanes_lo[10] = L10_lo; lanes_hi[10] = L10_hi
|
|
lanes_lo[11] = L11_lo; lanes_hi[11] = L11_hi
|
|
lanes_lo[12] = L12_lo; lanes_hi[12] = L12_hi
|
|
lanes_lo[13] = L13_lo; lanes_hi[13] = L13_hi
|
|
lanes_lo[14] = L14_lo; lanes_hi[14] = L14_hi
|
|
lanes_lo[15] = L15_lo; lanes_hi[15] = L15_hi
|
|
lanes_lo[16] = L16_lo; lanes_hi[16] = L16_hi
|
|
lanes_lo[17] = L17_lo; lanes_hi[17] = L17_hi
|
|
lanes_lo[18] = L18_lo; lanes_hi[18] = L18_hi
|
|
lanes_lo[19] = L19_lo; lanes_hi[19] = L19_hi
|
|
lanes_lo[20] = L20_lo; lanes_hi[20] = L20_hi
|
|
lanes_lo[21] = L21_lo; lanes_hi[21] = L21_hi
|
|
lanes_lo[22] = L22_lo; lanes_hi[22] = L22_hi
|
|
lanes_lo[23] = L23_lo; lanes_hi[23] = L23_hi
|
|
lanes_lo[24] = L24_lo; lanes_hi[24] = L24_hi
|
|
lanes_lo[25] = L25_lo; lanes_hi[25] = L25_hi
|
|
end
|
|
end
|
|
|
|
|
|
function blake2s_feed_64(H, str, offs, size, bytes_compressed, last_block_size, is_last_node)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
local W = common_W
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
|
|
for pos = offs, offs + size - 1, 64 do
|
|
if str then
|
|
for j = 1, 16 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = ((d * 256 + c) * 256 + b) * 256 + a
|
|
end
|
|
end
|
|
local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
|
|
local v8, v9, vA, vB, vC, vD, vE, vF = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
|
|
bytes_compressed = bytes_compressed + (last_block_size or 64)
|
|
local t0 = bytes_compressed % 2^32
|
|
local t1 = (bytes_compressed - t0) / 2^32
|
|
vC = XOR(vC, t0) -- t0 = low_4_bytes(bytes_compressed)
|
|
vD = XOR(vD, t1) -- t1 = high_4_bytes(bytes_compressed)
|
|
if last_block_size then -- flag f0
|
|
vE = -1 - vE
|
|
end
|
|
if is_last_node then -- flag f1
|
|
vF = -1 - vF
|
|
end
|
|
for j = 1, 10 do
|
|
local row = sigma[j]
|
|
v0 = v0 + v4 + W[row[1]]
|
|
vC = XOR(vC, v0) % 2^32 / 2^16
|
|
vC = vC % 1 * (2^32 - 1) + vC
|
|
v8 = v8 + vC
|
|
v4 = XOR(v4, v8) % 2^32 / 2^12
|
|
v4 = v4 % 1 * (2^32 - 1) + v4
|
|
v0 = v0 + v4 + W[row[2]]
|
|
vC = XOR(vC, v0) % 2^32 / 2^8
|
|
vC = vC % 1 * (2^32 - 1) + vC
|
|
v8 = v8 + vC
|
|
v4 = XOR(v4, v8) % 2^32 / 2^7
|
|
v4 = v4 % 1 * (2^32 - 1) + v4
|
|
v1 = v1 + v5 + W[row[3]]
|
|
vD = XOR(vD, v1) % 2^32 / 2^16
|
|
vD = vD % 1 * (2^32 - 1) + vD
|
|
v9 = v9 + vD
|
|
v5 = XOR(v5, v9) % 2^32 / 2^12
|
|
v5 = v5 % 1 * (2^32 - 1) + v5
|
|
v1 = v1 + v5 + W[row[4]]
|
|
vD = XOR(vD, v1) % 2^32 / 2^8
|
|
vD = vD % 1 * (2^32 - 1) + vD
|
|
v9 = v9 + vD
|
|
v5 = XOR(v5, v9) % 2^32 / 2^7
|
|
v5 = v5 % 1 * (2^32 - 1) + v5
|
|
v2 = v2 + v6 + W[row[5]]
|
|
vE = XOR(vE, v2) % 2^32 / 2^16
|
|
vE = vE % 1 * (2^32 - 1) + vE
|
|
vA = vA + vE
|
|
v6 = XOR(v6, vA) % 2^32 / 2^12
|
|
v6 = v6 % 1 * (2^32 - 1) + v6
|
|
v2 = v2 + v6 + W[row[6]]
|
|
vE = XOR(vE, v2) % 2^32 / 2^8
|
|
vE = vE % 1 * (2^32 - 1) + vE
|
|
vA = vA + vE
|
|
v6 = XOR(v6, vA) % 2^32 / 2^7
|
|
v6 = v6 % 1 * (2^32 - 1) + v6
|
|
v3 = v3 + v7 + W[row[7]]
|
|
vF = XOR(vF, v3) % 2^32 / 2^16
|
|
vF = vF % 1 * (2^32 - 1) + vF
|
|
vB = vB + vF
|
|
v7 = XOR(v7, vB) % 2^32 / 2^12
|
|
v7 = v7 % 1 * (2^32 - 1) + v7
|
|
v3 = v3 + v7 + W[row[8]]
|
|
vF = XOR(vF, v3) % 2^32 / 2^8
|
|
vF = vF % 1 * (2^32 - 1) + vF
|
|
vB = vB + vF
|
|
v7 = XOR(v7, vB) % 2^32 / 2^7
|
|
v7 = v7 % 1 * (2^32 - 1) + v7
|
|
v0 = v0 + v5 + W[row[9]]
|
|
vF = XOR(vF, v0) % 2^32 / 2^16
|
|
vF = vF % 1 * (2^32 - 1) + vF
|
|
vA = vA + vF
|
|
v5 = XOR(v5, vA) % 2^32 / 2^12
|
|
v5 = v5 % 1 * (2^32 - 1) + v5
|
|
v0 = v0 + v5 + W[row[10]]
|
|
vF = XOR(vF, v0) % 2^32 / 2^8
|
|
vF = vF % 1 * (2^32 - 1) + vF
|
|
vA = vA + vF
|
|
v5 = XOR(v5, vA) % 2^32 / 2^7
|
|
v5 = v5 % 1 * (2^32 - 1) + v5
|
|
v1 = v1 + v6 + W[row[11]]
|
|
vC = XOR(vC, v1) % 2^32 / 2^16
|
|
vC = vC % 1 * (2^32 - 1) + vC
|
|
vB = vB + vC
|
|
v6 = XOR(v6, vB) % 2^32 / 2^12
|
|
v6 = v6 % 1 * (2^32 - 1) + v6
|
|
v1 = v1 + v6 + W[row[12]]
|
|
vC = XOR(vC, v1) % 2^32 / 2^8
|
|
vC = vC % 1 * (2^32 - 1) + vC
|
|
vB = vB + vC
|
|
v6 = XOR(v6, vB) % 2^32 / 2^7
|
|
v6 = v6 % 1 * (2^32 - 1) + v6
|
|
v2 = v2 + v7 + W[row[13]]
|
|
vD = XOR(vD, v2) % 2^32 / 2^16
|
|
vD = vD % 1 * (2^32 - 1) + vD
|
|
v8 = v8 + vD
|
|
v7 = XOR(v7, v8) % 2^32 / 2^12
|
|
v7 = v7 % 1 * (2^32 - 1) + v7
|
|
v2 = v2 + v7 + W[row[14]]
|
|
vD = XOR(vD, v2) % 2^32 / 2^8
|
|
vD = vD % 1 * (2^32 - 1) + vD
|
|
v8 = v8 + vD
|
|
v7 = XOR(v7, v8) % 2^32 / 2^7
|
|
v7 = v7 % 1 * (2^32 - 1) + v7
|
|
v3 = v3 + v4 + W[row[15]]
|
|
vE = XOR(vE, v3) % 2^32 / 2^16
|
|
vE = vE % 1 * (2^32 - 1) + vE
|
|
v9 = v9 + vE
|
|
v4 = XOR(v4, v9) % 2^32 / 2^12
|
|
v4 = v4 % 1 * (2^32 - 1) + v4
|
|
v3 = v3 + v4 + W[row[16]]
|
|
vE = XOR(vE, v3) % 2^32 / 2^8
|
|
vE = vE % 1 * (2^32 - 1) + vE
|
|
v9 = v9 + vE
|
|
v4 = XOR(v4, v9) % 2^32 / 2^7
|
|
v4 = v4 % 1 * (2^32 - 1) + v4
|
|
end
|
|
h1 = XOR(h1, v0, v8)
|
|
h2 = XOR(h2, v1, v9)
|
|
h3 = XOR(h3, v2, vA)
|
|
h4 = XOR(h4, v3, vB)
|
|
h5 = XOR(h5, v4, vC)
|
|
h6 = XOR(h6, v5, vD)
|
|
h7 = XOR(h7, v6, vE)
|
|
h8 = XOR(h8, v7, vF)
|
|
end
|
|
H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
return bytes_compressed
|
|
end
|
|
|
|
|
|
function blake2b_feed_128(H_lo, H_hi, str, offs, size, bytes_compressed, last_block_size, is_last_node)
|
|
-- offs >= 0, size >= 0, size is multiple of 128
|
|
local W = common_W
|
|
local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
|
|
local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
|
|
for pos = offs, offs + size - 1, 128 do
|
|
if str then
|
|
for j = 1, 32 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = ((d * 256 + c) * 256 + b) * 256 + a
|
|
end
|
|
end
|
|
local v0_lo, v1_lo, v2_lo, v3_lo, v4_lo, v5_lo, v6_lo, v7_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
|
|
local v0_hi, v1_hi, v2_hi, v3_hi, v4_hi, v5_hi, v6_hi, v7_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
|
|
local v8_lo, v9_lo, vA_lo, vB_lo, vC_lo, vD_lo, vE_lo, vF_lo = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[5], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
|
|
local v8_hi, v9_hi, vA_hi, vB_hi, vC_hi, vD_hi, vE_hi, vF_hi = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
|
|
bytes_compressed = bytes_compressed + (last_block_size or 128)
|
|
local t0_lo = bytes_compressed % 2^32
|
|
local t0_hi = (bytes_compressed - t0_lo) / 2^32
|
|
vC_lo = XOR(vC_lo, t0_lo) -- t0 = low_8_bytes(bytes_compressed)
|
|
vC_hi = XOR(vC_hi, t0_hi)
|
|
-- t1 = high_8_bytes(bytes_compressed) = 0, message length is always below 2^53 bytes
|
|
if last_block_size then -- flag f0
|
|
vE_lo = -1 - vE_lo
|
|
vE_hi = -1 - vE_hi
|
|
end
|
|
if is_last_node then -- flag f1
|
|
vF_lo = -1 - vF_lo
|
|
vF_hi = -1 - vF_hi
|
|
end
|
|
for j = 1, 12 do
|
|
local row = sigma[j]
|
|
local k = row[1] * 2
|
|
local z = v0_lo % 2^32 + v4_lo % 2^32 + W[k-1]
|
|
v0_lo = z % 2^32
|
|
v0_hi = v0_hi + v4_hi + (z - v0_lo) / 2^32 + W[k]
|
|
vC_lo, vC_hi = XOR(vC_hi, v0_hi), XOR(vC_lo, v0_lo)
|
|
z = v8_lo % 2^32 + vC_lo % 2^32
|
|
v8_lo = z % 2^32
|
|
v8_hi = v8_hi + vC_hi + (z - v8_lo) / 2^32
|
|
v4_lo, v4_hi = XOR(v4_lo, v8_lo), XOR(v4_hi, v8_hi)
|
|
local z_lo, z_hi = v4_lo % 2^24, v4_hi % 2^24
|
|
v4_lo, v4_hi = (v4_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v4_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
|
|
k = row[2] * 2
|
|
z = v0_lo % 2^32 + v4_lo % 2^32 + W[k-1]
|
|
v0_lo = z % 2^32
|
|
v0_hi = v0_hi + v4_hi + (z - v0_lo) / 2^32 + W[k]
|
|
vC_lo, vC_hi = XOR(vC_lo, v0_lo), XOR(vC_hi, v0_hi)
|
|
z_lo, z_hi = vC_lo % 2^16, vC_hi % 2^16
|
|
vC_lo, vC_hi = (vC_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vC_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
|
|
z = v8_lo % 2^32 + vC_lo % 2^32
|
|
v8_lo = z % 2^32
|
|
v8_hi = v8_hi + vC_hi + (z - v8_lo) / 2^32
|
|
v4_lo, v4_hi = XOR(v4_lo, v8_lo), XOR(v4_hi, v8_hi)
|
|
z_lo, z_hi = v4_lo % 2^31, v4_hi % 2^31
|
|
v4_lo, v4_hi = z_lo * 2^1 + (v4_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v4_lo - z_lo) / 2^31 % 2^1
|
|
k = row[3] * 2
|
|
z = v1_lo % 2^32 + v5_lo % 2^32 + W[k-1]
|
|
v1_lo = z % 2^32
|
|
v1_hi = v1_hi + v5_hi + (z - v1_lo) / 2^32 + W[k]
|
|
vD_lo, vD_hi = XOR(vD_hi, v1_hi), XOR(vD_lo, v1_lo)
|
|
z = v9_lo % 2^32 + vD_lo % 2^32
|
|
v9_lo = z % 2^32
|
|
v9_hi = v9_hi + vD_hi + (z - v9_lo) / 2^32
|
|
v5_lo, v5_hi = XOR(v5_lo, v9_lo), XOR(v5_hi, v9_hi)
|
|
z_lo, z_hi = v5_lo % 2^24, v5_hi % 2^24
|
|
v5_lo, v5_hi = (v5_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v5_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
|
|
k = row[4] * 2
|
|
z = v1_lo % 2^32 + v5_lo % 2^32 + W[k-1]
|
|
v1_lo = z % 2^32
|
|
v1_hi = v1_hi + v5_hi + (z - v1_lo) / 2^32 + W[k]
|
|
vD_lo, vD_hi = XOR(vD_lo, v1_lo), XOR(vD_hi, v1_hi)
|
|
z_lo, z_hi = vD_lo % 2^16, vD_hi % 2^16
|
|
vD_lo, vD_hi = (vD_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vD_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
|
|
z = v9_lo % 2^32 + vD_lo % 2^32
|
|
v9_lo = z % 2^32
|
|
v9_hi = v9_hi + vD_hi + (z - v9_lo) / 2^32
|
|
v5_lo, v5_hi = XOR(v5_lo, v9_lo), XOR(v5_hi, v9_hi)
|
|
z_lo, z_hi = v5_lo % 2^31, v5_hi % 2^31
|
|
v5_lo, v5_hi = z_lo * 2^1 + (v5_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v5_lo - z_lo) / 2^31 % 2^1
|
|
k = row[5] * 2
|
|
z = v2_lo % 2^32 + v6_lo % 2^32 + W[k-1]
|
|
v2_lo = z % 2^32
|
|
v2_hi = v2_hi + v6_hi + (z - v2_lo) / 2^32 + W[k]
|
|
vE_lo, vE_hi = XOR(vE_hi, v2_hi), XOR(vE_lo, v2_lo)
|
|
z = vA_lo % 2^32 + vE_lo % 2^32
|
|
vA_lo = z % 2^32
|
|
vA_hi = vA_hi + vE_hi + (z - vA_lo) / 2^32
|
|
v6_lo, v6_hi = XOR(v6_lo, vA_lo), XOR(v6_hi, vA_hi)
|
|
z_lo, z_hi = v6_lo % 2^24, v6_hi % 2^24
|
|
v6_lo, v6_hi = (v6_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v6_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
|
|
k = row[6] * 2
|
|
z = v2_lo % 2^32 + v6_lo % 2^32 + W[k-1]
|
|
v2_lo = z % 2^32
|
|
v2_hi = v2_hi + v6_hi + (z - v2_lo) / 2^32 + W[k]
|
|
vE_lo, vE_hi = XOR(vE_lo, v2_lo), XOR(vE_hi, v2_hi)
|
|
z_lo, z_hi = vE_lo % 2^16, vE_hi % 2^16
|
|
vE_lo, vE_hi = (vE_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vE_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
|
|
z = vA_lo % 2^32 + vE_lo % 2^32
|
|
vA_lo = z % 2^32
|
|
vA_hi = vA_hi + vE_hi + (z - vA_lo) / 2^32
|
|
v6_lo, v6_hi = XOR(v6_lo, vA_lo), XOR(v6_hi, vA_hi)
|
|
z_lo, z_hi = v6_lo % 2^31, v6_hi % 2^31
|
|
v6_lo, v6_hi = z_lo * 2^1 + (v6_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v6_lo - z_lo) / 2^31 % 2^1
|
|
k = row[7] * 2
|
|
z = v3_lo % 2^32 + v7_lo % 2^32 + W[k-1]
|
|
v3_lo = z % 2^32
|
|
v3_hi = v3_hi + v7_hi + (z - v3_lo) / 2^32 + W[k]
|
|
vF_lo, vF_hi = XOR(vF_hi, v3_hi), XOR(vF_lo, v3_lo)
|
|
z = vB_lo % 2^32 + vF_lo % 2^32
|
|
vB_lo = z % 2^32
|
|
vB_hi = vB_hi + vF_hi + (z - vB_lo) / 2^32
|
|
v7_lo, v7_hi = XOR(v7_lo, vB_lo), XOR(v7_hi, vB_hi)
|
|
z_lo, z_hi = v7_lo % 2^24, v7_hi % 2^24
|
|
v7_lo, v7_hi = (v7_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v7_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
|
|
k = row[8] * 2
|
|
z = v3_lo % 2^32 + v7_lo % 2^32 + W[k-1]
|
|
v3_lo = z % 2^32
|
|
v3_hi = v3_hi + v7_hi + (z - v3_lo) / 2^32 + W[k]
|
|
vF_lo, vF_hi = XOR(vF_lo, v3_lo), XOR(vF_hi, v3_hi)
|
|
z_lo, z_hi = vF_lo % 2^16, vF_hi % 2^16
|
|
vF_lo, vF_hi = (vF_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vF_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
|
|
z = vB_lo % 2^32 + vF_lo % 2^32
|
|
vB_lo = z % 2^32
|
|
vB_hi = vB_hi + vF_hi + (z - vB_lo) / 2^32
|
|
v7_lo, v7_hi = XOR(v7_lo, vB_lo), XOR(v7_hi, vB_hi)
|
|
z_lo, z_hi = v7_lo % 2^31, v7_hi % 2^31
|
|
v7_lo, v7_hi = z_lo * 2^1 + (v7_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v7_lo - z_lo) / 2^31 % 2^1
|
|
k = row[9] * 2
|
|
z = v0_lo % 2^32 + v5_lo % 2^32 + W[k-1]
|
|
v0_lo = z % 2^32
|
|
v0_hi = v0_hi + v5_hi + (z - v0_lo) / 2^32 + W[k]
|
|
vF_lo, vF_hi = XOR(vF_hi, v0_hi), XOR(vF_lo, v0_lo)
|
|
z = vA_lo % 2^32 + vF_lo % 2^32
|
|
vA_lo = z % 2^32
|
|
vA_hi = vA_hi + vF_hi + (z - vA_lo) / 2^32
|
|
v5_lo, v5_hi = XOR(v5_lo, vA_lo), XOR(v5_hi, vA_hi)
|
|
z_lo, z_hi = v5_lo % 2^24, v5_hi % 2^24
|
|
v5_lo, v5_hi = (v5_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v5_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
|
|
k = row[10] * 2
|
|
z = v0_lo % 2^32 + v5_lo % 2^32 + W[k-1]
|
|
v0_lo = z % 2^32
|
|
v0_hi = v0_hi + v5_hi + (z - v0_lo) / 2^32 + W[k]
|
|
vF_lo, vF_hi = XOR(vF_lo, v0_lo), XOR(vF_hi, v0_hi)
|
|
z_lo, z_hi = vF_lo % 2^16, vF_hi % 2^16
|
|
vF_lo, vF_hi = (vF_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vF_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
|
|
z = vA_lo % 2^32 + vF_lo % 2^32
|
|
vA_lo = z % 2^32
|
|
vA_hi = vA_hi + vF_hi + (z - vA_lo) / 2^32
|
|
v5_lo, v5_hi = XOR(v5_lo, vA_lo), XOR(v5_hi, vA_hi)
|
|
z_lo, z_hi = v5_lo % 2^31, v5_hi % 2^31
|
|
v5_lo, v5_hi = z_lo * 2^1 + (v5_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v5_lo - z_lo) / 2^31 % 2^1
|
|
k = row[11] * 2
|
|
z = v1_lo % 2^32 + v6_lo % 2^32 + W[k-1]
|
|
v1_lo = z % 2^32
|
|
v1_hi = v1_hi + v6_hi + (z - v1_lo) / 2^32 + W[k]
|
|
vC_lo, vC_hi = XOR(vC_hi, v1_hi), XOR(vC_lo, v1_lo)
|
|
z = vB_lo % 2^32 + vC_lo % 2^32
|
|
vB_lo = z % 2^32
|
|
vB_hi = vB_hi + vC_hi + (z - vB_lo) / 2^32
|
|
v6_lo, v6_hi = XOR(v6_lo, vB_lo), XOR(v6_hi, vB_hi)
|
|
z_lo, z_hi = v6_lo % 2^24, v6_hi % 2^24
|
|
v6_lo, v6_hi = (v6_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v6_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
|
|
k = row[12] * 2
|
|
z = v1_lo % 2^32 + v6_lo % 2^32 + W[k-1]
|
|
v1_lo = z % 2^32
|
|
v1_hi = v1_hi + v6_hi + (z - v1_lo) / 2^32 + W[k]
|
|
vC_lo, vC_hi = XOR(vC_lo, v1_lo), XOR(vC_hi, v1_hi)
|
|
z_lo, z_hi = vC_lo % 2^16, vC_hi % 2^16
|
|
vC_lo, vC_hi = (vC_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vC_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
|
|
z = vB_lo % 2^32 + vC_lo % 2^32
|
|
vB_lo = z % 2^32
|
|
vB_hi = vB_hi + vC_hi + (z - vB_lo) / 2^32
|
|
v6_lo, v6_hi = XOR(v6_lo, vB_lo), XOR(v6_hi, vB_hi)
|
|
z_lo, z_hi = v6_lo % 2^31, v6_hi % 2^31
|
|
v6_lo, v6_hi = z_lo * 2^1 + (v6_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v6_lo - z_lo) / 2^31 % 2^1
|
|
k = row[13] * 2
|
|
z = v2_lo % 2^32 + v7_lo % 2^32 + W[k-1]
|
|
v2_lo = z % 2^32
|
|
v2_hi = v2_hi + v7_hi + (z - v2_lo) / 2^32 + W[k]
|
|
vD_lo, vD_hi = XOR(vD_hi, v2_hi), XOR(vD_lo, v2_lo)
|
|
z = v8_lo % 2^32 + vD_lo % 2^32
|
|
v8_lo = z % 2^32
|
|
v8_hi = v8_hi + vD_hi + (z - v8_lo) / 2^32
|
|
v7_lo, v7_hi = XOR(v7_lo, v8_lo), XOR(v7_hi, v8_hi)
|
|
z_lo, z_hi = v7_lo % 2^24, v7_hi % 2^24
|
|
v7_lo, v7_hi = (v7_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v7_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
|
|
k = row[14] * 2
|
|
z = v2_lo % 2^32 + v7_lo % 2^32 + W[k-1]
|
|
v2_lo = z % 2^32
|
|
v2_hi = v2_hi + v7_hi + (z - v2_lo) / 2^32 + W[k]
|
|
vD_lo, vD_hi = XOR(vD_lo, v2_lo), XOR(vD_hi, v2_hi)
|
|
z_lo, z_hi = vD_lo % 2^16, vD_hi % 2^16
|
|
vD_lo, vD_hi = (vD_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vD_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
|
|
z = v8_lo % 2^32 + vD_lo % 2^32
|
|
v8_lo = z % 2^32
|
|
v8_hi = v8_hi + vD_hi + (z - v8_lo) / 2^32
|
|
v7_lo, v7_hi = XOR(v7_lo, v8_lo), XOR(v7_hi, v8_hi)
|
|
z_lo, z_hi = v7_lo % 2^31, v7_hi % 2^31
|
|
v7_lo, v7_hi = z_lo * 2^1 + (v7_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v7_lo - z_lo) / 2^31 % 2^1
|
|
k = row[15] * 2
|
|
z = v3_lo % 2^32 + v4_lo % 2^32 + W[k-1]
|
|
v3_lo = z % 2^32
|
|
v3_hi = v3_hi + v4_hi + (z - v3_lo) / 2^32 + W[k]
|
|
vE_lo, vE_hi = XOR(vE_hi, v3_hi), XOR(vE_lo, v3_lo)
|
|
z = v9_lo % 2^32 + vE_lo % 2^32
|
|
v9_lo = z % 2^32
|
|
v9_hi = v9_hi + vE_hi + (z - v9_lo) / 2^32
|
|
v4_lo, v4_hi = XOR(v4_lo, v9_lo), XOR(v4_hi, v9_hi)
|
|
z_lo, z_hi = v4_lo % 2^24, v4_hi % 2^24
|
|
v4_lo, v4_hi = (v4_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v4_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
|
|
k = row[16] * 2
|
|
z = v3_lo % 2^32 + v4_lo % 2^32 + W[k-1]
|
|
v3_lo = z % 2^32
|
|
v3_hi = v3_hi + v4_hi + (z - v3_lo) / 2^32 + W[k]
|
|
vE_lo, vE_hi = XOR(vE_lo, v3_lo), XOR(vE_hi, v3_hi)
|
|
z_lo, z_hi = vE_lo % 2^16, vE_hi % 2^16
|
|
vE_lo, vE_hi = (vE_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vE_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
|
|
z = v9_lo % 2^32 + vE_lo % 2^32
|
|
v9_lo = z % 2^32
|
|
v9_hi = v9_hi + vE_hi + (z - v9_lo) / 2^32
|
|
v4_lo, v4_hi = XOR(v4_lo, v9_lo), XOR(v4_hi, v9_hi)
|
|
z_lo, z_hi = v4_lo % 2^31, v4_hi % 2^31
|
|
v4_lo, v4_hi = z_lo * 2^1 + (v4_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v4_lo - z_lo) / 2^31 % 2^1
|
|
end
|
|
h1_lo = XOR(h1_lo, v0_lo, v8_lo) % 2^32
|
|
h2_lo = XOR(h2_lo, v1_lo, v9_lo) % 2^32
|
|
h3_lo = XOR(h3_lo, v2_lo, vA_lo) % 2^32
|
|
h4_lo = XOR(h4_lo, v3_lo, vB_lo) % 2^32
|
|
h5_lo = XOR(h5_lo, v4_lo, vC_lo) % 2^32
|
|
h6_lo = XOR(h6_lo, v5_lo, vD_lo) % 2^32
|
|
h7_lo = XOR(h7_lo, v6_lo, vE_lo) % 2^32
|
|
h8_lo = XOR(h8_lo, v7_lo, vF_lo) % 2^32
|
|
h1_hi = XOR(h1_hi, v0_hi, v8_hi) % 2^32
|
|
h2_hi = XOR(h2_hi, v1_hi, v9_hi) % 2^32
|
|
h3_hi = XOR(h3_hi, v2_hi, vA_hi) % 2^32
|
|
h4_hi = XOR(h4_hi, v3_hi, vB_hi) % 2^32
|
|
h5_hi = XOR(h5_hi, v4_hi, vC_hi) % 2^32
|
|
h6_hi = XOR(h6_hi, v5_hi, vD_hi) % 2^32
|
|
h7_hi = XOR(h7_hi, v6_hi, vE_hi) % 2^32
|
|
h8_hi = XOR(h8_hi, v7_hi, vF_hi) % 2^32
|
|
end
|
|
H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
|
|
H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
|
|
return bytes_compressed
|
|
end
|
|
|
|
|
|
function blake3_feed_64(str, offs, size, flags, chunk_index, H_in, H_out, wide_output, block_length)
|
|
-- offs >= 0, size >= 0, size is multiple of 64
|
|
block_length = block_length or 64
|
|
local W = common_W
|
|
local h1, h2, h3, h4, h5, h6, h7, h8 = H_in[1], H_in[2], H_in[3], H_in[4], H_in[5], H_in[6], H_in[7], H_in[8]
|
|
H_out = H_out or H_in
|
|
for pos = offs, offs + size - 1, 64 do
|
|
if str then
|
|
for j = 1, 16 do
|
|
pos = pos + 4
|
|
local a, b, c, d = byte(str, pos - 3, pos)
|
|
W[j] = ((d * 256 + c) * 256 + b) * 256 + a
|
|
end
|
|
end
|
|
local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
|
|
local v8, v9, vA, vB = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4]
|
|
local vC = chunk_index % 2^32 -- t0 = low_4_bytes(chunk_index)
|
|
local vD = (chunk_index - vC) / 2^32 -- t1 = high_4_bytes(chunk_index)
|
|
local vE, vF = block_length, flags
|
|
for j = 1, 7 do
|
|
v0 = v0 + v4 + W[perm_blake3[j]]
|
|
vC = XOR(vC, v0) % 2^32 / 2^16
|
|
vC = vC % 1 * (2^32 - 1) + vC
|
|
v8 = v8 + vC
|
|
v4 = XOR(v4, v8) % 2^32 / 2^12
|
|
v4 = v4 % 1 * (2^32 - 1) + v4
|
|
v0 = v0 + v4 + W[perm_blake3[j + 14]]
|
|
vC = XOR(vC, v0) % 2^32 / 2^8
|
|
vC = vC % 1 * (2^32 - 1) + vC
|
|
v8 = v8 + vC
|
|
v4 = XOR(v4, v8) % 2^32 / 2^7
|
|
v4 = v4 % 1 * (2^32 - 1) + v4
|
|
v1 = v1 + v5 + W[perm_blake3[j + 1]]
|
|
vD = XOR(vD, v1) % 2^32 / 2^16
|
|
vD = vD % 1 * (2^32 - 1) + vD
|
|
v9 = v9 + vD
|
|
v5 = XOR(v5, v9) % 2^32 / 2^12
|
|
v5 = v5 % 1 * (2^32 - 1) + v5
|
|
v1 = v1 + v5 + W[perm_blake3[j + 2]]
|
|
vD = XOR(vD, v1) % 2^32 / 2^8
|
|
vD = vD % 1 * (2^32 - 1) + vD
|
|
v9 = v9 + vD
|
|
v5 = XOR(v5, v9) % 2^32 / 2^7
|
|
v5 = v5 % 1 * (2^32 - 1) + v5
|
|
v2 = v2 + v6 + W[perm_blake3[j + 16]]
|
|
vE = XOR(vE, v2) % 2^32 / 2^16
|
|
vE = vE % 1 * (2^32 - 1) + vE
|
|
vA = vA + vE
|
|
v6 = XOR(v6, vA) % 2^32 / 2^12
|
|
v6 = v6 % 1 * (2^32 - 1) + v6
|
|
v2 = v2 + v6 + W[perm_blake3[j + 7]]
|
|
vE = XOR(vE, v2) % 2^32 / 2^8
|
|
vE = vE % 1 * (2^32 - 1) + vE
|
|
vA = vA + vE
|
|
v6 = XOR(v6, vA) % 2^32 / 2^7
|
|
v6 = v6 % 1 * (2^32 - 1) + v6
|
|
v3 = v3 + v7 + W[perm_blake3[j + 15]]
|
|
vF = XOR(vF, v3) % 2^32 / 2^16
|
|
vF = vF % 1 * (2^32 - 1) + vF
|
|
vB = vB + vF
|
|
v7 = XOR(v7, vB) % 2^32 / 2^12
|
|
v7 = v7 % 1 * (2^32 - 1) + v7
|
|
v3 = v3 + v7 + W[perm_blake3[j + 17]]
|
|
vF = XOR(vF, v3) % 2^32 / 2^8
|
|
vF = vF % 1 * (2^32 - 1) + vF
|
|
vB = vB + vF
|
|
v7 = XOR(v7, vB) % 2^32 / 2^7
|
|
v7 = v7 % 1 * (2^32 - 1) + v7
|
|
v0 = v0 + v5 + W[perm_blake3[j + 21]]
|
|
vF = XOR(vF, v0) % 2^32 / 2^16
|
|
vF = vF % 1 * (2^32 - 1) + vF
|
|
vA = vA + vF
|
|
v5 = XOR(v5, vA) % 2^32 / 2^12
|
|
v5 = v5 % 1 * (2^32 - 1) + v5
|
|
v0 = v0 + v5 + W[perm_blake3[j + 5]]
|
|
vF = XOR(vF, v0) % 2^32 / 2^8
|
|
vF = vF % 1 * (2^32 - 1) + vF
|
|
vA = vA + vF
|
|
v5 = XOR(v5, vA) % 2^32 / 2^7
|
|
v5 = v5 % 1 * (2^32 - 1) + v5
|
|
v1 = v1 + v6 + W[perm_blake3[j + 3]]
|
|
vC = XOR(vC, v1) % 2^32 / 2^16
|
|
vC = vC % 1 * (2^32 - 1) + vC
|
|
vB = vB + vC
|
|
v6 = XOR(v6, vB) % 2^32 / 2^12
|
|
v6 = v6 % 1 * (2^32 - 1) + v6
|
|
v1 = v1 + v6 + W[perm_blake3[j + 6]]
|
|
vC = XOR(vC, v1) % 2^32 / 2^8
|
|
vC = vC % 1 * (2^32 - 1) + vC
|
|
vB = vB + vC
|
|
v6 = XOR(v6, vB) % 2^32 / 2^7
|
|
v6 = v6 % 1 * (2^32 - 1) + v6
|
|
v2 = v2 + v7 + W[perm_blake3[j + 4]]
|
|
vD = XOR(vD, v2) % 2^32 / 2^16
|
|
vD = vD % 1 * (2^32 - 1) + vD
|
|
v8 = v8 + vD
|
|
v7 = XOR(v7, v8) % 2^32 / 2^12
|
|
v7 = v7 % 1 * (2^32 - 1) + v7
|
|
v2 = v2 + v7 + W[perm_blake3[j + 18]]
|
|
vD = XOR(vD, v2) % 2^32 / 2^8
|
|
vD = vD % 1 * (2^32 - 1) + vD
|
|
v8 = v8 + vD
|
|
v7 = XOR(v7, v8) % 2^32 / 2^7
|
|
v7 = v7 % 1 * (2^32 - 1) + v7
|
|
v3 = v3 + v4 + W[perm_blake3[j + 19]]
|
|
vE = XOR(vE, v3) % 2^32 / 2^16
|
|
vE = vE % 1 * (2^32 - 1) + vE
|
|
v9 = v9 + vE
|
|
v4 = XOR(v4, v9) % 2^32 / 2^12
|
|
v4 = v4 % 1 * (2^32 - 1) + v4
|
|
v3 = v3 + v4 + W[perm_blake3[j + 20]]
|
|
vE = XOR(vE, v3) % 2^32 / 2^8
|
|
vE = vE % 1 * (2^32 - 1) + vE
|
|
v9 = v9 + vE
|
|
v4 = XOR(v4, v9) % 2^32 / 2^7
|
|
v4 = v4 % 1 * (2^32 - 1) + v4
|
|
end
|
|
if wide_output then
|
|
H_out[ 9] = XOR(h1, v8)
|
|
H_out[10] = XOR(h2, v9)
|
|
H_out[11] = XOR(h3, vA)
|
|
H_out[12] = XOR(h4, vB)
|
|
H_out[13] = XOR(h5, vC)
|
|
H_out[14] = XOR(h6, vD)
|
|
H_out[15] = XOR(h7, vE)
|
|
H_out[16] = XOR(h8, vF)
|
|
end
|
|
h1 = XOR(v0, v8)
|
|
h2 = XOR(v1, v9)
|
|
h3 = XOR(v2, vA)
|
|
h4 = XOR(v3, vB)
|
|
h5 = XOR(v4, vC)
|
|
h6 = XOR(v5, vD)
|
|
h7 = XOR(v6, vE)
|
|
h8 = XOR(v7, vF)
|
|
end
|
|
H_out[1], H_out[2], H_out[3], H_out[4], H_out[5], H_out[6], H_out[7], H_out[8] = h1, h2, h3, h4, h5, h6, h7, h8
|
|
end
|
|
|
|
end
|
|
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- MAGIC NUMBERS CALCULATOR
|
|
--------------------------------------------------------------------------------
|
|
-- Q:
|
|
-- Is 53-bit "double" math enough to calculate square roots and cube roots of primes with 64 correct bits after decimal point?
|
|
-- A:
|
|
-- Yes, 53-bit "double" arithmetic is enough.
|
|
-- We could obtain first 40 bits by direct calculation of p^(1/3) and next 40 bits by one step of Newton's method.
|
|
|
|
do
|
|
local function mul(src1, src2, factor, result_length)
|
|
-- src1, src2 - long integers (arrays of digits in base 2^24)
|
|
-- factor - small integer
|
|
-- returns long integer result (src1 * src2 * factor) and its floating point approximation
|
|
local result, carry, value, weight = {}, 0.0, 0.0, 1.0
|
|
for j = 1, result_length do
|
|
for k = math_max(1, j + 1 - #src2), math_min(j, #src1) do
|
|
carry = carry + factor * src1[k] * src2[j + 1 - k] -- "int32" is not enough for multiplication result, that's why "factor" must be of type "double"
|
|
end
|
|
local digit = carry % 2^24
|
|
result[j] = floor(digit)
|
|
carry = (carry - digit) / 2^24
|
|
value = value + digit * weight
|
|
weight = weight * 2^24
|
|
end
|
|
return result, value
|
|
end
|
|
|
|
local idx, step, p, one, sqrt_hi, sqrt_lo = 0, {4, 1, 2, -2, 2}, 4, {1}, sha2_H_hi, sha2_H_lo
|
|
repeat
|
|
p = p + step[p % 6]
|
|
local d = 1
|
|
repeat
|
|
d = d + step[d % 6]
|
|
if d*d > p then -- next prime number is found
|
|
local root = p^(1/3)
|
|
local R = root * 2^40
|
|
R = mul({R - R % 1}, one, 1.0, 2)
|
|
local _, delta = mul(R, mul(R, R, 1.0, 4), -1.0, 4)
|
|
local hi = R[2] % 65536 * 65536 + floor(R[1] / 256)
|
|
local lo = R[1] % 256 * 16777216 + floor(delta * (2^-56 / 3) * root / p)
|
|
if idx < 16 then
|
|
root = p^(1/2)
|
|
R = root * 2^40
|
|
R = mul({R - R % 1}, one, 1.0, 2)
|
|
_, delta = mul(R, R, -1.0, 2)
|
|
local hi = R[2] % 65536 * 65536 + floor(R[1] / 256)
|
|
local lo = R[1] % 256 * 16777216 + floor(delta * 2^-17 / root)
|
|
local idx = idx % 8 + 1
|
|
sha2_H_ext256[224][idx] = lo
|
|
sqrt_hi[idx], sqrt_lo[idx] = hi, lo + hi * hi_factor
|
|
if idx > 7 then
|
|
sqrt_hi, sqrt_lo = sha2_H_ext512_hi[384], sha2_H_ext512_lo[384]
|
|
end
|
|
end
|
|
idx = idx + 1
|
|
sha2_K_hi[idx], sha2_K_lo[idx] = hi, lo % K_lo_modulo + hi * hi_factor
|
|
break
|
|
end
|
|
until p % d == 0
|
|
until idx > 79
|
|
end
|
|
|
|
-- Calculating IVs for SHA512/224 and SHA512/256
|
|
for width = 224, 256, 32 do
|
|
local H_lo, H_hi = {}
|
|
if HEX64 then
|
|
for j = 1, 8 do
|
|
H_lo[j] = XORA5(sha2_H_lo[j])
|
|
end
|
|
else
|
|
H_hi = {}
|
|
for j = 1, 8 do
|
|
H_lo[j] = XORA5(sha2_H_lo[j])
|
|
H_hi[j] = XORA5(sha2_H_hi[j])
|
|
end
|
|
end
|
|
sha512_feed_128(H_lo, H_hi, "SHA-512/"..tostring(width).."\128"..string_rep("\0", 115).."\88", 0, 128)
|
|
sha2_H_ext512_lo[width] = H_lo
|
|
sha2_H_ext512_hi[width] = H_hi
|
|
end
|
|
|
|
-- Constants for MD5
|
|
do
|
|
local sin, abs, modf = math.sin, math.abs, math.modf
|
|
for idx = 1, 64 do
|
|
-- we can't use formula floor(abs(sin(idx))*2^32) because its result may be beyond integer range on Lua built with 32-bit integers
|
|
local hi, lo = modf(abs(sin(idx)) * 2^16)
|
|
md5_K[idx] = hi * 65536 + floor(lo * 2^16)
|
|
end
|
|
end
|
|
|
|
-- Constants for SHA-3
|
|
do
|
|
local sh_reg = 29
|
|
|
|
local function next_bit()
|
|
local r = sh_reg % 2
|
|
sh_reg = XOR_BYTE((sh_reg - r) / 2, 142 * r)
|
|
return r
|
|
end
|
|
|
|
for idx = 1, 24 do
|
|
local lo, m = 0
|
|
for _ = 1, 6 do
|
|
m = m and m * m * 2 or 1
|
|
lo = lo + next_bit() * m
|
|
end
|
|
local hi = next_bit() * m
|
|
sha3_RC_hi[idx], sha3_RC_lo[idx] = hi, lo + hi * hi_factor_keccak
|
|
end
|
|
end
|
|
|
|
if branch == "FFI" then
|
|
sha2_K_hi = ffi.new("uint32_t[?]", #sha2_K_hi + 1, 0, unpack(sha2_K_hi))
|
|
sha2_K_lo = ffi.new("int64_t[?]", #sha2_K_lo + 1, 0, unpack(sha2_K_lo))
|
|
--md5_K = ffi.new("uint32_t[?]", #md5_K + 1, 0, unpack(md5_K))
|
|
if hi_factor_keccak == 0 then
|
|
sha3_RC_lo = ffi.new("uint32_t[?]", #sha3_RC_lo + 1, 0, unpack(sha3_RC_lo))
|
|
sha3_RC_hi = ffi.new("uint32_t[?]", #sha3_RC_hi + 1, 0, unpack(sha3_RC_hi))
|
|
else
|
|
sha3_RC_lo = ffi.new("int64_t[?]", #sha3_RC_lo + 1, 0, unpack(sha3_RC_lo))
|
|
end
|
|
end
|
|
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- MAIN FUNCTIONS
|
|
--------------------------------------------------------------------------------
|
|
|
|
local function sha256ext(width, message)
|
|
-- Create an instance (private objects for current calculation)
|
|
local H, length, tail = {unpack(sha2_H_ext256[width])}, 0.0, ""
|
|
|
|
local function partial(message_part)
|
|
if message_part then
|
|
if tail then
|
|
length = length + #message_part
|
|
local offs = 0
|
|
if tail ~= "" and #tail + #message_part >= 64 then
|
|
offs = 64 - #tail
|
|
sha256_feed_64(H, tail..sub(message_part, 1, offs), 0, 64)
|
|
tail = ""
|
|
end
|
|
local size = #message_part - offs
|
|
local size_tail = size % 64
|
|
sha256_feed_64(H, message_part, offs, size - size_tail)
|
|
tail = tail..sub(message_part, #message_part + 1 - size_tail)
|
|
return partial
|
|
else
|
|
error("Adding more chunks is not allowed after receiving the result", 2)
|
|
end
|
|
else
|
|
if tail then
|
|
local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64 + 1)}
|
|
tail = nil
|
|
-- Assuming user data length is shorter than (2^53)-9 bytes
|
|
-- Anyway, it looks very unrealistic that someone would spend more than a year of calculations to process 2^53 bytes of data by using this Lua script :-)
|
|
-- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes
|
|
length = length * (8 / 256^7) -- convert "byte-counter" to "bit-counter" and move decimal point to the left
|
|
for j = 4, 10 do
|
|
length = length % 1 * 256
|
|
final_blocks[j] = char(floor(length))
|
|
end
|
|
final_blocks = table_concat(final_blocks)
|
|
sha256_feed_64(H, final_blocks, 0, #final_blocks)
|
|
local max_reg = width / 32
|
|
for j = 1, max_reg do
|
|
H[j] = HEX(H[j])
|
|
end
|
|
H = table_concat(H, "", 1, max_reg)
|
|
end
|
|
return H
|
|
end
|
|
end
|
|
|
|
if message then
|
|
-- Actually perform calculations and return the SHA256 digest of a message
|
|
return partial(message)()
|
|
else
|
|
-- Return function for chunk-by-chunk loading
|
|
-- User should feed every chunk of input data as single argument to this function and finally get SHA256 digest by invoking this function without an argument
|
|
return partial
|
|
end
|
|
end
|
|
|
|
|
|
local function sha512ext(width, message)
|
|
-- Create an instance (private objects for current calculation)
|
|
local length, tail, H_lo, H_hi = 0.0, "", {unpack(sha2_H_ext512_lo[width])}, not HEX64 and {unpack(sha2_H_ext512_hi[width])}
|
|
|
|
local function partial(message_part)
|
|
if message_part then
|
|
if tail then
|
|
length = length + #message_part
|
|
local offs = 0
|
|
if tail ~= "" and #tail + #message_part >= 128 then
|
|
offs = 128 - #tail
|
|
sha512_feed_128(H_lo, H_hi, tail..sub(message_part, 1, offs), 0, 128)
|
|
tail = ""
|
|
end
|
|
local size = #message_part - offs
|
|
local size_tail = size % 128
|
|
sha512_feed_128(H_lo, H_hi, message_part, offs, size - size_tail)
|
|
tail = tail..sub(message_part, #message_part + 1 - size_tail)
|
|
return partial
|
|
else
|
|
error("Adding more chunks is not allowed after receiving the result", 2)
|
|
end
|
|
else
|
|
if tail then
|
|
local final_blocks = {tail, "\128", string_rep("\0", (-17-length) % 128 + 9)}
|
|
tail = nil
|
|
-- Assuming user data length is shorter than (2^53)-17 bytes
|
|
-- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes
|
|
length = length * (8 / 256^7) -- convert "byte-counter" to "bit-counter" and move floating point to the left
|
|
for j = 4, 10 do
|
|
length = length % 1 * 256
|
|
final_blocks[j] = char(floor(length))
|
|
end
|
|
final_blocks = table_concat(final_blocks)
|
|
sha512_feed_128(H_lo, H_hi, final_blocks, 0, #final_blocks)
|
|
local max_reg = ceil(width / 64)
|
|
if HEX64 then
|
|
for j = 1, max_reg do
|
|
H_lo[j] = HEX64(H_lo[j])
|
|
end
|
|
else
|
|
for j = 1, max_reg do
|
|
H_lo[j] = HEX(H_hi[j])..HEX(H_lo[j])
|
|
end
|
|
H_hi = nil
|
|
end
|
|
H_lo = sub(table_concat(H_lo, "", 1, max_reg), 1, width / 4)
|
|
end
|
|
return H_lo
|
|
end
|
|
end
|
|
|
|
if message then
|
|
-- Actually perform calculations and return the SHA512 digest of a message
|
|
return partial(message)()
|
|
else
|
|
-- Return function for chunk-by-chunk loading
|
|
-- User should feed every chunk of input data as single argument to this function and finally get SHA512 digest by invoking this function without an argument
|
|
return partial
|
|
end
|
|
end
|
|
|
|
|
|
local function md5(message)
|
|
-- Create an instance (private objects for current calculation)
|
|
local H, length, tail = {unpack(md5_sha1_H, 1, 4)}, 0.0, ""
|
|
|
|
local function partial(message_part)
|
|
if message_part then
|
|
if tail then
|
|
length = length + #message_part
|
|
local offs = 0
|
|
if tail ~= "" and #tail + #message_part >= 64 then
|
|
offs = 64 - #tail
|
|
md5_feed_64(H, tail..sub(message_part, 1, offs), 0, 64)
|
|
tail = ""
|
|
end
|
|
local size = #message_part - offs
|
|
local size_tail = size % 64
|
|
md5_feed_64(H, message_part, offs, size - size_tail)
|
|
tail = tail..sub(message_part, #message_part + 1 - size_tail)
|
|
return partial
|
|
else
|
|
error("Adding more chunks is not allowed after receiving the result", 2)
|
|
end
|
|
else
|
|
if tail then
|
|
local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64)}
|
|
tail = nil
|
|
length = length * 8 -- convert "byte-counter" to "bit-counter"
|
|
for j = 4, 11 do
|
|
local low_byte = length % 256
|
|
final_blocks[j] = char(low_byte)
|
|
length = (length - low_byte) / 256
|
|
end
|
|
final_blocks = table_concat(final_blocks)
|
|
md5_feed_64(H, final_blocks, 0, #final_blocks)
|
|
for j = 1, 4 do
|
|
H[j] = HEX(H[j])
|
|
end
|
|
H = gsub(table_concat(H), "(..)(..)(..)(..)", "%4%3%2%1")
|
|
end
|
|
return H
|
|
end
|
|
end
|
|
|
|
if message then
|
|
-- Actually perform calculations and return the MD5 digest of a message
|
|
return partial(message)()
|
|
else
|
|
-- Return function for chunk-by-chunk loading
|
|
-- User should feed every chunk of input data as single argument to this function and finally get MD5 digest by invoking this function without an argument
|
|
return partial
|
|
end
|
|
end
|
|
|
|
|
|
local function sha1(message)
|
|
-- Create an instance (private objects for current calculation)
|
|
local H, length, tail = {unpack(md5_sha1_H)}, 0.0, ""
|
|
|
|
local function partial(message_part)
|
|
if message_part then
|
|
if tail then
|
|
length = length + #message_part
|
|
local offs = 0
|
|
if tail ~= "" and #tail + #message_part >= 64 then
|
|
offs = 64 - #tail
|
|
sha1_feed_64(H, tail..sub(message_part, 1, offs), 0, 64)
|
|
tail = ""
|
|
end
|
|
local size = #message_part - offs
|
|
local size_tail = size % 64
|
|
sha1_feed_64(H, message_part, offs, size - size_tail)
|
|
tail = tail..sub(message_part, #message_part + 1 - size_tail)
|
|
return partial
|
|
else
|
|
error("Adding more chunks is not allowed after receiving the result", 2)
|
|
end
|
|
else
|
|
if tail then
|
|
local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64 + 1)}
|
|
tail = nil
|
|
-- Assuming user data length is shorter than (2^53)-9 bytes
|
|
-- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes
|
|
length = length * (8 / 256^7) -- convert "byte-counter" to "bit-counter" and move decimal point to the left
|
|
for j = 4, 10 do
|
|
length = length % 1 * 256
|
|
final_blocks[j] = char(floor(length))
|
|
end
|
|
final_blocks = table_concat(final_blocks)
|
|
sha1_feed_64(H, final_blocks, 0, #final_blocks)
|
|
for j = 1, 5 do
|
|
H[j] = HEX(H[j])
|
|
end
|
|
H = table_concat(H)
|
|
end
|
|
return H
|
|
end
|
|
end
|
|
|
|
if message then
|
|
-- Actually perform calculations and return the SHA-1 digest of a message
|
|
return partial(message)()
|
|
else
|
|
-- Return function for chunk-by-chunk loading
|
|
-- User should feed every chunk of input data as single argument to this function and finally get SHA-1 digest by invoking this function without an argument
|
|
return partial
|
|
end
|
|
end
|
|
|
|
|
|
local function keccak(block_size_in_bytes, digest_size_in_bytes, is_SHAKE, message)
|
|
-- "block_size_in_bytes" is multiple of 8
|
|
if type(digest_size_in_bytes) ~= "number" then
|
|
-- arguments in SHAKE are swapped:
|
|
-- NIST FIPS 202 defines SHAKE(message,num_bits)
|
|
-- this module defines SHAKE(num_bytes,message)
|
|
-- it's easy to forget about this swap, hence the check
|
|
error("Argument 'digest_size_in_bytes' must be a number", 2)
|
|
end
|
|
-- Create an instance (private objects for current calculation)
|
|
local tail, lanes_lo, lanes_hi = "", create_array_of_lanes(), hi_factor_keccak == 0 and create_array_of_lanes()
|
|
local result
|
|
|
|
local function partial(message_part)
|
|
if message_part then
|
|
if tail then
|
|
local offs = 0
|
|
if tail ~= "" and #tail + #message_part >= block_size_in_bytes then
|
|
offs = block_size_in_bytes - #tail
|
|
keccak_feed(lanes_lo, lanes_hi, tail..sub(message_part, 1, offs), 0, block_size_in_bytes, block_size_in_bytes)
|
|
tail = ""
|
|
end
|
|
local size = #message_part - offs
|
|
local size_tail = size % block_size_in_bytes
|
|
keccak_feed(lanes_lo, lanes_hi, message_part, offs, size - size_tail, block_size_in_bytes)
|
|
tail = tail..sub(message_part, #message_part + 1 - size_tail)
|
|
return partial
|
|
else
|
|
error("Adding more chunks is not allowed after receiving the result", 2)
|
|
end
|
|
else
|
|
if tail then
|
|
-- append the following bits to the message: for usual SHA-3: 011(0*)1, for SHAKE: 11111(0*)1
|
|
local gap_start = is_SHAKE and 31 or 6
|
|
tail = tail..(#tail + 1 == block_size_in_bytes and char(gap_start + 128) or char(gap_start)..string_rep("\0", (-2 - #tail) % block_size_in_bytes).."\128")
|
|
keccak_feed(lanes_lo, lanes_hi, tail, 0, #tail, block_size_in_bytes)
|
|
tail = nil
|
|
local lanes_used = 0
|
|
local total_lanes = floor(block_size_in_bytes / 8)
|
|
local qwords = {}
|
|
|
|
local function get_next_qwords_of_digest(qwords_qty)
|
|
-- returns not more than 'qwords_qty' qwords ('qwords_qty' might be non-integer)
|
|
-- doesn't go across keccak-buffer boundary
|
|
-- block_size_in_bytes is a multiple of 8, so, keccak-buffer contains integer number of qwords
|
|
if lanes_used >= total_lanes then
|
|
keccak_feed(lanes_lo, lanes_hi, "\0\0\0\0\0\0\0\0", 0, 8, 8)
|
|
lanes_used = 0
|
|
end
|
|
qwords_qty = floor(math_min(qwords_qty, total_lanes - lanes_used))
|
|
if hi_factor_keccak ~= 0 then
|
|
for j = 1, qwords_qty do
|
|
qwords[j] = HEX64(lanes_lo[lanes_used + j - 1 + lanes_index_base])
|
|
end
|
|
else
|
|
for j = 1, qwords_qty do
|
|
qwords[j] = HEX(lanes_hi[lanes_used + j])..HEX(lanes_lo[lanes_used + j])
|
|
end
|
|
end
|
|
lanes_used = lanes_used + qwords_qty
|
|
return
|
|
gsub(table_concat(qwords, "", 1, qwords_qty), "(..)(..)(..)(..)(..)(..)(..)(..)", "%8%7%6%5%4%3%2%1"),
|
|
qwords_qty * 8
|
|
end
|
|
|
|
local parts = {} -- digest parts
|
|
local last_part, last_part_size = "", 0
|
|
|
|
local function get_next_part_of_digest(bytes_needed)
|
|
-- returns 'bytes_needed' bytes, for arbitrary integer 'bytes_needed'
|
|
bytes_needed = bytes_needed or 1
|
|
if bytes_needed <= last_part_size then
|
|
last_part_size = last_part_size - bytes_needed
|
|
local part_size_in_nibbles = bytes_needed * 2
|
|
local result = sub(last_part, 1, part_size_in_nibbles)
|
|
last_part = sub(last_part, part_size_in_nibbles + 1)
|
|
return result
|
|
end
|
|
local parts_qty = 0
|
|
if last_part_size > 0 then
|
|
parts_qty = 1
|
|
parts[parts_qty] = last_part
|
|
bytes_needed = bytes_needed - last_part_size
|
|
end
|
|
-- repeats until the length is enough
|
|
while bytes_needed >= 8 do
|
|
local next_part, next_part_size = get_next_qwords_of_digest(bytes_needed / 8)
|
|
parts_qty = parts_qty + 1
|
|
parts[parts_qty] = next_part
|
|
bytes_needed = bytes_needed - next_part_size
|
|
end
|
|
if bytes_needed > 0 then
|
|
last_part, last_part_size = get_next_qwords_of_digest(1)
|
|
parts_qty = parts_qty + 1
|
|
parts[parts_qty] = get_next_part_of_digest(bytes_needed)
|
|
else
|
|
last_part, last_part_size = "", 0
|
|
end
|
|
return table_concat(parts, "", 1, parts_qty)
|
|
end
|
|
|
|
if digest_size_in_bytes < 0 then
|
|
result = get_next_part_of_digest
|
|
else
|
|
result = get_next_part_of_digest(digest_size_in_bytes)
|
|
end
|
|
end
|
|
return result
|
|
end
|
|
end
|
|
|
|
if message then
|
|
-- Actually perform calculations and return the SHA-3 digest of a message
|
|
return partial(message)()
|
|
else
|
|
-- Return function for chunk-by-chunk loading
|
|
-- User should feed every chunk of input data as single argument to this function and finally get SHA-3 digest by invoking this function without an argument
|
|
return partial
|
|
end
|
|
end
|
|
|
|
|
|
local hex_to_bin, bin_to_hex, bin_to_base64, base64_to_bin
|
|
do
|
|
function hex_to_bin(hex_string)
|
|
return (gsub(hex_string, "%x%x",
|
|
function (hh)
|
|
return char(tonumber(hh, 16))
|
|
end
|
|
))
|
|
end
|
|
|
|
function bin_to_hex(binary_string)
|
|
return (gsub(binary_string, ".",
|
|
function (c)
|
|
return string_format("%02x", byte(c))
|
|
end
|
|
))
|
|
end
|
|
|
|
local base64_symbols = {
|
|
['+'] = 62, ['-'] = 62, [62] = '+',
|
|
['/'] = 63, ['_'] = 63, [63] = '/',
|
|
['='] = -1, ['.'] = -1, [-1] = '='
|
|
}
|
|
local symbol_index = 0
|
|
for j, pair in ipairs{'AZ', 'az', '09'} do
|
|
for ascii = byte(pair), byte(pair, 2) do
|
|
local ch = char(ascii)
|
|
base64_symbols[ch] = symbol_index
|
|
base64_symbols[symbol_index] = ch
|
|
symbol_index = symbol_index + 1
|
|
end
|
|
end
|
|
|
|
function bin_to_base64(binary_string)
|
|
local result = {}
|
|
for pos = 1, #binary_string, 3 do
|
|
local c1, c2, c3, c4 = byte(sub(binary_string, pos, pos + 2)..'\0', 1, -1)
|
|
result[#result + 1] =
|
|
base64_symbols[floor(c1 / 4)]
|
|
..base64_symbols[c1 % 4 * 16 + floor(c2 / 16)]
|
|
..base64_symbols[c3 and c2 % 16 * 4 + floor(c3 / 64) or -1]
|
|
..base64_symbols[c4 and c3 % 64 or -1]
|
|
end
|
|
return table_concat(result)
|
|
end
|
|
|
|
function base64_to_bin(base64_string)
|
|
local result, chars_qty = {}, 3
|
|
for pos, ch in gmatch(gsub(base64_string, '%s+', ''), '()(.)') do
|
|
local code = base64_symbols[ch]
|
|
if code < 0 then
|
|
chars_qty = chars_qty - 1
|
|
code = 0
|
|
end
|
|
local idx = pos % 4
|
|
if idx > 0 then
|
|
result[-idx] = code
|
|
else
|
|
local c1 = result[-1] * 4 + floor(result[-2] / 16)
|
|
local c2 = (result[-2] % 16) * 16 + floor(result[-3] / 4)
|
|
local c3 = (result[-3] % 4) * 64 + code
|
|
result[#result + 1] = sub(char(c1, c2, c3), 1, chars_qty)
|
|
end
|
|
end
|
|
return table_concat(result)
|
|
end
|
|
|
|
end
|
|
|
|
|
|
local block_size_for_HMAC -- this table will be initialized at the end of the module
|
|
|
|
local function pad_and_xor(str, result_length, byte_for_xor)
|
|
return gsub(str, ".",
|
|
function(c)
|
|
return char(XOR_BYTE(byte(c), byte_for_xor))
|
|
end
|
|
)..string_rep(char(byte_for_xor), result_length - #str)
|
|
end
|
|
|
|
local function hmac(hash_func, key, message)
|
|
-- Create an instance (private objects for current calculation)
|
|
local block_size = block_size_for_HMAC[hash_func]
|
|
if not block_size then
|
|
error("Unknown hash function", 2)
|
|
end
|
|
if #key > block_size then
|
|
key = hex_to_bin(hash_func(key))
|
|
end
|
|
local append = hash_func()(pad_and_xor(key, block_size, 0x36))
|
|
local result
|
|
|
|
local function partial(message_part)
|
|
if not message_part then
|
|
result = result or hash_func(pad_and_xor(key, block_size, 0x5C)..hex_to_bin(append()))
|
|
return result
|
|
elseif result then
|
|
error("Adding more chunks is not allowed after receiving the result", 2)
|
|
else
|
|
append(message_part)
|
|
return partial
|
|
end
|
|
end
|
|
|
|
if message then
|
|
-- Actually perform calculations and return the HMAC of a message
|
|
return partial(message)()
|
|
else
|
|
-- Return function for chunk-by-chunk loading of a message
|
|
-- User should feed every chunk of the message as single argument to this function and finally get HMAC by invoking this function without an argument
|
|
return partial
|
|
end
|
|
end
|
|
|
|
|
|
local function xor_blake2_salt(salt, letter, H_lo, H_hi)
|
|
-- salt: concatenation of "Salt"+"Personalization" fields
|
|
local max_size = letter == "s" and 16 or 32
|
|
local salt_size = #salt
|
|
if salt_size > max_size then
|
|
error(string_format("For BLAKE2%s/BLAKE2%sp/BLAKE2X%s the 'salt' parameter length must not exceed %d bytes", letter, letter, letter, max_size), 2)
|
|
end
|
|
if H_lo then
|
|
local offset, blake2_word_size, xor = 0, letter == "s" and 4 or 8, letter == "s" and XOR or XORA5
|
|
for j = 5, 4 + ceil(salt_size / blake2_word_size) do
|
|
local prev, last
|
|
for _ = 1, blake2_word_size, 4 do
|
|
offset = offset + 4
|
|
local a, b, c, d = byte(salt, offset - 3, offset)
|
|
local four_bytes = (((d or 0) * 256 + (c or 0)) * 256 + (b or 0)) * 256 + (a or 0)
|
|
prev, last = last, four_bytes
|
|
end
|
|
H_lo[j] = xor(H_lo[j], prev and last * hi_factor + prev or last)
|
|
if H_hi then
|
|
H_hi[j] = xor(H_hi[j], last)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
local function blake2s(message, key, salt, digest_size_in_bytes, XOF_length, B2_offset)
|
|
-- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
|
|
-- key: (optional) binary string up to 32 bytes, by default empty string
|
|
-- salt: (optional) binary string up to 16 bytes, by default empty string
|
|
-- digest_size_in_bytes: (optional) integer from 1 to 32, by default 32
|
|
-- The last two parameters "XOF_length" and "B2_offset" are for internal use only, user must omit them (or pass nil)
|
|
digest_size_in_bytes = digest_size_in_bytes or 32
|
|
if digest_size_in_bytes < 1 or digest_size_in_bytes > 32 then
|
|
error("BLAKE2s digest length must be from 1 to 32 bytes", 2)
|
|
end
|
|
key = key or ""
|
|
local key_length = #key
|
|
if key_length > 32 then
|
|
error("BLAKE2s key length must not exceed 32 bytes", 2)
|
|
end
|
|
salt = salt or ""
|
|
local bytes_compressed, tail, H = 0.0, "", {unpack(sha2_H_hi)}
|
|
if B2_offset then
|
|
H[1] = XOR(H[1], digest_size_in_bytes)
|
|
H[2] = XOR(H[2], 0x20)
|
|
H[3] = XOR(H[3], B2_offset)
|
|
H[4] = XOR(H[4], 0x20000000 + XOF_length)
|
|
else
|
|
H[1] = XOR(H[1], 0x01010000 + key_length * 256 + digest_size_in_bytes)
|
|
if XOF_length then
|
|
H[4] = XOR(H[4], XOF_length)
|
|
end
|
|
end
|
|
if salt ~= "" then
|
|
xor_blake2_salt(salt, "s", H)
|
|
end
|
|
|
|
local function partial(message_part)
|
|
if message_part then
|
|
if tail then
|
|
local offs = 0
|
|
if tail ~= "" and #tail + #message_part > 64 then
|
|
offs = 64 - #tail
|
|
bytes_compressed = blake2s_feed_64(H, tail..sub(message_part, 1, offs), 0, 64, bytes_compressed)
|
|
tail = ""
|
|
end
|
|
local size = #message_part - offs
|
|
local size_tail = size > 0 and (size - 1) % 64 + 1 or 0
|
|
bytes_compressed = blake2s_feed_64(H, message_part, offs, size - size_tail, bytes_compressed)
|
|
tail = tail..sub(message_part, #message_part + 1 - size_tail)
|
|
return partial
|
|
else
|
|
error("Adding more chunks is not allowed after receiving the result", 2)
|
|
end
|
|
else
|
|
if tail then
|
|
if B2_offset then
|
|
blake2s_feed_64(H, nil, 0, 64, 0, 32)
|
|
else
|
|
blake2s_feed_64(H, tail..string_rep("\0", 64 - #tail), 0, 64, bytes_compressed, #tail)
|
|
end
|
|
tail = nil
|
|
if not XOF_length or B2_offset then
|
|
local max_reg = ceil(digest_size_in_bytes / 4)
|
|
for j = 1, max_reg do
|
|
H[j] = HEX(H[j])
|
|
end
|
|
H = sub(gsub(table_concat(H, "", 1, max_reg), "(..)(..)(..)(..)", "%4%3%2%1"), 1, digest_size_in_bytes * 2)
|
|
end
|
|
end
|
|
return H
|
|
end
|
|
end
|
|
|
|
if key_length > 0 then
|
|
partial(key..string_rep("\0", 64 - key_length))
|
|
end
|
|
if B2_offset then
|
|
return partial()
|
|
elseif message then
|
|
-- Actually perform calculations and return the BLAKE2s digest of a message
|
|
return partial(message)()
|
|
else
|
|
-- Return function for chunk-by-chunk loading
|
|
-- User should feed every chunk of input data as single argument to this function and finally get BLAKE2s digest by invoking this function without an argument
|
|
return partial
|
|
end
|
|
end
|
|
|
|
local function blake2b(message, key, salt, digest_size_in_bytes, XOF_length, B2_offset)
|
|
-- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
|
|
-- key: (optional) binary string up to 64 bytes, by default empty string
|
|
-- salt: (optional) binary string up to 32 bytes, by default empty string
|
|
-- digest_size_in_bytes: (optional) integer from 1 to 64, by default 64
|
|
-- The last two parameters "XOF_length" and "B2_offset" are for internal use only, user must omit them (or pass nil)
|
|
digest_size_in_bytes = floor(digest_size_in_bytes or 64)
|
|
if digest_size_in_bytes < 1 or digest_size_in_bytes > 64 then
|
|
error("BLAKE2b digest length must be from 1 to 64 bytes", 2)
|
|
end
|
|
key = key or ""
|
|
local key_length = #key
|
|
if key_length > 64 then
|
|
error("BLAKE2b key length must not exceed 64 bytes", 2)
|
|
end
|
|
salt = salt or ""
|
|
local bytes_compressed, tail, H_lo, H_hi = 0.0, "", {unpack(sha2_H_lo)}, not HEX64 and {unpack(sha2_H_hi)}
|
|
if B2_offset then
|
|
if H_hi then
|
|
H_lo[1] = XORA5(H_lo[1], digest_size_in_bytes)
|
|
H_hi[1] = XORA5(H_hi[1], 0x40)
|
|
H_lo[2] = XORA5(H_lo[2], B2_offset)
|
|
H_hi[2] = XORA5(H_hi[2], XOF_length)
|
|
else
|
|
H_lo[1] = XORA5(H_lo[1], 0x40 * hi_factor + digest_size_in_bytes)
|
|
H_lo[2] = XORA5(H_lo[2], XOF_length * hi_factor + B2_offset)
|
|
end
|
|
H_lo[3] = XORA5(H_lo[3], 0x4000)
|
|
else
|
|
H_lo[1] = XORA5(H_lo[1], 0x01010000 + key_length * 256 + digest_size_in_bytes)
|
|
if XOF_length then
|
|
if H_hi then
|
|
H_hi[2] = XORA5(H_hi[2], XOF_length)
|
|
else
|
|
H_lo[2] = XORA5(H_lo[2], XOF_length * hi_factor)
|
|
end
|
|
end
|
|
end
|
|
if salt ~= "" then
|
|
xor_blake2_salt(salt, "b", H_lo, H_hi)
|
|
end
|
|
|
|
local function partial(message_part)
|
|
if message_part then
|
|
if tail then
|
|
local offs = 0
|
|
if tail ~= "" and #tail + #message_part > 128 then
|
|
offs = 128 - #tail
|
|
bytes_compressed = blake2b_feed_128(H_lo, H_hi, tail..sub(message_part, 1, offs), 0, 128, bytes_compressed)
|
|
tail = ""
|
|
end
|
|
local size = #message_part - offs
|
|
local size_tail = size > 0 and (size - 1) % 128 + 1 or 0
|
|
bytes_compressed = blake2b_feed_128(H_lo, H_hi, message_part, offs, size - size_tail, bytes_compressed)
|
|
tail = tail..sub(message_part, #message_part + 1 - size_tail)
|
|
return partial
|
|
else
|
|
error("Adding more chunks is not allowed after receiving the result", 2)
|
|
end
|
|
else
|
|
if tail then
|
|
if B2_offset then
|
|
blake2b_feed_128(H_lo, H_hi, nil, 0, 128, 0, 64)
|
|
else
|
|
blake2b_feed_128(H_lo, H_hi, tail..string_rep("\0", 128 - #tail), 0, 128, bytes_compressed, #tail)
|
|
end
|
|
tail = nil
|
|
if XOF_length and not B2_offset then
|
|
if H_hi then
|
|
for j = 8, 1, -1 do
|
|
H_lo[j*2] = H_hi[j]
|
|
H_lo[j*2-1] = H_lo[j]
|
|
end
|
|
return H_lo, 16
|
|
end
|
|
else
|
|
local max_reg = ceil(digest_size_in_bytes / 8)
|
|
if H_hi then
|
|
for j = 1, max_reg do
|
|
H_lo[j] = HEX(H_hi[j])..HEX(H_lo[j])
|
|
end
|
|
else
|
|
for j = 1, max_reg do
|
|
H_lo[j] = HEX64(H_lo[j])
|
|
end
|
|
end
|
|
H_lo = sub(gsub(table_concat(H_lo, "", 1, max_reg), "(..)(..)(..)(..)(..)(..)(..)(..)", "%8%7%6%5%4%3%2%1"), 1, digest_size_in_bytes * 2)
|
|
end
|
|
H_hi = nil
|
|
end
|
|
return H_lo
|
|
end
|
|
end
|
|
|
|
if key_length > 0 then
|
|
partial(key..string_rep("\0", 128 - key_length))
|
|
end
|
|
if B2_offset then
|
|
return partial()
|
|
elseif message then
|
|
-- Actually perform calculations and return the BLAKE2b digest of a message
|
|
return partial(message)()
|
|
else
|
|
-- Return function for chunk-by-chunk loading
|
|
-- User should feed every chunk of input data as single argument to this function and finally get BLAKE2b digest by invoking this function without an argument
|
|
return partial
|
|
end
|
|
end
|
|
|
|
local function blake2sp(message, key, salt, digest_size_in_bytes)
|
|
-- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
|
|
-- key: (optional) binary string up to 32 bytes, by default empty string
|
|
-- salt: (optional) binary string up to 16 bytes, by default empty string
|
|
-- digest_size_in_bytes: (optional) integer from 1 to 32, by default 32
|
|
digest_size_in_bytes = digest_size_in_bytes or 32
|
|
if digest_size_in_bytes < 1 or digest_size_in_bytes > 32 then
|
|
error("BLAKE2sp digest length must be from 1 to 32 bytes", 2)
|
|
end
|
|
key = key or ""
|
|
local key_length = #key
|
|
if key_length > 32 then
|
|
error("BLAKE2sp key length must not exceed 32 bytes", 2)
|
|
end
|
|
salt = salt or ""
|
|
local instances, length, first_dword_of_parameter_block, result = {}, 0.0, 0x02080000 + key_length * 256 + digest_size_in_bytes
|
|
for j = 1, 8 do
|
|
local bytes_compressed, tail, H = 0.0, "", {unpack(sha2_H_hi)}
|
|
instances[j] = {bytes_compressed, tail, H}
|
|
H[1] = XOR(H[1], first_dword_of_parameter_block)
|
|
H[3] = XOR(H[3], j-1)
|
|
H[4] = XOR(H[4], 0x20000000)
|
|
if salt ~= "" then
|
|
xor_blake2_salt(salt, "s", H)
|
|
end
|
|
end
|
|
|
|
local function partial(message_part)
|
|
if message_part then
|
|
if instances then
|
|
local from = 0
|
|
while true do
|
|
local to = math_min(from + 64 - length % 64, #message_part)
|
|
if to > from then
|
|
local inst = instances[floor(length / 64) % 8 + 1]
|
|
local part = sub(message_part, from + 1, to)
|
|
length, from = length + to - from, to
|
|
local bytes_compressed, tail = inst[1], inst[2]
|
|
if #tail < 64 then
|
|
tail = tail..part
|
|
else
|
|
local H = inst[3]
|
|
bytes_compressed = blake2s_feed_64(H, tail, 0, 64, bytes_compressed)
|
|
tail = part
|
|
end
|
|
inst[1], inst[2] = bytes_compressed, tail
|
|
else
|
|
break
|
|
end
|
|
end
|
|
return partial
|
|
else
|
|
error("Adding more chunks is not allowed after receiving the result", 2)
|
|
end
|
|
else
|
|
if instances then
|
|
local root_H = {unpack(sha2_H_hi)}
|
|
root_H[1] = XOR(root_H[1], first_dword_of_parameter_block)
|
|
root_H[4] = XOR(root_H[4], 0x20010000)
|
|
if salt ~= "" then
|
|
xor_blake2_salt(salt, "s", root_H)
|
|
end
|
|
for j = 1, 8 do
|
|
local inst = instances[j]
|
|
local bytes_compressed, tail, H = inst[1], inst[2], inst[3]
|
|
blake2s_feed_64(H, tail..string_rep("\0", 64 - #tail), 0, 64, bytes_compressed, #tail, j == 8)
|
|
if j % 2 == 0 then
|
|
local index = 0
|
|
for k = j - 1, j do
|
|
local inst = instances[k]
|
|
local H = inst[3]
|
|
for i = 1, 8 do
|
|
index = index + 1
|
|
common_W_blake2s[index] = H[i]
|
|
end
|
|
end
|
|
blake2s_feed_64(root_H, nil, 0, 64, 64 * (j/2 - 1), j == 8 and 64, j == 8)
|
|
end
|
|
end
|
|
instances = nil
|
|
local max_reg = ceil(digest_size_in_bytes / 4)
|
|
for j = 1, max_reg do
|
|
root_H[j] = HEX(root_H[j])
|
|
end
|
|
result = sub(gsub(table_concat(root_H, "", 1, max_reg), "(..)(..)(..)(..)", "%4%3%2%1"), 1, digest_size_in_bytes * 2)
|
|
end
|
|
return result
|
|
end
|
|
end
|
|
|
|
if key_length > 0 then
|
|
key = key..string_rep("\0", 64 - key_length)
|
|
for j = 1, 8 do
|
|
partial(key)
|
|
end
|
|
end
|
|
if message then
|
|
-- Actually perform calculations and return the BLAKE2sp digest of a message
|
|
return partial(message)()
|
|
else
|
|
-- Return function for chunk-by-chunk loading
|
|
-- User should feed every chunk of input data as single argument to this function and finally get BLAKE2sp digest by invoking this function without an argument
|
|
return partial
|
|
end
|
|
|
|
end
|
|
|
|
local function blake2bp(message, key, salt, digest_size_in_bytes)
|
|
-- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
|
|
-- key: (optional) binary string up to 64 bytes, by default empty string
|
|
-- salt: (optional) binary string up to 32 bytes, by default empty string
|
|
-- digest_size_in_bytes: (optional) integer from 1 to 64, by default 64
|
|
digest_size_in_bytes = digest_size_in_bytes or 64
|
|
if digest_size_in_bytes < 1 or digest_size_in_bytes > 64 then
|
|
error("BLAKE2bp digest length must be from 1 to 64 bytes", 2)
|
|
end
|
|
key = key or ""
|
|
local key_length = #key
|
|
if key_length > 64 then
|
|
error("BLAKE2bp key length must not exceed 64 bytes", 2)
|
|
end
|
|
salt = salt or ""
|
|
local instances, length, first_dword_of_parameter_block, result = {}, 0.0, 0x02040000 + key_length * 256 + digest_size_in_bytes
|
|
for j = 1, 4 do
|
|
local bytes_compressed, tail, H_lo, H_hi = 0.0, "", {unpack(sha2_H_lo)}, not HEX64 and {unpack(sha2_H_hi)}
|
|
instances[j] = {bytes_compressed, tail, H_lo, H_hi}
|
|
H_lo[1] = XORA5(H_lo[1], first_dword_of_parameter_block)
|
|
H_lo[2] = XORA5(H_lo[2], j-1)
|
|
H_lo[3] = XORA5(H_lo[3], 0x4000)
|
|
if salt ~= "" then
|
|
xor_blake2_salt(salt, "b", H_lo, H_hi)
|
|
end
|
|
end
|
|
|
|
local function partial(message_part)
|
|
if message_part then
|
|
if instances then
|
|
local from = 0
|
|
while true do
|
|
local to = math_min(from + 128 - length % 128, #message_part)
|
|
if to > from then
|
|
local inst = instances[floor(length / 128) % 4 + 1]
|
|
local part = sub(message_part, from + 1, to)
|
|
length, from = length + to - from, to
|
|
local bytes_compressed, tail = inst[1], inst[2]
|
|
if #tail < 128 then
|
|
tail = tail..part
|
|
else
|
|
local H_lo, H_hi = inst[3], inst[4]
|
|
bytes_compressed = blake2b_feed_128(H_lo, H_hi, tail, 0, 128, bytes_compressed)
|
|
tail = part
|
|
end
|
|
inst[1], inst[2] = bytes_compressed, tail
|
|
else
|
|
break
|
|
end
|
|
end
|
|
return partial
|
|
else
|
|
error("Adding more chunks is not allowed after receiving the result", 2)
|
|
end
|
|
else
|
|
if instances then
|
|
local root_H_lo, root_H_hi = {unpack(sha2_H_lo)}, not HEX64 and {unpack(sha2_H_hi)}
|
|
root_H_lo[1] = XORA5(root_H_lo[1], first_dword_of_parameter_block)
|
|
root_H_lo[3] = XORA5(root_H_lo[3], 0x4001)
|
|
if salt ~= "" then
|
|
xor_blake2_salt(salt, "b", root_H_lo, root_H_hi)
|
|
end
|
|
for j = 1, 4 do
|
|
local inst = instances[j]
|
|
local bytes_compressed, tail, H_lo, H_hi = inst[1], inst[2], inst[3], inst[4]
|
|
blake2b_feed_128(H_lo, H_hi, tail..string_rep("\0", 128 - #tail), 0, 128, bytes_compressed, #tail, j == 4)
|
|
if j % 2 == 0 then
|
|
local index = 0
|
|
for k = j - 1, j do
|
|
local inst = instances[k]
|
|
local H_lo, H_hi = inst[3], inst[4]
|
|
for i = 1, 8 do
|
|
index = index + 1
|
|
common_W_blake2b[index] = H_lo[i]
|
|
if H_hi then
|
|
index = index + 1
|
|
common_W_blake2b[index] = H_hi[i]
|
|
end
|
|
end
|
|
end
|
|
blake2b_feed_128(root_H_lo, root_H_hi, nil, 0, 128, 128 * (j/2 - 1), j == 4 and 128, j == 4)
|
|
end
|
|
end
|
|
instances = nil
|
|
local max_reg = ceil(digest_size_in_bytes / 8)
|
|
if HEX64 then
|
|
for j = 1, max_reg do
|
|
root_H_lo[j] = HEX64(root_H_lo[j])
|
|
end
|
|
else
|
|
for j = 1, max_reg do
|
|
root_H_lo[j] = HEX(root_H_hi[j])..HEX(root_H_lo[j])
|
|
end
|
|
end
|
|
result = sub(gsub(table_concat(root_H_lo, "", 1, max_reg), "(..)(..)(..)(..)(..)(..)(..)(..)", "%8%7%6%5%4%3%2%1"), 1, digest_size_in_bytes * 2)
|
|
end
|
|
return result
|
|
end
|
|
end
|
|
|
|
if key_length > 0 then
|
|
key = key..string_rep("\0", 128 - key_length)
|
|
for j = 1, 4 do
|
|
partial(key)
|
|
end
|
|
end
|
|
if message then
|
|
-- Actually perform calculations and return the BLAKE2bp digest of a message
|
|
return partial(message)()
|
|
else
|
|
-- Return function for chunk-by-chunk loading
|
|
-- User should feed every chunk of input data as single argument to this function and finally get BLAKE2bp digest by invoking this function without an argument
|
|
return partial
|
|
end
|
|
|
|
end
|
|
|
|
local function blake2x(inner_func, inner_func_letter, common_W_blake2, block_size, digest_size_in_bytes, message, key, salt)
|
|
local XOF_digest_length_limit, XOF_digest_length, chunk_by_chunk_output = 2^(block_size / 2) - 1
|
|
if digest_size_in_bytes == -1 then -- infinite digest
|
|
digest_size_in_bytes = math_huge
|
|
XOF_digest_length = floor(XOF_digest_length_limit)
|
|
chunk_by_chunk_output = true
|
|
else
|
|
if digest_size_in_bytes < 0 then
|
|
digest_size_in_bytes = -1.0 * digest_size_in_bytes
|
|
chunk_by_chunk_output = true
|
|
end
|
|
XOF_digest_length = floor(digest_size_in_bytes)
|
|
if XOF_digest_length >= XOF_digest_length_limit then
|
|
error("Requested digest is too long. BLAKE2X"..inner_func_letter.." finite digest is limited by (2^"..floor(block_size / 2)..")-2 bytes. Hint: you can generate infinite digest.", 2)
|
|
end
|
|
end
|
|
salt = salt or ""
|
|
if salt ~= "" then
|
|
xor_blake2_salt(salt, inner_func_letter) -- don't xor, only check the size of salt
|
|
end
|
|
local inner_partial = inner_func(nil, key, salt, nil, XOF_digest_length)
|
|
local result
|
|
|
|
local function partial(message_part)
|
|
if message_part then
|
|
if inner_partial then
|
|
inner_partial(message_part)
|
|
return partial
|
|
else
|
|
error("Adding more chunks is not allowed after receiving the result", 2)
|
|
end
|
|
else
|
|
if inner_partial then
|
|
local half_W, half_W_size = inner_partial()
|
|
half_W_size, inner_partial = half_W_size or 8
|
|
|
|
local function get_hash_block(block_no)
|
|
-- block_no = 0...(2^32-1)
|
|
local size = math_min(block_size, digest_size_in_bytes - block_no * block_size)
|
|
if size <= 0 then
|
|
return ""
|
|
end
|
|
for j = 1, half_W_size do
|
|
common_W_blake2[j] = half_W[j]
|
|
end
|
|
for j = half_W_size + 1, 2 * half_W_size do
|
|
common_W_blake2[j] = 0
|
|
end
|
|
return inner_func(nil, nil, salt, size, XOF_digest_length, floor(block_no))
|
|
end
|
|
|
|
local hash = {}
|
|
if chunk_by_chunk_output then
|
|
local pos, period, cached_block_no, cached_block = 0, block_size * 2^32
|
|
|
|
local function get_next_part_of_digest(arg1, arg2)
|
|
if arg1 == "seek" then
|
|
-- Usage #1: get_next_part_of_digest("seek", new_pos)
|
|
pos = arg2 % period
|
|
else
|
|
-- Usage #2: hex_string = get_next_part_of_digest(size)
|
|
local size, index = arg1 or 1, 0
|
|
while size > 0 do
|
|
local block_offset = pos % block_size
|
|
local block_no = (pos - block_offset) / block_size
|
|
local part_size = math_min(size, block_size - block_offset)
|
|
if cached_block_no ~= block_no then
|
|
cached_block_no = block_no
|
|
cached_block = get_hash_block(block_no)
|
|
end
|
|
index = index + 1
|
|
hash[index] = sub(cached_block, block_offset * 2 + 1, (block_offset + part_size) * 2)
|
|
size = size - part_size
|
|
pos = (pos + part_size) % period
|
|
end
|
|
return table_concat(hash, "", 1, index)
|
|
end
|
|
end
|
|
|
|
result = get_next_part_of_digest
|
|
else
|
|
for j = 1.0, ceil(digest_size_in_bytes / block_size) do
|
|
hash[j] = get_hash_block(j - 1.0)
|
|
end
|
|
result = table_concat(hash)
|
|
end
|
|
end
|
|
return result
|
|
end
|
|
end
|
|
|
|
if message then
|
|
-- Actually perform calculations and return the BLAKE2X digest of a message
|
|
return partial(message)()
|
|
else
|
|
-- Return function for chunk-by-chunk loading
|
|
-- User should feed every chunk of input data as single argument to this function and finally get BLAKE2X digest by invoking this function without an argument
|
|
return partial
|
|
end
|
|
end
|
|
|
|
local function blake2xs(digest_size_in_bytes, message, key, salt)
|
|
-- digest_size_in_bytes:
|
|
-- 0..65534 = get finite digest as single Lua string
|
|
-- (-1) = get infinite digest in "chunk-by-chunk" output mode
|
|
-- (-2)..(-65534) = get finite digest in "chunk-by-chunk" output mode
|
|
-- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
|
|
-- key: (optional) binary string up to 32 bytes, by default empty string
|
|
-- salt: (optional) binary string up to 16 bytes, by default empty string
|
|
return blake2x(blake2s, "s", common_W_blake2s, 32, digest_size_in_bytes, message, key, salt)
|
|
end
|
|
|
|
local function blake2xb(digest_size_in_bytes, message, key, salt)
|
|
-- digest_size_in_bytes:
|
|
-- 0..4294967294 = get finite digest as single Lua string
|
|
-- (-1) = get infinite digest in "chunk-by-chunk" output mode
|
|
-- (-2)..(-4294967294) = get finite digest in "chunk-by-chunk" output mode
|
|
-- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
|
|
-- key: (optional) binary string up to 64 bytes, by default empty string
|
|
-- salt: (optional) binary string up to 32 bytes, by default empty string
|
|
return blake2x(blake2b, "b", common_W_blake2b, 64, digest_size_in_bytes, message, key, salt)
|
|
end
|
|
|
|
|
|
local function blake3(message, key, digest_size_in_bytes, message_flags, K, return_array)
|
|
-- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
|
|
-- key: (optional) binary string up to 32 bytes, by default empty string
|
|
-- digest_size_in_bytes: (optional) by default 32
|
|
-- 0,1,2,3,4,... = get finite digest as single Lua string
|
|
-- (-1) = get infinite digest in "chunk-by-chunk" output mode
|
|
-- -2,-3,-4,... = get finite digest in "chunk-by-chunk" output mode
|
|
-- The last three parameters "message_flags", "K" and "return_array" are for internal use only, user must omit them (or pass nil)
|
|
key = key or ""
|
|
digest_size_in_bytes = digest_size_in_bytes or 32
|
|
message_flags = message_flags or 0
|
|
if key == "" then
|
|
K = K or sha2_H_hi
|
|
else
|
|
local key_length = #key
|
|
if key_length > 32 then
|
|
error("BLAKE3 key length must not exceed 32 bytes", 2)
|
|
end
|
|
key = key..string_rep("\0", 32 - key_length)
|
|
K = {}
|
|
for j = 1, 8 do
|
|
local a, b, c, d = byte(key, 4*j-3, 4*j)
|
|
K[j] = ((d * 256 + c) * 256 + b) * 256 + a
|
|
end
|
|
message_flags = message_flags + 16 -- flag:KEYED_HASH
|
|
end
|
|
local tail, H, chunk_index, blocks_in_chunk, stack_size, stack = "", {}, 0, 0, 0, {}
|
|
local final_H_in, final_block_length, chunk_by_chunk_output, result, wide_output = K
|
|
local final_compression_flags = 3 -- flags:CHUNK_START,CHUNK_END
|
|
|
|
local function feed_blocks(str, offs, size)
|
|
-- size >= 0, size is multiple of 64
|
|
while size > 0 do
|
|
local part_size_in_blocks, block_flags, H_in = 1, 0, H
|
|
if blocks_in_chunk == 0 then
|
|
block_flags = 1 -- flag:CHUNK_START
|
|
H_in, final_H_in = K, H
|
|
final_compression_flags = 2 -- flag:CHUNK_END
|
|
elseif blocks_in_chunk == 15 then
|
|
block_flags = 2 -- flag:CHUNK_END
|
|
final_compression_flags = 3 -- flags:CHUNK_START,CHUNK_END
|
|
final_H_in = K
|
|
else
|
|
part_size_in_blocks = math_min(size / 64, 15 - blocks_in_chunk)
|
|
end
|
|
local part_size = part_size_in_blocks * 64
|
|
blake3_feed_64(str, offs, part_size, message_flags + block_flags, chunk_index, H_in, H)
|
|
offs, size = offs + part_size, size - part_size
|
|
blocks_in_chunk = (blocks_in_chunk + part_size_in_blocks) % 16
|
|
if blocks_in_chunk == 0 then
|
|
-- completing the currect chunk
|
|
chunk_index = chunk_index + 1.0
|
|
local divider = 2.0
|
|
while chunk_index % divider == 0 do
|
|
divider = divider * 2.0
|
|
stack_size = stack_size - 8
|
|
for j = 1, 8 do
|
|
common_W_blake2s[j] = stack[stack_size + j]
|
|
end
|
|
for j = 1, 8 do
|
|
common_W_blake2s[j + 8] = H[j]
|
|
end
|
|
blake3_feed_64(nil, 0, 64, message_flags + 4, 0, K, H) -- flag:PARENT
|
|
end
|
|
for j = 1, 8 do
|
|
stack[stack_size + j] = H[j]
|
|
end
|
|
stack_size = stack_size + 8
|
|
end
|
|
end
|
|
end
|
|
|
|
local function get_hash_block(block_no)
|
|
local size = math_min(64, digest_size_in_bytes - block_no * 64)
|
|
if block_no < 0 or size <= 0 then
|
|
return ""
|
|
end
|
|
if chunk_by_chunk_output then
|
|
for j = 1, 16 do
|
|
common_W_blake2s[j] = stack[j + 16]
|
|
end
|
|
end
|
|
blake3_feed_64(nil, 0, 64, final_compression_flags, block_no, final_H_in, stack, wide_output, final_block_length)
|
|
if return_array then
|
|
return stack
|
|
end
|
|
local max_reg = ceil(size / 4)
|
|
for j = 1, max_reg do
|
|
stack[j] = HEX(stack[j])
|
|
end
|
|
return sub(gsub(table_concat(stack, "", 1, max_reg), "(..)(..)(..)(..)", "%4%3%2%1"), 1, size * 2)
|
|
end
|
|
|
|
local function partial(message_part)
|
|
if message_part then
|
|
if tail then
|
|
local offs = 0
|
|
if tail ~= "" and #tail + #message_part > 64 then
|
|
offs = 64 - #tail
|
|
feed_blocks(tail..sub(message_part, 1, offs), 0, 64)
|
|
tail = ""
|
|
end
|
|
local size = #message_part - offs
|
|
local size_tail = size > 0 and (size - 1) % 64 + 1 or 0
|
|
feed_blocks(message_part, offs, size - size_tail)
|
|
tail = tail..sub(message_part, #message_part + 1 - size_tail)
|
|
return partial
|
|
else
|
|
error("Adding more chunks is not allowed after receiving the result", 2)
|
|
end
|
|
else
|
|
if tail then
|
|
final_block_length = #tail
|
|
tail = tail..string_rep("\0", 64 - #tail)
|
|
if common_W_blake2s[0] then
|
|
for j = 1, 16 do
|
|
local a, b, c, d = byte(tail, 4*j-3, 4*j)
|
|
common_W_blake2s[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
|
|
end
|
|
else
|
|
for j = 1, 16 do
|
|
local a, b, c, d = byte(tail, 4*j-3, 4*j)
|
|
common_W_blake2s[j] = ((d * 256 + c) * 256 + b) * 256 + a
|
|
end
|
|
end
|
|
tail = nil
|
|
for stack_size = stack_size - 8, 0, -8 do
|
|
blake3_feed_64(nil, 0, 64, message_flags + final_compression_flags, chunk_index, final_H_in, H, nil, final_block_length)
|
|
chunk_index, final_block_length, final_H_in, final_compression_flags = 0, 64, K, 4 -- flag:PARENT
|
|
for j = 1, 8 do
|
|
common_W_blake2s[j] = stack[stack_size + j]
|
|
end
|
|
for j = 1, 8 do
|
|
common_W_blake2s[j + 8] = H[j]
|
|
end
|
|
end
|
|
final_compression_flags = message_flags + final_compression_flags + 8 -- flag:ROOT
|
|
if digest_size_in_bytes < 0 then
|
|
if digest_size_in_bytes == -1 then -- infinite digest
|
|
digest_size_in_bytes = math_huge
|
|
else
|
|
digest_size_in_bytes = -1.0 * digest_size_in_bytes
|
|
end
|
|
chunk_by_chunk_output = true
|
|
for j = 1, 16 do
|
|
stack[j + 16] = common_W_blake2s[j]
|
|
end
|
|
end
|
|
digest_size_in_bytes = math_min(2^53, digest_size_in_bytes)
|
|
wide_output = digest_size_in_bytes > 32
|
|
if chunk_by_chunk_output then
|
|
local pos, cached_block_no, cached_block = 0.0
|
|
|
|
local function get_next_part_of_digest(arg1, arg2)
|
|
if arg1 == "seek" then
|
|
-- Usage #1: get_next_part_of_digest("seek", new_pos)
|
|
pos = arg2 * 1.0
|
|
else
|
|
-- Usage #2: hex_string = get_next_part_of_digest(size)
|
|
local size, index = arg1 or 1, 32
|
|
while size > 0 do
|
|
local block_offset = pos % 64
|
|
local block_no = (pos - block_offset) / 64
|
|
local part_size = math_min(size, 64 - block_offset)
|
|
if cached_block_no ~= block_no then
|
|
cached_block_no = block_no
|
|
cached_block = get_hash_block(block_no)
|
|
end
|
|
index = index + 1
|
|
stack[index] = sub(cached_block, block_offset * 2 + 1, (block_offset + part_size) * 2)
|
|
size = size - part_size
|
|
pos = pos + part_size
|
|
end
|
|
return table_concat(stack, "", 33, index)
|
|
end
|
|
end
|
|
|
|
result = get_next_part_of_digest
|
|
elseif digest_size_in_bytes <= 64 then
|
|
result = get_hash_block(0)
|
|
else
|
|
local last_block_no = ceil(digest_size_in_bytes / 64) - 1
|
|
for block_no = 0.0, last_block_no do
|
|
stack[33 + block_no] = get_hash_block(block_no)
|
|
end
|
|
result = table_concat(stack, "", 33, 33 + last_block_no)
|
|
end
|
|
end
|
|
return result
|
|
end
|
|
end
|
|
|
|
if message then
|
|
-- Actually perform calculations and return the BLAKE3 digest of a message
|
|
return partial(message)()
|
|
else
|
|
-- Return function for chunk-by-chunk loading
|
|
-- User should feed every chunk of input data as single argument to this function and finally get BLAKE3 digest by invoking this function without an argument
|
|
return partial
|
|
end
|
|
end
|
|
|
|
local function blake3_derive_key(key_material, context_string, derived_key_size_in_bytes)
|
|
-- key_material: (string) your source of entropy to derive a key from (for example, it can be a master password)
|
|
-- set to nil for feeding the key material in "chunk-by-chunk" input mode
|
|
-- context_string: (string) unique description of the derived key
|
|
-- digest_size_in_bytes: (optional) by default 32
|
|
-- 0,1,2,3,4,... = get finite derived key as single Lua string
|
|
-- (-1) = get infinite derived key in "chunk-by-chunk" output mode
|
|
-- -2,-3,-4,... = get finite derived key in "chunk-by-chunk" output mode
|
|
if type(context_string) ~= "string" then
|
|
error("'context_string' parameter must be a Lua string", 2)
|
|
end
|
|
local K = blake3(context_string, nil, nil, 32, nil, true) -- flag:DERIVE_KEY_CONTEXT
|
|
return blake3(key_material, nil, derived_key_size_in_bytes, 64, K) -- flag:DERIVE_KEY_MATERIAL
|
|
end
|
|
|
|
|
|
|
|
local sha = {
|
|
md5 = md5, -- MD5
|
|
sha1 = sha1, -- SHA-1
|
|
-- SHA-2 hash functions:
|
|
sha224 = function (message) return sha256ext(224, message) end, -- SHA-224
|
|
sha256 = function (message) return sha256ext(256, message) end, -- SHA-256
|
|
sha512_224 = function (message) return sha512ext(224, message) end, -- SHA-512/224
|
|
sha512_256 = function (message) return sha512ext(256, message) end, -- SHA-512/256
|
|
sha384 = function (message) return sha512ext(384, message) end, -- SHA-384
|
|
sha512 = function (message) return sha512ext(512, message) end, -- SHA-512
|
|
-- SHA-3 hash functions:
|
|
sha3_224 = function (message) return keccak((1600 - 2 * 224) / 8, 224 / 8, false, message) end, -- SHA3-224
|
|
sha3_256 = function (message) return keccak((1600 - 2 * 256) / 8, 256 / 8, false, message) end, -- SHA3-256
|
|
sha3_384 = function (message) return keccak((1600 - 2 * 384) / 8, 384 / 8, false, message) end, -- SHA3-384
|
|
sha3_512 = function (message) return keccak((1600 - 2 * 512) / 8, 512 / 8, false, message) end, -- SHA3-512
|
|
shake128 = function (digest_size_in_bytes, message) return keccak((1600 - 2 * 128) / 8, digest_size_in_bytes, true, message) end, -- SHAKE128
|
|
shake256 = function (digest_size_in_bytes, message) return keccak((1600 - 2 * 256) / 8, digest_size_in_bytes, true, message) end, -- SHAKE256
|
|
-- HMAC:
|
|
hmac = hmac, -- HMAC(hash_func, key, message) is applicable to any hash function from this module except SHAKE* and BLAKE*
|
|
-- misc utilities:
|
|
hex_to_bin = hex_to_bin, -- converts hexadecimal representation to binary string
|
|
bin_to_hex = bin_to_hex, -- converts binary string to hexadecimal representation
|
|
base64_to_bin = base64_to_bin, -- converts base64 representation to binary string
|
|
bin_to_base64 = bin_to_base64, -- converts binary string to base64 representation
|
|
-- old style names for backward compatibility:
|
|
hex2bin = hex_to_bin,
|
|
bin2hex = bin_to_hex,
|
|
base642bin = base64_to_bin,
|
|
bin2base64 = bin_to_base64,
|
|
-- BLAKE2 hash functions:
|
|
blake2b = blake2b, -- BLAKE2b (message, key, salt, digest_size_in_bytes)
|
|
blake2s = blake2s, -- BLAKE2s (message, key, salt, digest_size_in_bytes)
|
|
blake2bp = blake2bp, -- BLAKE2bp(message, key, salt, digest_size_in_bytes)
|
|
blake2sp = blake2sp, -- BLAKE2sp(message, key, salt, digest_size_in_bytes)
|
|
blake2xb = blake2xb, -- BLAKE2Xb(digest_size_in_bytes, message, key, salt)
|
|
blake2xs = blake2xs, -- BLAKE2Xs(digest_size_in_bytes, message, key, salt)
|
|
-- BLAKE2 aliases:
|
|
blake2 = blake2b,
|
|
blake2b_160 = function (message, key, salt) return blake2b(message, key, salt, 20) end, -- BLAKE2b-160
|
|
blake2b_256 = function (message, key, salt) return blake2b(message, key, salt, 32) end, -- BLAKE2b-256
|
|
blake2b_384 = function (message, key, salt) return blake2b(message, key, salt, 48) end, -- BLAKE2b-384
|
|
blake2b_512 = blake2b, -- 64 -- BLAKE2b-512
|
|
blake2s_128 = function (message, key, salt) return blake2s(message, key, salt, 16) end, -- BLAKE2s-128
|
|
blake2s_160 = function (message, key, salt) return blake2s(message, key, salt, 20) end, -- BLAKE2s-160
|
|
blake2s_224 = function (message, key, salt) return blake2s(message, key, salt, 28) end, -- BLAKE2s-224
|
|
blake2s_256 = blake2s, -- 32 -- BLAKE2s-256
|
|
-- BLAKE3 hash function
|
|
blake3 = blake3, -- BLAKE3 (message, key, digest_size_in_bytes)
|
|
blake3_derive_key = blake3_derive_key, -- BLAKE3_KDF(key_material, context_string, derived_key_size_in_bytes)
|
|
}
|
|
|
|
|
|
block_size_for_HMAC = {
|
|
[sha.md5] = 64,
|
|
[sha.sha1] = 64,
|
|
[sha.sha224] = 64,
|
|
[sha.sha256] = 64,
|
|
[sha.sha512_224] = 128,
|
|
[sha.sha512_256] = 128,
|
|
[sha.sha384] = 128,
|
|
[sha.sha512] = 128,
|
|
[sha.sha3_224] = 144, -- (1600 - 2 * 224) / 8
|
|
[sha.sha3_256] = 136, -- (1600 - 2 * 256) / 8
|
|
[sha.sha3_384] = 104, -- (1600 - 2 * 384) / 8
|
|
[sha.sha3_512] = 72, -- (1600 - 2 * 512) / 8
|
|
}
|
|
|
|
|
|
return sha
|