1799 lines
48 KiB
C
1799 lines
48 KiB
C
/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */
|
|
/*
|
|
* SIMD implementation.
|
|
*
|
|
* ==========================(LICENSE BEGIN)============================
|
|
*
|
|
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining
|
|
* a copy of this software and associated documentation files (the
|
|
* "Software"), to deal in the Software without restriction, including
|
|
* without limitation the rights to use, copy, modify, merge, publish,
|
|
* distribute, sublicense, and/or sell copies of the Software, and to
|
|
* permit persons to whom the Software is furnished to do so, subject to
|
|
* the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
* ===========================(LICENSE END)=============================
|
|
*
|
|
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
|
*/
|
|
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
#include <limits.h>
|
|
|
|
#include "sph_simd.h"
|
|
|
|
#ifdef __cplusplus
|
|
extern "C"{
|
|
#endif
|
|
|
|
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD
|
|
#define SPH_SMALL_FOOTPRINT_SIMD 1
|
|
#endif
|
|
|
|
#ifdef _MSC_VER
|
|
#pragma warning (disable: 4146)
|
|
#endif
|
|
|
|
typedef sph_u32 u32;
|
|
typedef sph_s32 s32;
|
|
#define C32 SPH_C32
|
|
#define T32 SPH_T32
|
|
#define ROL32 SPH_ROTL32
|
|
|
|
#define XCAT(x, y) XCAT_(x, y)
|
|
#define XCAT_(x, y) x ## y
|
|
|
|
/*
|
|
* The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
|
|
*/
|
|
static const s32 alpha_tab[] = {
|
|
1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130,
|
|
190, 80, 196, 69, 2, 82, 21, 90, 92, 174, 195, 28,
|
|
120, 37, 232, 3, 123, 160, 135, 138, 4, 164, 42, 180,
|
|
184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19,
|
|
8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12,
|
|
235, 126, 26, 38, 16, 142, 168, 206, 222, 107, 18, 224,
|
|
189, 39, 57, 24, 213, 252, 52, 76, 32, 27, 79, 155,
|
|
187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152,
|
|
64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96,
|
|
81, 237, 208, 47, 128, 108, 59, 106, 234, 85, 144, 250,
|
|
227, 55, 199, 192, 162, 217, 159, 94, 256, 216, 118, 212,
|
|
211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188,
|
|
255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254,
|
|
134, 97, 122, 119, 253, 93, 215, 77, 73, 166, 124, 201,
|
|
17, 183, 50, 251, 11, 194, 244, 238, 249, 186, 173, 154,
|
|
146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219,
|
|
241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233,
|
|
44, 5, 205, 181, 225, 230, 178, 102, 70, 43, 221, 66,
|
|
136, 179, 143, 209, 88, 10, 153, 105, 193, 203, 99, 204,
|
|
140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210,
|
|
129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65,
|
|
95, 40, 98, 163
|
|
};
|
|
|
|
/*
|
|
* Ranges:
|
|
* REDS1: from -32768..98302 to -383..383
|
|
* REDS2: from -2^31..2^31-1 to -32768..98302
|
|
*/
|
|
#define REDS1(x) (((x) & 0xFF) - ((x) >> 8))
|
|
#define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16))
|
|
|
|
/*
|
|
* If, upon entry, the values of q[] are all in the -N..N range (where
|
|
* N >= 98302) then the new values of q[] are in the -2N..2N range.
|
|
*
|
|
* Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
|
|
*/
|
|
#define FFT_LOOP(rb, hk, as, id) do { \
|
|
size_t u, v; \
|
|
s32 m = q[(rb)]; \
|
|
s32 n = q[(rb) + (hk)]; \
|
|
q[(rb)] = m + n; \
|
|
q[(rb) + (hk)] = m - n; \
|
|
u = v = 0; \
|
|
goto id; \
|
|
for (; u < (hk); u += 4, v += 4 * (as)) { \
|
|
s32 t; \
|
|
m = q[(rb) + u + 0]; \
|
|
n = q[(rb) + u + 0 + (hk)]; \
|
|
t = REDS2(n * alpha_tab[v + 0 * (as)]); \
|
|
q[(rb) + u + 0] = m + t; \
|
|
q[(rb) + u + 0 + (hk)] = m - t; \
|
|
id: \
|
|
m = q[(rb) + u + 1]; \
|
|
n = q[(rb) + u + 1 + (hk)]; \
|
|
t = REDS2(n * alpha_tab[v + 1 * (as)]); \
|
|
q[(rb) + u + 1] = m + t; \
|
|
q[(rb) + u + 1 + (hk)] = m - t; \
|
|
m = q[(rb) + u + 2]; \
|
|
n = q[(rb) + u + 2 + (hk)]; \
|
|
t = REDS2(n * alpha_tab[v + 2 * (as)]); \
|
|
q[(rb) + u + 2] = m + t; \
|
|
q[(rb) + u + 2 + (hk)] = m - t; \
|
|
m = q[(rb) + u + 3]; \
|
|
n = q[(rb) + u + 3 + (hk)]; \
|
|
t = REDS2(n * alpha_tab[v + 3 * (as)]); \
|
|
q[(rb) + u + 3] = m + t; \
|
|
q[(rb) + u + 3 + (hk)] = m - t; \
|
|
} \
|
|
} while (0)
|
|
|
|
/*
|
|
* Output ranges:
|
|
* d0: min= 0 max= 1020
|
|
* d1: min= -67 max= 4587
|
|
* d2: min=-4335 max= 4335
|
|
* d3: min=-4147 max= 507
|
|
* d4: min= -510 max= 510
|
|
* d5: min= -252 max= 4402
|
|
* d6: min=-4335 max= 4335
|
|
* d7: min=-4332 max= 322
|
|
*/
|
|
#define FFT8(xb, xs, d) do { \
|
|
s32 x0 = x[(xb)]; \
|
|
s32 x1 = x[(xb) + (xs)]; \
|
|
s32 x2 = x[(xb) + 2 * (xs)]; \
|
|
s32 x3 = x[(xb) + 3 * (xs)]; \
|
|
s32 a0 = x0 + x2; \
|
|
s32 a1 = x0 + (x2 << 4); \
|
|
s32 a2 = x0 - x2; \
|
|
s32 a3 = x0 - (x2 << 4); \
|
|
s32 b0 = x1 + x3; \
|
|
s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
|
|
s32 b2 = (x1 << 4) - (x3 << 4); \
|
|
s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
|
|
d ## 0 = a0 + b0; \
|
|
d ## 1 = a1 + b1; \
|
|
d ## 2 = a2 + b2; \
|
|
d ## 3 = a3 + b3; \
|
|
d ## 4 = a0 - b0; \
|
|
d ## 5 = a1 - b1; \
|
|
d ## 6 = a2 - b2; \
|
|
d ## 7 = a3 - b3; \
|
|
} while (0)
|
|
|
|
/*
|
|
* When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
|
|
* to some shifting.
|
|
*
|
|
* Output: within -591471..591723
|
|
*/
|
|
#define FFT16(xb, xs, rb) do { \
|
|
s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
|
|
s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
|
|
FFT8(xb, (xs) << 1, d1_); \
|
|
FFT8((xb) + (xs), (xs) << 1, d2_); \
|
|
q[(rb) + 0] = d1_0 + d2_0; \
|
|
q[(rb) + 1] = d1_1 + (d2_1 << 1); \
|
|
q[(rb) + 2] = d1_2 + (d2_2 << 2); \
|
|
q[(rb) + 3] = d1_3 + (d2_3 << 3); \
|
|
q[(rb) + 4] = d1_4 + (d2_4 << 4); \
|
|
q[(rb) + 5] = d1_5 + (d2_5 << 5); \
|
|
q[(rb) + 6] = d1_6 + (d2_6 << 6); \
|
|
q[(rb) + 7] = d1_7 + (d2_7 << 7); \
|
|
q[(rb) + 8] = d1_0 - d2_0; \
|
|
q[(rb) + 9] = d1_1 - (d2_1 << 1); \
|
|
q[(rb) + 10] = d1_2 - (d2_2 << 2); \
|
|
q[(rb) + 11] = d1_3 - (d2_3 << 3); \
|
|
q[(rb) + 12] = d1_4 - (d2_4 << 4); \
|
|
q[(rb) + 13] = d1_5 - (d2_5 << 5); \
|
|
q[(rb) + 14] = d1_6 - (d2_6 << 6); \
|
|
q[(rb) + 15] = d1_7 - (d2_7 << 7); \
|
|
} while (0)
|
|
|
|
/*
|
|
* Output range: |q| <= 1183446
|
|
*/
|
|
#define FFT32(xb, xs, rb, id) do { \
|
|
FFT16(xb, (xs) << 1, rb); \
|
|
FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
|
|
FFT_LOOP(rb, 16, 8, id); \
|
|
} while (0)
|
|
|
|
/*
|
|
* Output range: |q| <= 2366892
|
|
*/
|
|
#define FFT64(xb, xs, rb, id) do { \
|
|
FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \
|
|
FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \
|
|
FFT_LOOP(rb, 32, 4, id); \
|
|
} while (0)
|
|
|
|
#if SPH_SMALL_FOOTPRINT_SIMD
|
|
|
|
static void
|
|
fft32(unsigned char *x, size_t xs, s32 *q)
|
|
{
|
|
size_t xd;
|
|
|
|
xd = xs << 1;
|
|
FFT16(0, xd, 0);
|
|
FFT16(xs, xd, 16);
|
|
FFT_LOOP(0, 16, 8, label_);
|
|
}
|
|
|
|
#define FFT128(xb, xs, rb, id) do { \
|
|
fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \
|
|
fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \
|
|
FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \
|
|
fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \
|
|
fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \
|
|
FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \
|
|
FFT_LOOP(rb, 64, 2, XCAT(id, a)); \
|
|
} while (0)
|
|
|
|
#else
|
|
|
|
/*
|
|
* Output range: |q| <= 4733784
|
|
*/
|
|
#define FFT128(xb, xs, rb, id) do { \
|
|
FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \
|
|
FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \
|
|
FFT_LOOP(rb, 64, 2, id); \
|
|
} while (0)
|
|
|
|
#endif
|
|
|
|
/*
|
|
* For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression
|
|
* function which does not fit in the 32 kB L1 cache of a typical x86
|
|
* Intel. We therefore add a function call layer at the FFT64 level.
|
|
*/
|
|
|
|
static void
|
|
fft64(unsigned char *x, size_t xs, s32 *q)
|
|
{
|
|
size_t xd;
|
|
|
|
xd = xs << 1;
|
|
FFT32(0, xd, 0, label_a);
|
|
FFT32(xs, xd, 32, label_b);
|
|
FFT_LOOP(0, 32, 4, label_);
|
|
}
|
|
|
|
/*
|
|
* Output range: |q| <= 9467568
|
|
*/
|
|
#define FFT256(xb, xs, rb, id) do { \
|
|
fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \
|
|
fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 64]); \
|
|
FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \
|
|
fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \
|
|
fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \
|
|
FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \
|
|
FFT_LOOP(rb, 128, 1, XCAT(id, a)); \
|
|
} while (0)
|
|
|
|
/*
|
|
* alpha^(127*i) mod 257
|
|
*/
|
|
static const unsigned short yoff_s_n[] = {
|
|
1, 98, 95, 58, 30, 113, 23, 198, 129, 49, 176, 29,
|
|
15, 185, 140, 99, 193, 153, 88, 143, 136, 221, 70, 178,
|
|
225, 205, 44, 200, 68, 239, 35, 89, 241, 231, 22, 100,
|
|
34, 248, 146, 173, 249, 244, 11, 50, 17, 124, 73, 215,
|
|
253, 122, 134, 25, 137, 62, 165, 236, 255, 61, 67, 141,
|
|
197, 31, 211, 118, 256, 159, 162, 199, 227, 144, 234, 59,
|
|
128, 208, 81, 228, 242, 72, 117, 158, 64, 104, 169, 114,
|
|
121, 36, 187, 79, 32, 52, 213, 57, 189, 18, 222, 168,
|
|
16, 26, 235, 157, 223, 9, 111, 84, 8, 13, 246, 207,
|
|
240, 133, 184, 42, 4, 135, 123, 232, 120, 195, 92, 21,
|
|
2, 196, 190, 116, 60, 226, 46, 139
|
|
};
|
|
|
|
/*
|
|
* alpha^(127*i) + alpha^(125*i) mod 257
|
|
*/
|
|
static const unsigned short yoff_s_f[] = {
|
|
2, 156, 118, 107, 45, 212, 111, 162, 97, 249, 211, 3,
|
|
49, 101, 151, 223, 189, 178, 253, 204, 76, 82, 232, 65,
|
|
96, 176, 161, 47, 189, 61, 248, 107, 0, 131, 133, 113,
|
|
17, 33, 12, 111, 251, 103, 57, 148, 47, 65, 249, 143,
|
|
189, 8, 204, 230, 205, 151, 187, 227, 247, 111, 140, 6,
|
|
77, 10, 21, 149, 255, 101, 139, 150, 212, 45, 146, 95,
|
|
160, 8, 46, 254, 208, 156, 106, 34, 68, 79, 4, 53,
|
|
181, 175, 25, 192, 161, 81, 96, 210, 68, 196, 9, 150,
|
|
0, 126, 124, 144, 240, 224, 245, 146, 6, 154, 200, 109,
|
|
210, 192, 8, 114, 68, 249, 53, 27, 52, 106, 70, 30,
|
|
10, 146, 117, 251, 180, 247, 236, 108
|
|
};
|
|
|
|
/*
|
|
* beta^(255*i) mod 257
|
|
*/
|
|
static const unsigned short yoff_b_n[] = {
|
|
1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172,
|
|
23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101,
|
|
15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10,
|
|
88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230,
|
|
225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150,
|
|
35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109,
|
|
34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194,
|
|
11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93,
|
|
253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83,
|
|
165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110,
|
|
197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217,
|
|
162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108,
|
|
128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171,
|
|
117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78,
|
|
121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252,
|
|
213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142,
|
|
16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182,
|
|
111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74,
|
|
240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160,
|
|
123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82,
|
|
2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87,
|
|
46, 45, 139, 41
|
|
};
|
|
|
|
/*
|
|
* beta^(255*i) + beta^(253*i) mod 257
|
|
*/
|
|
static const unsigned short yoff_b_f[] = {
|
|
2, 203, 156, 47, 118, 214, 107, 106, 45, 93, 212, 20,
|
|
111, 73, 162, 251, 97, 215, 249, 53, 211, 19, 3, 89,
|
|
49, 207, 101, 67, 151, 130, 223, 23, 189, 202, 178, 239,
|
|
253, 127, 204, 49, 76, 236, 82, 137, 232, 157, 65, 79,
|
|
96, 161, 176, 130, 161, 30, 47, 9, 189, 247, 61, 226,
|
|
248, 90, 107, 64, 0, 88, 131, 243, 133, 59, 113, 115,
|
|
17, 236, 33, 213, 12, 191, 111, 19, 251, 61, 103, 208,
|
|
57, 35, 148, 248, 47, 116, 65, 119, 249, 178, 143, 40,
|
|
189, 129, 8, 163, 204, 227, 230, 196, 205, 122, 151, 45,
|
|
187, 19, 227, 72, 247, 125, 111, 121, 140, 220, 6, 107,
|
|
77, 69, 10, 101, 21, 65, 149, 171, 255, 54, 101, 210,
|
|
139, 43, 150, 151, 212, 164, 45, 237, 146, 184, 95, 6,
|
|
160, 42, 8, 204, 46, 238, 254, 168, 208, 50, 156, 190,
|
|
106, 127, 34, 234, 68, 55, 79, 18, 4, 130, 53, 208,
|
|
181, 21, 175, 120, 25, 100, 192, 178, 161, 96, 81, 127,
|
|
96, 227, 210, 248, 68, 10, 196, 31, 9, 167, 150, 193,
|
|
0, 169, 126, 14, 124, 198, 144, 142, 240, 21, 224, 44,
|
|
245, 66, 146, 238, 6, 196, 154, 49, 200, 222, 109, 9,
|
|
210, 141, 192, 138, 8, 79, 114, 217, 68, 128, 249, 94,
|
|
53, 30, 27, 61, 52, 135, 106, 212, 70, 238, 30, 185,
|
|
10, 132, 146, 136, 117, 37, 251, 150, 180, 188, 247, 156,
|
|
236, 192, 108, 86
|
|
};
|
|
|
|
#define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \
|
|
+ ((u32)((h) * (mm)) << 16))
|
|
|
|
#define W_SMALL(sb, o1, o2, mm) \
|
|
(INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \
|
|
INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \
|
|
INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \
|
|
INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm)
|
|
|
|
#define WS_0_0 W_SMALL( 4, 0, 1, 185)
|
|
#define WS_0_1 W_SMALL( 6, 0, 1, 185)
|
|
#define WS_0_2 W_SMALL( 0, 0, 1, 185)
|
|
#define WS_0_3 W_SMALL( 2, 0, 1, 185)
|
|
#define WS_0_4 W_SMALL( 7, 0, 1, 185)
|
|
#define WS_0_5 W_SMALL( 5, 0, 1, 185)
|
|
#define WS_0_6 W_SMALL( 3, 0, 1, 185)
|
|
#define WS_0_7 W_SMALL( 1, 0, 1, 185)
|
|
#define WS_1_0 W_SMALL(15, 0, 1, 185)
|
|
#define WS_1_1 W_SMALL(11, 0, 1, 185)
|
|
#define WS_1_2 W_SMALL(12, 0, 1, 185)
|
|
#define WS_1_3 W_SMALL( 8, 0, 1, 185)
|
|
#define WS_1_4 W_SMALL( 9, 0, 1, 185)
|
|
#define WS_1_5 W_SMALL(13, 0, 1, 185)
|
|
#define WS_1_6 W_SMALL(10, 0, 1, 185)
|
|
#define WS_1_7 W_SMALL(14, 0, 1, 185)
|
|
#define WS_2_0 W_SMALL(17, -128, -64, 233)
|
|
#define WS_2_1 W_SMALL(18, -128, -64, 233)
|
|
#define WS_2_2 W_SMALL(23, -128, -64, 233)
|
|
#define WS_2_3 W_SMALL(20, -128, -64, 233)
|
|
#define WS_2_4 W_SMALL(22, -128, -64, 233)
|
|
#define WS_2_5 W_SMALL(21, -128, -64, 233)
|
|
#define WS_2_6 W_SMALL(16, -128, -64, 233)
|
|
#define WS_2_7 W_SMALL(19, -128, -64, 233)
|
|
#define WS_3_0 W_SMALL(30, -191, -127, 233)
|
|
#define WS_3_1 W_SMALL(24, -191, -127, 233)
|
|
#define WS_3_2 W_SMALL(25, -191, -127, 233)
|
|
#define WS_3_3 W_SMALL(31, -191, -127, 233)
|
|
#define WS_3_4 W_SMALL(27, -191, -127, 233)
|
|
#define WS_3_5 W_SMALL(29, -191, -127, 233)
|
|
#define WS_3_6 W_SMALL(28, -191, -127, 233)
|
|
#define WS_3_7 W_SMALL(26, -191, -127, 233)
|
|
|
|
#define W_BIG(sb, o1, o2, mm) \
|
|
(INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
|
|
INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
|
|
INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
|
|
INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
|
|
INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
|
|
INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
|
|
INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
|
|
INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)
|
|
|
|
#define WB_0_0 W_BIG( 4, 0, 1, 185)
|
|
#define WB_0_1 W_BIG( 6, 0, 1, 185)
|
|
#define WB_0_2 W_BIG( 0, 0, 1, 185)
|
|
#define WB_0_3 W_BIG( 2, 0, 1, 185)
|
|
#define WB_0_4 W_BIG( 7, 0, 1, 185)
|
|
#define WB_0_5 W_BIG( 5, 0, 1, 185)
|
|
#define WB_0_6 W_BIG( 3, 0, 1, 185)
|
|
#define WB_0_7 W_BIG( 1, 0, 1, 185)
|
|
#define WB_1_0 W_BIG(15, 0, 1, 185)
|
|
#define WB_1_1 W_BIG(11, 0, 1, 185)
|
|
#define WB_1_2 W_BIG(12, 0, 1, 185)
|
|
#define WB_1_3 W_BIG( 8, 0, 1, 185)
|
|
#define WB_1_4 W_BIG( 9, 0, 1, 185)
|
|
#define WB_1_5 W_BIG(13, 0, 1, 185)
|
|
#define WB_1_6 W_BIG(10, 0, 1, 185)
|
|
#define WB_1_7 W_BIG(14, 0, 1, 185)
|
|
#define WB_2_0 W_BIG(17, -256, -128, 233)
|
|
#define WB_2_1 W_BIG(18, -256, -128, 233)
|
|
#define WB_2_2 W_BIG(23, -256, -128, 233)
|
|
#define WB_2_3 W_BIG(20, -256, -128, 233)
|
|
#define WB_2_4 W_BIG(22, -256, -128, 233)
|
|
#define WB_2_5 W_BIG(21, -256, -128, 233)
|
|
#define WB_2_6 W_BIG(16, -256, -128, 233)
|
|
#define WB_2_7 W_BIG(19, -256, -128, 233)
|
|
#define WB_3_0 W_BIG(30, -383, -255, 233)
|
|
#define WB_3_1 W_BIG(24, -383, -255, 233)
|
|
#define WB_3_2 W_BIG(25, -383, -255, 233)
|
|
#define WB_3_3 W_BIG(31, -383, -255, 233)
|
|
#define WB_3_4 W_BIG(27, -383, -255, 233)
|
|
#define WB_3_5 W_BIG(29, -383, -255, 233)
|
|
#define WB_3_6 W_BIG(28, -383, -255, 233)
|
|
#define WB_3_7 W_BIG(26, -383, -255, 233)
|
|
|
|
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
|
|
#define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z)))
|
|
|
|
#define PP4_0_0 1
|
|
#define PP4_0_1 0
|
|
#define PP4_0_2 3
|
|
#define PP4_0_3 2
|
|
#define PP4_1_0 2
|
|
#define PP4_1_1 3
|
|
#define PP4_1_2 0
|
|
#define PP4_1_3 1
|
|
#define PP4_2_0 3
|
|
#define PP4_2_1 2
|
|
#define PP4_2_2 1
|
|
#define PP4_2_3 0
|
|
|
|
#define PP8_0_0 1
|
|
#define PP8_0_1 0
|
|
#define PP8_0_2 3
|
|
#define PP8_0_3 2
|
|
#define PP8_0_4 5
|
|
#define PP8_0_5 4
|
|
#define PP8_0_6 7
|
|
#define PP8_0_7 6
|
|
|
|
#define PP8_1_0 6
|
|
#define PP8_1_1 7
|
|
#define PP8_1_2 4
|
|
#define PP8_1_3 5
|
|
#define PP8_1_4 2
|
|
#define PP8_1_5 3
|
|
#define PP8_1_6 0
|
|
#define PP8_1_7 1
|
|
|
|
#define PP8_2_0 2
|
|
#define PP8_2_1 3
|
|
#define PP8_2_2 0
|
|
#define PP8_2_3 1
|
|
#define PP8_2_4 6
|
|
#define PP8_2_5 7
|
|
#define PP8_2_6 4
|
|
#define PP8_2_7 5
|
|
|
|
#define PP8_3_0 3
|
|
#define PP8_3_1 2
|
|
#define PP8_3_2 1
|
|
#define PP8_3_3 0
|
|
#define PP8_3_4 7
|
|
#define PP8_3_5 6
|
|
#define PP8_3_6 5
|
|
#define PP8_3_7 4
|
|
|
|
#define PP8_4_0 5
|
|
#define PP8_4_1 4
|
|
#define PP8_4_2 7
|
|
#define PP8_4_3 6
|
|
#define PP8_4_4 1
|
|
#define PP8_4_5 0
|
|
#define PP8_4_6 3
|
|
#define PP8_4_7 2
|
|
|
|
#define PP8_5_0 7
|
|
#define PP8_5_1 6
|
|
#define PP8_5_2 5
|
|
#define PP8_5_3 4
|
|
#define PP8_5_4 3
|
|
#define PP8_5_5 2
|
|
#define PP8_5_6 1
|
|
#define PP8_5_7 0
|
|
|
|
#define PP8_6_0 4
|
|
#define PP8_6_1 5
|
|
#define PP8_6_2 6
|
|
#define PP8_6_3 7
|
|
#define PP8_6_4 0
|
|
#define PP8_6_5 1
|
|
#define PP8_6_6 2
|
|
#define PP8_6_7 3
|
|
|
|
#if SPH_SIMD_NOCOPY
|
|
|
|
#define DECL_STATE_SMALL
|
|
#define READ_STATE_SMALL(sc)
|
|
#define WRITE_STATE_SMALL(sc)
|
|
#define DECL_STATE_BIG
|
|
#define READ_STATE_BIG(sc)
|
|
#define WRITE_STATE_BIG(sc)
|
|
|
|
#else
|
|
|
|
#define DECL_STATE_SMALL \
|
|
u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3;
|
|
|
|
#define READ_STATE_SMALL(sc) do { \
|
|
A0 = (sc)->state[ 0]; \
|
|
A1 = (sc)->state[ 1]; \
|
|
A2 = (sc)->state[ 2]; \
|
|
A3 = (sc)->state[ 3]; \
|
|
B0 = (sc)->state[ 4]; \
|
|
B1 = (sc)->state[ 5]; \
|
|
B2 = (sc)->state[ 6]; \
|
|
B3 = (sc)->state[ 7]; \
|
|
C0 = (sc)->state[ 8]; \
|
|
C1 = (sc)->state[ 9]; \
|
|
C2 = (sc)->state[10]; \
|
|
C3 = (sc)->state[11]; \
|
|
D0 = (sc)->state[12]; \
|
|
D1 = (sc)->state[13]; \
|
|
D2 = (sc)->state[14]; \
|
|
D3 = (sc)->state[15]; \
|
|
} while (0)
|
|
|
|
#define WRITE_STATE_SMALL(sc) do { \
|
|
(sc)->state[ 0] = A0; \
|
|
(sc)->state[ 1] = A1; \
|
|
(sc)->state[ 2] = A2; \
|
|
(sc)->state[ 3] = A3; \
|
|
(sc)->state[ 4] = B0; \
|
|
(sc)->state[ 5] = B1; \
|
|
(sc)->state[ 6] = B2; \
|
|
(sc)->state[ 7] = B3; \
|
|
(sc)->state[ 8] = C0; \
|
|
(sc)->state[ 9] = C1; \
|
|
(sc)->state[10] = C2; \
|
|
(sc)->state[11] = C3; \
|
|
(sc)->state[12] = D0; \
|
|
(sc)->state[13] = D1; \
|
|
(sc)->state[14] = D2; \
|
|
(sc)->state[15] = D3; \
|
|
} while (0)
|
|
|
|
#define DECL_STATE_BIG \
|
|
u32 A0, A1, A2, A3, A4, A5, A6, A7; \
|
|
u32 B0, B1, B2, B3, B4, B5, B6, B7; \
|
|
u32 C0, C1, C2, C3, C4, C5, C6, C7; \
|
|
u32 D0, D1, D2, D3, D4, D5, D6, D7;
|
|
|
|
#define READ_STATE_BIG(sc) do { \
|
|
A0 = (sc)->state[ 0]; \
|
|
A1 = (sc)->state[ 1]; \
|
|
A2 = (sc)->state[ 2]; \
|
|
A3 = (sc)->state[ 3]; \
|
|
A4 = (sc)->state[ 4]; \
|
|
A5 = (sc)->state[ 5]; \
|
|
A6 = (sc)->state[ 6]; \
|
|
A7 = (sc)->state[ 7]; \
|
|
B0 = (sc)->state[ 8]; \
|
|
B1 = (sc)->state[ 9]; \
|
|
B2 = (sc)->state[10]; \
|
|
B3 = (sc)->state[11]; \
|
|
B4 = (sc)->state[12]; \
|
|
B5 = (sc)->state[13]; \
|
|
B6 = (sc)->state[14]; \
|
|
B7 = (sc)->state[15]; \
|
|
C0 = (sc)->state[16]; \
|
|
C1 = (sc)->state[17]; \
|
|
C2 = (sc)->state[18]; \
|
|
C3 = (sc)->state[19]; \
|
|
C4 = (sc)->state[20]; \
|
|
C5 = (sc)->state[21]; \
|
|
C6 = (sc)->state[22]; \
|
|
C7 = (sc)->state[23]; \
|
|
D0 = (sc)->state[24]; \
|
|
D1 = (sc)->state[25]; \
|
|
D2 = (sc)->state[26]; \
|
|
D3 = (sc)->state[27]; \
|
|
D4 = (sc)->state[28]; \
|
|
D5 = (sc)->state[29]; \
|
|
D6 = (sc)->state[30]; \
|
|
D7 = (sc)->state[31]; \
|
|
} while (0)
|
|
|
|
#define WRITE_STATE_BIG(sc) do { \
|
|
(sc)->state[ 0] = A0; \
|
|
(sc)->state[ 1] = A1; \
|
|
(sc)->state[ 2] = A2; \
|
|
(sc)->state[ 3] = A3; \
|
|
(sc)->state[ 4] = A4; \
|
|
(sc)->state[ 5] = A5; \
|
|
(sc)->state[ 6] = A6; \
|
|
(sc)->state[ 7] = A7; \
|
|
(sc)->state[ 8] = B0; \
|
|
(sc)->state[ 9] = B1; \
|
|
(sc)->state[10] = B2; \
|
|
(sc)->state[11] = B3; \
|
|
(sc)->state[12] = B4; \
|
|
(sc)->state[13] = B5; \
|
|
(sc)->state[14] = B6; \
|
|
(sc)->state[15] = B7; \
|
|
(sc)->state[16] = C0; \
|
|
(sc)->state[17] = C1; \
|
|
(sc)->state[18] = C2; \
|
|
(sc)->state[19] = C3; \
|
|
(sc)->state[20] = C4; \
|
|
(sc)->state[21] = C5; \
|
|
(sc)->state[22] = C6; \
|
|
(sc)->state[23] = C7; \
|
|
(sc)->state[24] = D0; \
|
|
(sc)->state[25] = D1; \
|
|
(sc)->state[26] = D2; \
|
|
(sc)->state[27] = D3; \
|
|
(sc)->state[28] = D4; \
|
|
(sc)->state[29] = D5; \
|
|
(sc)->state[30] = D6; \
|
|
(sc)->state[31] = D7; \
|
|
} while (0)
|
|
|
|
#endif
|
|
|
|
#define STEP_ELT(n, w, fun, s, ppb) do { \
|
|
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
|
|
A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
|
|
D ## n = C ## n; \
|
|
C ## n = B ## n; \
|
|
B ## n = tA ## n; \
|
|
} while (0)
|
|
|
|
#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \
|
|
u32 tA0 = ROL32(A0, r); \
|
|
u32 tA1 = ROL32(A1, r); \
|
|
u32 tA2 = ROL32(A2, r); \
|
|
u32 tA3 = ROL32(A3, r); \
|
|
STEP_ELT(0, w0, fun, s, pp4b); \
|
|
STEP_ELT(1, w1, fun, s, pp4b); \
|
|
STEP_ELT(2, w2, fun, s, pp4b); \
|
|
STEP_ELT(3, w3, fun, s, pp4b); \
|
|
} while (0)
|
|
|
|
#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
|
|
u32 tA0 = ROL32(A0, r); \
|
|
u32 tA1 = ROL32(A1, r); \
|
|
u32 tA2 = ROL32(A2, r); \
|
|
u32 tA3 = ROL32(A3, r); \
|
|
u32 tA4 = ROL32(A4, r); \
|
|
u32 tA5 = ROL32(A5, r); \
|
|
u32 tA6 = ROL32(A6, r); \
|
|
u32 tA7 = ROL32(A7, r); \
|
|
STEP_ELT(0, w0, fun, s, pp8b); \
|
|
STEP_ELT(1, w1, fun, s, pp8b); \
|
|
STEP_ELT(2, w2, fun, s, pp8b); \
|
|
STEP_ELT(3, w3, fun, s, pp8b); \
|
|
STEP_ELT(4, w4, fun, s, pp8b); \
|
|
STEP_ELT(5, w5, fun, s, pp8b); \
|
|
STEP_ELT(6, w6, fun, s, pp8b); \
|
|
STEP_ELT(7, w7, fun, s, pp8b); \
|
|
} while (0)
|
|
|
|
#define M3_0_0 0_
|
|
#define M3_1_0 1_
|
|
#define M3_2_0 2_
|
|
#define M3_3_0 0_
|
|
#define M3_4_0 1_
|
|
#define M3_5_0 2_
|
|
#define M3_6_0 0_
|
|
#define M3_7_0 1_
|
|
|
|
#define M3_0_1 1_
|
|
#define M3_1_1 2_
|
|
#define M3_2_1 0_
|
|
#define M3_3_1 1_
|
|
#define M3_4_1 2_
|
|
#define M3_5_1 0_
|
|
#define M3_6_1 1_
|
|
#define M3_7_1 2_
|
|
|
|
#define M3_0_2 2_
|
|
#define M3_1_2 0_
|
|
#define M3_2_2 1_
|
|
#define M3_3_2 2_
|
|
#define M3_4_2 0_
|
|
#define M3_5_2 1_
|
|
#define M3_6_2 2_
|
|
#define M3_7_2 0_
|
|
|
|
#define STEP_SMALL_(w, fun, r, s, pp4b) STEP_SMALL w, fun, r, s, pp4b)
|
|
|
|
#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3) do { \
|
|
STEP_SMALL_(WS_ ## ri ## 0, \
|
|
IF, p0, p1, XCAT(PP4_, M3_0_ ## isp)); \
|
|
STEP_SMALL_(WS_ ## ri ## 1, \
|
|
IF, p1, p2, XCAT(PP4_, M3_1_ ## isp)); \
|
|
STEP_SMALL_(WS_ ## ri ## 2, \
|
|
IF, p2, p3, XCAT(PP4_, M3_2_ ## isp)); \
|
|
STEP_SMALL_(WS_ ## ri ## 3, \
|
|
IF, p3, p0, XCAT(PP4_, M3_3_ ## isp)); \
|
|
STEP_SMALL_(WS_ ## ri ## 4, \
|
|
MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \
|
|
STEP_SMALL_(WS_ ## ri ## 5, \
|
|
MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \
|
|
STEP_SMALL_(WS_ ## ri ## 6, \
|
|
MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \
|
|
STEP_SMALL_(WS_ ## ri ## 7, \
|
|
MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \
|
|
} while (0)
|
|
|
|
#define M7_0_0 0_
|
|
#define M7_1_0 1_
|
|
#define M7_2_0 2_
|
|
#define M7_3_0 3_
|
|
#define M7_4_0 4_
|
|
#define M7_5_0 5_
|
|
#define M7_6_0 6_
|
|
#define M7_7_0 0_
|
|
|
|
#define M7_0_1 1_
|
|
#define M7_1_1 2_
|
|
#define M7_2_1 3_
|
|
#define M7_3_1 4_
|
|
#define M7_4_1 5_
|
|
#define M7_5_1 6_
|
|
#define M7_6_1 0_
|
|
#define M7_7_1 1_
|
|
|
|
#define M7_0_2 2_
|
|
#define M7_1_2 3_
|
|
#define M7_2_2 4_
|
|
#define M7_3_2 5_
|
|
#define M7_4_2 6_
|
|
#define M7_5_2 0_
|
|
#define M7_6_2 1_
|
|
#define M7_7_2 2_
|
|
|
|
#define M7_0_3 3_
|
|
#define M7_1_3 4_
|
|
#define M7_2_3 5_
|
|
#define M7_3_3 6_
|
|
#define M7_4_3 0_
|
|
#define M7_5_3 1_
|
|
#define M7_6_3 2_
|
|
#define M7_7_3 3_
|
|
|
|
#define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b)
|
|
|
|
#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \
|
|
STEP_BIG_(WB_ ## ri ## 0, \
|
|
IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
|
|
STEP_BIG_(WB_ ## ri ## 1, \
|
|
IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
|
|
STEP_BIG_(WB_ ## ri ## 2, \
|
|
IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
|
|
STEP_BIG_(WB_ ## ri ## 3, \
|
|
IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
|
|
STEP_BIG_(WB_ ## ri ## 4, \
|
|
MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
|
|
STEP_BIG_(WB_ ## ri ## 5, \
|
|
MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
|
|
STEP_BIG_(WB_ ## ri ## 6, \
|
|
MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
|
|
STEP_BIG_(WB_ ## ri ## 7, \
|
|
MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
|
|
} while (0)
|
|
|
|
#if SPH_SMALL_FOOTPRINT_SIMD
|
|
|
|
#define A0 state[ 0]
|
|
#define A1 state[ 1]
|
|
#define A2 state[ 2]
|
|
#define A3 state[ 3]
|
|
#define B0 state[ 4]
|
|
#define B1 state[ 5]
|
|
#define B2 state[ 6]
|
|
#define B3 state[ 7]
|
|
#define C0 state[ 8]
|
|
#define C1 state[ 9]
|
|
#define C2 state[10]
|
|
#define C3 state[11]
|
|
#define D0 state[12]
|
|
#define D1 state[13]
|
|
#define D2 state[14]
|
|
#define D3 state[15]
|
|
|
|
#define STEP2_ELT(n, w, fun, s, ppb) do { \
|
|
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
|
|
A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
|
|
D ## n = C ## n; \
|
|
C ## n = B ## n; \
|
|
B ## n = tA[n]; \
|
|
} while (0)
|
|
|
|
#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \
|
|
u32 tA[4]; \
|
|
tA[0] = ROL32(A0, r); \
|
|
tA[1] = ROL32(A1, r); \
|
|
tA[2] = ROL32(A2, r); \
|
|
tA[3] = ROL32(A3, r); \
|
|
STEP2_ELT(0, w0, fun, s, pp4b); \
|
|
STEP2_ELT(1, w1, fun, s, pp4b); \
|
|
STEP2_ELT(2, w2, fun, s, pp4b); \
|
|
STEP2_ELT(3, w3, fun, s, pp4b); \
|
|
} while (0)
|
|
|
|
static void
|
|
one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
|
|
{
|
|
static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 };
|
|
|
|
STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF, p0, p1, pp4k[isp + 0]);
|
|
STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF, p1, p2, pp4k[isp + 1]);
|
|
STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF, p2, p3, pp4k[isp + 2]);
|
|
STEP2_SMALL(w[12], w[13], w[14], w[15], IF, p3, p0, pp4k[isp + 3]);
|
|
STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]);
|
|
STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]);
|
|
STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]);
|
|
STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]);
|
|
}
|
|
|
|
static void
|
|
compress_small(sph_simd_small_context *sc, int last)
|
|
{
|
|
unsigned char *x;
|
|
s32 q[128];
|
|
int i;
|
|
u32 w[32];
|
|
u32 state[16];
|
|
size_t u;
|
|
|
|
static const size_t wsp[32] = {
|
|
4 << 3, 6 << 3, 0 << 3, 2 << 3,
|
|
7 << 3, 5 << 3, 3 << 3, 1 << 3,
|
|
15 << 3, 11 << 3, 12 << 3, 8 << 3,
|
|
9 << 3, 13 << 3, 10 << 3, 14 << 3,
|
|
17 << 3, 18 << 3, 23 << 3, 20 << 3,
|
|
22 << 3, 21 << 3, 16 << 3, 19 << 3,
|
|
30 << 3, 24 << 3, 25 << 3, 31 << 3,
|
|
27 << 3, 29 << 3, 28 << 3, 26 << 3
|
|
};
|
|
|
|
x = sc->buf;
|
|
FFT128(0, 1, 0, ll);
|
|
if (last) {
|
|
for (i = 0; i < 128; i ++) {
|
|
s32 tq;
|
|
|
|
tq = q[i] + yoff_s_f[i];
|
|
tq = REDS2(tq);
|
|
tq = REDS1(tq);
|
|
tq = REDS1(tq);
|
|
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
}
|
|
} else {
|
|
for (i = 0; i < 128; i ++) {
|
|
s32 tq;
|
|
|
|
tq = q[i] + yoff_s_n[i];
|
|
tq = REDS2(tq);
|
|
tq = REDS1(tq);
|
|
tq = REDS1(tq);
|
|
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 16; i += 4) {
|
|
state[i + 0] = sc->state[i + 0]
|
|
^ sph_dec32le_aligned(x + 4 * (i + 0));
|
|
state[i + 1] = sc->state[i + 1]
|
|
^ sph_dec32le_aligned(x + 4 * (i + 1));
|
|
state[i + 2] = sc->state[i + 2]
|
|
^ sph_dec32le_aligned(x + 4 * (i + 2));
|
|
state[i + 3] = sc->state[i + 3]
|
|
^ sph_dec32le_aligned(x + 4 * (i + 3));
|
|
}
|
|
|
|
#define WSREAD(sb, o1, o2, mm) do { \
|
|
for (u = 0; u < 32; u += 4) { \
|
|
size_t v = wsp[(u >> 2) + (sb)]; \
|
|
w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
|
|
q[v + 2 * 0 + (o2)], mm); \
|
|
w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
|
|
q[v + 2 * 1 + (o2)], mm); \
|
|
w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
|
|
q[v + 2 * 2 + (o2)], mm); \
|
|
w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
|
|
q[v + 2 * 3 + (o2)], mm); \
|
|
} \
|
|
} while (0)
|
|
|
|
WSREAD( 0, 0, 1, 185);
|
|
one_round_small(state, w, 0, 3, 23, 17, 27);
|
|
WSREAD( 8, 0, 1, 185);
|
|
one_round_small(state, w, 2, 28, 19, 22, 7);
|
|
WSREAD(16, -128, -64, 233);
|
|
one_round_small(state, w, 1, 29, 9, 15, 5);
|
|
WSREAD(24, -191, -127, 233);
|
|
one_round_small(state, w, 0, 4, 13, 10, 25);
|
|
|
|
#undef WSREAD
|
|
|
|
STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
|
|
IF, 4, 13, PP4_2_);
|
|
STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
|
|
IF, 13, 10, PP4_0_);
|
|
STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
|
|
IF, 10, 25, PP4_1_);
|
|
STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
|
|
IF, 25, 4, PP4_2_);
|
|
|
|
memcpy(sc->state, state, sizeof state);
|
|
}
|
|
|
|
#undef A0
|
|
#undef A1
|
|
#undef A2
|
|
#undef A3
|
|
#undef B0
|
|
#undef B1
|
|
#undef B2
|
|
#undef B3
|
|
#undef C0
|
|
#undef C1
|
|
#undef C2
|
|
#undef C3
|
|
#undef D0
|
|
#undef D1
|
|
#undef D2
|
|
#undef D3
|
|
|
|
#else
|
|
|
|
#if SPH_SIMD_NOCOPY
|
|
#define A0 (sc->state[ 0])
|
|
#define A1 (sc->state[ 1])
|
|
#define A2 (sc->state[ 2])
|
|
#define A3 (sc->state[ 3])
|
|
#define B0 (sc->state[ 4])
|
|
#define B1 (sc->state[ 5])
|
|
#define B2 (sc->state[ 6])
|
|
#define B3 (sc->state[ 7])
|
|
#define C0 (sc->state[ 8])
|
|
#define C1 (sc->state[ 9])
|
|
#define C2 (sc->state[10])
|
|
#define C3 (sc->state[11])
|
|
#define D0 (sc->state[12])
|
|
#define D1 (sc->state[13])
|
|
#define D2 (sc->state[14])
|
|
#define D3 (sc->state[15])
|
|
#endif
|
|
|
|
static void
|
|
compress_small(sph_simd_small_context *sc, int last)
|
|
{
|
|
unsigned char *x;
|
|
s32 q[128];
|
|
int i;
|
|
DECL_STATE_SMALL
|
|
#if SPH_SIMD_NOCOPY
|
|
sph_u32 saved[16];
|
|
#endif
|
|
|
|
#if SPH_SIMD_NOCOPY
|
|
memcpy(saved, sc->state, sizeof saved);
|
|
#endif
|
|
x = sc->buf;
|
|
FFT128(0, 1, 0, ll);
|
|
if (last) {
|
|
for (i = 0; i < 128; i ++) {
|
|
s32 tq;
|
|
|
|
tq = q[i] + yoff_s_f[i];
|
|
tq = REDS2(tq);
|
|
tq = REDS1(tq);
|
|
tq = REDS1(tq);
|
|
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
}
|
|
} else {
|
|
for (i = 0; i < 128; i ++) {
|
|
s32 tq;
|
|
|
|
tq = q[i] + yoff_s_n[i];
|
|
tq = REDS2(tq);
|
|
tq = REDS1(tq);
|
|
tq = REDS1(tq);
|
|
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
}
|
|
}
|
|
READ_STATE_SMALL(sc);
|
|
A0 ^= sph_dec32le_aligned(x + 0);
|
|
A1 ^= sph_dec32le_aligned(x + 4);
|
|
A2 ^= sph_dec32le_aligned(x + 8);
|
|
A3 ^= sph_dec32le_aligned(x + 12);
|
|
B0 ^= sph_dec32le_aligned(x + 16);
|
|
B1 ^= sph_dec32le_aligned(x + 20);
|
|
B2 ^= sph_dec32le_aligned(x + 24);
|
|
B3 ^= sph_dec32le_aligned(x + 28);
|
|
C0 ^= sph_dec32le_aligned(x + 32);
|
|
C1 ^= sph_dec32le_aligned(x + 36);
|
|
C2 ^= sph_dec32le_aligned(x + 40);
|
|
C3 ^= sph_dec32le_aligned(x + 44);
|
|
D0 ^= sph_dec32le_aligned(x + 48);
|
|
D1 ^= sph_dec32le_aligned(x + 52);
|
|
D2 ^= sph_dec32le_aligned(x + 56);
|
|
D3 ^= sph_dec32le_aligned(x + 60);
|
|
ONE_ROUND_SMALL(0_, 0, 3, 23, 17, 27);
|
|
ONE_ROUND_SMALL(1_, 2, 28, 19, 22, 7);
|
|
ONE_ROUND_SMALL(2_, 1, 29, 9, 15, 5);
|
|
ONE_ROUND_SMALL(3_, 0, 4, 13, 10, 25);
|
|
#if SPH_SIMD_NOCOPY
|
|
STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3],
|
|
IF, 4, 13, PP4_2_);
|
|
STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7],
|
|
IF, 13, 10, PP4_0_);
|
|
STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11],
|
|
IF, 10, 25, PP4_1_);
|
|
STEP_SMALL(saved[12], saved[13], saved[14], saved[15],
|
|
IF, 25, 4, PP4_2_);
|
|
#else
|
|
STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
|
|
IF, 4, 13, PP4_2_);
|
|
STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
|
|
IF, 13, 10, PP4_0_);
|
|
STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
|
|
IF, 10, 25, PP4_1_);
|
|
STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
|
|
IF, 25, 4, PP4_2_);
|
|
WRITE_STATE_SMALL(sc);
|
|
#endif
|
|
}
|
|
|
|
#if SPH_SIMD_NOCOPY
|
|
#undef A0
|
|
#undef A1
|
|
#undef A2
|
|
#undef A3
|
|
#undef B0
|
|
#undef B1
|
|
#undef B2
|
|
#undef B3
|
|
#undef C0
|
|
#undef C1
|
|
#undef C2
|
|
#undef C3
|
|
#undef D0
|
|
#undef D1
|
|
#undef D2
|
|
#undef D3
|
|
#endif
|
|
|
|
#endif
|
|
|
|
#if SPH_SMALL_FOOTPRINT_SIMD
|
|
|
|
#define A0 state[ 0]
|
|
#define A1 state[ 1]
|
|
#define A2 state[ 2]
|
|
#define A3 state[ 3]
|
|
#define A4 state[ 4]
|
|
#define A5 state[ 5]
|
|
#define A6 state[ 6]
|
|
#define A7 state[ 7]
|
|
#define B0 state[ 8]
|
|
#define B1 state[ 9]
|
|
#define B2 state[10]
|
|
#define B3 state[11]
|
|
#define B4 state[12]
|
|
#define B5 state[13]
|
|
#define B6 state[14]
|
|
#define B7 state[15]
|
|
#define C0 state[16]
|
|
#define C1 state[17]
|
|
#define C2 state[18]
|
|
#define C3 state[19]
|
|
#define C4 state[20]
|
|
#define C5 state[21]
|
|
#define C6 state[22]
|
|
#define C7 state[23]
|
|
#define D0 state[24]
|
|
#define D1 state[25]
|
|
#define D2 state[26]
|
|
#define D3 state[27]
|
|
#define D4 state[28]
|
|
#define D5 state[29]
|
|
#define D6 state[30]
|
|
#define D7 state[31]
|
|
|
|
/*
|
|
* Not needed -- already defined for SIMD-224 / SIMD-256
|
|
*
|
|
#define STEP2_ELT(n, w, fun, s, ppb) do { \
|
|
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
|
|
A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
|
|
D ## n = C ## n; \
|
|
C ## n = B ## n; \
|
|
B ## n = tA[n]; \
|
|
} while (0)
|
|
*/
|
|
|
|
#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
|
|
u32 tA[8]; \
|
|
tA[0] = ROL32(A0, r); \
|
|
tA[1] = ROL32(A1, r); \
|
|
tA[2] = ROL32(A2, r); \
|
|
tA[3] = ROL32(A3, r); \
|
|
tA[4] = ROL32(A4, r); \
|
|
tA[5] = ROL32(A5, r); \
|
|
tA[6] = ROL32(A6, r); \
|
|
tA[7] = ROL32(A7, r); \
|
|
STEP2_ELT(0, w0, fun, s, pp8b); \
|
|
STEP2_ELT(1, w1, fun, s, pp8b); \
|
|
STEP2_ELT(2, w2, fun, s, pp8b); \
|
|
STEP2_ELT(3, w3, fun, s, pp8b); \
|
|
STEP2_ELT(4, w4, fun, s, pp8b); \
|
|
STEP2_ELT(5, w5, fun, s, pp8b); \
|
|
STEP2_ELT(6, w6, fun, s, pp8b); \
|
|
STEP2_ELT(7, w7, fun, s, pp8b); \
|
|
} while (0)
|
|
|
|
static void
|
|
one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
|
|
{
|
|
static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 };
|
|
|
|
STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7],
|
|
IF, p0, p1, pp8k[isp + 0]);
|
|
STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15],
|
|
IF, p1, p2, pp8k[isp + 1]);
|
|
STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23],
|
|
IF, p2, p3, pp8k[isp + 2]);
|
|
STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31],
|
|
IF, p3, p0, pp8k[isp + 3]);
|
|
STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39],
|
|
MAJ, p0, p1, pp8k[isp + 4]);
|
|
STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47],
|
|
MAJ, p1, p2, pp8k[isp + 5]);
|
|
STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55],
|
|
MAJ, p2, p3, pp8k[isp + 6]);
|
|
STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63],
|
|
MAJ, p3, p0, pp8k[isp + 7]);
|
|
}
|
|
|
|
static void
|
|
compress_big(sph_simd_big_context *sc, int last)
|
|
{
|
|
unsigned char *x;
|
|
s32 q[256];
|
|
int i;
|
|
u32 w[64];
|
|
u32 state[32];
|
|
size_t u;
|
|
|
|
static const size_t wbp[32] = {
|
|
4 << 4, 6 << 4, 0 << 4, 2 << 4,
|
|
7 << 4, 5 << 4, 3 << 4, 1 << 4,
|
|
15 << 4, 11 << 4, 12 << 4, 8 << 4,
|
|
9 << 4, 13 << 4, 10 << 4, 14 << 4,
|
|
17 << 4, 18 << 4, 23 << 4, 20 << 4,
|
|
22 << 4, 21 << 4, 16 << 4, 19 << 4,
|
|
30 << 4, 24 << 4, 25 << 4, 31 << 4,
|
|
27 << 4, 29 << 4, 28 << 4, 26 << 4
|
|
};
|
|
|
|
x = sc->buf;
|
|
FFT256(0, 1, 0, ll);
|
|
if (last) {
|
|
for (i = 0; i < 256; i ++) {
|
|
s32 tq;
|
|
|
|
tq = q[i] + yoff_b_f[i];
|
|
tq = REDS2(tq);
|
|
tq = REDS1(tq);
|
|
tq = REDS1(tq);
|
|
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
}
|
|
} else {
|
|
for (i = 0; i < 256; i ++) {
|
|
s32 tq;
|
|
|
|
tq = q[i] + yoff_b_n[i];
|
|
tq = REDS2(tq);
|
|
tq = REDS1(tq);
|
|
tq = REDS1(tq);
|
|
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32; i += 8) {
|
|
state[i + 0] = sc->state[i + 0]
|
|
^ sph_dec32le_aligned(x + 4 * (i + 0));
|
|
state[i + 1] = sc->state[i + 1]
|
|
^ sph_dec32le_aligned(x + 4 * (i + 1));
|
|
state[i + 2] = sc->state[i + 2]
|
|
^ sph_dec32le_aligned(x + 4 * (i + 2));
|
|
state[i + 3] = sc->state[i + 3]
|
|
^ sph_dec32le_aligned(x + 4 * (i + 3));
|
|
state[i + 4] = sc->state[i + 4]
|
|
^ sph_dec32le_aligned(x + 4 * (i + 4));
|
|
state[i + 5] = sc->state[i + 5]
|
|
^ sph_dec32le_aligned(x + 4 * (i + 5));
|
|
state[i + 6] = sc->state[i + 6]
|
|
^ sph_dec32le_aligned(x + 4 * (i + 6));
|
|
state[i + 7] = sc->state[i + 7]
|
|
^ sph_dec32le_aligned(x + 4 * (i + 7));
|
|
}
|
|
|
|
#define WBREAD(sb, o1, o2, mm) do { \
|
|
for (u = 0; u < 64; u += 8) { \
|
|
size_t v = wbp[(u >> 3) + (sb)]; \
|
|
w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
|
|
q[v + 2 * 0 + (o2)], mm); \
|
|
w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
|
|
q[v + 2 * 1 + (o2)], mm); \
|
|
w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
|
|
q[v + 2 * 2 + (o2)], mm); \
|
|
w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
|
|
q[v + 2 * 3 + (o2)], mm); \
|
|
w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \
|
|
q[v + 2 * 4 + (o2)], mm); \
|
|
w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \
|
|
q[v + 2 * 5 + (o2)], mm); \
|
|
w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \
|
|
q[v + 2 * 6 + (o2)], mm); \
|
|
w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \
|
|
q[v + 2 * 7 + (o2)], mm); \
|
|
} \
|
|
} while (0)
|
|
|
|
WBREAD( 0, 0, 1, 185);
|
|
one_round_big(state, w, 0, 3, 23, 17, 27);
|
|
WBREAD( 8, 0, 1, 185);
|
|
one_round_big(state, w, 1, 28, 19, 22, 7);
|
|
WBREAD(16, -256, -128, 233);
|
|
one_round_big(state, w, 2, 29, 9, 15, 5);
|
|
WBREAD(24, -383, -255, 233);
|
|
one_round_big(state, w, 3, 4, 13, 10, 25);
|
|
|
|
#undef WBREAD
|
|
|
|
STEP_BIG(
|
|
sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
|
|
sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
|
|
IF, 4, 13, PP8_4_);
|
|
STEP_BIG(
|
|
sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
|
|
sc->state[12], sc->state[13], sc->state[14], sc->state[15],
|
|
IF, 13, 10, PP8_5_);
|
|
STEP_BIG(
|
|
sc->state[16], sc->state[17], sc->state[18], sc->state[19],
|
|
sc->state[20], sc->state[21], sc->state[22], sc->state[23],
|
|
IF, 10, 25, PP8_6_);
|
|
STEP_BIG(
|
|
sc->state[24], sc->state[25], sc->state[26], sc->state[27],
|
|
sc->state[28], sc->state[29], sc->state[30], sc->state[31],
|
|
IF, 25, 4, PP8_0_);
|
|
|
|
memcpy(sc->state, state, sizeof state);
|
|
}
|
|
|
|
#undef A0
|
|
#undef A1
|
|
#undef A2
|
|
#undef A3
|
|
#undef A4
|
|
#undef A5
|
|
#undef A6
|
|
#undef A7
|
|
#undef B0
|
|
#undef B1
|
|
#undef B2
|
|
#undef B3
|
|
#undef B4
|
|
#undef B5
|
|
#undef B6
|
|
#undef B7
|
|
#undef C0
|
|
#undef C1
|
|
#undef C2
|
|
#undef C3
|
|
#undef C4
|
|
#undef C5
|
|
#undef C6
|
|
#undef C7
|
|
#undef D0
|
|
#undef D1
|
|
#undef D2
|
|
#undef D3
|
|
#undef D4
|
|
#undef D5
|
|
#undef D6
|
|
#undef D7
|
|
|
|
#else
|
|
|
|
#if SPH_SIMD_NOCOPY
|
|
#define A0 (sc->state[ 0])
|
|
#define A1 (sc->state[ 1])
|
|
#define A2 (sc->state[ 2])
|
|
#define A3 (sc->state[ 3])
|
|
#define A4 (sc->state[ 4])
|
|
#define A5 (sc->state[ 5])
|
|
#define A6 (sc->state[ 6])
|
|
#define A7 (sc->state[ 7])
|
|
#define B0 (sc->state[ 8])
|
|
#define B1 (sc->state[ 9])
|
|
#define B2 (sc->state[10])
|
|
#define B3 (sc->state[11])
|
|
#define B4 (sc->state[12])
|
|
#define B5 (sc->state[13])
|
|
#define B6 (sc->state[14])
|
|
#define B7 (sc->state[15])
|
|
#define C0 (sc->state[16])
|
|
#define C1 (sc->state[17])
|
|
#define C2 (sc->state[18])
|
|
#define C3 (sc->state[19])
|
|
#define C4 (sc->state[20])
|
|
#define C5 (sc->state[21])
|
|
#define C6 (sc->state[22])
|
|
#define C7 (sc->state[23])
|
|
#define D0 (sc->state[24])
|
|
#define D1 (sc->state[25])
|
|
#define D2 (sc->state[26])
|
|
#define D3 (sc->state[27])
|
|
#define D4 (sc->state[28])
|
|
#define D5 (sc->state[29])
|
|
#define D6 (sc->state[30])
|
|
#define D7 (sc->state[31])
|
|
#endif
|
|
|
|
static void
|
|
compress_big(sph_simd_big_context *sc, int last)
|
|
{
|
|
unsigned char *x;
|
|
s32 q[256];
|
|
int i;
|
|
DECL_STATE_BIG
|
|
#if SPH_SIMD_NOCOPY
|
|
sph_u32 saved[32];
|
|
#endif
|
|
|
|
#if SPH_SIMD_NOCOPY
|
|
memcpy(saved, sc->state, sizeof saved);
|
|
#endif
|
|
|
|
x = sc->buf;
|
|
FFT256(0, 1, 0, ll);
|
|
if (last) {
|
|
for (i = 0; i < 256; i ++) {
|
|
s32 tq;
|
|
|
|
tq = q[i] + yoff_b_f[i];
|
|
tq = REDS2(tq);
|
|
tq = REDS1(tq);
|
|
tq = REDS1(tq);
|
|
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
}
|
|
} else {
|
|
for (i = 0; i < 256; i ++) {
|
|
s32 tq;
|
|
|
|
tq = q[i] + yoff_b_n[i];
|
|
tq = REDS2(tq);
|
|
tq = REDS1(tq);
|
|
tq = REDS1(tq);
|
|
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
}
|
|
}
|
|
READ_STATE_BIG(sc);
|
|
A0 ^= sph_dec32le_aligned(x + 0);
|
|
A1 ^= sph_dec32le_aligned(x + 4);
|
|
A2 ^= sph_dec32le_aligned(x + 8);
|
|
A3 ^= sph_dec32le_aligned(x + 12);
|
|
A4 ^= sph_dec32le_aligned(x + 16);
|
|
A5 ^= sph_dec32le_aligned(x + 20);
|
|
A6 ^= sph_dec32le_aligned(x + 24);
|
|
A7 ^= sph_dec32le_aligned(x + 28);
|
|
B0 ^= sph_dec32le_aligned(x + 32);
|
|
B1 ^= sph_dec32le_aligned(x + 36);
|
|
B2 ^= sph_dec32le_aligned(x + 40);
|
|
B3 ^= sph_dec32le_aligned(x + 44);
|
|
B4 ^= sph_dec32le_aligned(x + 48);
|
|
B5 ^= sph_dec32le_aligned(x + 52);
|
|
B6 ^= sph_dec32le_aligned(x + 56);
|
|
B7 ^= sph_dec32le_aligned(x + 60);
|
|
C0 ^= sph_dec32le_aligned(x + 64);
|
|
C1 ^= sph_dec32le_aligned(x + 68);
|
|
C2 ^= sph_dec32le_aligned(x + 72);
|
|
C3 ^= sph_dec32le_aligned(x + 76);
|
|
C4 ^= sph_dec32le_aligned(x + 80);
|
|
C5 ^= sph_dec32le_aligned(x + 84);
|
|
C6 ^= sph_dec32le_aligned(x + 88);
|
|
C7 ^= sph_dec32le_aligned(x + 92);
|
|
D0 ^= sph_dec32le_aligned(x + 96);
|
|
D1 ^= sph_dec32le_aligned(x + 100);
|
|
D2 ^= sph_dec32le_aligned(x + 104);
|
|
D3 ^= sph_dec32le_aligned(x + 108);
|
|
D4 ^= sph_dec32le_aligned(x + 112);
|
|
D5 ^= sph_dec32le_aligned(x + 116);
|
|
D6 ^= sph_dec32le_aligned(x + 120);
|
|
D7 ^= sph_dec32le_aligned(x + 124);
|
|
|
|
ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27);
|
|
ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
|
|
ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
|
|
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
|
|
#if SPH_SIMD_NOCOPY
|
|
STEP_BIG(
|
|
saved[ 0], saved[ 1], saved[ 2], saved[ 3],
|
|
saved[ 4], saved[ 5], saved[ 6], saved[ 7],
|
|
IF, 4, 13, PP8_4_);
|
|
STEP_BIG(
|
|
saved[ 8], saved[ 9], saved[10], saved[11],
|
|
saved[12], saved[13], saved[14], saved[15],
|
|
IF, 13, 10, PP8_5_);
|
|
STEP_BIG(
|
|
saved[16], saved[17], saved[18], saved[19],
|
|
saved[20], saved[21], saved[22], saved[23],
|
|
IF, 10, 25, PP8_6_);
|
|
STEP_BIG(
|
|
saved[24], saved[25], saved[26], saved[27],
|
|
saved[28], saved[29], saved[30], saved[31],
|
|
IF, 25, 4, PP8_0_);
|
|
#else
|
|
STEP_BIG(
|
|
sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
|
|
sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
|
|
IF, 4, 13, PP8_4_);
|
|
STEP_BIG(
|
|
sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
|
|
sc->state[12], sc->state[13], sc->state[14], sc->state[15],
|
|
IF, 13, 10, PP8_5_);
|
|
STEP_BIG(
|
|
sc->state[16], sc->state[17], sc->state[18], sc->state[19],
|
|
sc->state[20], sc->state[21], sc->state[22], sc->state[23],
|
|
IF, 10, 25, PP8_6_);
|
|
STEP_BIG(
|
|
sc->state[24], sc->state[25], sc->state[26], sc->state[27],
|
|
sc->state[28], sc->state[29], sc->state[30], sc->state[31],
|
|
IF, 25, 4, PP8_0_);
|
|
WRITE_STATE_BIG(sc);
|
|
#endif
|
|
}
|
|
|
|
#if SPH_SIMD_NOCOPY
|
|
#undef A0
|
|
#undef A1
|
|
#undef A2
|
|
#undef A3
|
|
#undef A4
|
|
#undef A5
|
|
#undef A6
|
|
#undef A7
|
|
#undef B0
|
|
#undef B1
|
|
#undef B2
|
|
#undef B3
|
|
#undef B4
|
|
#undef B5
|
|
#undef B6
|
|
#undef B7
|
|
#undef C0
|
|
#undef C1
|
|
#undef C2
|
|
#undef C3
|
|
#undef C4
|
|
#undef C5
|
|
#undef C6
|
|
#undef C7
|
|
#undef D0
|
|
#undef D1
|
|
#undef D2
|
|
#undef D3
|
|
#undef D4
|
|
#undef D5
|
|
#undef D6
|
|
#undef D7
|
|
#endif
|
|
|
|
#endif
|
|
|
|
static const u32 IV224[] = {
|
|
C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53),
|
|
C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96),
|
|
C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6),
|
|
C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8)
|
|
};
|
|
|
|
static const u32 IV256[] = {
|
|
C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9),
|
|
C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3),
|
|
C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9),
|
|
C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1)
|
|
};
|
|
|
|
static const u32 IV384[] = {
|
|
C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B),
|
|
C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1),
|
|
C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A),
|
|
C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8),
|
|
C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2),
|
|
C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462),
|
|
C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5),
|
|
C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71)
|
|
};
|
|
|
|
static const u32 IV512[] = {
|
|
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
|
|
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
|
|
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
|
|
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
|
|
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
|
|
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
|
|
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
|
|
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22)
|
|
};
|
|
|
|
static void
|
|
init_small(void *cc, const u32 *iv)
|
|
{
|
|
sph_simd_small_context *sc;
|
|
|
|
sc = cc;
|
|
memcpy(sc->state, iv, sizeof sc->state);
|
|
sc->count_low = sc->count_high = 0;
|
|
sc->ptr = 0;
|
|
}
|
|
|
|
static void
|
|
init_big(void *cc, const u32 *iv)
|
|
{
|
|
sph_simd_big_context *sc;
|
|
|
|
sc = cc;
|
|
memcpy(sc->state, iv, sizeof sc->state);
|
|
sc->count_low = sc->count_high = 0;
|
|
sc->ptr = 0;
|
|
}
|
|
|
|
static void
|
|
update_small(void *cc, const void *data, size_t len)
|
|
{
|
|
sph_simd_small_context *sc;
|
|
|
|
sc = cc;
|
|
while (len > 0) {
|
|
size_t clen;
|
|
|
|
clen = (sizeof sc->buf) - sc->ptr;
|
|
if (clen > len)
|
|
clen = len;
|
|
memcpy(sc->buf + sc->ptr, data, clen);
|
|
data = (const unsigned char *)data + clen;
|
|
len -= clen;
|
|
if ((sc->ptr += clen) == sizeof sc->buf) {
|
|
compress_small(sc, 0);
|
|
sc->ptr = 0;
|
|
sc->count_low = T32(sc->count_low + 1);
|
|
if (sc->count_low == 0)
|
|
sc->count_high ++;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
update_big(void *cc, const void *data, size_t len)
|
|
{
|
|
sph_simd_big_context *sc;
|
|
|
|
sc = cc;
|
|
while (len > 0) {
|
|
size_t clen;
|
|
|
|
clen = (sizeof sc->buf) - sc->ptr;
|
|
if (clen > len)
|
|
clen = len;
|
|
memcpy(sc->buf + sc->ptr, data, clen);
|
|
data = (const unsigned char *)data + clen;
|
|
len -= clen;
|
|
if ((sc->ptr += clen) == sizeof sc->buf) {
|
|
compress_big(sc, 0);
|
|
sc->ptr = 0;
|
|
sc->count_low = T32(sc->count_low + 1);
|
|
if (sc->count_low == 0)
|
|
sc->count_high ++;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
encode_count_small(unsigned char *dst,
|
|
u32 low, u32 high, size_t ptr, unsigned n)
|
|
{
|
|
low = T32(low << 9);
|
|
high = T32(high << 9) + (low >> 23);
|
|
low += (ptr << 3) + n;
|
|
sph_enc32le(dst, low);
|
|
sph_enc32le(dst + 4, high);
|
|
}
|
|
|
|
static void
|
|
encode_count_big(unsigned char *dst,
|
|
u32 low, u32 high, size_t ptr, unsigned n)
|
|
{
|
|
low = T32(low << 10);
|
|
high = T32(high << 10) + (low >> 22);
|
|
low += (ptr << 3) + n;
|
|
sph_enc32le(dst, low);
|
|
sph_enc32le(dst + 4, high);
|
|
}
|
|
|
|
static void
|
|
finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
|
|
{
|
|
sph_simd_small_context *sc;
|
|
unsigned char *d;
|
|
size_t u;
|
|
|
|
sc = cc;
|
|
if (sc->ptr > 0 || n > 0) {
|
|
memset(sc->buf + sc->ptr, 0,
|
|
(sizeof sc->buf) - sc->ptr);
|
|
sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
|
|
compress_small(sc, 0);
|
|
}
|
|
memset(sc->buf, 0, sizeof sc->buf);
|
|
encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
|
|
compress_small(sc, 1);
|
|
d = dst;
|
|
for (d = dst, u = 0; u < dst_len; u ++)
|
|
sph_enc32le(d + (u << 2), sc->state[u]);
|
|
}
|
|
|
|
static void
|
|
finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
|
|
{
|
|
sph_simd_big_context *sc;
|
|
unsigned char *d;
|
|
size_t u;
|
|
|
|
sc = cc;
|
|
if (sc->ptr > 0 || n > 0) {
|
|
memset(sc->buf + sc->ptr, 0,
|
|
(sizeof sc->buf) - sc->ptr);
|
|
sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
|
|
compress_big(sc, 0);
|
|
}
|
|
memset(sc->buf, 0, sizeof sc->buf);
|
|
encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
|
|
compress_big(sc, 1);
|
|
d = dst;
|
|
for (d = dst, u = 0; u < dst_len; u ++)
|
|
sph_enc32le(d + (u << 2), sc->state[u]);
|
|
}
|
|
|
|
void
|
|
sph_simd224_init(void *cc)
|
|
{
|
|
init_small(cc, IV224);
|
|
}
|
|
|
|
void
|
|
sph_simd224(void *cc, const void *data, size_t len)
|
|
{
|
|
update_small(cc, data, len);
|
|
}
|
|
|
|
void
|
|
sph_simd224_close(void *cc, void *dst)
|
|
{
|
|
sph_simd224_addbits_and_close(cc, 0, 0, dst);
|
|
}
|
|
|
|
void
|
|
sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|
{
|
|
finalize_small(cc, ub, n, dst, 7);
|
|
sph_simd224_init(cc);
|
|
}
|
|
|
|
void
|
|
sph_simd256_init(void *cc)
|
|
{
|
|
init_small(cc, IV256);
|
|
}
|
|
|
|
void
|
|
sph_simd256(void *cc, const void *data, size_t len)
|
|
{
|
|
update_small(cc, data, len);
|
|
}
|
|
|
|
void
|
|
sph_simd256_close(void *cc, void *dst)
|
|
{
|
|
sph_simd256_addbits_and_close(cc, 0, 0, dst);
|
|
}
|
|
|
|
void
|
|
sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|
{
|
|
finalize_small(cc, ub, n, dst, 8);
|
|
sph_simd256_init(cc);
|
|
}
|
|
|
|
void
|
|
sph_simd384_init(void *cc)
|
|
{
|
|
init_big(cc, IV384);
|
|
}
|
|
|
|
void
|
|
sph_simd384(void *cc, const void *data, size_t len)
|
|
{
|
|
update_big(cc, data, len);
|
|
}
|
|
|
|
void
|
|
sph_simd384_close(void *cc, void *dst)
|
|
{
|
|
sph_simd384_addbits_and_close(cc, 0, 0, dst);
|
|
}
|
|
|
|
void
|
|
sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|
{
|
|
finalize_big(cc, ub, n, dst, 12);
|
|
sph_simd384_init(cc);
|
|
}
|
|
|
|
void
|
|
sph_simd512_init(void *cc)
|
|
{
|
|
init_big(cc, IV512);
|
|
}
|
|
|
|
void
|
|
sph_simd512(void *cc, const void *data, size_t len)
|
|
{
|
|
update_big(cc, data, len);
|
|
}
|
|
|
|
void
|
|
sph_simd512_close(void *cc, void *dst)
|
|
{
|
|
sph_simd512_addbits_and_close(cc, 0, 0, dst);
|
|
}
|
|
|
|
void
|
|
sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|
{
|
|
finalize_big(cc, ub, n, dst, 16);
|
|
sph_simd512_init(cc);
|
|
}
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif |