xmr-stak-generic-opencl11/opencl/jh.cl

273 lines
9.8 KiB
Common Lisp

/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */
/*
* JH implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#define SPH_JH_64 1
#define SPH_LITTLE_ENDIAN 1
#define SPH_C32(x) x
#define SPH_C64(x) x
typedef uint sph_u32;
typedef ulong sph_u64;
/*
* The internal bitslice representation may use either big-endian or
* little-endian (true bitslice operations do not care about the bit
* ordering, and the bit-swapping linear operations in JH happen to
* be invariant through endianness-swapping). The constants must be
* defined according to the chosen endianness; we use some
* byte-swapping macros for that.
*/
#if SPH_LITTLE_ENDIAN
#define C32e(x) ((SPH_C32(x) >> 24) \
| ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \
| ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \
| ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
#define dec32e_aligned sph_dec32le_aligned
#define enc32e sph_enc32le
#define C64e(x) ((SPH_C64(x) >> 56) \
| ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
| ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
| ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \
| ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \
| ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
| ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
| ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
#define dec64e_aligned sph_dec64le_aligned
#define enc64e sph_enc64le
#else
#define C32e(x) SPH_C32(x)
#define dec32e_aligned sph_dec32be_aligned
#define enc32e sph_enc32be
#define C64e(x) SPH_C64(x)
#define dec64e_aligned sph_dec64be_aligned
#define enc64e sph_enc64be
#endif
#define Sb(x0, x1, x2, x3, c) do { \
x3 = ~x3; \
x0 ^= (c) & ~x2; \
tmp = (c) ^ (x0 & x1); \
x0 ^= x2 & x3; \
x3 ^= ~x1 & x2; \
x1 ^= x0 & x2; \
x2 ^= x0 & ~x3; \
x0 ^= x1 | x3; \
x3 ^= x1 & x2; \
x1 ^= tmp & x0; \
x2 ^= tmp; \
} while (0)
#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) do { \
x4 ^= x1; \
x5 ^= x2; \
x6 ^= x3 ^ x0; \
x7 ^= x0; \
x0 ^= x5; \
x1 ^= x6; \
x2 ^= x7 ^ x4; \
x3 ^= x4; \
} while (0)
static const __constant ulong C[] =
{
0x67F815DFA2DED572UL, 0x571523B70A15847BUL, 0xF6875A4D90D6AB81UL, 0x402BD1C3C54F9F4EUL,
0x9CFA455CE03A98EAUL, 0x9A99B26699D2C503UL, 0x8A53BBF2B4960266UL, 0x31A2DB881A1456B5UL,
0xDB0E199A5C5AA303UL, 0x1044C1870AB23F40UL, 0x1D959E848019051CUL, 0xDCCDE75EADEB336FUL,
0x416BBF029213BA10UL, 0xD027BBF7156578DCUL, 0x5078AA3739812C0AUL, 0xD3910041D2BF1A3FUL,
0x907ECCF60D5A2D42UL, 0xCE97C0929C9F62DDUL, 0xAC442BC70BA75C18UL, 0x23FCC663D665DFD1UL,
0x1AB8E09E036C6E97UL, 0xA8EC6C447E450521UL, 0xFA618E5DBB03F1EEUL, 0x97818394B29796FDUL,
0x2F3003DB37858E4AUL, 0x956A9FFB2D8D672AUL, 0x6C69B8F88173FE8AUL, 0x14427FC04672C78AUL,
0xC45EC7BD8F15F4C5UL, 0x80BB118FA76F4475UL, 0xBC88E4AEB775DE52UL, 0xF4A3A6981E00B882UL,
0x1563A3A9338FF48EUL, 0x89F9B7D524565FAAUL, 0xFDE05A7C20EDF1B6UL, 0x362C42065AE9CA36UL,
0x3D98FE4E433529CEUL, 0xA74B9A7374F93A53UL, 0x86814E6F591FF5D0UL, 0x9F5AD8AF81AD9D0EUL,
0x6A6234EE670605A7UL, 0x2717B96EBE280B8BUL, 0x3F1080C626077447UL, 0x7B487EC66F7EA0E0UL,
0xC0A4F84AA50A550DUL, 0x9EF18E979FE7E391UL, 0xD48D605081727686UL, 0x62B0E5F3415A9E7EUL,
0x7A205440EC1F9FFCUL, 0x84C9F4CE001AE4E3UL, 0xD895FA9DF594D74FUL, 0xA554C324117E2E55UL,
0x286EFEBD2872DF5BUL, 0xB2C4A50FE27FF578UL, 0x2ED349EEEF7C8905UL, 0x7F5928EB85937E44UL,
0x4A3124B337695F70UL, 0x65E4D61DF128865EUL, 0xE720B95104771BC7UL, 0x8A87D423E843FE74UL,
0xF2947692A3E8297DUL, 0xC1D9309B097ACBDDUL, 0xE01BDC5BFB301B1DUL, 0xBF829CF24F4924DAUL,
0xFFBF70B431BAE7A4UL, 0x48BCF8DE0544320DUL, 0x39D3BB5332FCAE3BUL, 0xA08B29E0C1C39F45UL,
0x0F09AEF7FD05C9E5UL, 0x34F1904212347094UL, 0x95ED44E301B771A2UL, 0x4A982F4F368E3BE9UL,
0x15F66CA0631D4088UL, 0xFFAF52874B44C147UL, 0x30C60AE2F14ABB7EUL, 0xE68C6ECCC5B67046UL,
0x00CA4FBD56A4D5A4UL, 0xAE183EC84B849DDAUL, 0xADD1643045CE5773UL, 0x67255C1468CEA6E8UL,
0x16E10ECBF28CDAA3UL, 0x9A99949A5806E933UL, 0x7B846FC220B2601FUL, 0x1885D1A07FACCED1UL,
0xD319DD8DA15B5932UL, 0x46B4A5AAC01C9A50UL, 0xBA6B04E467633D9FUL, 0x7EEE560BAB19CAF6UL,
0x742128A9EA79B11FUL, 0xEE51363B35F7BDE9UL, 0x76D350755AAC571DUL, 0x01707DA3FEC2463AUL,
0x42D8A498AFC135F7UL, 0x79676B9E20ECED78UL, 0xA8DB3AEA15638341UL, 0x832C83324D3BC3FAUL,
0xF347271C1F3B40A7UL, 0x9A762DB734F04059UL, 0xFD4F21D26C4E3EE7UL, 0xEF5957DC398DFDB8UL,
0xDAEB492B490C9B8DUL, 0x0D70F36849D7A25BUL, 0x84558D7AD0AE3B7DUL, 0x658EF8E4F0E9A5F5UL,
0x533B1036F4A2B8A0UL, 0x5AEC3E759E07A80CUL, 0x4F88E85692946891UL, 0x4CBCBAF8555CB05BUL,
0x7B9487F3993BBBE3UL, 0x5D1C6B72D6F4DA75UL, 0x6DB334DC28ACAE64UL, 0x71DB28B850A5346CUL,
0x2A518D10F2E261F8UL, 0xFC75DD593364DBE3UL, 0xA23FCE43F1BCAC1CUL, 0xB043E8023CD1BB67UL,
0x75A12988CA5B0A33UL, 0x5C5316B44D19347FUL, 0x1E4D790EC3943B92UL, 0x3FAFEEB6D7757479UL,
0x21391ABEF7D4A8EAUL, 0x5127234C097EF45CUL, 0xD23C32BA5324A326UL, 0xADD5A66D4A17A344UL,
0x08C9F2AFA63E1DB5UL, 0x563C6B91983D5983UL, 0x4D608672A17CF84CUL, 0xF6C76E08CC3EE246UL,
0x5E76BCB1B333982FUL, 0x2AE6C4EFA566D62BUL, 0x36D4C1BEE8B6F406UL, 0x6321EFBC1582EE74UL,
0x69C953F40D4EC1FDUL, 0x26585806C45A7DA7UL, 0x16FAE0061614C17EUL, 0x3F9D63283DAF907EUL,
0x0CD29B00E3F2C9D2UL, 0x300CD4B730CEAA5FUL, 0x9832E0F216512A74UL, 0x9AF8CEE3D830EB0DUL,
0x9279F1B57B9EC54BUL, 0xD36886046EE651FFUL, 0x316796E6574D239BUL, 0x05750A17F3A6E6CCUL,
0xCE6C3213D98176B1UL, 0x62A205F88452173CUL, 0x47154778B3CB2BF4UL, 0x486A9323825446FFUL,
0x65655E4E0758DF38UL, 0x8E5086FC897CFCF2UL, 0x86CA0BD0442E7031UL, 0x4E477830A20940F0UL,
0x8338F7D139EEA065UL, 0xBD3A2CE437E95EF7UL, 0x6FF8130126B29721UL, 0xE7DE9FEFD1ED44A3UL,
0xD992257615DFA08BUL, 0xBE42DC12F6F7853CUL, 0x7EB027AB7CECA7D8UL, 0xDEA83EAADA7D8D53UL,
0xD86902BD93CE25AAUL, 0xF908731AFD43F65AUL, 0xA5194A17DAEF5FC0UL, 0x6A21FD4C33664D97UL,
0x701541DB3198B435UL, 0x9B54CDEDBB0F1EEAUL, 0x72409751A163D09AUL, 0xE26F4791BF9D75F6UL
};
#define Ceven_hi(r) (C[((r) << 2) + 0])
#define Ceven_lo(r) (C[((r) << 2) + 1])
#define Codd_hi(r) (C[((r) << 2) + 2])
#define Codd_lo(r) (C[((r) << 2) + 3])
#define S(x0, x1, x2, x3, cb, r) do { \
Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
} while (0)
#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \
Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
} while (0)
#define Wz(x, c, n) do { \
sph_u64 t = (x ## h & (c)) << (n); \
x ## h = ((x ## h >> (n)) & (c)) | t; \
t = (x ## l & (c)) << (n); \
x ## l = ((x ## l >> (n)) & (c)) | t; \
} while (0)
#define W0(x) Wz(x, SPH_C64(0x5555555555555555), 1)
#define W1(x) Wz(x, SPH_C64(0x3333333333333333), 2)
#define W2(x) Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F), 4)
#define W3(x) Wz(x, SPH_C64(0x00FF00FF00FF00FF), 8)
#define W4(x) Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
#define W5(x) Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
#define W6(x) do { \
sph_u64 t = x ## h; \
x ## h = x ## l; \
x ## l = t; \
} while (0)
#define SL(ro) SLu(r + ro, ro)
#define SLu(r, ro) do { \
S(h0, h2, h4, h6, Ceven_, r); \
S(h1, h3, h5, h7, Codd_, r); \
L(h0, h2, h4, h6, h1, h3, h5, h7); \
W ## ro(h1); \
W ## ro(h3); \
W ## ro(h5); \
W ## ro(h7); \
} while (0)
#if SPH_SMALL_FOOTPRINT_JH
/*
* The "small footprint" 64-bit version just uses a partially unrolled
* loop.
*/
#define E8 do { \
unsigned r; \
for (r = 0; r < 42; r += 7) { \
SL(0); \
SL(1); \
SL(2); \
SL(3); \
SL(4); \
SL(5); \
SL(6); \
} \
} while (0)
#else
/*
* On a "true 64-bit" architecture, we can unroll at will.
*/
#define E8 do { \
SLu( 0, 0); \
SLu( 1, 1); \
SLu( 2, 2); \
SLu( 3, 3); \
SLu( 4, 4); \
SLu( 5, 5); \
SLu( 6, 6); \
SLu( 7, 0); \
SLu( 8, 1); \
SLu( 9, 2); \
SLu(10, 3); \
SLu(11, 4); \
SLu(12, 5); \
SLu(13, 6); \
SLu(14, 0); \
SLu(15, 1); \
SLu(16, 2); \
SLu(17, 3); \
SLu(18, 4); \
SLu(19, 5); \
SLu(20, 6); \
SLu(21, 0); \
SLu(22, 1); \
SLu(23, 2); \
SLu(24, 3); \
SLu(25, 4); \
SLu(26, 5); \
SLu(27, 6); \
SLu(28, 0); \
SLu(29, 1); \
SLu(30, 2); \
SLu(31, 3); \
SLu(32, 4); \
SLu(33, 5); \
SLu(34, 6); \
SLu(35, 0); \
SLu(36, 1); \
SLu(37, 2); \
SLu(38, 3); \
SLu(39, 4); \
SLu(40, 5); \
SLu(41, 6); \
} while (0)
#endif