From 52580a563688c0df9834cfc4fac2a6e4c250e8d9 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 24 Jan 2016 00:01:50 +0100 Subject: [PATCH] Add argon2 algo support --- rc.local | 1 + stratum/algos/ar2/argon2.c | 279 +++++++++++ stratum/algos/ar2/argon2.h | 292 +++++++++++ stratum/algos/ar2/bench.c | 111 +++++ stratum/algos/ar2/blake2/blake2-impl.h | 143 ++++++ stratum/algos/ar2/blake2/blake2.h | 76 +++ stratum/algos/ar2/blake2/blamka-round-opt.h | 162 ++++++ stratum/algos/ar2/blake2/blamka-round-ref.h | 39 ++ stratum/algos/ar2/blake2b.c | 305 ++++++++++++ stratum/algos/ar2/cores.c | 341 +++++++++++++ stratum/algos/ar2/cores.h | 220 +++++++++ stratum/algos/ar2/genkat.c | 182 +++++++ stratum/algos/ar2/genkat.h | 45 ++ stratum/algos/ar2/opt.c | 185 +++++++ stratum/algos/ar2/opt.h | 49 ++ stratum/algos/ar2/ref.c | 174 +++++++ stratum/algos/ar2/ref.h | 49 ++ stratum/algos/ar2/run.c | 223 +++++++++ stratum/algos/ar2/scrypt-jane.c | 249 ++++++++++ stratum/algos/ar2/scrypt-jane.h | 33 ++ stratum/algos/ar2/sj/scrypt-jane-hash.h | 38 ++ .../algos/ar2/sj/scrypt-jane-hash_skein512.h | 188 +++++++ .../algos/ar2/sj/scrypt-jane-mix_salsa-avx.h | 381 +++++++++++++++ .../algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h | 443 +++++++++++++++++ .../algos/ar2/sj/scrypt-jane-mix_salsa-xop.h | 317 ++++++++++++ stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h | 70 +++ .../ar2/sj/scrypt-jane-mix_salsa64-avx.h | 367 ++++++++++++++ .../ar2/sj/scrypt-jane-mix_salsa64-avx2.h | 221 +++++++++ .../ar2/sj/scrypt-jane-mix_salsa64-sse2.h | 449 +++++++++++++++++ .../ar2/sj/scrypt-jane-mix_salsa64-ssse3.h | 399 +++++++++++++++ .../ar2/sj/scrypt-jane-mix_salsa64-xop.h | 335 +++++++++++++ .../algos/ar2/sj/scrypt-jane-mix_salsa64.h | 41 ++ stratum/algos/ar2/sj/scrypt-jane-pbkdf2.h | 112 +++++ .../algos/ar2/sj/scrypt-jane-portable-x86.h | 462 ++++++++++++++++++ stratum/algos/ar2/sj/scrypt-jane-portable.h | 307 ++++++++++++ .../algos/ar2/sj/scrypt-jane-romix-basic.h | 74 +++ .../algos/ar2/sj/scrypt-jane-romix-template.h | 122 +++++ stratum/algos/ar2/sj/scrypt-jane-romix.h | 25 + stratum/algos/ar2/sj/scrypt-jane-salsa.h | 134 +++++ stratum/algos/ar2/sj/scrypt-jane-salsa64.h | 183 +++++++ .../algos/ar2/sj/scrypt-jane-test-vectors.h | 39 ++ stratum/algos/ar2/thread.c | 36 ++ stratum/algos/ar2/thread.h | 46 ++ stratum/algos/argon2a.c | 76 +++ stratum/algos/argon2a.h | 16 + stratum/algos/makefile | 21 +- stratum/algos/sha256_Y.c | 0 stratum/algos/sha256_Y.h | 0 stratum/algos/sysendian.h | 0 stratum/algos/yescrypt-opt.c | 0 stratum/algos/yescrypt-platform.c | 0 stratum/config.sample/argon2.conf | 16 + stratum/stratum.cpp | 2 + stratum/stratum.h | 1 + web/yaamp/core/functions/yaamp.php | 10 +- 55 files changed, 8076 insertions(+), 13 deletions(-) create mode 100644 stratum/algos/ar2/argon2.c create mode 100644 stratum/algos/ar2/argon2.h create mode 100644 stratum/algos/ar2/bench.c create mode 100644 stratum/algos/ar2/blake2/blake2-impl.h create mode 100644 stratum/algos/ar2/blake2/blake2.h create mode 100644 stratum/algos/ar2/blake2/blamka-round-opt.h create mode 100644 stratum/algos/ar2/blake2/blamka-round-ref.h create mode 100644 stratum/algos/ar2/blake2b.c create mode 100644 stratum/algos/ar2/cores.c create mode 100644 stratum/algos/ar2/cores.h create mode 100644 stratum/algos/ar2/genkat.c create mode 100644 stratum/algos/ar2/genkat.h create mode 100644 stratum/algos/ar2/opt.c create mode 100644 stratum/algos/ar2/opt.h create mode 100644 stratum/algos/ar2/ref.c create mode 100644 stratum/algos/ar2/ref.h create mode 100644 stratum/algos/ar2/run.c create mode 100644 stratum/algos/ar2/scrypt-jane.c create mode 100644 stratum/algos/ar2/scrypt-jane.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-hash.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-hash_skein512.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa-avx.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa-xop.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-avx.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-avx2.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-sse2.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-xop.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa64.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-pbkdf2.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-portable-x86.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-portable.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-romix-basic.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-romix-template.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-romix.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-salsa.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-salsa64.h create mode 100644 stratum/algos/ar2/sj/scrypt-jane-test-vectors.h create mode 100644 stratum/algos/ar2/thread.c create mode 100644 stratum/algos/ar2/thread.h create mode 100644 stratum/algos/argon2a.c create mode 100644 stratum/algos/argon2a.h mode change 100755 => 100644 stratum/algos/sha256_Y.c mode change 100755 => 100644 stratum/algos/sha256_Y.h mode change 100755 => 100644 stratum/algos/sysendian.h mode change 100755 => 100644 stratum/algos/yescrypt-opt.c mode change 100755 => 100644 stratum/algos/yescrypt-platform.c create mode 100644 stratum/config.sample/argon2.conf diff --git a/rc.local b/rc.local index 68bdc8e..2c5f3c7 100644 --- a/rc.local +++ b/rc.local @@ -48,4 +48,5 @@ screen -dmS zr5 $STRATUM_DIR/run.sh zr5 screen -dmS sib $STRATUM_DIR/run.sh sib screen -dmS m7m $STRATUM_DIR/run.sh m7m screen -dmS velvet $STRATUM_DIR/run.sh velvet +screen -dmS argon2 $STRATUM_DIR/run.sh argon2 diff --git a/stratum/algos/ar2/argon2.c b/stratum/algos/ar2/argon2.c new file mode 100644 index 0000000..58ef79f --- /dev/null +++ b/stratum/algos/ar2/argon2.c @@ -0,0 +1,279 @@ +/* + * Argon2 source code package + * + * Written by Daniel Dinu and Dmitry Khovratovich, 2015 + * + * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. + * + * You should have received a copy of the CC0 Public Domain Dedication along + * with + * this software. If not, see + * . + */ + +#include +#include +#include +#include + +#include "argon2.h" +#include "cores.h" + +/* Error messages */ +static const char *Argon2_ErrorMessage[] = { + /*{ARGON2_OK, */ "OK", + /*}, + + {ARGON2_OUTPUT_PTR_NULL, */ "Output pointer is NULL", + /*}, + +{ARGON2_OUTPUT_TOO_SHORT, */ "Output is too short", + /*}, +{ARGON2_OUTPUT_TOO_LONG, */ "Output is too long", + /*}, + +{ARGON2_PWD_TOO_SHORT, */ "Password is too short", + /*}, +{ARGON2_PWD_TOO_LONG, */ "Password is too long", + /*}, + +{ARGON2_SALT_TOO_SHORT, */ "Salt is too short", + /*}, +{ARGON2_SALT_TOO_LONG, */ "Salt is too long", + /*}, + +{ARGON2_AD_TOO_SHORT, */ "Associated data is too short", + /*}, +{ARGON2_AD_TOO_LONG, */ "Associated date is too long", + /*}, + +{ARGON2_SECRET_TOO_SHORT, */ "Secret is too short", + /*}, +{ARGON2_SECRET_TOO_LONG, */ "Secret is too long", + /*}, + +{ARGON2_TIME_TOO_SMALL, */ "Time cost is too small", + /*}, +{ARGON2_TIME_TOO_LARGE, */ "Time cost is too large", + /*}, + +{ARGON2_MEMORY_TOO_LITTLE, */ "Memory cost is too small", + /*}, +{ARGON2_MEMORY_TOO_MUCH, */ "Memory cost is too large", + /*}, + +{ARGON2_LANES_TOO_FEW, */ "Too few lanes", + /*}, +{ARGON2_LANES_TOO_MANY, */ "Too many lanes", + /*}, + +{ARGON2_PWD_PTR_MISMATCH, */ "Password pointer is NULL, but password length is not 0", + /*}, +{ARGON2_SALT_PTR_MISMATCH, */ "Salt pointer is NULL, but salt length is not 0", + /*}, +{ARGON2_SECRET_PTR_MISMATCH, */ "Secret pointer is NULL, but secret length is not 0", + /*}, +{ARGON2_AD_PTR_MISMATCH, */ "Associated data pointer is NULL, but ad length is not 0", + /*}, + +{ARGON2_MEMORY_ALLOCATION_ERROR, */ "Memory allocation error", + /*}, + +{ARGON2_FREE_MEMORY_CBK_NULL, */ "The free memory callback is NULL", + /*}, +{ARGON2_ALLOCATE_MEMORY_CBK_NULL, */ "The allocate memory callback is NULL", + /*}, + +{ARGON2_INCORRECT_PARAMETER, */ "Argon2_Context context is NULL", + /*}, +{ARGON2_INCORRECT_TYPE, */ "There is no such version of Argon2", + /*}, + +{ARGON2_OUT_PTR_MISMATCH, */ "Output pointer mismatch", + /*}, + +{ARGON2_THREADS_TOO_FEW, */ "Not enough threads", + /*}, +{ARGON2_THREADS_TOO_MANY, */ "Too many threads", + /*}, +{ARGON2_MISSING_ARGS, */ "Missing arguments", /*},*/ +}; + +int argon2d(argon2_context *context) { return argon2_core(context, Argon2_d); } + +int argon2i(argon2_context *context) { return argon2_core(context, Argon2_i); } + +int verify_d(argon2_context *context, const char *hash) { + int result; + /*if (0 == context->outlen || NULL == hash) { + return ARGON2_OUT_PTR_MISMATCH; + }*/ + + result = argon2_core(context, Argon2_d); + + if (ARGON2_OK != result) { + return result; + } + + return 0 == memcmp(hash, context->out, 32); +} + +const char *error_message(int error_code) { + enum { + /* Make sure---at compile time---that the enum size matches the array + size */ + ERROR_STRING_CHECK = + 1 / + !!((sizeof(Argon2_ErrorMessage) / sizeof(Argon2_ErrorMessage[0])) == + ARGON2_ERROR_CODES_LENGTH) + }; + if (error_code < ARGON2_ERROR_CODES_LENGTH) { + return Argon2_ErrorMessage[(argon2_error_codes)error_code]; + } + return "Unknown error code."; +} + +/* encoding/decoding helpers */ + +/* + * Some macros for constant-time comparisons. These work over values in + * the 0..255 range. Returned value is 0x00 on "false", 0xFF on "true". + */ +#define EQ(x, y) ((((0U - ((unsigned)(x) ^ (unsigned)(y))) >> 8) & 0xFF) ^ 0xFF) +#define GT(x, y) ((((unsigned)(y) - (unsigned)(x)) >> 8) & 0xFF) +#define GE(x, y) (GT(y, x) ^ 0xFF) +#define LT(x, y) GT(y, x) +#define LE(x, y) GE(y, x) + +/* + * Convert value x (0..63) to corresponding Base64 character. + */ +static int b64_byte_to_char(unsigned x) { + return (LT(x, 26) & (x + 'A')) | + (GE(x, 26) & LT(x, 52) & (x + ('a' - 26))) | + (GE(x, 52) & LT(x, 62) & (x + ('0' - 52))) | (EQ(x, 62) & '+') | + (EQ(x, 63) & '/'); +} + +/* + * Convert some bytes to Base64. 'dst_len' is the length (in characters) + * of the output buffer 'dst'; if that buffer is not large enough to + * receive the result (including the terminating 0), then (size_t)-1 + * is returned. Otherwise, the zero-terminated Base64 string is written + * in the buffer, and the output length (counted WITHOUT the terminating + * zero) is returned. + */ +static size_t to_base64(char *dst, size_t dst_len, const void *src) { + size_t olen; + const unsigned char *buf; + unsigned acc, acc_len; + + olen = 43; + /*switch (32 % 3) { + case 2: + olen++;*/ + /* fall through */ + /*case 1: + olen += 2; + break; + }*/ + if (dst_len <= olen) { + return (size_t)-1; + } + acc = 0; + acc_len = 0; + buf = (const unsigned char *)src; + size_t src_len = 32; + while (src_len-- > 0) { + acc = (acc << 8) + (*buf++); + acc_len += 8; + while (acc_len >= 6) { + acc_len -= 6; + *dst++ = b64_byte_to_char((acc >> acc_len) & 0x3F); + } + } + if (acc_len > 0) { + *dst++ = b64_byte_to_char((acc << (6 - acc_len)) & 0x3F); + } + *dst++ = 0; + return olen; +} + +/* ==================================================================== */ +/* + * Code specific to Argon2i. + * + * The code below applies the following format: + * + * $argon2i$m=,t=,p=[,keyid=][,data=][$[$]] + * + * where is a decimal integer (positive, fits in an 'unsigned long') + * and is Base64-encoded data (no '=' padding characters, no newline + * or whitespace). The "keyid" is a binary identifier for a key (up to 8 + * bytes); "data" is associated data (up to 32 bytes). When the 'keyid' + * (resp. the 'data') is empty, then it is ommitted from the output. + * + * The last two binary chunks (encoded in Base64) are, in that order, + * the salt and the output. Both are optional, but you cannot have an + * output without a salt. The binary salt length is between 8 and 48 bytes. + * The output length is always exactly 32 bytes. + */ + +int encode_string(char *dst, size_t dst_len, argon2_context *ctx) { +#define SS(str) \ + do { \ + size_t pp_len = strlen(str); \ + if (pp_len >= dst_len) { \ + return 0; \ + } \ + memcpy(dst, str, pp_len + 1); \ + dst += pp_len; \ + dst_len -= pp_len; \ + } while (0) + +#define SX(x) \ + do { \ + char tmp[30]; \ + sprintf(tmp, "%lu", (unsigned long)(x)); \ + SS(tmp); \ + } while (0); + +#define SB(buf) \ + do { \ + size_t sb_len = to_base64(dst, dst_len, buf); \ + if (sb_len == (size_t)-1) { \ + return 0; \ + } \ + dst += sb_len; \ + dst_len -= sb_len; \ + } while (0); + + SS("$argon2i$m="); + SX(16); + SS(",t="); + SX(2); + SS(",p="); + SX(1); + + /*if (ctx->adlen > 0) { + SS(",data="); + SB(ctx->ad, ctx->adlen); + }*/ + + /*if (ctx->saltlen == 0) + return 1;*/ + + SS("$"); + SB(ctx->salt); + + /*if (ctx->outlen32 == 0) + return 1;*/ + + SS("$"); + SB(ctx->out); + return 1; + +#undef SS +#undef SX +#undef SB +} diff --git a/stratum/algos/ar2/argon2.h b/stratum/algos/ar2/argon2.h new file mode 100644 index 0000000..cecb2c7 --- /dev/null +++ b/stratum/algos/ar2/argon2.h @@ -0,0 +1,292 @@ +/* + * Argon2 source code package + * + * Written by Daniel Dinu and Dmitry Khovratovich, 2015 + * + * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. + * + * You should have received a copy of the CC0 Public Domain Dedication along + * with + * this software. If not, see + * . + */ +#ifndef ARGON2_H +#define ARGON2_H + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +/*************************Argon2 input parameter + * restrictions**************************************************/ + +/* Minimum and maximum number of lanes (degree of parallelism) */ +#define ARGON2_MIN_LANES UINT32_C(1) +#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF) + +/* Minimum and maximum number of threads */ +#define ARGON2_MIN_THREADS UINT32_C(1) +#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF) + +/* Number of synchronization points between lanes per pass */ +#define ARGON2_SYNC_POINTS UINT32_C(4) + +/* Minimum and maximum digest size in bytes */ +#define ARGON2_MIN_OUTLEN UINT32_C(4) +#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */ +#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */ + +#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b)) +/* Max memory size is half the addressing space, topping at 2^32 blocks (4 TB) + */ +#define ARGON2_MAX_MEMORY_BITS \ + ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1)) +#define ARGON2_MAX_MEMORY \ + ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS) + +/* Minimum and maximum number of passes */ +#define ARGON2_MIN_TIME UINT32_C(1) +#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum password length in bytes */ +#define ARGON2_MIN_PWD_LENGTH UINT32_C(0) +#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum associated data length in bytes */ +#define ARGON2_MIN_AD_LENGTH UINT32_C(0) +#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum salt length in bytes */ +#define ARGON2_MIN_SALT_LENGTH UINT32_C(8) +#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum key length in bytes */ +#define ARGON2_MIN_SECRET UINT32_C(0) +#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF) + +#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0) +#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1) +#define ARGON2_FLAG_CLEAR_MEMORY (UINT32_C(1) << 2) +#define ARGON2_DEFAULT_FLAGS \ + (ARGON2_FLAG_CLEAR_PASSWORD | ARGON2_FLAG_CLEAR_MEMORY) + +/* Error codes */ +typedef enum Argon2_ErrorCodes { + ARGON2_OK = 0, + + ARGON2_OUTPUT_PTR_NULL = 1, + + ARGON2_OUTPUT_TOO_SHORT = 2, + ARGON2_OUTPUT_TOO_LONG = 3, + + ARGON2_PWD_TOO_SHORT = 4, + ARGON2_PWD_TOO_LONG = 5, + + ARGON2_SALT_TOO_SHORT = 6, + ARGON2_SALT_TOO_LONG = 7, + + ARGON2_AD_TOO_SHORT = 8, + ARGON2_AD_TOO_LONG = 9, + + ARGON2_SECRET_TOO_SHORT = 10, + ARGON2_SECRET_TOO_LONG = 11, + + ARGON2_TIME_TOO_SMALL = 12, + ARGON2_TIME_TOO_LARGE = 13, + + ARGON2_MEMORY_TOO_LITTLE = 14, + ARGON2_MEMORY_TOO_MUCH = 15, + + ARGON2_LANES_TOO_FEW = 16, + ARGON2_LANES_TOO_MANY = 17, + + ARGON2_PWD_PTR_MISMATCH = 18, /* NULL ptr with non-zero length */ + ARGON2_SALT_PTR_MISMATCH = 19, /* NULL ptr with non-zero length */ + ARGON2_SECRET_PTR_MISMATCH = 20, /* NULL ptr with non-zero length */ + ARGON2_AD_PTR_MISMATCH = 21, /* NULL ptr with non-zero length */ + + ARGON2_MEMORY_ALLOCATION_ERROR = 22, + + ARGON2_FREE_MEMORY_CBK_NULL = 23, + ARGON2_ALLOCATE_MEMORY_CBK_NULL = 24, + + ARGON2_INCORRECT_PARAMETER = 25, + ARGON2_INCORRECT_TYPE = 26, + + ARGON2_OUT_PTR_MISMATCH = 27, + + ARGON2_THREADS_TOO_FEW = 28, + ARGON2_THREADS_TOO_MANY = 29, + + ARGON2_MISSING_ARGS = 30, + + ARGON2_ERROR_CODES_LENGTH /* Do NOT remove; Do NOT add error codes after + this + error code */ +} argon2_error_codes; + +/* Memory allocator types --- for external allocation */ +typedef int (*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate); +typedef void (*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate); + +/* Argon2 external data structures */ + +/* + *****Context: structure to hold Argon2 inputs: + * output array and its length, + * password and its length, + * salt and its length, + * secret and its length, + * associated data and its length, + * number of passes, amount of used memory (in KBytes, can be rounded up a bit) + * number of parallel threads that will be run. + * All the parameters above affect the output hash value. + * Additionally, two function pointers can be provided to allocate and + deallocate the memory (if NULL, memory will be allocated internally). + * Also, three flags indicate whether to erase password, secret as soon as they + are pre-hashed (and thus not needed anymore), and the entire memory + **************************** + Simplest situation: you have output array out[8], password is stored in + pwd[32], salt is stored in salt[16], you do not have keys nor associated data. + You need to spend 1 GB of RAM and you run 5 passes of Argon2d with 4 parallel + lanes. + You want to erase the password, but you're OK with last pass not being erased. + You want to use the default memory allocator. + */ +typedef struct Argon2_Context { + uint8_t *out; /* output array */ + uint8_t *pwd; /* password array */ + uint8_t *salt; /* salt array */ + /*uint8_t *secret;*/ /* key array */ + /*uint8_t *ad;*/ /* associated data array */ + + allocate_fptr allocate_cbk; /* pointer to memory allocator */ + deallocate_fptr free_cbk; /* pointer to memory deallocator */ + + /*uint32_t outlen;*/ /* digest length */ + uint32_t pwdlen; /* password length */ + /*uint32_t saltlen;*/ /* salt length */ + /*uint32_t secretlen;*/ /* key length */ + /*uint32_t adlen;*/ /* associated data length */ + /*uint32_t t_cost;*/ /* number of passes */ + /*uint32_t m_cost;*/ /* amount of memory requested (KB) */ + /*uint32_t lanes;*/ /* number of lanes */ + /*uint32_t threads;*/ /* maximum number of threads */ + /*uint32_t flags;*/ /* array of bool options */ + +} argon2_context; + +/** + * Function to hash the inputs in the memory-hard fashion (uses Argon2i) + * @param out Pointer to the memory where the hash digest will be written + * @param outlen Digest length in bytes + * @param in Pointer to the input (password) + * @param inlen Input length in bytes + * @param salt Pointer to the salt + * @param saltlen Salt length in bytes + * @pre @a out must have at least @a outlen bytes allocated + * @pre @a in must be at least @inlen bytes long + * @pre @a saltlen must be at least @saltlen bytes long + * @return Zero if successful, 1 otherwise. + */ +/*int hash_argon2i(void *out, size_t outlen, const void *in, size_t inlen, + const void *salt, size_t saltlen, unsigned int t_cost, + unsigned int m_cost);*/ + +/* same for argon2d */ +/*int hash_argon2d(void *out, size_t outlen, const void *in, size_t inlen, + const void *salt, size_t saltlen, unsigned int t_cost, + unsigned int m_cost);*/ + +/* + * **************Argon2d: Version of Argon2 that picks memory blocks depending + * on the password and salt. Only for side-channel-free + * environment!!*************** + * @param context Pointer to current Argon2 context + * @return Zero if successful, a non zero error code otherwise + */ +int argon2d(argon2_context *context); + +/* + * * **************Argon2i: Version of Argon2 that picks memory blocks + *independent on the password and salt. Good for side-channels, + ******************* but worse w.r.t. tradeoff attacks if + *******************only one pass is used*************** + * @param context Pointer to current Argon2 context + * @return Zero if successful, a non zero error code otherwise + */ +int argon2i(argon2_context *context); + +/* + * * **************Argon2di: Reserved name*************** + * @param context Pointer to current Argon2 context + * @return Zero if successful, a non zero error code otherwise + */ +int argon2di(argon2_context *context); + +/* + * * **************Argon2ds: Argon2d hardened against GPU attacks, 20% + * slower*************** + * @param context Pointer to current Argon2 context + * @return Zero if successful, a non zero error code otherwise + */ +int argon2ds(argon2_context *context); + +/* + * * **************Argon2id: First half-pass over memory is + *password-independent, the rest are password-dependent + ********************OK against side channels: they reduce to 1/2-pass + *Argon2i*************** + * @param context Pointer to current Argon2 context + * @return Zero if successful, a non zero error code otherwise + */ +int argon2id(argon2_context *context); + +/* + * Verify if a given password is correct for Argon2d hashing + * @param context Pointer to current Argon2 context + * @param hash The password hash to verify. The length of the hash is + * specified by the context outlen member + * @return Zero if successful, a non zero error code otherwise + */ +int verify_d(argon2_context *context, const char *hash); + +/* + * Get the associated error message for given error code + * @return The error message associated with the given error code + */ +const char *error_message(int error_code); + +/* ==================================================================== */ +/* + * Code specific to Argon2i. + * + * The code below applies the following format: + * + * $argon2i$m=,t=,p=[,keyid=][,data=][$[$]] + * + * where is a decimal integer (positive, fits in an 'unsigned long') + * and is Base64-encoded data (no '=' padding characters, no newline + * or whitespace). The "keyid" is a binary identifier for a key (up to 8 + * bytes); "data" is associated data (up to 32 bytes). When the 'keyid' + * (resp. the 'data') is empty, then it is ommitted from the output. + * + * The last two binary chunks (encoded in Base64) are, in that order, + * the salt and the output. Both are optional, but you cannot have an + * output without a salt. The binary salt length is between 8 and 48 bytes. + * The output length is always exactly 32 bytes. + */ + +int encode_string(char *dst, size_t dst_len, argon2_context *ctx); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/stratum/algos/ar2/bench.c b/stratum/algos/ar2/bench.c new file mode 100644 index 0000000..7a6edc5 --- /dev/null +++ b/stratum/algos/ar2/bench.c @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#ifdef _MSC_VER +#include +#endif + +#include "argon2.h" + +static uint64_t rdtsc(void) { +#ifdef _MSC_VER + return __rdtsc(); +#else +#if defined(__amd64__) || defined(__x86_64__) + uint64_t rax, rdx; + __asm__ __volatile__("rdtsc" : "=a"(rax), "=d"(rdx) : :); + return (rdx << 32) | rax; +#elif defined(__i386__) || defined(__i386) || defined(__X86__) + uint64_t rax; + __asm__ __volatile__("rdtsc" : "=A"(rax) : :); + return rax; +#else +#error "Not implemented!" +#endif +#endif +} + +/* + * Benchmarks Argon2 with salt length 16, password length 16, t_cost 1, + and different m_cost and threads + */ +static void benchmark() { +#define BENCH_OUTLEN 16 +#define BENCH_INLEN 16 + const uint32_t inlen = BENCH_INLEN; + const unsigned outlen = BENCH_OUTLEN; + unsigned char out[BENCH_OUTLEN]; + unsigned char pwd_array[BENCH_INLEN]; + unsigned char salt_array[BENCH_INLEN]; +#undef BENCH_INLEN +#undef BENCH_OUTLEN + + uint32_t t_cost = 1; + uint32_t m_cost; + uint32_t thread_test[6] = {1, 2, 4, 6, 8, 16}; + + memset(pwd_array, 0, inlen); + memset(salt_array, 1, inlen); + + for (m_cost = (uint32_t)1 << 10; m_cost <= (uint32_t)1 << 22; m_cost *= 2) { + unsigned i; + for (i = 0; i < 6; ++i) { + argon2_context context; + uint32_t thread_n = thread_test[i]; + uint64_t stop_cycles, stop_cycles_i; + clock_t stop_time; + uint64_t delta_d, delta_i; + double mcycles_d, mcycles_i, run_time; + + clock_t start_time = clock(); + uint64_t start_cycles = rdtsc(); + + context.out = out; + context.outlen = outlen; + context.pwd = pwd_array; + context.pwdlen = inlen; + context.salt = salt_array; + context.saltlen = inlen; + context.secret = NULL; + context.secretlen = 0; + context.ad = NULL; + context.adlen = 0; + context.t_cost = t_cost; + context.m_cost = m_cost; + context.lanes = thread_n; + context.threads = thread_n; + context.allocate_cbk = NULL; + context.free_cbk = NULL; + context.flags = 0; + + argon2d(&context); + stop_cycles = rdtsc(); + argon2i(&context); + stop_cycles_i = rdtsc(); + stop_time = clock(); + + delta_d = (stop_cycles - start_cycles) / (m_cost); + delta_i = (stop_cycles_i - stop_cycles) / (m_cost); + mcycles_d = (double)(stop_cycles - start_cycles) / (1UL << 20); + mcycles_i = (double)(stop_cycles_i - stop_cycles) / (1UL << 20); + printf("Argon2d %d iterations %d MiB %d threads: %2.2f cpb %2.2f " + "Mcycles \n", + t_cost, m_cost >> 10, thread_n, (float)delta_d / 1024, + mcycles_d); + printf("Argon2i %d iterations %d MiB %d threads: %2.2f cpb %2.2f " + "Mcycles \n", + t_cost, m_cost >> 10, thread_n, (float)delta_i / 1024, + mcycles_i); + + run_time = ((double)stop_time - start_time) / (CLOCKS_PER_SEC); + printf("%2.4f seconds\n\n", run_time); + } + } +} + +int main() { + benchmark(); + return ARGON2_OK; +} diff --git a/stratum/algos/ar2/blake2/blake2-impl.h b/stratum/algos/ar2/blake2/blake2-impl.h new file mode 100644 index 0000000..115a192 --- /dev/null +++ b/stratum/algos/ar2/blake2/blake2-impl.h @@ -0,0 +1,143 @@ +#ifndef PORTABLE_BLAKE2_IMPL_H +#define PORTABLE_BLAKE2_IMPL_H + +#include +#include + +#if defined(_MSC_VER) +#define BLAKE2_INLINE __inline +#elif defined(__GNUC__) || defined(__clang__) +#define BLAKE2_INLINE __inline__ +#else +#define BLAKE2_INLINE +#endif + +/* Argon2 Team - Begin Code */ +/* + Not an exhaustive list, but should cover the majority of modern platforms + Additionally, the code will always be correct---this is only a performance + tweak. +*/ +#if (defined(__BYTE_ORDER__) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ + defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \ + defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \ + defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \ + defined(_M_ARM) +#define NATIVE_LITTLE_ENDIAN +#endif +/* Argon2 Team - End Code */ + +static BLAKE2_INLINE uint32_t load32(const void *src) { +#if defined(NATIVE_LITTLE_ENDIAN) + uint32_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = (const uint8_t *)src; + uint32_t w = *p++; + w |= (uint32_t)(*p++) << 8; + w |= (uint32_t)(*p++) << 16; + w |= (uint32_t)(*p++) << 24; + return w; +#endif +} + +static BLAKE2_INLINE uint64_t load64(const void *src) { +#if defined(NATIVE_LITTLE_ENDIAN) + uint64_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = (const uint8_t *)src; + uint64_t w = *p++; + w |= (uint64_t)(*p++) << 8; + w |= (uint64_t)(*p++) << 16; + w |= (uint64_t)(*p++) << 24; + w |= (uint64_t)(*p++) << 32; + w |= (uint64_t)(*p++) << 40; + w |= (uint64_t)(*p++) << 48; + w |= (uint64_t)(*p++) << 56; + return w; +#endif +} + +static BLAKE2_INLINE void store32(void *dst, uint32_t w) { +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +#endif +} + +static BLAKE2_INLINE void store64(void *dst, uint64_t w) { +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +#endif +} + +static BLAKE2_INLINE uint64_t load48(const void *src) { + const uint8_t *p = (const uint8_t *)src; + uint64_t w = *p++; + w |= (uint64_t)(*p++) << 8; + w |= (uint64_t)(*p++) << 16; + w |= (uint64_t)(*p++) << 24; + w |= (uint64_t)(*p++) << 32; + w |= (uint64_t)(*p++) << 40; + return w; +} + +static BLAKE2_INLINE void store48(void *dst, uint64_t w) { + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +} + +static BLAKE2_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) { + return (w >> c) | (w << (32 - c)); +} + +static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) { + return (w >> c) | (w << (64 - c)); +} + +/* prevents compiler optimizing out memset() */ +static BLAKE2_INLINE void burn(void *v, size_t n) { + static void *(*const volatile memset_v)(void *, int, size_t) = &memset; + memset_v(v, 0, n); +} + +#endif diff --git a/stratum/algos/ar2/blake2/blake2.h b/stratum/algos/ar2/blake2/blake2.h new file mode 100644 index 0000000..7d8d5eb --- /dev/null +++ b/stratum/algos/ar2/blake2/blake2.h @@ -0,0 +1,76 @@ +#ifndef PORTABLE_BLAKE2_H +#define PORTABLE_BLAKE2_H + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +enum blake2b_constant { + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_OUTBYTES = 64, + BLAKE2B_KEYBYTES = 64, + BLAKE2B_SALTBYTES = 16, + BLAKE2B_PERSONALBYTES = 16 +}; + +#pragma pack(push, 1) +typedef struct __blake2b_param { + uint8_t digest_length; /* 1 */ + uint8_t key_length; /* 2 */ + uint8_t fanout; /* 3 */ + uint8_t depth; /* 4 */ + uint32_t leaf_length; /* 8 */ + uint64_t node_offset; /* 16 */ + uint8_t node_depth; /* 17 */ + uint8_t inner_length; /* 18 */ + uint8_t reserved[14]; /* 32 */ + uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */ + uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */ +} blake2b_param; +#pragma pack(pop) + +typedef struct __blake2b_state { + uint64_t h[8]; + uint64_t t[2]; + uint64_t f[2]; + unsigned buflen; + unsigned outlen; + uint8_t last_node; + uint8_t buf[BLAKE2B_BLOCKBYTES]; +} blake2b_state; + +/* Ensure param structs have not been wrongly padded */ +/* Poor man's static_assert */ +enum { + blake2_size_check_0 = 1 / !!(CHAR_BIT == 8), + blake2_size_check_2 = + 1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT) +}; + +/* Streaming API */ +int blake2b_init(blake2b_state *S, size_t outlen); +int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, + size_t keylen); +int blake2b_init_param(blake2b_state *S, const blake2b_param *P); +int blake2b_update(blake2b_state *S, const void *in, size_t inlen); +void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen); +int blake2b_final(blake2b_state *S, void *out, size_t outlen); + +/* Simple API */ +int blake2b(void *out, const void *in, const void *key, size_t keylen); + +/* Argon2 Team - Begin Code */ +int blake2b_long(void *out, const void *in); +/* Argon2 Team - End Code */ +/* Miouyouyou */ +void blake2b_too(void *out, const void *in); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/stratum/algos/ar2/blake2/blamka-round-opt.h b/stratum/algos/ar2/blake2/blamka-round-opt.h new file mode 100644 index 0000000..690686d --- /dev/null +++ b/stratum/algos/ar2/blake2/blamka-round-opt.h @@ -0,0 +1,162 @@ +#ifndef BLAKE_ROUND_MKA_OPT_H +#define BLAKE_ROUND_MKA_OPT_H + +#include "blake2-impl.h" + +#if defined(_MSC_VER) +#include +#endif + +#include +#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__)) +#include +#endif + +#if !defined(__XOP__) +#if defined(__SSSE3__) +#define r16 \ + (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) +#define r24 \ + (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) \ + ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ + : (-(c) == 24) \ + ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) \ + ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) \ + ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_slli_epi64((x), 64 - (-(c)))) +#else /* defined(__SSE2__) */ +#define _mm_roti_epi64(r, c) \ + _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c)))) +#endif +#else +#endif + +static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { + const __m128i z = _mm_mul_epu32(x, y); + return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z)); +} + +#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + A0 = fBlaMka(A0, B0); \ + A1 = fBlaMka(A1, B1); \ + \ + D0 = _mm_xor_si128(D0, A0); \ + D1 = _mm_xor_si128(D1, A1); \ + \ + D0 = _mm_roti_epi64(D0, -32); \ + D1 = _mm_roti_epi64(D1, -32); \ + \ + C0 = fBlaMka(C0, D0); \ + C1 = fBlaMka(C1, D1); \ + \ + B0 = _mm_xor_si128(B0, C0); \ + B1 = _mm_xor_si128(B1, C1); \ + \ + B0 = _mm_roti_epi64(B0, -24); \ + B1 = _mm_roti_epi64(B1, -24); \ + } while ((void)0, 0) + +#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + A0 = fBlaMka(A0, B0); \ + A1 = fBlaMka(A1, B1); \ + \ + D0 = _mm_xor_si128(D0, A0); \ + D1 = _mm_xor_si128(D1, A1); \ + \ + D0 = _mm_roti_epi64(D0, -16); \ + D1 = _mm_roti_epi64(D1, -16); \ + \ + C0 = fBlaMka(C0, D0); \ + C1 = fBlaMka(C1, D1); \ + \ + B0 = _mm_xor_si128(B0, C0); \ + B1 = _mm_xor_si128(B1, C1); \ + \ + B0 = _mm_roti_epi64(B0, -63); \ + B1 = _mm_roti_epi64(B1, -63); \ + } while ((void)0, 0) + +#if defined(__SSSE3__) +#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \ + __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \ + B0 = t0; \ + B1 = t1; \ + \ + t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + \ + t0 = _mm_alignr_epi8(D1, D0, 8); \ + t1 = _mm_alignr_epi8(D0, D1, 8); \ + D0 = t1; \ + D1 = t0; \ + } while ((void)0, 0) + +#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \ + __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \ + B0 = t0; \ + B1 = t1; \ + \ + t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + \ + t0 = _mm_alignr_epi8(D0, D1, 8); \ + t1 = _mm_alignr_epi8(D1, D0, 8); \ + D0 = t1; \ + D1 = t0; \ + } while ((void)0, 0) +#else /* SSE2 */ +#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = D0; \ + __m128i t1 = B0; \ + D0 = C0; \ + C0 = C1; \ + C1 = D0; \ + D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \ + D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \ + B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \ + B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \ + } while ((void)0, 0) + +#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + t0 = B0; \ + __m128i t1 = D0; \ + B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \ + B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \ + D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \ + D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \ + } while ((void)0, 0) +#endif + +#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + G1(A0, B0, C0, D0, A1, B1, C1, D1); \ + G2(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + G1(A0, B0, C0, D0, A1, B1, C1, D1); \ + G2(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ + } while ((void)0, 0) + +#endif diff --git a/stratum/algos/ar2/blake2/blamka-round-ref.h b/stratum/algos/ar2/blake2/blamka-round-ref.h new file mode 100644 index 0000000..f497e10 --- /dev/null +++ b/stratum/algos/ar2/blake2/blamka-round-ref.h @@ -0,0 +1,39 @@ +#ifndef BLAKE_ROUND_MKA_H +#define BLAKE_ROUND_MKA_H + +#include "blake2.h" +#include "blake2-impl.h" + +/*designed by the Lyra PHC team */ +static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) { + const uint64_t m = UINT64_C(0xFFFFFFFF); + const uint64_t xy = (x & m) * (y & m); + return x + y + 2 * xy; +} + +#define G(a, b, c, d) \ + do { \ + a = fBlaMka(a, b); \ + d = rotr64(d ^ a, 32); \ + c = fBlaMka(c, d); \ + b = rotr64(b ^ c, 24); \ + a = fBlaMka(a, b); \ + d = rotr64(d ^ a, 16); \ + c = fBlaMka(c, d); \ + b = rotr64(b ^ c, 63); \ + } while ((void)0, 0) + +#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, \ + v12, v13, v14, v15) \ + do { \ + G(v0, v4, v8, v12); \ + G(v1, v5, v9, v13); \ + G(v2, v6, v10, v14); \ + G(v3, v7, v11, v15); \ + G(v0, v5, v10, v15); \ + G(v1, v6, v11, v12); \ + G(v2, v7, v8, v13); \ + G(v3, v4, v9, v14); \ + } while ((void)0, 0) + +#endif diff --git a/stratum/algos/ar2/blake2b.c b/stratum/algos/ar2/blake2b.c new file mode 100644 index 0000000..be691b1 --- /dev/null +++ b/stratum/algos/ar2/blake2b.c @@ -0,0 +1,305 @@ +#include +#include +#include +#include + +#include "blake2/blake2.h" +#include "blake2/blake2-impl.h" + +static const uint64_t blake2b_IV[8] = { + UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), + UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), + UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), + UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) +}; + +static const unsigned int blake2b_sigma[12][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, +}; + +static BLAKE2_INLINE void blake2b_set_lastnode(blake2b_state *S) { + S->f[1] = (uint64_t)-1; +} + +static BLAKE2_INLINE void blake2b_set_lastblock(blake2b_state *S) { + if (S->last_node) { + blake2b_set_lastnode(S); + } + S->f[0] = (uint64_t)-1; +} + +static BLAKE2_INLINE void blake2b_increment_counter(blake2b_state *S, + uint64_t inc) { + S->t[0] += inc; + S->t[1] += (S->t[0] < inc); +} + +static BLAKE2_INLINE void blake2b_invalidate_state(blake2b_state *S) { + burn(S, sizeof(*S)); /* wipe */ + blake2b_set_lastblock(S); /* invalidate for further use */ +} + +static BLAKE2_INLINE void blake2b_init0(blake2b_state *S) { + memset(S, 0, sizeof(*S)); + memcpy(S->h, blake2b_IV, sizeof(S->h)); +} + + +/*void print_state(blake2b_state BlakeHash) { + printf(".h = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n" + "UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n" + "UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n" + "UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")},\n" + ".t = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")},\n" + ".f = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")}\n", + BlakeHash.h[0], BlakeHash.h[1], BlakeHash.h[2], BlakeHash.h[3], + BlakeHash.h[4], BlakeHash.h[5], BlakeHash.h[6], BlakeHash.h[7], + BlakeHash.t[0], BlakeHash.t[1], + BlakeHash.f[0], BlakeHash.f[1]); + printf(".buf = {"); + for (register uint8_t i = 0; i < BLAKE2B_BLOCKBYTES; i++) + printf("%" PRIu8 ", ", BlakeHash.buf[i]); + puts("\n"); + printf("}\n.buflen = %d\n.outlen = %d\n", + BlakeHash.buflen, BlakeHash.outlen); + printf(".last_node = %" PRIu8 "\n", BlakeHash.last_node); + fflush(stdout); +}*/ + +static const blake2b_state miou = { + .h = { + UINT64_C(7640891576939301128), UINT64_C(13503953896175478587), + UINT64_C(4354685564936845355), UINT64_C(11912009170470909681), + UINT64_C(5840696475078001361), UINT64_C(11170449401992604703), + UINT64_C(2270897969802886507), UINT64_C(6620516959819538809) + }, + .t = {UINT64_C(0), UINT64_C(0)}, + .f = {UINT64_C(0), UINT64_C(0)}, + .buf = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }, + .buflen = 0, + .outlen = 64, + .last_node = 0 +}; + + +int blake2b_init_param(blake2b_state *S, const blake2b_param *P) { + const unsigned char *p = (const unsigned char *)P; + unsigned int i; + + if (NULL == P || NULL == S) { + return -1; + } + + blake2b_init0(S); + + /* IV XOR Parameter Block */ + for (i = 0; i < 8; ++i) { + S->h[i] ^= load64(&p[i * sizeof(S->h[i])]); + } + S->outlen = P->digest_length; + return 0; +} + +void compare_buffs(uint64_t *h, size_t outlen) +{ + // printf("CMP : %d", memcmp(h, miou.h, 8*(sizeof(uint64_t)))); + printf("miou : %" PRIu64 " - h : %" PRIu64 " - outlen : %ld\n", miou.h[0], h[0], outlen); + fflush(stdout); +} + +/* Sequential blake2b initialization */ +int blake2b_init(blake2b_state *S, size_t outlen) { + memcpy(S, &miou, sizeof(*S)); + S->h[0] += outlen; + return 0; +} + + +void print64(const char *name, const uint64_t *array, uint16_t size) { + printf("%s = {", name); + for (uint8_t i = 0; i < size; i++) printf("UINT64_C(%" PRIu64 "), ", array[i]); + printf("};\n"); +} +int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, + size_t keylen) { + return 0; +} + +static void blake2b_compress(blake2b_state *S, const uint8_t *block) { + uint64_t m[16]; + uint64_t v[16]; + unsigned int i, r; + + for (i = 0; i < 16; ++i) { + m[i] = load64(block + i * 8); + } + + for (i = 0; i < 8; ++i) { + v[i] = S->h[i]; + } + + v[8] = blake2b_IV[0]; + v[9] = blake2b_IV[1]; + v[10] = blake2b_IV[2]; + v[11] = blake2b_IV[3]; + v[12] = blake2b_IV[4] ^ S->t[0]; + v[13] = blake2b_IV[5]/* ^ S->t[1]*/; + v[14] = blake2b_IV[6] ^ S->f[0]; + v[15] = blake2b_IV[7]/* ^ S->f[1]*/; + +#define G(r, i, a, b, c, d) \ + do { \ + a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while ((void)0, 0) + +#define ROUND(r) \ + do { \ + G(r, 0, v[0], v[4], v[8], v[12]); \ + G(r, 1, v[1], v[5], v[9], v[13]); \ + G(r, 2, v[2], v[6], v[10], v[14]); \ + G(r, 3, v[3], v[7], v[11], v[15]); \ + G(r, 4, v[0], v[5], v[10], v[15]); \ + G(r, 5, v[1], v[6], v[11], v[12]); \ + G(r, 6, v[2], v[7], v[8], v[13]); \ + G(r, 7, v[3], v[4], v[9], v[14]); \ + } while ((void)0, 0) + + for (r = 0; r < 12; ++r) ROUND(r); + + for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + +#undef G +#undef ROUND +} + +int blake2b_update(blake2b_state *S, const void *in, size_t inlen) { + const uint8_t *pin = (const uint8_t *)in; + /* Complete current block */ + memcpy(&S->buf[4], pin, 124); + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress(S, S->buf); + S->buflen = 0; + pin += 124; + + register int8_t i = 7; + /* Avoid buffer copies when possible */ + while (i--) { + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress(S, pin); + pin += BLAKE2B_BLOCKBYTES; + } + memcpy(&S->buf[S->buflen], pin, 4); + S->buflen += 4; + return 0; +} + +void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen) { + + memcpy(&S->buf[S->buflen], in, inlen); + S->buflen += (unsigned int)inlen; +} + +int blake2b_final(blake2b_state *S, void *out, size_t outlen) { + uint8_t buffer[BLAKE2B_OUTBYTES] = {0}; + unsigned int i; + + blake2b_increment_counter(S, S->buflen); + blake2b_set_lastblock(S); + memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */ + blake2b_compress(S, S->buf); + + for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */ + store64(buffer + sizeof(S->h[i]) * i, S->h[i]); + } + + memcpy(out, buffer, S->outlen); + + burn(buffer, sizeof(buffer)); + burn(S->buf, sizeof(S->buf)); + burn(S->h, sizeof(S->h)); + return 0; +} + +int blake2b(void *out, const void *in, const void *key, size_t keylen) +{ + blake2b_state S; + + blake2b_init(&S, 64); + my_blake2b_update(&S, in, 64); + blake2b_final(&S, out, 64); + burn(&S, sizeof(S)); + return 0; +} + +void blake2b_too(void *pout, const void *in) +{ + uint8_t *out = (uint8_t *)pout; + uint8_t out_buffer[64]; + uint8_t in_buffer[64]; + + blake2b_state blake_state; + blake2b_init(&blake_state, 64); + blake_state.buflen = blake_state.buf[1] = 4; + my_blake2b_update(&blake_state, in, 72); + blake2b_final(&blake_state, out_buffer, 64); + memcpy(out, out_buffer, 32); + out += 32; + + register uint8_t i = 29; + while (i--) { + memcpy(in_buffer, out_buffer, 64); + blake2b(out_buffer, in_buffer, NULL, 0); + memcpy(out, out_buffer, 32); + out += 32; + } + + memcpy(in_buffer, out_buffer, 64); + blake2b(out_buffer, in_buffer, NULL, 0); + memcpy(out, out_buffer, 64); + + burn(&blake_state, sizeof(blake_state)); +} + +/* Argon2 Team - Begin Code */ +int blake2b_long(void *pout, const void *in) +{ + uint8_t *out = (uint8_t *)pout; + blake2b_state blake_state; + uint8_t outlen_bytes[sizeof(uint32_t)] = {0}; + + store32(outlen_bytes, 32); + + blake2b_init(&blake_state, 32); + my_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)); + blake2b_update(&blake_state, in, 1024); + blake2b_final(&blake_state, out, 32); + burn(&blake_state, sizeof(blake_state)); + return 0; + +} +/* Argon2 Team - End Code */ diff --git a/stratum/algos/ar2/cores.c b/stratum/algos/ar2/cores.c new file mode 100644 index 0000000..f4b2878 --- /dev/null +++ b/stratum/algos/ar2/cores.c @@ -0,0 +1,341 @@ +/* + * Argon2 source code package + * + * Written by Daniel Dinu and Dmitry Khovratovich, 2015 + * + * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. + * + * You should have received a copy of the CC0 Public Domain Dedication along + * with + * this software. If not, see + * . + */ + +/*For memory wiping*/ +#ifdef _MSC_VER +#include +#include /* For SecureZeroMemory */ +#endif +#if defined __STDC_LIB_EXT1__ +#define __STDC_WANT_LIB_EXT1__ 1 +#endif +#define VC_GE_2005(version) (version >= 1400) + +#include +#include +#include +#include + +#include "argon2.h" +#include "cores.h" +#include "blake2/blake2.h" +#include "blake2/blake2-impl.h" + +#ifdef GENKAT +#include "genkat.h" +#endif + +#if defined(__clang__) +#if __has_attribute(optnone) +#define NOT_OPTIMIZED __attribute__((optnone)) +#endif +#elif defined(__GNUC__) +#define GCC_VERSION \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#if GCC_VERSION >= 40400 +#define NOT_OPTIMIZED __attribute__((optimize("O0"))) +#endif +#endif +#ifndef NOT_OPTIMIZED +#define NOT_OPTIMIZED +#endif + +/***************Instance and Position constructors**********/ +void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); } + +void copy_block(block *dst, const block *src) { + memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_WORDS_IN_BLOCK); +} + +void xor_block(block *dst, const block *src) { + int i; + for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) { + dst->v[i] ^= src->v[i]; + } +} + +static void load_block(block *dst, const void *input) { + unsigned i; + for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) { + dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i])); + } +} + +static void store_block(void *output, const block *src) { + unsigned i; + for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) { + store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]); + } +} + +/***************Memory allocators*****************/ +int allocate_memory(block **memory, uint32_t m_cost) { + if (memory != NULL) { + size_t memory_size = sizeof(block) * m_cost; + if (m_cost != 0 && + memory_size / m_cost != + sizeof(block)) { /*1. Check for multiplication overflow*/ + return ARGON2_MEMORY_ALLOCATION_ERROR; + } + + *memory = (block *)malloc(memory_size); /*2. Try to allocate*/ + + if (!*memory) { + return ARGON2_MEMORY_ALLOCATION_ERROR; + } + + return ARGON2_OK; + } else { + return ARGON2_MEMORY_ALLOCATION_ERROR; + } +} + +void secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); } + +/*********Memory functions*/ + +void clear_memory(argon2_instance_t *instance, int clear) { + if (instance->memory != NULL && clear) { + secure_wipe_memory(instance->memory, + sizeof(block) * /*instance->memory_blocks*/16); + } +} + +void free_memory(block *memory) { free(memory); } + +void finalize(const argon2_context *context, argon2_instance_t *instance) { + if (context != NULL && instance != NULL) { + block blockhash; + copy_block(&blockhash, instance->memory + 15); + + /* Hash the result */ + { + uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE]; + store_block(blockhash_bytes, &blockhash); + blake2b_long(context->out, blockhash_bytes); + secure_wipe_memory(blockhash.v, ARGON2_BLOCK_SIZE); + secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); /* clear blockhash_bytes */ + } + +#ifdef GENKAT + print_tag(context->out, context->outlen); +#endif + + /* Clear memory */ + clear_memory(instance, 1); + + free_memory(instance->memory); + } +} + +uint32_t index_alpha(const argon2_instance_t *instance, + const argon2_position_t *position, uint32_t pseudo_rand, + int same_lane) { + /* + * Pass 0: + * This lane : all already finished segments plus already constructed + * blocks in this segment + * Other lanes : all already finished segments + * Pass 1+: + * This lane : (SYNC_POINTS - 1) last segments plus already constructed + * blocks in this segment + * Other lanes : (SYNC_POINTS - 1) last segments + */ + uint32_t reference_area_size; + uint64_t relative_position; + uint32_t start_position, absolute_position; + + if (0 == position->pass) { + /* First pass */ + if (0 == position->slice) { + /* First slice */ + reference_area_size = + position->index - 1; /* all but the previous */ + } else { + if (same_lane) { + /* The same lane => add current segment */ + reference_area_size = + position->slice * 4 + + position->index - 1; + } else { + reference_area_size = + position->slice * 4 + + ((position->index == 0) ? (-1) : 0); + } + } + } else { + /* Second pass */ + if (same_lane) {reference_area_size = 11 + position->index;} + else {reference_area_size = 12 - (position->index == 0);} + } + + /* 1.2.4. Mapping pseudo_rand to 0.. and produce + * relative position */ + relative_position = pseudo_rand; + relative_position = relative_position * relative_position >> 32; + relative_position = reference_area_size - 1 - + (reference_area_size * relative_position >> 32); + + /* 1.2.5 Computing starting position */ + start_position = 0; + + if (0 != position->pass) { + start_position = (position->slice == ARGON2_SYNC_POINTS - 1) + ? 0 : (position->slice + 1) * 4; + } + + /* 1.2.6. Computing absolute position */ + absolute_position = (start_position + relative_position) % 16; + return absolute_position; +} + +void fill_memory_blocks(argon2_instance_t *instance) { + uint32_t r, s; + + for (r = 0; r < 2; ++r) { + for (s = 0; s < ARGON2_SYNC_POINTS; ++s) { + + argon2_position_t position; + position.pass = r; + position.lane = 0; + position.slice = (uint8_t)s; + position.index = 0; + fill_segment(instance, position); + } + +#ifdef GENKAT + internal_kat(instance, r); /* Print all memory blocks */ +#endif + } +} + +void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) { + /* Make the first and second block in each lane as G(H0||i||0) or + G(H0||i||1) */ + uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE]; + store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0); + store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, 0); + blake2b_too(blockhash_bytes, blockhash); + load_block(&instance->memory[0], blockhash_bytes); + + store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1); + blake2b_too(blockhash_bytes, blockhash); + load_block(&instance->memory[1], blockhash_bytes); + secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); +} + + +static const blake2b_state base_hash = { + .h = { + UINT64_C(7640891576939301192), UINT64_C(13503953896175478587), + UINT64_C(4354685564936845355), UINT64_C(11912009170470909681), + UINT64_C(5840696475078001361), UINT64_C(11170449401992604703), + UINT64_C(2270897969802886507), UINT64_C(6620516959819538809) + }, + .t = {UINT64_C(0),UINT64_C(0)}, + .f = {UINT64_C(0),UINT64_C(0)}, + .buf = { + 1, 0, 0, 0, 32, 0, 0, 0, 16, 0, 0, 0, 2, 0, 0, 0, 16, 0, 0, 0, 1, 0, + 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + .buflen = 28, + .outlen = 64, + .last_node = 0 +}; + +#define PWDLEN 32 +#define SALTLEN 32 +#define SECRETLEN 0 +#define ADLEN 0 +void initial_hash(uint8_t *blockhash, argon2_context *context, + argon2_type type) { + + uint8_t value[sizeof(uint32_t)]; + + /* Is it generating cache invalidation between cores ? */ + blake2b_state BlakeHash = base_hash; + BlakeHash.buf[20] = (uint8_t) type; + my_blake2b_update(&BlakeHash, (const uint8_t *)context->pwd, + PWDLEN); + + + secure_wipe_memory(context->pwd, PWDLEN); + context->pwdlen = 0; + + store32(&value, SALTLEN); + my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + my_blake2b_update(&BlakeHash, (const uint8_t *)context->salt, + SALTLEN); + + store32(&value, SECRETLEN); + my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, ADLEN); + my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH); +} + +int initialize(argon2_instance_t *instance, argon2_context *context) { + /* 1. Memory allocation */ + + + allocate_memory(&(instance->memory), 16); + + /* 2. Initial hashing */ + /* H_0 + 8 extra bytes to produce the first blocks */ + /* Hashing all inputs */ + uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; + initial_hash(blockhash, context, instance->type); + /* Zeroing 8 extra bytes */ + secure_wipe_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, + ARGON2_PREHASH_SEED_LENGTH - + ARGON2_PREHASH_DIGEST_LENGTH); + +#ifdef GENKAT + initial_kat(blockhash, context, instance->type); +#endif + + /* 3. Creating first blocks, we always have at least two blocks in a slice + */ + fill_first_blocks(blockhash, instance); + /* Clearing the hash */ + secure_wipe_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH); + + return ARGON2_OK; +} + +int argon2_core(argon2_context *context, argon2_type type) { + argon2_instance_t instance; + instance.memory = NULL; + instance.type = type; + + /* 3. Initialization: Hashing inputs, allocating memory, filling first + * blocks + */ + + int result = initialize(&instance, context); + if (ARGON2_OK != result) return result; + + /* 4. Filling memory */ + fill_memory_blocks(&instance); + + /* 5. Finalization */ + finalize(context, &instance); + + return ARGON2_OK; +} diff --git a/stratum/algos/ar2/cores.h b/stratum/algos/ar2/cores.h new file mode 100644 index 0000000..8cf5dda --- /dev/null +++ b/stratum/algos/ar2/cores.h @@ -0,0 +1,220 @@ +/* + * Argon2 source code package + * + * Written by Daniel Dinu and Dmitry Khovratovich, 2015 + * + * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. + * + * You should have received a copy of the CC0 Public Domain Dedication along + * with + * this software. If not, see + * . + */ + +#ifndef ARGON2_CORES_H +#define ARGON2_CORES_H + +#if defined(_MSC_VER) +#define ALIGN(n) __declspec(align(16)) +#elif defined(__GNUC__) || defined(__clang) +#define ALIGN(x) __attribute__((__aligned__(x))) +#else +#define ALIGN(x) +#endif + +/*************************Argon2 internal + * constants**************************************************/ + +enum argon2_core_constants { + /* Version of the algorithm */ + ARGON2_VERSION_NUMBER = 0x10, + + /* Memory block size in bytes */ + ARGON2_BLOCK_SIZE = 1024, + ARGON2_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8, + ARGON2_QWORDS_IN_BLOCK = 64, + + /* Number of pseudo-random values generated by one call to Blake in Argon2i + to + generate reference block positions */ + ARGON2_ADDRESSES_IN_BLOCK = 128, + + /* Pre-hashing digest length and its extension*/ + ARGON2_PREHASH_DIGEST_LENGTH = 64, + ARGON2_PREHASH_SEED_LENGTH = 72 +}; + +/* Argon2 primitive type */ +typedef enum Argon2_type { Argon2_d = 0, Argon2_i = 1 } argon2_type; + +/*************************Argon2 internal data + * types**************************************************/ + +/* + * Structure for the (1KB) memory block implemented as 128 64-bit words. + * Memory blocks can be copied, XORed. Internal words can be accessed by [] (no + * bounds checking). + */ +typedef struct _block { uint64_t v[ARGON2_WORDS_IN_BLOCK]; } __attribute__ ((aligned (16))) block; + +/*****************Functions that work with the block******************/ + +/* Initialize each byte of the block with @in */ +void init_block_value(block *b, uint8_t in); + +/* Copy block @src to block @dst */ +void copy_block(block *dst, const block *src); + +/* XOR @src onto @dst bytewise */ +void xor_block(block *dst, const block *src); + +/* + * Argon2 instance: memory pointer, number of passes, amount of memory, type, + * and derived values. + * Used to evaluate the number and location of blocks to construct in each + * thread + */ +typedef struct Argon2_instance_t { + block *memory; /* Memory pointer */ + argon2_type type; + int print_internals; /* whether to print the memory blocks */ +} argon2_instance_t; + +/* + * Argon2 position: where we construct the block right now. Used to distribute + * work between threads. + */ +typedef struct Argon2_position_t { + uint32_t pass; + uint32_t lane; + uint8_t slice; + uint32_t index; +} argon2_position_t; + +/*Struct that holds the inputs for thread handling FillSegment*/ +typedef struct Argon2_thread_data { + argon2_instance_t *instance_ptr; + argon2_position_t pos; +} argon2_thread_data; + +/*************************Argon2 core + * functions**************************************************/ + +/* Allocates memory to the given pointer + * @param memory pointer to the pointer to the memory + * @param m_cost number of blocks to allocate in the memory + * @return ARGON2_OK if @memory is a valid pointer and memory is allocated + */ +int allocate_memory(block **memory, uint32_t m_cost); + +/* Function that securely cleans the memory + * @param mem Pointer to the memory + * @param s Memory size in bytes + */ +void secure_wipe_memory(void *v, size_t n); + +/* Clears memory + * @param instance pointer to the current instance + * @param clear_memory indicates if we clear the memory with zeros. + */ +void clear_memory(argon2_instance_t *instance, int clear); + +/* Deallocates memory + * @param memory pointer to the blocks + */ +void free_memory(block *memory); + +/* + * Computes absolute position of reference block in the lane following a skewed + * distribution and using a pseudo-random value as input + * @param instance Pointer to the current instance + * @param position Pointer to the current position + * @param pseudo_rand 32-bit pseudo-random value used to determine the position + * @param same_lane Indicates if the block will be taken from the current lane. + * If so we can reference the current segment + * @pre All pointers must be valid + */ +uint32_t index_alpha(const argon2_instance_t *instance, + const argon2_position_t *position, uint32_t pseudo_rand, + int same_lane); + +/* + * Function that validates all inputs against predefined restrictions and return + * an error code + * @param context Pointer to current Argon2 context + * @return ARGON2_OK if everything is all right, otherwise one of error codes + * (all defined in + */ +int validate_inputs(const argon2_context *context); + +/* + * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears + * password and secret if needed + * @param context Pointer to the Argon2 internal structure containing memory + * pointer, and parameters for time and space requirements. + * @param blockhash Buffer for pre-hashing digest + * @param type Argon2 type + * @pre @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes + * allocated + */ +void initial_hash(uint8_t *blockhash, argon2_context *context, + argon2_type type); + +/* + * Function creates first 2 blocks per lane + * @param instance Pointer to the current instance + * @param blockhash Pointer to the pre-hashing digest + * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values + */ +void fill_firsts_blocks(uint8_t *blockhash, const argon2_instance_t *instance); + +/* + * Function allocates memory, hashes the inputs with Blake, and creates first + * two blocks. Returns the pointer to the main memory with 2 blocks per lane + * initialized + * @param context Pointer to the Argon2 internal structure containing memory + * pointer, and parameters for time and space requirements. + * @param instance Current Argon2 instance + * @return Zero if successful, -1 if memory failed to allocate. @context->state + * will be modified if successful. + */ +int initialize(argon2_instance_t *instance, argon2_context *context); + +/* + * XORing the last block of each lane, hashing it, making the tag. Deallocates + * the memory. + * @param context Pointer to current Argon2 context (use only the out parameters + * from it) + * @param instance Pointer to current instance of Argon2 + * @pre instance->state must point to necessary amount of memory + * @pre context->out must point to outlen bytes of memory + * @pre if context->free_cbk is not NULL, it should point to a function that + * deallocates memory + */ +void finalize(const argon2_context *context, argon2_instance_t *instance); + +/* + * Function that fills the segment using previous segments also from other + * threads + * @param instance Pointer to the current instance + * @param position Current position + * @pre all block pointers must be valid + */ +void fill_segment(const argon2_instance_t *instance, + argon2_position_t position); + +/* + * Function that fills the entire memory t_cost times based on the first two + * blocks in each lane + * @param instance Pointer to the current instance + */ +void fill_memory_blocks(argon2_instance_t *instance); + +/* + * Function that performs memory-hard hashing with certain degree of parallelism + * @param context Pointer to the Argon2 internal structure + * @return Error code if smth is wrong, ARGON2_OK otherwise + */ +int argon2_core(argon2_context *context, argon2_type type); + +#endif diff --git a/stratum/algos/ar2/genkat.c b/stratum/algos/ar2/genkat.c new file mode 100644 index 0000000..7aa482d --- /dev/null +++ b/stratum/algos/ar2/genkat.c @@ -0,0 +1,182 @@ +#include +#include +#include +#include + +#include "argon2.h" +#include "cores.h" + +void initial_kat(const uint8_t *blockhash, const argon2_context *context, + argon2_type type) { + unsigned i; + + if (blockhash != NULL && context != NULL) { + printf("======================================="); + + switch (type) { + case Argon2_d: + printf("Argon2d\n"); + break; + + case Argon2_i: + printf("Argon2i\n"); + break; + + default: + break; + } + + printf("Memory: %u KiB, Iterations: %u, Parallelism: %u lanes, Tag " + "length: %u bytes\n", + context->m_cost, context->t_cost, context->lanes, + context->outlen); + + printf("Password[%u]: ", context->pwdlen); + + if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) { + printf("CLEARED\n"); + } else { + for (i = 0; i < context->pwdlen; ++i) { + printf("%2.2x ", ((unsigned char *)context->pwd)[i]); + } + + printf("\n"); + } + + printf("Salt[%u]: ", context->saltlen); + + for (i = 0; i < context->saltlen; ++i) { + printf("%2.2x ", ((unsigned char *)context->salt)[i]); + } + + printf("\n"); + + printf("Secret[%u]: ", context->secretlen); + + if (context->flags & ARGON2_FLAG_CLEAR_SECRET) { + printf("CLEARED\n"); + } else { + for (i = 0; i < context->secretlen; ++i) { + printf("%2.2x ", ((unsigned char *)context->secret)[i]); + } + + printf("\n"); + } + + printf("Associated data[%u]: ", context->adlen); + + for (i = 0; i < context->adlen; ++i) { + printf("%2.2x ", ((unsigned char *)context->ad)[i]); + } + + printf("\n"); + + printf("Pre-hashing digest: "); + + for (i = 0; i < ARGON2_PREHASH_DIGEST_LENGTH; ++i) { + printf("%2.2x ", ((unsigned char *)blockhash)[i]); + } + + printf("\n"); + } +} + +void print_tag(const void *out, uint32_t outlen) { + unsigned i; + if (out != NULL) { + printf("Tag: "); + + for (i = 0; i < outlen; ++i) { + printf("%2.2x ", ((uint8_t *)out)[i]); + } + + printf("\n"); + } +} + +void internal_kat(const argon2_instance_t *instance, uint32_t pass) { + + if (instance != NULL) { + uint32_t i, j; + printf("\n After pass %u:\n", pass); + + for (i = 0; i < instance->memory_blocks; ++i) { + uint32_t how_many_words = + (instance->memory_blocks > ARGON2_WORDS_IN_BLOCK) + ? 1 + : ARGON2_WORDS_IN_BLOCK; + + for (j = 0; j < how_many_words; ++j) + printf("Block %.4u [%3u]: %016" PRIx64 "\n", i, j, + instance->memory[i].v[j]); + } + } +} + +static void fatal(const char *error) { + fprintf(stderr, "Error: %s\n", error); + exit(1); +} + +static void generate_testvectors(const char *type) { +#define TEST_OUTLEN 32 +#define TEST_PWDLEN 32 +#define TEST_SALTLEN 16 +#define TEST_SECRETLEN 8 +#define TEST_ADLEN 12 + argon2_context context; + + unsigned char out[TEST_OUTLEN]; + unsigned char pwd[TEST_PWDLEN]; + unsigned char salt[TEST_SALTLEN]; + unsigned char secret[TEST_SECRETLEN]; + unsigned char ad[TEST_ADLEN]; + const allocate_fptr myown_allocator = NULL; + const deallocate_fptr myown_deallocator = NULL; + + unsigned t_cost = 3; + unsigned m_cost = 16; + unsigned lanes = 4; + + memset(pwd, 1, TEST_OUTLEN); + memset(salt, 2, TEST_SALTLEN); + memset(secret, 3, TEST_SECRETLEN); + memset(ad, 4, TEST_ADLEN); + + context.out = out; + context.outlen = TEST_OUTLEN; + context.pwd = pwd; + context.pwdlen = TEST_PWDLEN; + context.salt = salt; + context.saltlen = TEST_SALTLEN; + context.secret = secret; + context.secretlen = TEST_SECRETLEN; + context.ad = ad; + context.adlen = TEST_ADLEN; + context.t_cost = t_cost; + context.m_cost = m_cost; + context.lanes = lanes; + context.threads = lanes; + context.allocate_cbk = myown_allocator; + context.free_cbk = myown_deallocator; + context.flags = 0; + +#undef TEST_OUTLEN +#undef TEST_PWDLEN +#undef TEST_SALTLEN +#undef TEST_SECRETLEN +#undef TEST_ADLEN + + if (!strcmp(type, "d")) { + argon2d(&context); + } else if (!strcmp(type, "i")) { + argon2i(&context); + } else + fatal("wrong Argon2 type"); +} + +int main(int argc, char *argv[]) { + const char *type = (argc > 1) ? argv[1] : "i"; + generate_testvectors(type); + return ARGON2_OK; +} diff --git a/stratum/algos/ar2/genkat.h b/stratum/algos/ar2/genkat.h new file mode 100644 index 0000000..9c776bf --- /dev/null +++ b/stratum/algos/ar2/genkat.h @@ -0,0 +1,45 @@ +/* + * Argon2 source code package + * + * Written by Daniel Dinu and Dmitry Khovratovich, 2015 + * + * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. + * + * You should have received a copy of the CC0 Public Domain Dedication along + * with + * this software. If not, see + * . + */ + +#ifndef ARGON2_KAT_H +#define ARGON2_KAT_H + +/* + * Initial KAT function that prints the inputs to the file + * @param blockhash Array that contains pre-hashing digest + * @param context Holds inputs + * @param type Argon2 type + * @pre blockhash must point to INPUT_INITIAL_HASH_LENGTH bytes + * @pre context member pointers must point to allocated memory of size according + * to the length values + */ +void initial_kat(const uint8_t *blockhash, const argon2_context *context, + argon2_type type); + +/* + * Function that prints the output tag + * @param out output array pointer + * @param outlen digest length + * @pre out must point to @a outlen bytes + **/ +void print_tag(const void *out, uint32_t outlen); + +/* + * Function that prints the internal state at given moment + * @param instance pointer to the current instance + * @param pass current pass number + * @pre instance must have necessary memory allocated + **/ +void internal_kat(const argon2_instance_t *instance, uint32_t pass); + +#endif diff --git a/stratum/algos/ar2/opt.c b/stratum/algos/ar2/opt.c new file mode 100644 index 0000000..755a894 --- /dev/null +++ b/stratum/algos/ar2/opt.c @@ -0,0 +1,185 @@ +/* + * Argon2 source code package + * + * Written by Daniel Dinu and Dmitry Khovratovich, 2015 + * + * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. + * + * You should have received a copy of the CC0 Public Domain Dedication along + * with + * this software. If not, see + * . + */ + +#include +#include +#include +#include +#include + +#include + +#include "argon2.h" +#include "cores.h" +#include "opt.h" + +#include "blake2/blake2.h" +#include "blake2/blamka-round-opt.h" + +void fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block) +{ + __m128i block_XY[ARGON2_QWORDS_IN_BLOCK] __attribute__ ((aligned (16))); + uint32_t i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) { + block_XY[i] = state[i] = _mm_xor_si128( + state[i], _mm_load_si128(&ref_block[i])); + } + + BLAKE2_ROUND(state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]); + BLAKE2_ROUND(state[8], state[9], state[10], state[11], state[12], state[13], state[14], state[15]); + BLAKE2_ROUND(state[16], state[17], state[18], state[19], state[20], state[21], state[22], state[23]); + BLAKE2_ROUND(state[24], state[25], state[26], state[27], state[28], state[29], state[30], state[31]); + BLAKE2_ROUND(state[32], state[33], state[34], state[35], state[36], state[37], state[38], state[39]); + BLAKE2_ROUND(state[40], state[41], state[42], state[43], state[44], state[45], state[46], state[47]); + BLAKE2_ROUND(state[48], state[49], state[50], state[51], state[52], state[53], state[54], state[55]); + BLAKE2_ROUND(state[56], state[57], state[58], state[59], state[60], state[61], state[62], state[63]); + /*for (i = 0; i < 8; ++i) { + BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], + state[8 * i + 3], state[8 * i + 4], state[8 * i + 5], + state[8 * i + 6], state[8 * i + 7]); + }*/ + + BLAKE2_ROUND(state[0], state[8], state[16], state[24], state[32], state[40], state[48], state[56]); + BLAKE2_ROUND(state[1], state[9], state[17], state[25], state[33], state[41], state[49], state[57]); + BLAKE2_ROUND(state[2], state[10], state[18], state[26], state[34], state[42], state[50], state[58]); + BLAKE2_ROUND(state[3], state[11], state[19], state[27], state[35], state[43], state[51], state[59]); + BLAKE2_ROUND(state[4], state[12], state[20], state[28], state[36], state[44], state[52], state[60]); + BLAKE2_ROUND(state[5], state[13], state[21], state[29], state[37], state[45], state[53], state[61]); + BLAKE2_ROUND(state[6], state[14], state[22], state[30], state[38], state[46], state[54], state[62]); + BLAKE2_ROUND(state[7], state[15], state[23], state[31], state[39], state[47], state[55], state[63]); + /*for (i = 0; i < 8; ++i) { + BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], + state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i], + state[8 * 6 + i], state[8 * 7 + i]); + }*/ + + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) { + state[i] = _mm_xor_si128(state[i], block_XY[i]); + _mm_storeu_si128(&next_block[i], state[i]); + } +} + +static const uint64_t bad_rands[32] = { + UINT64_C(17023632018251376180), UINT64_C(4911461131397773491), + UINT64_C(15927076453364631751), UINT64_C(7860239898779391109), + + UINT64_C(11820267568857244377), UINT64_C(12188179869468676617), + UINT64_C(3732913385414474778), UINT64_C(7651458777762572084), + + UINT64_C(3062274162574341415), UINT64_C(17922653540258786897), + UINT64_C(17393848266100524980), UINT64_C(8539695715554563839), + + UINT64_C(13824538050656654359), UINT64_C(12078939433126460936), + UINT64_C(15331979418564540430), UINT64_C(12058346794217174273), + + UINT64_C(13593922096015221049), UINT64_C(18356682276374416500), + UINT64_C(4968040514092703824), UINT64_C(11202790346130235567), + + UINT64_C(2276229735041314644), UINT64_C(220837743321691382), + UINT64_C(4861211596230784273), UINT64_C(6330592584132590331), + + UINT64_C(3515580430960296763), UINT64_C(9869356316971855173), + UINT64_C(485533243489193056), UINT64_C(14596447761048148032), + + UINT64_C(16531790085730132900), UINT64_C(17328824500878824371), + UINT64_C(8548260058287621283), UINT64_C(8641748798041936364) +}; + +void generate_addresses(const argon2_instance_t *instance, + const argon2_position_t *position, + uint64_t *pseudo_rands) +{ + uint8_t offset = position->pass * 16 + position->slice * 4; + pseudo_rands[0] = bad_rands[offset++]; + pseudo_rands[1] = bad_rands[offset++]; + pseudo_rands[2] = bad_rands[offset++]; + pseudo_rands[3] = bad_rands[offset++]; + + /*if ((position->pass == 1 && position->slice == 3)) + print64("pseudo_rands", pseudo_rands, 4);*/ +} + +#define SEGMENT_LENGTH 4 +#define LANE_LENGTH 16 +#define POS_LANE 0 + +void fill_segment(const argon2_instance_t *instance, + argon2_position_t position) +{ + block *ref_block = NULL, *curr_block = NULL; + uint64_t pseudo_rand, ref_index; + uint32_t prev_offset, curr_offset; + uint8_t i; + __m128i state[64]; + int data_independent_addressing = (instance->type == Argon2_i); + + /* Pseudo-random values that determine the reference block position */ + uint64_t *pseudo_rands = NULL; + + pseudo_rands = (uint64_t *)malloc(/*sizeof(uint64_t) * 4*/32); + + if (data_independent_addressing) { + generate_addresses(instance, &position, pseudo_rands); + } + + i = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + i = 2; /* we have already generated the first two blocks */ + } + + /*printf("Position.lane = %d\nPosition.slice = %d\nStarting index : %d\n", position.lane, position.slice, starting_index);*/ + /* Offset of the current block */ + curr_offset = position.slice * 4 + i; + + if (0 == curr_offset % 16) { + /* Last block in this lane */ + prev_offset = curr_offset + /*instance->lane_length - 1*/15; + } else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); + + for (; i < SEGMENT_LENGTH; + ++i, ++curr_offset, ++prev_offset) { + /*1.1 Rotating prev_offset if needed */ + if (curr_offset % LANE_LENGTH == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + if (data_independent_addressing) { + pseudo_rand = pseudo_rands[i]; + } else { + pseudo_rand = instance->memory[prev_offset].v[0]; + } + + /* 1.2.2 Computing the lane of the reference block */ + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,1); + + /* 2 Creating a new block */ + ref_block = instance->memory + ref_index; + curr_block = instance->memory + curr_offset; + fill_block(state, (__m128i const *)ref_block->v, (__m128i *)curr_block->v); + } + + free(pseudo_rands); +} diff --git a/stratum/algos/ar2/opt.h b/stratum/algos/ar2/opt.h new file mode 100644 index 0000000..ec89b96 --- /dev/null +++ b/stratum/algos/ar2/opt.h @@ -0,0 +1,49 @@ +/* + * Argon2 source code package + * + * Written by Daniel Dinu and Dmitry Khovratovich, 2015 + * + * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. + * + * You should have received a copy of the CC0 Public Domain Dedication along + * with + * this software. If not, see + * . + */ + +#ifndef ARGON2_OPT_H +#define ARGON2_OPT_H + +/* + * Function fills a new memory block. Differs from the + * @param state Pointer to the just produced block. Content will be updated(!) + * @param ref_block Pointer to the reference block + * @param next_block Pointer to the block to be constructed + * @pre all block pointers must be valid + */ +void fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block); + +/* + * Generate pseudo-random values to reference blocks in the segment and puts + * them into the array + * @param instance Pointer to the current instance + * @param position Pointer to the current position + * @param pseudo_rands Pointer to the array of 64-bit values + * @pre pseudo_rands must point to @a instance->segment_length allocated values + */ +void generate_addresses(const argon2_instance_t *instance, + const argon2_position_t *position, + uint64_t *pseudo_rands); + +/* + * Function that fills the segment using previous segments also from other + * threads. + * Identical to the reference code except that it calls optimized FillBlock() + * @param instance Pointer to the current instance + * @param position Current position + * @pre all block pointers must be valid + */ +void fill_segment(const argon2_instance_t *instance, + argon2_position_t position); + +#endif /* ARGON2_OPT_H */ diff --git a/stratum/algos/ar2/ref.c b/stratum/algos/ar2/ref.c new file mode 100644 index 0000000..98ae07c --- /dev/null +++ b/stratum/algos/ar2/ref.c @@ -0,0 +1,174 @@ +/* + * Argon2 source code package + * + * Written by Daniel Dinu and Dmitry Khovratovich, 2015 + * + * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. + * + * You should have received a copy of the CC0 Public Domain Dedication along + * with + * this software. If not, see + * . + */ + +#include +#include +#include + +#include "argon2.h" +#include "cores.h" +#include "ref.h" + +#include "blake2/blamka-round-ref.h" +#include "blake2/blake2-impl.h" +#include "blake2/blake2.h" + +void fill_block(const block *prev_block, const block *ref_block, + block *next_block) { + block blockR, block_tmp; + unsigned i; + + copy_block(&blockR, ref_block); + xor_block(&blockR, prev_block); + copy_block(&block_tmp, &blockR); + + /* Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then + (16,17,..31)... finally (112,113,...127) */ + for (i = 0; i < 8; ++i) { + BLAKE2_ROUND_NOMSG( + blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2], + blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5], + blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8], + blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11], + blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14], + blockR.v[16 * i + 15]); + } + + /* Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then + (2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127) */ + for (i = 0; i < 8; i++) { + BLAKE2_ROUND_NOMSG( + blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16], + blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33], + blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64], + blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81], + blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112], + blockR.v[2 * i + 113]); + } + + copy_block(next_block, &block_tmp); + xor_block(next_block, &blockR); +} + +void generate_addresses(const argon2_instance_t *instance, + const argon2_position_t *position, + uint64_t *pseudo_rands) { + block zero_block, input_block, address_block; + uint32_t i; + + init_block_value(&zero_block, 0); + init_block_value(&input_block, 0); + init_block_value(&address_block, 0); + + if (instance != NULL && position != NULL) { + input_block.v[0] = position->pass; + input_block.v[1] = position->lane; + input_block.v[2] = position->slice; + input_block.v[3] = 16; + input_block.v[4] = 2; + input_block.v[5] = instance->type; + + for (i = 0; i < 4; ++i) { + if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { + input_block.v[6]++; + fill_block(&zero_block, &input_block, &address_block); + fill_block(&zero_block, &address_block, &address_block); + } + + pseudo_rands[i] = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; + } + } +} + +void fill_segment(const argon2_instance_t *instance, + argon2_position_t position) { + block *ref_block = NULL, *curr_block = NULL; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index; + uint32_t i; + int data_independent_addressing = (instance->type == Argon2_i); + /* Pseudo-random values that determine the reference block position */ + uint64_t *pseudo_rands = NULL; + + if (instance == NULL) { + return; + } + + pseudo_rands = + (uint64_t *)malloc(sizeof(uint64_t) * 4); + + if (pseudo_rands == NULL) { + return; + } + + if (data_independent_addressing) { + generate_addresses(instance, &position, pseudo_rands); + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* we have already generated the first two blocks */ + } + + /* Offset of the current block */ + curr_offset = position.lane * 16 + + position.slice * 4 + starting_index; + + if (0 == curr_offset % 16) { + /* Last block in this lane */ + prev_offset = curr_offset + 16 - 1; + } else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + for (i = starting_index; i < 4; ++i, ++curr_offset, ++prev_offset) { + /*1.1 Rotating prev_offset if needed */ + if (curr_offset % 16 == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + if (data_independent_addressing) { + pseudo_rand = pseudo_rands[i]; + } else { + pseudo_rand = instance->memory[prev_offset].v[0]; + } + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % 1; + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = + instance->memory + 16 * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + fill_block(instance->memory + prev_offset, ref_block, curr_block); + } + + free(pseudo_rands); +} diff --git a/stratum/algos/ar2/ref.h b/stratum/algos/ar2/ref.h new file mode 100644 index 0000000..7ee22ee --- /dev/null +++ b/stratum/algos/ar2/ref.h @@ -0,0 +1,49 @@ +/* + * Argon2 source code package + * + * Written by Daniel Dinu and Dmitry Khovratovich, 2015 + * + * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. + * + * You should have received a copy of the CC0 Public Domain Dedication along + * with + * this software. If not, see + * . + */ + +#ifndef ARGON2_REF_H +#define ARGON2_REF_H + +/* + * Function fills a new memory block + * @param prev_block Pointer to the previous block + * @param ref_block Pointer to the reference block + * @param next_block Pointer to the block to be constructed + * @pre all block pointers must be valid + */ +void fill_block(const block *prev_block, const block *ref_block, + block *next_block); + +/* + * Generate pseudo-random values to reference blocks in the segment and puts + * them into the array + * @param instance Pointer to the current instance + * @param position Pointer to the current position + * @param pseudo_rands Pointer to the array of 64-bit values + * @pre pseudo_rands must point to @a instance->segment_length allocated values + */ +void generate_addresses(const argon2_instance_t *instance, + const argon2_position_t *position, + uint64_t *pseudo_rands); + +/* + * Function that fills the segment using previous segments also from other + * threads + * @param instance Pointer to the current instance + * @param position Current position + * @pre all block pointers must be valid + */ +void fill_segment(const argon2_instance_t *instance, + argon2_position_t position); + +#endif /* ARGON2_REF_H */ diff --git a/stratum/algos/ar2/run.c b/stratum/algos/ar2/run.c new file mode 100644 index 0000000..2b1b30a --- /dev/null +++ b/stratum/algos/ar2/run.c @@ -0,0 +1,223 @@ +/* + * Argon2 source code package + * + * Written by Daniel Dinu and Dmitry Khovratovich, 2015 + * + * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. + * + * You should have received a copy of the CC0 Public Domain Dedication along + * with + * this software. If not, see + * . + */ + +#include +#include +#include +#include +#include +#include + +#include "argon2.h" +#include "cores.h" + +#define T_COST_DEF 3 +#define LOG_M_COST_DEF 12 /* 2^12 = 4 MiB */ +#define LANES_DEF 1 +#define THREADS_DEF 1 +#define OUT_LEN 32 +#define SALT_LEN 16 + +#define UNUSED_PARAMETER(x) (void)(x) + +static void usage(const char *cmd) { + printf("Usage: %s pwd salt [-y version] [-t iterations] [-m memory] [-p " + "parallelism]\n", + cmd); + + printf("Parameters:\n"); + printf("\tpwd\t\tThe password to hash\n"); + printf("\tsalt\t\tThe salt to use, at most 16 characters\n"); + printf("\t-d\t\tUse Argon2d instead of Argon2i (which is the default)\n"); + printf("\t-t N\t\tSets the number of iterations to N (default = %d)\n", + T_COST_DEF); + printf("\t-m N\t\tSets the memory usage of 2^N KiB (default %d)\n", + LOG_M_COST_DEF); + printf("\t-p N\t\tSets parallelism to N threads (default %d)\n", + THREADS_DEF); +} + +static void fatal(const char *error) { + fprintf(stderr, "Error: %s\n", error); + exit(1); +} + +/* +Runs Argon2 with certain inputs and parameters, inputs not cleared. Prints the +Base64-encoded hash string +@out output array with at least 32 bytes allocated +@pwd NULL-terminated string, presumably from argv[] +@salt salt array with at least SALTLEN_DEF bytes allocated +@t_cost number of iterations +@m_cost amount of requested memory in KB +@lanes amount of requested parallelism +@threads actual parallelism +@type String, only "d" and "i" are accepted +*/ +static void run(uint8_t *out, char *pwd, uint8_t *salt, uint32_t t_cost, + uint32_t m_cost, uint32_t lanes, uint32_t threads, + const char *type) { + clock_t start_time, stop_time; + unsigned pwd_length; + argon2_context context; + int i; + + start_time = clock(); + + if (!pwd) { + fatal("password missing"); + } + + if (!salt) { + secure_wipe_memory(pwd, strlen(pwd)); + fatal("salt missing"); + } + + pwd_length = strlen(pwd); + + UNUSED_PARAMETER(threads); + + context.out = out; + context.outlen = OUT_LEN; + context.pwd = (uint8_t *)pwd; + context.pwdlen = pwd_length; + context.salt = salt; + context.saltlen = SALT_LEN; + context.secret = NULL; + context.secretlen = 0; + context.ad = NULL; + context.adlen = 0; + context.t_cost = t_cost; + context.m_cost = m_cost; + context.lanes = lanes; + context.threads = lanes; + context.allocate_cbk = NULL; + context.free_cbk = NULL; + context.flags = ARGON2_FLAG_CLEAR_PASSWORD; + + if (!strcmp(type, "d")) { + int result = argon2d(&context); + if (result != ARGON2_OK) + fatal(error_message(result)); + } else if (!strcmp(type, "i")) { + int result = argon2i(&context); + if (result != ARGON2_OK) + fatal(error_message(result)); + } else { + secure_wipe_memory(pwd, strlen(pwd)); + fatal("wrong Argon2 type"); + } + + stop_time = clock(); + + /* add back when proper decoding */ + /* + char encoded[300]; + encode_string(encoded, sizeof encoded, &context); + printf("%s\n", encoded); + */ + printf("Hash:\t\t"); + for (i = 0; i < context.outlen; ++i) { + printf("%02x", context.out[i]); + } + printf("\n"); + + printf("%2.3f seconds\n", + ((double)stop_time - start_time) / (CLOCKS_PER_SEC)); +} + +int main(int argc, char *argv[]) { + unsigned char out[OUT_LEN]; + uint32_t m_cost = 1 << LOG_M_COST_DEF; + uint32_t t_cost = T_COST_DEF; + uint32_t lanes = LANES_DEF; + uint32_t threads = THREADS_DEF; + char *pwd = NULL; + uint8_t salt[SALT_LEN]; + const char *type = "i"; + int i; + + if (argc < 3) { + usage(argv[0]); + return ARGON2_MISSING_ARGS; + } + + /* get password and salt from command line */ + pwd = argv[1]; + if (strlen(argv[2]) > SALT_LEN) { + fatal("salt too long"); + } + memset(salt, 0x00, SALT_LEN); /* pad with null bytes */ + memcpy(salt, argv[2], strlen(argv[2])); + + /* parse options */ + for (i = 3; i < argc; i++) { + const char *a = argv[i]; + unsigned long input = 0; + if (!strcmp(a, "-m")) { + if (i < argc - 1) { + i++; + input = strtoul(argv[i], NULL, 10); + if (input == 0 || input == ULONG_MAX || + input > ARGON2_MAX_MEMORY_BITS) { + fatal("bad numeric input for -m"); + } + m_cost = ARGON2_MIN(UINT64_C(1) << input, UINT32_C(0xFFFFFFFF)); + if (m_cost > ARGON2_MAX_MEMORY) { + fatal("m_cost overflow"); + } + continue; + } else { + fatal("missing -m argument"); + } + } else if (!strcmp(a, "-t")) { + if (i < argc - 1) { + i++; + input = strtoul(argv[i], NULL, 10); + if (input == 0 || input == ULONG_MAX || + input > ARGON2_MAX_TIME) { + fatal("bad numeric input for -t"); + } + t_cost = input; + continue; + } else { + fatal("missing -t argument"); + } + } else if (!strcmp(a, "-p")) { + if (i < argc - 1) { + i++; + input = strtoul(argv[i], NULL, 10); + if (input == 0 || input == ULONG_MAX || + input > ARGON2_MAX_THREADS || input > ARGON2_MAX_LANES) { + fatal("bad numeric input for -p"); + } + threads = input; + lanes = threads; + continue; + } else { + fatal("missing -p argument"); + } + } else if (!strcmp(a, "-d")) { + type = "d"; + } else { + fatal("unknown argument"); + } + } + printf("Type:\t\tArgon2%c\n", type[0]); + printf("Iterations:\t%" PRIu32 " \n", t_cost); + printf("Memory:\t\t%" PRIu32 " KiB\n", m_cost); + printf("Parallelism:\t%" PRIu32 " \n", lanes); + run(out, pwd, salt, t_cost, m_cost, lanes, threads, type); + + return ARGON2_OK; +} diff --git a/stratum/algos/ar2/scrypt-jane.c b/stratum/algos/ar2/scrypt-jane.c new file mode 100644 index 0000000..70fa626 --- /dev/null +++ b/stratum/algos/ar2/scrypt-jane.c @@ -0,0 +1,249 @@ +/* + scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane + + Public Domain or MIT License, whichever is easier +*/ + +#include + +#if defined( _WINDOWS ) +#if !defined( QT_GUI ) +extern "C" { +#endif +#endif + +#include "scrypt-jane.h" + +#include "sj/scrypt-jane-portable.h" +#include "sj/scrypt-jane-hash.h" +#include "sj/scrypt-jane-romix.h" +#include "sj/scrypt-jane-test-vectors.h" + +#define scrypt_maxNfactor 30 /* (1 << (30 + 1)) = ~2 billion */ +#if (SCRYPT_BLOCK_BYTES == 64) +#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */ +#elif (SCRYPT_BLOCK_BYTES == 128) +#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */ +#elif (SCRYPT_BLOCK_BYTES == 256) +#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */ +#elif (SCRYPT_BLOCK_BYTES == 512) +#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */ +#endif +#define scrypt_maxrfactor scrypt_r_32kb /* 32kb */ +#define scrypt_maxpfactor 25 /* (1 << 25) = ~33 million */ + +#include +//#include + +static void NORETURN +scrypt_fatal_error_default(const char *msg) { + fprintf(stderr, "%s\n", msg); + exit(1); +} + +static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default; + +void scrypt_set_fatal_error(scrypt_fatal_errorfn fn) { + scrypt_fatal_error = fn; +} + +static int scrypt_power_on_self_test(void) +{ + const scrypt_test_setting *t; + uint8_t test_digest[64]; + uint32_t i; + int res = 7, scrypt_valid; + + if (!scrypt_test_mix()) { +#if !defined(SCRYPT_TEST) + scrypt_fatal_error("scrypt: mix function power-on-self-test failed"); +#endif + res &= ~1; + } + + if (!scrypt_test_hash()) { +#if !defined(SCRYPT_TEST) + scrypt_fatal_error("scrypt: hash function power-on-self-test failed"); +#endif + res &= ~2; + } + + for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) { + t = post_settings + i; + scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest)); + scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest)); + } + + if (!scrypt_valid) { +#if !defined(SCRYPT_TEST) + scrypt_fatal_error("scrypt: scrypt power-on-self-test failed"); +#endif + res &= ~4; + } + + return res; +} + +typedef struct scrypt_aligned_alloc_t { + uint8_t *mem, *ptr; +} scrypt_aligned_alloc; + +#ifdef SCRYPT_TEST_SPEED + +static uint8_t *mem_base = (uint8_t *)0; +static size_t mem_bump = 0; + +/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */ +static scrypt_aligned_alloc scrypt_alloc(uint64_t size) +{ + scrypt_aligned_alloc aa; + if (!mem_base) { + mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1)); + if (!mem_base) + scrypt_fatal_error("scrypt: out of memory"); + mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); + } + aa.mem = mem_base + mem_bump; + aa.ptr = aa.mem; + mem_bump += (size_t)size; + return aa; +} + +static void scrypt_free(scrypt_aligned_alloc *aa) { + mem_bump = 0; +} + +#else + +static scrypt_aligned_alloc scrypt_alloc(uint64_t size) +{ + static const size_t max_alloc = (size_t)-1; + scrypt_aligned_alloc aa; + size += (SCRYPT_BLOCK_BYTES - 1); + if (size > max_alloc) + scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); + aa.mem = (uint8_t *)malloc((size_t)size); + aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); + if (!aa.mem) + scrypt_fatal_error("scrypt: out of memory"); + return aa; +} + +static void scrypt_free(scrypt_aligned_alloc *aa) +{ + free(aa->mem); +} + +#endif /* SCRYPT_TEST_SPEED */ + + +void scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, + uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes) +{ + scrypt_aligned_alloc YX, V; + uint8_t *X, *Y; + uint32_t N, r, p, chunk_bytes, i; + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) + scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); +#endif + +#if !defined(SCRYPT_TEST) + static int power_on_self_test = 0; + if (!power_on_self_test) { + power_on_self_test = 1; + if (!scrypt_power_on_self_test()) + scrypt_fatal_error("scrypt: power on self test failed"); + } +#endif + + if (Nfactor > scrypt_maxNfactor) + scrypt_fatal_error("scrypt: N out of range"); + if (rfactor > scrypt_maxrfactor) + scrypt_fatal_error("scrypt: r out of range"); + if (pfactor > scrypt_maxpfactor) + scrypt_fatal_error("scrypt: p out of range"); + + N = (1 << (Nfactor + 1)); + r = (1 << rfactor); + p = (1 << pfactor); + + chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; + V = scrypt_alloc((uint64_t)N * chunk_bytes); + YX = scrypt_alloc((p + 1) * chunk_bytes); + + /* 1: X = PBKDF2(password, salt) */ + Y = YX.ptr; + X = Y + chunk_bytes; + scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p); + + /* 2: X = ROMix(X) */ + for (i = 0; i < p; i++) + scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r); + + /* 3: Out = PBKDF2(password, X) */ + scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes); + + scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes); + + scrypt_free(&V); + scrypt_free(&YX); +} + +#define Nfactor 8 +#define rfactor 0 +#define pfactor 0 +#if (SCRYPT_BLOCK_BYTES == 64) +#define chunk_bytes 128 +#elif (SCRYPT_BLOCK_BYTES == 128) +#define chunk_bytes 256 +#elif (SCRYPT_BLOCK_BYTES == 256) +#define chunk_bytes 512 +#elif (SCRYPT_BLOCK_BYTES == 512) +#define chunk_bytes 1024 +#endif + +void my_scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out) +{ + scrypt_aligned_alloc YX, V; + uint8_t *X, *Y; + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) + scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); +#endif + +/* +#if !defined(SCRYPT_TEST) + static int power_on_self_test = 0; + if (!power_on_self_test) { + power_on_self_test = 1; + if (!scrypt_power_on_self_test()) + scrypt_fatal_error("scrypt: power on self test failed"); + } +#endif +*/ + V = scrypt_alloc((uint64_t)512 * chunk_bytes); + YX = scrypt_alloc(2 * chunk_bytes); + + /* 1: X = PBKDF2(password, salt) */ + Y = YX.ptr; + X = Y + chunk_bytes; + scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes); + + /* 2: X = ROMix(X) */ + scrypt_ROMix((scrypt_mix_word_t *)X, (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, 512, 1); + + /* 3: Out = PBKDF2(password, X) */ + scrypt_pbkdf2(password, password_len, X, chunk_bytes, 1, out, 32); + + scrypt_ensure_zero(YX.ptr, 2 * chunk_bytes); + + scrypt_free(&V); + scrypt_free(&YX); +} + +#if defined( _WINDOWS ) +#if !defined( QT_GUI ) +} /* extern "C" */ +#endif +#endif diff --git a/stratum/algos/ar2/scrypt-jane.h b/stratum/algos/ar2/scrypt-jane.h new file mode 100644 index 0000000..e71e460 --- /dev/null +++ b/stratum/algos/ar2/scrypt-jane.h @@ -0,0 +1,33 @@ +#ifndef AR2_SCRYPT_JANE_H +#define AR2_SCRYPT_JANE_H + +//#define SCRYPT_CHOOSE_COMPILETIME +//#define SCRYPT_TEST +#define SCRYPT_SKEIN512 +#define SCRYPT_SALSA64 + +/* + Nfactor: Increases CPU & Memory Hardness + N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used + + rfactor: Increases Memory Hardness + r = (1 << rfactor): How large a chunk is + + pfactor: Increases CPU Hardness + p = (1 << pfactor): Number of times to mix the main chunk + + A block is the basic mixing unit (salsa/chacha block = 64 bytes) + A chunk is (2 * r) blocks + + ~Memory used = (N + 2) * ((2 * r) * block size) +*/ + +#include +#include + +typedef void (*scrypt_fatal_errorfn)(const char *msg); +void scrypt_set_fatal_error(scrypt_fatal_errorfn fn); + +void scrypt(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, unsigned char Nfactor, unsigned char rfactor, unsigned char pfactor, unsigned char *out, size_t bytes); +void my_scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out); +#endif /* AR2_SCRYPT_JANE_H */ diff --git a/stratum/algos/ar2/sj/scrypt-jane-hash.h b/stratum/algos/ar2/sj/scrypt-jane-hash.h new file mode 100644 index 0000000..3a48bf5 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-hash.h @@ -0,0 +1,38 @@ +#if defined(SCRYPT_SKEIN512) +#include "scrypt-jane-hash_skein512.h" +#else + #define SCRYPT_HASH "ERROR" + #define SCRYPT_HASH_BLOCK_SIZE 64 + #define SCRYPT_HASH_DIGEST_SIZE 64 + typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state; + typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + static void scrypt_hash_init(scrypt_hash_state *S) {} + static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {} + static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {} + static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0}; + #error must define a hash function! +#endif + +#include "scrypt-jane-pbkdf2.h" + +#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */ + +static int +scrypt_test_hash(void) { + scrypt_hash_state st; + scrypt_hash_digest hash, final; + uint8_t msg[SCRYPT_TEST_HASH_LEN]; + size_t i; + + for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++) + msg[i] = (uint8_t)i; + + scrypt_hash_init(&st); + for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) { + scrypt_hash(hash, msg, i); + scrypt_hash_update(&st, hash, sizeof(hash)); + } + scrypt_hash_finish(&st, final); + return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE); +} + diff --git a/stratum/algos/ar2/sj/scrypt-jane-hash_skein512.h b/stratum/algos/ar2/sj/scrypt-jane-hash_skein512.h new file mode 100644 index 0000000..a95d46b --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-hash_skein512.h @@ -0,0 +1,188 @@ +#define SCRYPT_HASH "Skein-512" +#define SCRYPT_HASH_BLOCK_SIZE 64 +#define SCRYPT_HASH_DIGEST_SIZE 64 + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +typedef struct scrypt_hash_state_t { + uint64_t X[8], T[2]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +#include + +static void +skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) { + uint64_t X[8], key[8], Xt[9+18], T[3+1]; + size_t r; + + while (blocks--) { + T[0] = S->T[0] + add; + T[1] = S->T[1]; + T[2] = T[0] ^ T[1]; + key[0] = U8TO64_LE(in + 0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0]; + key[1] = U8TO64_LE(in + 8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1]; + key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2]; + key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3]; + key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4]; + key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0]; + key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1]; + key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7]; + Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7]; + in += SCRYPT_HASH_BLOCK_SIZE; + + for (r = 0; r < 18; r++) + Xt[r + 9] = Xt[r + 0]; + + for (r = 0; r < 18; r += 2) { + X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0]; + X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2]; + X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4]; + X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6]; + X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2]; + X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0]; + X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6]; + X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4]; + X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4]; + X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6]; + X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0]; + X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2]; + X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6]; + X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4]; + X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2]; + X[0] += X[7]; X[7] = ROTL64(X[7], 9) ^ X[0]; + + X[0] += Xt[r + 1]; + X[1] += Xt[r + 2]; + X[2] += Xt[r + 3]; + X[3] += Xt[r + 4]; + X[4] += Xt[r + 5]; + X[5] += Xt[r + 6] + T[1]; + X[6] += Xt[r + 7] + T[2]; + X[7] += Xt[r + 8] + r + 1; + + T[3] = T[0]; + T[0] = T[1]; + T[1] = T[2]; + T[2] = T[3]; + + X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0]; + X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2]; + X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4]; + X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6]; + X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2]; + X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0]; + X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6]; + X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4]; + X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4]; + X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6]; + X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0]; + X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2]; + X[6] += X[1]; X[1] = ROTL64(X[1], 8) ^ X[6]; + X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4]; + X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2]; + X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0]; + + X[0] += Xt[r + 2]; + X[1] += Xt[r + 3]; + X[2] += Xt[r + 4]; + X[3] += Xt[r + 5]; + X[4] += Xt[r + 6]; + X[5] += Xt[r + 7] + T[1]; + X[6] += Xt[r + 8] + T[2]; + X[7] += Xt[r + 9] + r + 2; + + T[3] = T[0]; + T[0] = T[1]; + T[1] = T[2]; + T[2] = T[3]; + } + + S->X[0] = key[0] ^ X[0]; + S->X[1] = key[1] ^ X[1]; + S->X[2] = key[2] ^ X[2]; + S->X[3] = key[3] ^ X[3]; + S->X[4] = key[4] ^ X[4]; + S->X[5] = key[5] ^ X[5]; + S->X[6] = key[6] ^ X[6]; + S->X[7] = key[7] ^ X[7]; + + S->T[0] = T[0]; + S->T[1] = T[1] & ~0x4000000000000000ull; + } +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + S->X[0] = 0x4903ADFF749C51CEull; + S->X[1] = 0x0D95DE399746DF03ull; + S->X[2] = 0x8FD1934127C79BCEull; + S->X[3] = 0x9A255629FF352CB1ull; + S->X[4] = 0x5DB62599DF6CA7B0ull; + S->X[5] = 0xEABE394CA9D5C3F4ull; + S->X[6] = 0x991112C71A75B523ull; + S->X[7] = 0xAE18A40B660FCC33ull; + S->T[0] = 0x0000000000000000ull; + S->T[1] = 0x7000000000000000ull; + S->leftover = 0; +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */ + if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) { + /* handle the previous data, we know there is enough for at least one block */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + memcpy(S->buffer + S->leftover, in, want); + in += want; + inlen -= want; + S->leftover = 0; + skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE); + } + + /* handle the current data if there's more than one block */ + if (inlen > SCRYPT_HASH_BLOCK_SIZE) { + blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE); + inlen -= blocks; + in += blocks; + } + } + + /* handle leftover data */ + memcpy(S->buffer + S->leftover, in, inlen); + S->leftover += inlen; +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover); + S->T[1] |= 0x8000000000000000ull; + skein512_blocks(S, S->buffer, 1, S->leftover); + + memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE); + S->T[0] = 0; + S->T[1] = 0xff00000000000000ull; + skein512_blocks(S, S->buffer, 1, 8); + + U64TO8_LE(&hash[ 0], S->X[0]); + U64TO8_LE(&hash[ 8], S->X[1]); + U64TO8_LE(&hash[16], S->X[2]); + U64TO8_LE(&hash[24], S->X[3]); + U64TO8_LE(&hash[32], S->X[4]); + U64TO8_LE(&hash[40], S->X[5]); + U64TO8_LE(&hash[48], S->X[6]); + U64TO8_LE(&hash[56], S->X[7]); +} + + +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4, + 0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf, + 0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41, + 0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67, +}; diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-avx.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-avx.h new file mode 100644 index 0000000..259fae4 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-avx.h @@ -0,0 +1,381 @@ +/* x86 */ +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,32) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + aj(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[ecx+eax+0]) + a3(vpxor xmm1,xmm1,[ecx+eax+16]) + a3(vpxor xmm2,xmm2,[ecx+eax+32]) + a3(vpxor xmm3,xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_avx_loop:) + a2(and eax, eax) + a3(vpxor xmm0,xmm0,[esi+ecx+0]) + a3(vpxor xmm1,xmm1,[esi+ecx+16]) + a3(vpxor xmm2,xmm2,[esi+ecx+32]) + a3(vpxor xmm3,xmm3,[esi+ecx+48]) + aj(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[eax+ecx+0]) + a3(vpxor xmm1,xmm1,[eax+ecx+16]) + a3(vpxor xmm2,xmm2,[eax+ecx+32]) + a3(vpxor xmm3,xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa [esp+0],xmm0) + a2(vmovdqa [esp+16],xmm1) + a2(vmovdqa xmm6,xmm2) + a2(vmovdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_salsa_avx_loop: ) + a3(vpaddd xmm4, xmm1, xmm0) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm3, xmm3, xmm5) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm0, xmm3) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm3, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm1, xmm1, xmm5) + a3(vpshufd xmm3, xmm3, 0x93) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm2, xmm1) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(vpshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(vpaddd xmm4, xmm3, xmm0) + a3(vpshufd xmm1, xmm1, 0x39) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm1, xmm1, xmm5) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm0, xmm1) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm1, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm3, xmm3, xmm5) + a3(vpshufd xmm1, xmm1, 0x93) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm2, xmm3) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(vpshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(vpshufd xmm3, xmm3, 0x39) + a2(sub eax, 2) + aj(ja scrypt_salsa_avx_loop) + a3(vpaddd xmm0,xmm0,[esp+0]) + a3(vpaddd xmm1,xmm1,[esp+16]) + a3(vpaddd xmm2,xmm2,xmm6) + a3(vpaddd xmm3,xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(vmovdqa [eax+0],xmm0) + a2(vmovdqa [eax+16],xmm1) + a2(vmovdqa [eax+32],xmm2) + a2(vmovdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + aj(jne scrypt_ChunkMix_avx_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + aret(16) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + aj(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_avx_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + aj(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa xmm8,xmm0) + a2(vmovdqa xmm9,xmm1) + a2(vmovdqa xmm10,xmm2) + a2(vmovdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_salsa_avx_loop: ) + a3(vpaddd xmm4, xmm1, xmm0) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm3, xmm3, xmm5) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm0, xmm3) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm3, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm1, xmm1, xmm5) + a3(vpshufd xmm3, xmm3, 0x93) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm2, xmm1) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(vpshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(vpaddd xmm4, xmm3, xmm0) + a3(vpshufd xmm1, xmm1, 0x39) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm1, xmm1, xmm5) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm0, xmm1) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm1, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm3, xmm3, xmm5) + a3(vpshufd xmm1, xmm1, 0x93) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm2, xmm3) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(vpshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(vpshufd xmm3, xmm3, 0x39) + a2(sub rax, 2) + aj(ja scrypt_salsa_avx_loop) + a3(vpaddd xmm0,xmm0,xmm8) + a3(vpaddd xmm1,xmm1,xmm9) + a3(vpaddd xmm2,xmm2,xmm10) + a3(vpaddd xmm3,xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + aj(jne scrypt_ChunkMix_avx_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_AVX + +static void asm_calling_convention NOINLINE +scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x4 = x1; + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x3 = _mm_xor_si128(x3, x4); + x4 = x0; + x3 = _mm_xor_si128(x3, x5); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x3; + x2 = _mm_xor_si128(x2, x5); + x3 = _mm_shuffle_epi32(x3, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x1 = _mm_xor_si128(x1, x4); + x4 = x2; + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x4 = x3; + x0 = _mm_xor_si128(x0, x5); + x1 = _mm_shuffle_epi32(x1, 0x39); + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x1 = _mm_xor_si128(x1, x4); + x4 = x0; + x1 = _mm_xor_si128(x1, x5); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x1; + x2 = _mm_xor_si128(x2, x5); + x1 = _mm_shuffle_epi32(x1, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x3 = _mm_xor_si128(x3, x4); + x4 = x2; + x3 = _mm_xor_si128(x3, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x3 = _mm_shuffle_epi32(x3, 0x39); + x0 = _mm_xor_si128(x0, x5); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_SALSA_AVX) + /* uses salsa_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa/8-AVX" + #undef SCRYPT_SALSA_INCLUDED + #define SCRYPT_SALSA_INCLUDED +#endif diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h new file mode 100644 index 0000000..00f4a3c --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h @@ -0,0 +1,443 @@ +/* x86 */ +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,32) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + aj(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[ecx+eax+0]) + a2(pxor xmm1,[ecx+eax+16]) + a2(pxor xmm2,[ecx+eax+32]) + a2(pxor xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and eax, eax) + a2(pxor xmm0,[esi+ecx+0]) + a2(pxor xmm1,[esi+ecx+16]) + a2(pxor xmm2,[esi+ecx+32]) + a2(pxor xmm3,[esi+ecx+48]) + aj(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[eax+ecx+0]) + a2(pxor xmm1,[eax+ecx+16]) + a2(pxor xmm2,[eax+ecx+32]) + a2(pxor xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa [esp+0],xmm0) + a2(movdqa [esp+16],xmm1) + a2(movdqa xmm6,xmm2) + a2(movdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_salsa_sse2_loop: ) + a2(movdqa xmm4, xmm1) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm3, xmm5) + a2(paddd xmm4, xmm3) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm2, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm1, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm0, xmm5) + a3(pshufd xmm1, xmm1, 0x39) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm1, xmm5) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm1) + a2(pxor xmm2, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm3, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm3) + a2(sub eax, 2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a2(pxor xmm0, xmm5) + aj(ja scrypt_salsa_sse2_loop) + a2(paddd xmm0,[esp+0]) + a2(paddd xmm1,[esp+16]) + a2(paddd xmm2,xmm6) + a2(paddd xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(movdqa [eax+0],xmm0) + a2(movdqa [eax+16],xmm1) + a2(movdqa [eax+32],xmm2) + a2(movdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + aj(jne scrypt_ChunkMix_sse2_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + aret(16) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + aj(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + aj(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa xmm8,xmm0) + a2(movdqa xmm9,xmm1) + a2(movdqa xmm10,xmm2) + a2(movdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_salsa_sse2_loop: ) + a2(movdqa xmm4, xmm1) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm3, xmm5) + a2(paddd xmm4, xmm3) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm2, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm1, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm0, xmm5) + a3(pshufd xmm1, xmm1, 0x39) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm1, xmm5) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm1) + a2(pxor xmm2, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm3, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm3) + a2(sub rax, 2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a2(pxor xmm0, xmm5) + aj(ja scrypt_salsa_sse2_loop) + a2(paddd xmm0,xmm8) + a2(paddd xmm1,xmm9) + a2(paddd xmm2,xmm10) + a2(paddd xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + aj(jne scrypt_ChunkMix_sse2_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_SSE2 + +static void NOINLINE asm_calling_convention +scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x4 = x1; + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x3 = _mm_xor_si128(x3, x4); + x4 = x0; + x3 = _mm_xor_si128(x3, x5); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x3; + x2 = _mm_xor_si128(x2, x5); + x3 = _mm_shuffle_epi32(x3, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x1 = _mm_xor_si128(x1, x4); + x4 = x2; + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x4 = x3; + x0 = _mm_xor_si128(x0, x5); + x1 = _mm_shuffle_epi32(x1, 0x39); + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x1 = _mm_xor_si128(x1, x4); + x4 = x0; + x1 = _mm_xor_si128(x1, x5); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x1; + x2 = _mm_xor_si128(x2, x5); + x1 = _mm_shuffle_epi32(x1, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x3 = _mm_xor_si128(x3, x4); + x4 = x2; + x3 = _mm_xor_si128(x3, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x3 = _mm_shuffle_epi32(x3, 0x39); + x0 = _mm_xor_si128(x0, x5); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_SALSA_SSE2) + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa/8-SSE2" + #undef SCRYPT_SALSA_INCLUDED + #define SCRYPT_SALSA_INCLUDED +#endif + +/* used by avx,etc as well */ +#if defined(SCRYPT_SALSA_INCLUDED) + /* + Default layout: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 + 12 13 14 15 + + SSE2 layout: + 0 5 10 15 + 12 1 6 11 + 8 13 2 7 + 4 9 14 3 + */ + + static void asm_calling_convention + salsa_core_tangle_sse2(uint32_t *blocks, size_t count) { + uint32_t t; + while (count--) { + t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; + t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; + t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; + t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; + t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; + t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; + blocks += 16; + } + } +#endif + diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-xop.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-xop.h new file mode 100644 index 0000000..1d014d2 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-xop.h @@ -0,0 +1,317 @@ +/* x86 */ +#if defined(X86ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA_XOP + +asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_xop) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,32) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + aj(jz scrypt_ChunkMix_xop_no_xor1) + a3(vpxor xmm0,xmm0,[ecx+eax+0]) + a3(vpxor xmm1,xmm1,[ecx+eax+16]) + a3(vpxor xmm2,xmm2,[ecx+eax+32]) + a3(vpxor xmm3,xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_xop_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_xop_loop:) + a2(and eax, eax) + a3(vpxor xmm0,xmm0,[esi+ecx+0]) + a3(vpxor xmm1,xmm1,[esi+ecx+16]) + a3(vpxor xmm2,xmm2,[esi+ecx+32]) + a3(vpxor xmm3,xmm3,[esi+ecx+48]) + aj(jz scrypt_ChunkMix_xop_no_xor2) + a3(vpxor xmm0,xmm0,[eax+ecx+0]) + a3(vpxor xmm1,xmm1,[eax+ecx+16]) + a3(vpxor xmm2,xmm2,[eax+ecx+32]) + a3(vpxor xmm3,xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_xop_no_xor2:) + a2(vmovdqa [esp+0],xmm0) + a2(vmovdqa [esp+16],xmm1) + a2(vmovdqa xmm6,xmm2) + a2(vmovdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_salsa_xop_loop: ) + a3(vpaddd xmm4, xmm1, xmm0) + a3(vprotd xmm4, xmm4, 7) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm0, xmm3) + a3(vprotd xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm3, xmm2) + a3(vprotd xmm4, xmm4, 13) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm2, xmm1) + a3(pshufd xmm3, xmm3, 0x93) + a3(vprotd xmm4, xmm4, 18) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(pshufd xmm1, xmm1, 0x39) + a3(vpaddd xmm4, xmm3, xmm0) + a3(vprotd xmm4, xmm4, 7) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm0, xmm1) + a3(vprotd xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm1, xmm2) + a3(vprotd xmm4, xmm4, 13) + a3(vpxor xmm3, xmm3, xmm4) + a3(pshufd xmm1, xmm1, 0x93) + a3(vpaddd xmm4, xmm2, xmm3) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vprotd xmm4, xmm4, 18) + a3(pshufd xmm3, xmm3, 0x39) + a3(vpxor xmm0, xmm0, xmm4) + a2(sub eax, 2) + aj(ja scrypt_salsa_xop_loop) + a3(vpaddd xmm0,xmm0,[esp+0]) + a3(vpaddd xmm1,xmm1,[esp+16]) + a3(vpaddd xmm2,xmm2,xmm6) + a3(vpaddd xmm3,xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(vmovdqa [eax+0],xmm0) + a2(vmovdqa [eax+16],xmm1) + a2(vmovdqa [eax+32],xmm2) + a2(vmovdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + aj(jne scrypt_ChunkMix_xop_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + aret(16) +asm_naked_fn_end(scrypt_ChunkMix_xop) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA_XOP + +asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_xop) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + aj(jz scrypt_ChunkMix_xop_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a1(scrypt_ChunkMix_xop_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_xop_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + aj(jz scrypt_ChunkMix_xop_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_xop_no_xor2:) + a2(vmovdqa xmm8,xmm0) + a2(vmovdqa xmm9,xmm1) + a2(vmovdqa xmm10,xmm2) + a2(vmovdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_salsa_xop_loop: ) + a3(vpaddd xmm4, xmm1, xmm0) + a3(vprotd xmm4, xmm4, 7) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm0, xmm3) + a3(vprotd xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm3, xmm2) + a3(vprotd xmm4, xmm4, 13) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm2, xmm1) + a3(pshufd xmm3, xmm3, 0x93) + a3(vprotd xmm4, xmm4, 18) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(pshufd xmm1, xmm1, 0x39) + a3(vpaddd xmm4, xmm3, xmm0) + a3(vprotd xmm4, xmm4, 7) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm0, xmm1) + a3(vprotd xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm1, xmm2) + a3(vprotd xmm4, xmm4, 13) + a3(vpxor xmm3, xmm3, xmm4) + a3(pshufd xmm1, xmm1, 0x93) + a3(vpaddd xmm4, xmm2, xmm3) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vprotd xmm4, xmm4, 18) + a3(pshufd xmm3, xmm3, 0x39) + a3(vpxor xmm0, xmm0, xmm4) + a2(sub rax, 2) + aj(ja scrypt_salsa_xop_loop) + a3(vpaddd xmm0,xmm0,xmm8) + a3(vpaddd xmm1,xmm1,xmm9) + a3(vpaddd xmm2,xmm2,xmm10) + a3(vpaddd xmm3,xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + aj(jne scrypt_ChunkMix_xop_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_xop) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_XOP + +static void asm_calling_convention NOINLINE +scrypt_ChunkMix_xop(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x4 = _mm_add_epi32(x1, x0); + x4 = _mm_roti_epi32(x4, 7); + x3 = _mm_xor_si128(x3, x4); + x4 = _mm_add_epi32(x0, x3); + x4 = _mm_roti_epi32(x4, 9); + x2 = _mm_xor_si128(x2, x4); + x4 = _mm_add_epi32(x3, x2); + x4 = _mm_roti_epi32(x4, 13); + x1 = _mm_xor_si128(x1, x4); + x4 = _mm_add_epi32(x2, x1); + x4 = _mm_roti_epi32(x4, 18); + x0 = _mm_xor_si128(x0, x4); + x3 = _mm_shuffle_epi32(x3, 0x93); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x1 = _mm_shuffle_epi32(x1, 0x39); + x4 = _mm_add_epi32(x3, x0); + x4 = _mm_roti_epi32(x4, 7); + x1 = _mm_xor_si128(x1, x4); + x4 = _mm_add_epi32(x0, x1); + x4 = _mm_roti_epi32(x4, 9); + x2 = _mm_xor_si128(x2, x4); + x4 = _mm_add_epi32(x1, x2); + x4 = _mm_roti_epi32(x4, 13); + x3 = _mm_xor_si128(x3, x4); + x4 = _mm_add_epi32(x2, x3); + x4 = _mm_roti_epi32(x4, 18); + x0 = _mm_xor_si128(x0, x4); + x1 = _mm_shuffle_epi32(x1, 0x93); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x3 = _mm_shuffle_epi32(x3, 0x39); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_SALSA_XOP) + /* uses salsa_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa/8-XOP" + #undef SCRYPT_SALSA_INCLUDED + #define SCRYPT_SALSA_INCLUDED +#endif diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h new file mode 100644 index 0000000..33f3340 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h @@ -0,0 +1,70 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED) + +#undef SCRYPT_MIX +#define SCRYPT_MIX "Salsa20/8 Ref" + +#undef SCRYPT_SALSA_INCLUDED +#define SCRYPT_SALSA_INCLUDED +#define SCRYPT_SALSA_BASIC + +static void +salsa_core_basic(uint32_t state[16]) { + size_t rounds = 8; + uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; + + x0 = state[0]; + x1 = state[1]; + x2 = state[2]; + x3 = state[3]; + x4 = state[4]; + x5 = state[5]; + x6 = state[6]; + x7 = state[7]; + x8 = state[8]; + x9 = state[9]; + x10 = state[10]; + x11 = state[11]; + x12 = state[12]; + x13 = state[13]; + x14 = state[14]; + x15 = state[15]; + + #define quarter(a,b,c,d) \ + t = a+d; t = ROTL32(t, 7); b ^= t; \ + t = b+a; t = ROTL32(t, 9); c ^= t; \ + t = c+b; t = ROTL32(t, 13); d ^= t; \ + t = d+c; t = ROTL32(t, 18); a ^= t; \ + + for (; rounds; rounds -= 2) { + quarter( x0, x4, x8,x12) + quarter( x5, x9,x13, x1) + quarter(x10,x14, x2, x6) + quarter(x15, x3, x7,x11) + quarter( x0, x1, x2, x3) + quarter( x5, x6, x7, x4) + quarter(x10,x11, x8, x9) + quarter(x15,x12,x13,x14) + } + + state[0] += x0; + state[1] += x1; + state[2] += x2; + state[3] += x3; + state[4] += x4; + state[5] += x5; + state[6] += x6; + state[7] += x7; + state[8] += x8; + state[9] += x9; + state[10] += x10; + state[11] += x11; + state[12] += x12; + state[13] += x13; + state[14] += x14; + state[15] += x15; + + #undef quarter +} + +#endif + diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-avx.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-avx.h new file mode 100644 index 0000000..663d833 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-avx.h @@ -0,0 +1,367 @@ +/* x64 */ +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA64_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a1(push rbp) + a2(mov rbp, rsp) + a2(and rsp, ~63) + a2(sub rsp, 128) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + a2(vmovdqa xmm4,[rax+64]) + a2(vmovdqa xmm5,[rax+80]) + a2(vmovdqa xmm6,[rax+96]) + a2(vmovdqa xmm7,[rax+112]) + aj(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a3(vpxor xmm4,xmm4,[r9+64]) + a3(vpxor xmm5,xmm5,[r9+80]) + a3(vpxor xmm6,xmm6,[r9+96]) + a3(vpxor xmm7,xmm7,[r9+112]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_avx_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + a3(vpxor xmm4,xmm4,[rsi+r9+64]) + a3(vpxor xmm5,xmm5,[rsi+r9+80]) + a3(vpxor xmm6,xmm6,[rsi+r9+96]) + a3(vpxor xmm7,xmm7,[rsi+r9+112]) + aj(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a3(vpxor xmm4,xmm4,[rdx+r9+64]) + a3(vpxor xmm5,xmm5,[rdx+r9+80]) + a3(vpxor xmm6,xmm6,[rdx+r9+96]) + a3(vpxor xmm7,xmm7,[rdx+r9+112]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa [rsp+0],xmm0) + a2(vmovdqa [rsp+16],xmm1) + a2(vmovdqa [rsp+32],xmm2) + a2(vmovdqa [rsp+48],xmm3) + a2(vmovdqa [rsp+64],xmm4) + a2(vmovdqa [rsp+80],xmm5) + a2(vmovdqa [rsp+96],xmm6) + a2(vmovdqa [rsp+112],xmm7) + a2(mov rax,8) + a1(scrypt_salsa64_avx_loop: ) + a3(vpaddq xmm8, xmm0, xmm2) + a3(vpaddq xmm9, xmm1, xmm3) + a3(vpshufd xmm8, xmm8, 0xb1) + a3(vpshufd xmm9, xmm9, 0xb1) + a3(vpxor xmm6, xmm6, xmm8) + a3(vpxor xmm7, xmm7, xmm9) + a3(vpaddq xmm10, xmm0, xmm6) + a3(vpaddq xmm11, xmm1, xmm7) + a3(vpsrlq xmm8, xmm10, 51) + a3(vpsrlq xmm9, xmm11, 51) + a3(vpsllq xmm10, xmm10, 13) + a3(vpsllq xmm11, xmm11, 13) + a3(vpxor xmm4, xmm4, xmm8) + a3(vpxor xmm5, xmm5, xmm9) + a3(vpxor xmm4, xmm4, xmm10) + a3(vpxor xmm5, xmm5, xmm11) + a3(vpaddq xmm8, xmm6, xmm4) + a3(vpaddq xmm9, xmm7, xmm5) + a3(vpsrlq xmm10, xmm8, 25) + a3(vpsrlq xmm11, xmm9, 25) + a3(vpsllq xmm8, xmm8, 39) + a3(vpsllq xmm9, xmm9, 39) + a3(vpxor xmm2, xmm2, xmm10) + a3(vpxor xmm3, xmm3, xmm11) + a3(vpxor xmm2, xmm2, xmm8) + a3(vpxor xmm3, xmm3, xmm9) + a3(vpaddq xmm10, xmm4, xmm2) + a3(vpaddq xmm11, xmm5, xmm3) + a3(vpshufd xmm10, xmm10, 0xb1) + a3(vpshufd xmm11, xmm11, 0xb1) + a3(vpxor xmm0, xmm0, xmm10) + a3(vpxor xmm1, xmm1, xmm11) + a2(vmovdqa xmm8, xmm2) + a2(vmovdqa xmm9, xmm3) + a4(vpalignr xmm2, xmm6, xmm7, 8) + a4(vpalignr xmm3, xmm7, xmm6, 8) + a4(vpalignr xmm6, xmm9, xmm8, 8) + a4(vpalignr xmm7, xmm8, xmm9, 8) + a3(vpaddq xmm10, xmm0, xmm2) + a3(vpaddq xmm11, xmm1, xmm3) + a3(vpshufd xmm10, xmm10, 0xb1) + a3(vpshufd xmm11, xmm11, 0xb1) + a3(vpxor xmm6, xmm6, xmm10) + a3(vpxor xmm7, xmm7, xmm11) + a3(vpaddq xmm8, xmm0, xmm6) + a3(vpaddq xmm9, xmm1, xmm7) + a3(vpsrlq xmm10, xmm8, 51) + a3(vpsrlq xmm11, xmm9, 51) + a3(vpsllq xmm8, xmm8, 13) + a3(vpsllq xmm9, xmm9, 13) + a3(vpxor xmm5, xmm5, xmm10) + a3(vpxor xmm4, xmm4, xmm11) + a3(vpxor xmm5, xmm5, xmm8) + a3(vpxor xmm4, xmm4, xmm9) + a3(vpaddq xmm10, xmm6, xmm5) + a3(vpaddq xmm11, xmm7, xmm4) + a3(vpsrlq xmm8, xmm10, 25) + a3(vpsrlq xmm9, xmm11, 25) + a3(vpsllq xmm10, xmm10, 39) + a3(vpsllq xmm11, xmm11, 39) + a3(vpxor xmm2, xmm2, xmm8) + a3(vpxor xmm3, xmm3, xmm9) + a3(vpxor xmm2, xmm2, xmm10) + a3(vpxor xmm3, xmm3, xmm11) + a3(vpaddq xmm8, xmm5, xmm2) + a3(vpaddq xmm9, xmm4, xmm3) + a3(vpshufd xmm8, xmm8, 0xb1) + a3(vpshufd xmm9, xmm9, 0xb1) + a3(vpxor xmm0, xmm0, xmm8) + a3(vpxor xmm1, xmm1, xmm9) + a2(vmovdqa xmm10, xmm2) + a2(vmovdqa xmm11, xmm3) + a4(vpalignr xmm2, xmm6, xmm7, 8) + a4(vpalignr xmm3, xmm7, xmm6, 8) + a4(vpalignr xmm6, xmm11, xmm10, 8) + a4(vpalignr xmm7, xmm10, xmm11, 8) + a2(sub rax, 2) + aj(ja scrypt_salsa64_avx_loop) + a3(vpaddq xmm0,xmm0,[rsp+0]) + a3(vpaddq xmm1,xmm1,[rsp+16]) + a3(vpaddq xmm2,xmm2,[rsp+32]) + a3(vpaddq xmm3,xmm3,[rsp+48]) + a3(vpaddq xmm4,xmm4,[rsp+64]) + a3(vpaddq xmm5,xmm5,[rsp+80]) + a3(vpaddq xmm6,xmm6,[rsp+96]) + a3(vpaddq xmm7,xmm7,[rsp+112]) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + a2(vmovdqa [rax+64],xmm4) + a2(vmovdqa [rax+80],xmm5) + a2(vmovdqa [rax+96],xmm6) + a2(vmovdqa [rax+112],xmm7) + aj(jne scrypt_ChunkMix_avx_loop) + a2(mov rsp, rbp) + a1(pop rbp) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_AVX + +static void asm_calling_convention +scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + x4 = xmmp[4]; + x5 = xmmp[5]; + x6 = xmmp[6]; + x7 = xmmp[7]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + t4 = x4; + t5 = x5; + t6 = x6; + t7 = x7; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z2); + x5 = _mm_xor_si128(x5, z3); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x5 = _mm_xor_si128(x5, z2); + x4 = _mm_xor_si128(x4, z3); + x5 = _mm_xor_si128(x5, z0); + x4 = _mm_xor_si128(x4, z1); + + z0 = _mm_add_epi64(x5, x6); + z1 = _mm_add_epi64(x4, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x5); + z1 = _mm_add_epi64(x3, x4); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + } + + x0 = _mm_add_epi64(x0, t0); + x1 = _mm_add_epi64(x1, t1); + x2 = _mm_add_epi64(x2, t2); + x3 = _mm_add_epi64(x3, t3); + x4 = _mm_add_epi64(x4, t4); + x5 = _mm_add_epi64(x5, t5); + x6 = _mm_add_epi64(x6, t6); + x7 = _mm_add_epi64(x7, t7); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + xmmp[4] = x4; + xmmp[5] = x5; + xmmp[6] = x6; + xmmp[7] = x7; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_AVX) + /* uses salsa64_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-AVX" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-avx2.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-avx2.h new file mode 100644 index 0000000..8181302 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-avx2.h @@ -0,0 +1,221 @@ +/* x64 */ +#if defined(X86_64ASM_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA64_AVX2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx2) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa ymm0,[rax+0]) + a2(vmovdqa ymm1,[rax+32]) + a2(vmovdqa ymm2,[rax+64]) + a2(vmovdqa ymm3,[rax+96]) + aj(jz scrypt_ChunkMix_avx2_no_xor1) + a3(vpxor ymm0,ymm0,[r9+0]) + a3(vpxor ymm1,ymm1,[r9+32]) + a3(vpxor ymm2,ymm2,[r9+64]) + a3(vpxor ymm3,ymm3,[r9+96]) + a1(scrypt_ChunkMix_avx2_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_avx2_loop:) + a2(and rdx, rdx) + a3(vpxor ymm0,ymm0,[rsi+r9+0]) + a3(vpxor ymm1,ymm1,[rsi+r9+32]) + a3(vpxor ymm2,ymm2,[rsi+r9+64]) + a3(vpxor ymm3,ymm3,[rsi+r9+96]) + aj(jz scrypt_ChunkMix_avx2_no_xor2) + a3(vpxor ymm0,ymm0,[rdx+r9+0]) + a3(vpxor ymm1,ymm1,[rdx+r9+32]) + a3(vpxor ymm2,ymm2,[rdx+r9+64]) + a3(vpxor ymm3,ymm3,[rdx+r9+96]) + a1(scrypt_ChunkMix_avx2_no_xor2:) + a2(vmovdqa ymm6,ymm0) + a2(vmovdqa ymm7,ymm1) + a2(vmovdqa ymm8,ymm2) + a2(vmovdqa ymm9,ymm3) + a2(mov rax,4) + a1(scrypt_salsa64_avx2_loop: ) + a3(vpaddq ymm4, ymm1, ymm0) + a3(vpshufd ymm4, ymm4, 0xb1) + a3(vpxor ymm3, ymm3, ymm4) + a3(vpaddq ymm4, ymm0, ymm3) + a3(vpsrlq ymm5, ymm4, 51) + a3(vpxor ymm2, ymm2, ymm5) + a3(vpsllq ymm4, ymm4, 13) + a3(vpxor ymm2, ymm2, ymm4) + a3(vpaddq ymm4, ymm3, ymm2) + a3(vpsrlq ymm5, ymm4, 25) + a3(vpxor ymm1, ymm1, ymm5) + a3(vpsllq ymm4, ymm4, 39) + a3(vpxor ymm1, ymm1, ymm4) + a3(vpaddq ymm4, ymm2, ymm1) + a3(vpshufd ymm4, ymm4, 0xb1) + a3(vpermq ymm1, ymm1, 0x39) + a3(vpermq ymm10, ymm2, 0x4e) + a3(vpxor ymm0, ymm0, ymm4) + a3(vpermq ymm3, ymm3, 0x93) + a3(vpaddq ymm4, ymm3, ymm0) + a3(vpshufd ymm4, ymm4, 0xb1) + a3(vpxor ymm1, ymm1, ymm4) + a3(vpaddq ymm4, ymm0, ymm1) + a3(vpsrlq ymm5, ymm4, 51) + a3(vpxor ymm10, ymm10, ymm5) + a3(vpsllq ymm4, ymm4, 13) + a3(vpxor ymm10, ymm10, ymm4) + a3(vpaddq ymm4, ymm1, ymm10) + a3(vpsrlq ymm5, ymm4, 25) + a3(vpxor ymm3, ymm3, ymm5) + a3(vpsllq ymm4, ymm4, 39) + a3(vpermq ymm1, ymm1, 0x93) + a3(vpxor ymm3, ymm3, ymm4) + a3(vpermq ymm2, ymm10, 0x4e) + a3(vpaddq ymm4, ymm10, ymm3) + a3(vpshufd ymm4, ymm4, 0xb1) + a3(vpermq ymm3, ymm3, 0x39) + a3(vpxor ymm0, ymm0, ymm4) + a1(dec rax) + aj(jnz scrypt_salsa64_avx2_loop) + a3(vpaddq ymm0,ymm0,ymm6) + a3(vpaddq ymm1,ymm1,ymm7) + a3(vpaddq ymm2,ymm2,ymm8) + a3(vpaddq ymm3,ymm3,ymm9) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],ymm0) + a2(vmovdqa [rax+32],ymm1) + a2(vmovdqa [rax+64],ymm2) + a2(vmovdqa [rax+96],ymm3) + aj(jne scrypt_ChunkMix_avx2_loop) + a1(vzeroupper) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_avx2) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_AVX2 + +static void asm_calling_convention +scrypt_ChunkMix_avx2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + ymmi *ymmp,y0,y1,y2,y3,t0,t1,t2,t3,z0,z1; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + ymmp = (ymmi *)scrypt_block(Bin, blocksPerChunk - 1); + y0 = ymmp[0]; + y1 = ymmp[1]; + y2 = ymmp[2]; + y3 = ymmp[3]; + + if (Bxor) { + ymmp = (ymmi *)scrypt_block(Bxor, blocksPerChunk - 1); + y0 = _mm256_xor_si256(y0, ymmp[0]); + y1 = _mm256_xor_si256(y1, ymmp[1]); + y2 = _mm256_xor_si256(y2, ymmp[2]); + y3 = _mm256_xor_si256(y3, ymmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + ymmp = (ymmi *)scrypt_block(Bin, i); + y0 = _mm256_xor_si256(y0, ymmp[0]); + y1 = _mm256_xor_si256(y1, ymmp[1]); + y2 = _mm256_xor_si256(y2, ymmp[2]); + y3 = _mm256_xor_si256(y3, ymmp[3]); + + if (Bxor) { + ymmp = (ymmi *)scrypt_block(Bxor, i); + y0 = _mm256_xor_si256(y0, ymmp[0]); + y1 = _mm256_xor_si256(y1, ymmp[1]); + y2 = _mm256_xor_si256(y2, ymmp[2]); + y3 = _mm256_xor_si256(y3, ymmp[3]); + } + + t0 = y0; + t1 = y1; + t2 = y2; + t3 = y3; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm256_add_epi64(y0, y1); + z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + y3 = _mm256_xor_si256(y3, z0); + z0 = _mm256_add_epi64(y3, y0); + z1 = _mm256_srli_epi64(z0, 64-13); + y2 = _mm256_xor_si256(y2, z1); + z0 = _mm256_slli_epi64(z0, 13); + y2 = _mm256_xor_si256(y2, z0); + z0 = _mm256_add_epi64(y2, y3); + z1 = _mm256_srli_epi64(z0, 64-39); + y1 = _mm256_xor_si256(y1, z1); + z0 = _mm256_slli_epi64(z0, 39); + y1 = _mm256_xor_si256(y1, z0); + y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(0,3,2,1)); + y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2)); + y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(2,1,0,3)); + z0 = _mm256_add_epi64(y1, y2); + z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + y0 = _mm256_xor_si256(y0, z0); + z0 = _mm256_add_epi64(y0, y3); + z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + y1 = _mm256_xor_si256(y1, z0); + z0 = _mm256_add_epi64(y1, y0); + z1 = _mm256_srli_epi64(z0, 64-13); + y2 = _mm256_xor_si256(y2, z1); + z0 = _mm256_slli_epi64(z0, 13); + y2 = _mm256_xor_si256(y2, z0); + z0 = _mm256_add_epi64(y2, y1); + z1 = _mm256_srli_epi64(z0, 64-39); + y3 = _mm256_xor_si256(y3, z1); + z0 = _mm256_slli_epi64(z0, 39); + y3 = _mm256_xor_si256(y3, z0); + z0 = _mm256_add_epi64(y3, y2); + z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + y0 = _mm256_xor_si256(y0, z0); + y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(2,1,0,3)); + y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2)); + y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(0,3,2,1)); + } + + y0 = _mm256_add_epi64(y0, t0); + y1 = _mm256_add_epi64(y1, t1); + y2 = _mm256_add_epi64(y2, t2); + y3 = _mm256_add_epi64(y3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + ymmp = (ymmi *)scrypt_block(Bout, (i / 2) + half); + ymmp[0] = y0; + ymmp[1] = y1; + ymmp[2] = y2; + ymmp[3] = y3; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_AVX2) + /* uses salsa64_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-AVX2" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-sse2.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-sse2.h new file mode 100644 index 0000000..971d98a --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-sse2.h @@ -0,0 +1,449 @@ +/* x64 */ +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA64_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a1(push rbp) + a2(mov rbp, rsp) + a2(and rsp, ~63) + a2(sub rsp, 128) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a2(movdqa xmm4,[rax+64]) + a2(movdqa xmm5,[rax+80]) + a2(movdqa xmm6,[rax+96]) + a2(movdqa xmm7,[rax+112]) + aj(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a2(pxor xmm4,[r9+64]) + a2(pxor xmm5,[r9+80]) + a2(pxor xmm6,[r9+96]) + a2(pxor xmm7,[r9+112]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a2(pxor xmm4,[rsi+r9+64]) + a2(pxor xmm5,[rsi+r9+80]) + a2(pxor xmm6,[rsi+r9+96]) + a2(pxor xmm7,[rsi+r9+112]) + aj(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a2(pxor xmm4,[rdx+r9+64]) + a2(pxor xmm5,[rdx+r9+80]) + a2(pxor xmm6,[rdx+r9+96]) + a2(pxor xmm7,[rdx+r9+112]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa [rsp+0],xmm0) + a2(movdqa [rsp+16],xmm1) + a2(movdqa [rsp+32],xmm2) + a2(movdqa [rsp+48],xmm3) + a2(movdqa [rsp+64],xmm4) + a2(movdqa [rsp+80],xmm5) + a2(movdqa [rsp+96],xmm6) + a2(movdqa [rsp+112],xmm7) + a2(mov rax,8) + a1(scrypt_salsa64_sse2_loop: ) + a2(movdqa xmm8, xmm0) + a2(movdqa xmm9, xmm1) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm6, xmm8) + a2(pxor xmm7, xmm9) + a2(movdqa xmm10, xmm0) + a2(movdqa xmm11, xmm1) + a2(paddq xmm10, xmm6) + a2(paddq xmm11, xmm7) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 51) + a2(psrlq xmm11, 51) + a2(psllq xmm8, 13) + a2(psllq xmm9, 13) + a2(pxor xmm4, xmm10) + a2(pxor xmm5, xmm11) + a2(pxor xmm4, xmm8) + a2(pxor xmm5, xmm9) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(paddq xmm10, xmm4) + a2(paddq xmm11, xmm5) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 25) + a2(psrlq xmm11, 25) + a2(psllq xmm8, 39) + a2(psllq xmm9, 39) + a2(pxor xmm2, xmm10) + a2(pxor xmm3, xmm11) + a2(pxor xmm2, xmm8) + a2(pxor xmm3, xmm9) + a2(movdqa xmm8, xmm4) + a2(movdqa xmm9, xmm5) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm0, xmm8) + a2(pxor xmm1, xmm9) + a2(movdqa xmm8, xmm2) + a2(movdqa xmm9, xmm3) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(movdqa xmm2, xmm7) + a2(movdqa xmm3, xmm6) + a2(punpcklqdq xmm10, xmm6) + a2(punpcklqdq xmm11, xmm7) + a2(movdqa xmm6, xmm8) + a2(movdqa xmm7, xmm9) + a2(punpcklqdq xmm9, xmm9) + a2(punpcklqdq xmm8, xmm8) + a2(punpckhqdq xmm2, xmm10) + a2(punpckhqdq xmm3, xmm11) + a2(punpckhqdq xmm6, xmm9) + a2(punpckhqdq xmm7, xmm8) + a2(sub rax, 2) + a2(movdqa xmm8, xmm0) + a2(movdqa xmm9, xmm1) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm6, xmm8) + a2(pxor xmm7, xmm9) + a2(movdqa xmm10, xmm0) + a2(movdqa xmm11, xmm1) + a2(paddq xmm10, xmm6) + a2(paddq xmm11, xmm7) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 51) + a2(psrlq xmm11, 51) + a2(psllq xmm8, 13) + a2(psllq xmm9, 13) + a2(pxor xmm5, xmm10) + a2(pxor xmm4, xmm11) + a2(pxor xmm5, xmm8) + a2(pxor xmm4, xmm9) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(paddq xmm10, xmm5) + a2(paddq xmm11, xmm4) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 25) + a2(psrlq xmm11, 25) + a2(psllq xmm8, 39) + a2(psllq xmm9, 39) + a2(pxor xmm2, xmm10) + a2(pxor xmm3, xmm11) + a2(pxor xmm2, xmm8) + a2(pxor xmm3, xmm9) + a2(movdqa xmm8, xmm5) + a2(movdqa xmm9, xmm4) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm0, xmm8) + a2(pxor xmm1, xmm9) + a2(movdqa xmm8, xmm2) + a2(movdqa xmm9, xmm3) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(movdqa xmm2, xmm7) + a2(movdqa xmm3, xmm6) + a2(punpcklqdq xmm10, xmm6) + a2(punpcklqdq xmm11, xmm7) + a2(movdqa xmm6, xmm8) + a2(movdqa xmm7, xmm9) + a2(punpcklqdq xmm9, xmm9) + a2(punpcklqdq xmm8, xmm8) + a2(punpckhqdq xmm2, xmm10) + a2(punpckhqdq xmm3, xmm11) + a2(punpckhqdq xmm6, xmm9) + a2(punpckhqdq xmm7, xmm8) + aj(ja scrypt_salsa64_sse2_loop) + a2(paddq xmm0,[rsp+0]) + a2(paddq xmm1,[rsp+16]) + a2(paddq xmm2,[rsp+32]) + a2(paddq xmm3,[rsp+48]) + a2(paddq xmm4,[rsp+64]) + a2(paddq xmm5,[rsp+80]) + a2(paddq xmm6,[rsp+96]) + a2(paddq xmm7,[rsp+112]) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a2(movdqa [rax+64],xmm4) + a2(movdqa [rax+80],xmm5) + a2(movdqa [rax+96],xmm6) + a2(movdqa [rax+112],xmm7) + aj(jne scrypt_ChunkMix_sse2_loop) + a2(mov rsp, rbp) + a1(pop rbp) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_SSE2 + +static void asm_calling_convention +scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + x4 = xmmp[4]; + x5 = xmmp[5]; + x6 = xmmp[6]; + x7 = xmmp[7]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + t4 = x4; + t5 = x5; + t6 = x6; + t7 = x7; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z2); + x5 = _mm_xor_si128(x5, z3); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x4; + z1 = x5; + z2 = x2; + z3 = x3; + x4 = z1; + x5 = z0; + x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6)); + x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7)); + x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3)); + x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2)); + + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z2); + x5 = _mm_xor_si128(x5, z3); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x4; + z1 = x5; + z2 = x2; + z3 = x3; + x4 = z1; + x5 = z0; + x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6)); + x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7)); + x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3)); + x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2)); + } + + x0 = _mm_add_epi64(x0, t0); + x1 = _mm_add_epi64(x1, t1); + x2 = _mm_add_epi64(x2, t2); + x3 = _mm_add_epi64(x3, t3); + x4 = _mm_add_epi64(x4, t4); + x5 = _mm_add_epi64(x5, t5); + x6 = _mm_add_epi64(x6, t6); + x7 = _mm_add_epi64(x7, t7); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + xmmp[4] = x4; + xmmp[5] = x5; + xmmp[6] = x6; + xmmp[7] = x7; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-SSE2" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif + +/* sse3/avx use this as well */ +#if defined(SCRYPT_SALSA64_INCLUDED) + /* + Default layout: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 + 12 13 14 15 + + SSE2 layout: + 0 5 10 15 + 12 1 6 11 + 8 13 2 7 + 4 9 14 3 + */ + + + static void asm_calling_convention + salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) { + uint64_t t; + while (count--) { + t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; + t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; + t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; + t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; + t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; + t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; + blocks += 16; + } + } +#endif \ No newline at end of file diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h new file mode 100644 index 0000000..21e94c9 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h @@ -0,0 +1,399 @@ +/* x64 */ +#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA64_SSSE3 + +asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_ssse3) + a1(push rbp) + a2(mov rbp, rsp) + a2(and rsp, ~63) + a2(sub rsp, 128) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a2(movdqa xmm4,[rax+64]) + a2(movdqa xmm5,[rax+80]) + a2(movdqa xmm6,[rax+96]) + a2(movdqa xmm7,[rax+112]) + aj(jz scrypt_ChunkMix_ssse3_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a2(pxor xmm4,[r9+64]) + a2(pxor xmm5,[r9+80]) + a2(pxor xmm6,[r9+96]) + a2(pxor xmm7,[r9+112]) + a1(scrypt_ChunkMix_ssse3_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_ssse3_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a2(pxor xmm4,[rsi+r9+64]) + a2(pxor xmm5,[rsi+r9+80]) + a2(pxor xmm6,[rsi+r9+96]) + a2(pxor xmm7,[rsi+r9+112]) + aj(jz scrypt_ChunkMix_ssse3_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a2(pxor xmm4,[rdx+r9+64]) + a2(pxor xmm5,[rdx+r9+80]) + a2(pxor xmm6,[rdx+r9+96]) + a2(pxor xmm7,[rdx+r9+112]) + a1(scrypt_ChunkMix_ssse3_no_xor2:) + a2(movdqa [rsp+0],xmm0) + a2(movdqa [rsp+16],xmm1) + a2(movdqa [rsp+32],xmm2) + a2(movdqa [rsp+48],xmm3) + a2(movdqa [rsp+64],xmm4) + a2(movdqa [rsp+80],xmm5) + a2(movdqa [rsp+96],xmm6) + a2(movdqa [rsp+112],xmm7) + a2(mov rax,8) + a1(scrypt_salsa64_ssse3_loop: ) + a2(movdqa xmm8, xmm0) + a2(movdqa xmm9, xmm1) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm6, xmm8) + a2(pxor xmm7, xmm9) + a2(movdqa xmm10, xmm0) + a2(movdqa xmm11, xmm1) + a2(paddq xmm10, xmm6) + a2(paddq xmm11, xmm7) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 51) + a2(psrlq xmm11, 51) + a2(psllq xmm8, 13) + a2(psllq xmm9, 13) + a2(pxor xmm4, xmm10) + a2(pxor xmm5, xmm11) + a2(pxor xmm4, xmm8) + a2(pxor xmm5, xmm9) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(paddq xmm10, xmm4) + a2(paddq xmm11, xmm5) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 25) + a2(psrlq xmm11, 25) + a2(psllq xmm8, 39) + a2(psllq xmm9, 39) + a2(pxor xmm2, xmm10) + a2(pxor xmm3, xmm11) + a2(pxor xmm2, xmm8) + a2(pxor xmm3, xmm9) + a2(movdqa xmm8, xmm4) + a2(movdqa xmm9, xmm5) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm0, xmm8) + a2(pxor xmm1, xmm9) + a2(movdqa xmm10, xmm2) + a2(movdqa xmm11, xmm3) + a2(movdqa xmm2, xmm6) + a2(movdqa xmm3, xmm7) + a3(palignr xmm2, xmm7, 8) + a3(palignr xmm3, xmm6, 8) + a2(movdqa xmm6, xmm11) + a2(movdqa xmm7, xmm10) + a3(palignr xmm6, xmm10, 8) + a3(palignr xmm7, xmm11, 8) + a2(sub rax, 2) + a2(movdqa xmm8, xmm0) + a2(movdqa xmm9, xmm1) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm6, xmm8) + a2(pxor xmm7, xmm9) + a2(movdqa xmm10, xmm0) + a2(movdqa xmm11, xmm1) + a2(paddq xmm10, xmm6) + a2(paddq xmm11, xmm7) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 51) + a2(psrlq xmm11, 51) + a2(psllq xmm8, 13) + a2(psllq xmm9, 13) + a2(pxor xmm5, xmm10) + a2(pxor xmm4, xmm11) + a2(pxor xmm5, xmm8) + a2(pxor xmm4, xmm9) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(paddq xmm10, xmm5) + a2(paddq xmm11, xmm4) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 25) + a2(psrlq xmm11, 25) + a2(psllq xmm8, 39) + a2(psllq xmm9, 39) + a2(pxor xmm2, xmm10) + a2(pxor xmm3, xmm11) + a2(pxor xmm2, xmm8) + a2(pxor xmm3, xmm9) + a2(movdqa xmm8, xmm5) + a2(movdqa xmm9, xmm4) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm0, xmm8) + a2(pxor xmm1, xmm9) + a2(movdqa xmm10, xmm2) + a2(movdqa xmm11, xmm3) + a2(movdqa xmm2, xmm6) + a2(movdqa xmm3, xmm7) + a3(palignr xmm2, xmm7, 8) + a3(palignr xmm3, xmm6, 8) + a2(movdqa xmm6, xmm11) + a2(movdqa xmm7, xmm10) + a3(palignr xmm6, xmm10, 8) + a3(palignr xmm7, xmm11, 8) + aj(ja scrypt_salsa64_ssse3_loop) + a2(paddq xmm0,[rsp+0]) + a2(paddq xmm1,[rsp+16]) + a2(paddq xmm2,[rsp+32]) + a2(paddq xmm3,[rsp+48]) + a2(paddq xmm4,[rsp+64]) + a2(paddq xmm5,[rsp+80]) + a2(paddq xmm6,[rsp+96]) + a2(paddq xmm7,[rsp+112]) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a2(movdqa [rax+64],xmm4) + a2(movdqa [rax+80],xmm5) + a2(movdqa [rax+96],xmm6) + a2(movdqa [rax+112],xmm7) + aj(jne scrypt_ChunkMix_ssse3_loop) + a2(mov rsp, rbp) + a1(pop rbp) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_ssse3) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_SSSE3 + +static void asm_calling_convention +scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + x4 = xmmp[4]; + x5 = xmmp[5]; + x6 = xmmp[6]; + x7 = xmmp[7]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + t4 = x4; + t5 = x5; + t6 = x6; + t7 = x7; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z2); + x5 = _mm_xor_si128(x5, z3); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x5 = _mm_xor_si128(x5, z2); + x4 = _mm_xor_si128(x4, z3); + x5 = _mm_xor_si128(x5, z0); + x4 = _mm_xor_si128(x4, z1); + + z0 = _mm_add_epi64(x5, x6); + z1 = _mm_add_epi64(x4, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x5); + z1 = _mm_add_epi64(x3, x4); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + } + + x0 = _mm_add_epi64(x0, t0); + x1 = _mm_add_epi64(x1, t1); + x2 = _mm_add_epi64(x2, t2); + x3 = _mm_add_epi64(x3, t3); + x4 = _mm_add_epi64(x4, t4); + x5 = _mm_add_epi64(x5, t5); + x6 = _mm_add_epi64(x6, t6); + x7 = _mm_add_epi64(x7, t7); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + xmmp[4] = x4; + xmmp[5] = x5; + xmmp[6] = x6; + xmmp[7] = x7; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + /* uses salsa64_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-SSSE3" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-xop.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-xop.h new file mode 100644 index 0000000..34f1b40 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-xop.h @@ -0,0 +1,335 @@ +/* x64 */ +#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA64_XOP + +asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_xop) + a1(push rbp) + a2(mov rbp, rsp) + a2(and rsp, ~63) + a2(sub rsp, 128) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + a2(vmovdqa xmm4,[rax+64]) + a2(vmovdqa xmm5,[rax+80]) + a2(vmovdqa xmm6,[rax+96]) + a2(vmovdqa xmm7,[rax+112]) + aj(jz scrypt_ChunkMix_xop_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a3(vpxor xmm4,xmm4,[r9+64]) + a3(vpxor xmm5,xmm5,[r9+80]) + a3(vpxor xmm6,xmm6,[r9+96]) + a3(vpxor xmm7,xmm7,[r9+112]) + a1(scrypt_ChunkMix_xop_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_xop_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + a3(vpxor xmm4,xmm4,[rsi+r9+64]) + a3(vpxor xmm5,xmm5,[rsi+r9+80]) + a3(vpxor xmm6,xmm6,[rsi+r9+96]) + a3(vpxor xmm7,xmm7,[rsi+r9+112]) + aj(jz scrypt_ChunkMix_xop_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a3(vpxor xmm4,xmm4,[rdx+r9+64]) + a3(vpxor xmm5,xmm5,[rdx+r9+80]) + a3(vpxor xmm6,xmm6,[rdx+r9+96]) + a3(vpxor xmm7,xmm7,[rdx+r9+112]) + a1(scrypt_ChunkMix_xop_no_xor2:) + a2(vmovdqa [rsp+0],xmm0) + a2(vmovdqa [rsp+16],xmm1) + a2(vmovdqa [rsp+32],xmm2) + a2(vmovdqa [rsp+48],xmm3) + a2(vmovdqa [rsp+64],xmm4) + a2(vmovdqa [rsp+80],xmm5) + a2(vmovdqa [rsp+96],xmm6) + a2(vmovdqa [rsp+112],xmm7) + a2(mov rax,8) + a1(scrypt_salsa64_xop_loop: ) + a3(vpaddq xmm8, xmm0, xmm2) + a3(vpaddq xmm9, xmm1, xmm3) + a3(vpshufd xmm8, xmm8, 0xb1) + a3(vpshufd xmm9, xmm9, 0xb1) + a3(vpxor xmm6, xmm6, xmm8) + a3(vpxor xmm7, xmm7, xmm9) + a3(vpaddq xmm10, xmm0, xmm6) + a3(vpaddq xmm11, xmm1, xmm7) + a3(vprotq xmm10, xmm10, 13) + a3(vprotq xmm11, xmm11, 13) + a3(vpxor xmm4, xmm4, xmm10) + a3(vpxor xmm5, xmm5, xmm11) + a3(vpaddq xmm8, xmm6, xmm4) + a3(vpaddq xmm9, xmm7, xmm5) + a3(vprotq xmm8, xmm8, 39) + a3(vprotq xmm9, xmm9, 39) + a3(vpxor xmm2, xmm2, xmm8) + a3(vpxor xmm3, xmm3, xmm9) + a3(vpaddq xmm10, xmm4, xmm2) + a3(vpaddq xmm11, xmm5, xmm3) + a3(vpshufd xmm10, xmm10, 0xb1) + a3(vpshufd xmm11, xmm11, 0xb1) + a3(vpxor xmm0, xmm0, xmm10) + a3(vpxor xmm1, xmm1, xmm11) + a2(vmovdqa xmm8, xmm2) + a2(vmovdqa xmm9, xmm3) + a4(vpalignr xmm2, xmm6, xmm7, 8) + a4(vpalignr xmm3, xmm7, xmm6, 8) + a4(vpalignr xmm6, xmm9, xmm8, 8) + a4(vpalignr xmm7, xmm8, xmm9, 8) + a3(vpaddq xmm10, xmm0, xmm2) + a3(vpaddq xmm11, xmm1, xmm3) + a3(vpshufd xmm10, xmm10, 0xb1) + a3(vpshufd xmm11, xmm11, 0xb1) + a3(vpxor xmm6, xmm6, xmm10) + a3(vpxor xmm7, xmm7, xmm11) + a3(vpaddq xmm8, xmm0, xmm6) + a3(vpaddq xmm9, xmm1, xmm7) + a3(vprotq xmm8, xmm8, 13) + a3(vprotq xmm9, xmm9, 13) + a3(vpxor xmm5, xmm5, xmm8) + a3(vpxor xmm4, xmm4, xmm9) + a3(vpaddq xmm10, xmm6, xmm5) + a3(vpaddq xmm11, xmm7, xmm4) + a3(vprotq xmm10, xmm10, 39) + a3(vprotq xmm11, xmm11, 39) + a3(vpxor xmm2, xmm2, xmm10) + a3(vpxor xmm3, xmm3, xmm11) + a3(vpaddq xmm8, xmm5, xmm2) + a3(vpaddq xmm9, xmm4, xmm3) + a3(vpshufd xmm8, xmm8, 0xb1) + a3(vpshufd xmm9, xmm9, 0xb1) + a3(vpxor xmm0, xmm0, xmm8) + a3(vpxor xmm1, xmm1, xmm9) + a2(vmovdqa xmm10, xmm2) + a2(vmovdqa xmm11, xmm3) + a4(vpalignr xmm2, xmm6, xmm7, 8) + a4(vpalignr xmm3, xmm7, xmm6, 8) + a4(vpalignr xmm6, xmm11, xmm10, 8) + a4(vpalignr xmm7, xmm10, xmm11, 8) + a2(sub rax, 2) + aj(ja scrypt_salsa64_xop_loop) + a3(vpaddq xmm0,xmm0,[rsp+0]) + a3(vpaddq xmm1,xmm1,[rsp+16]) + a3(vpaddq xmm2,xmm2,[rsp+32]) + a3(vpaddq xmm3,xmm3,[rsp+48]) + a3(vpaddq xmm4,xmm4,[rsp+64]) + a3(vpaddq xmm5,xmm5,[rsp+80]) + a3(vpaddq xmm6,xmm6,[rsp+96]) + a3(vpaddq xmm7,xmm7,[rsp+112]) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + a2(vmovdqa [rax+64],xmm4) + a2(vmovdqa [rax+80],xmm5) + a2(vmovdqa [rax+96],xmm6) + a2(vmovdqa [rax+112],xmm7) + aj(jne scrypt_ChunkMix_xop_loop) + a2(mov rsp, rbp) + a1(pop rbp) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_xop) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_XOP + +static void asm_calling_convention +scrypt_ChunkMix_xop(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + x4 = xmmp[4]; + x5 = xmmp[5]; + x6 = xmmp[6]; + x7 = xmmp[7]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + t4 = x4; + t5 = x5; + t6 = x6; + t7 = x7; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z0 = _mm_roti_epi64(z0, 13); + z1 = _mm_roti_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z0 = _mm_roti_epi64(z0, 39); + z1 = _mm_roti_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z0 = _mm_roti_epi64(z0, 13); + z1 = _mm_roti_epi64(z1, 13); + x5 = _mm_xor_si128(x5, z0); + x4 = _mm_xor_si128(x4, z1); + + z0 = _mm_add_epi64(x5, x6); + z1 = _mm_add_epi64(x4, x7); + z0 = _mm_roti_epi64(z0, 39); + z1 = _mm_roti_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x5); + z1 = _mm_add_epi64(x3, x4); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + } + + x0 = _mm_add_epi64(x0, t0); + x1 = _mm_add_epi64(x1, t1); + x2 = _mm_add_epi64(x2, t2); + x3 = _mm_add_epi64(x3, t3); + x4 = _mm_add_epi64(x4, t4); + x5 = _mm_add_epi64(x5, t5); + x6 = _mm_add_epi64(x6, t6); + x7 = _mm_add_epi64(x7, t7); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + xmmp[4] = x4; + xmmp[5] = x5; + xmmp[6] = x6; + xmmp[7] = x7; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_XOP) + /* uses salsa64_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-XOP" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64.h new file mode 100644 index 0000000..2aec04f --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa64.h @@ -0,0 +1,41 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED) + +#undef SCRYPT_MIX +#define SCRYPT_MIX "Salsa64/8 Ref" + +#undef SCRYPT_SALSA64_INCLUDED +#define SCRYPT_SALSA64_INCLUDED +#define SCRYPT_SALSA64_BASIC + +static void +salsa64_core_basic(uint64_t state[16]) { + const size_t rounds = 8; + uint64_t v[16], t; + size_t i; + + for (i = 0; i < 16; i++) v[i] = state[i]; + + #define G(a,b,c,d) \ + t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \ + t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \ + t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \ + t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \ + + for (i = 0; i < rounds; i += 2) { + G( 0, 4, 8,12); + G( 5, 9,13, 1); + G(10,14, 2, 6); + G(15, 3, 7,11); + G( 0, 1, 2, 3); + G( 5, 6, 7, 4); + G(10,11, 8, 9); + G(15,12,13,14); + } + + for (i = 0; i < 16; i++) state[i] += v[i]; + + #undef G +} + +#endif + diff --git a/stratum/algos/ar2/sj/scrypt-jane-pbkdf2.h b/stratum/algos/ar2/sj/scrypt-jane-pbkdf2.h new file mode 100644 index 0000000..ddd8742 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-pbkdf2.h @@ -0,0 +1,112 @@ +typedef struct scrypt_hmac_state_t { + scrypt_hash_state inner, outer; +} scrypt_hmac_state; + + +static void +scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) { + scrypt_hash_state st; + scrypt_hash_init(&st); + scrypt_hash_update(&st, m, mlen); + scrypt_hash_finish(&st, hash); +} + +/* hmac */ +static void +scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) { + uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; + size_t i; + + scrypt_hash_init(&st->inner); + scrypt_hash_init(&st->outer); + + if (keylen <= SCRYPT_HASH_BLOCK_SIZE) { + /* use the key directly if it's <= blocksize bytes */ + memcpy(pad, key, keylen); + } else { + /* if it's > blocksize bytes, hash it */ + scrypt_hash(pad, key, keylen); + } + + /* inner = (key ^ 0x36) */ + /* h(inner || ...) */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= 0x36; + scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); + + /* outer = (key ^ 0x5c) */ + /* h(outer || ...) */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= (0x5c ^ 0x36); + scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); + + scrypt_ensure_zero(pad, sizeof(pad)); +} + +static void +scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) { + /* h(inner || m...) */ + scrypt_hash_update(&st->inner, m, mlen); +} + +static void +scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) { + /* h(inner || m) */ + scrypt_hash_digest innerhash; + scrypt_hash_finish(&st->inner, innerhash); + + /* h(outer || h(inner || m)) */ + scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash)); + scrypt_hash_finish(&st->outer, mac); + + scrypt_ensure_zero(st, sizeof(*st)); +} + +static void +scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) { + scrypt_hmac_state hmac_pw, hmac_pw_salt, work; + scrypt_hash_digest ti, u; + uint8_t be[4]; + uint32_t i, j, blocks; + uint64_t c; + + /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ + + /* hmac(password, ...) */ + scrypt_hmac_init(&hmac_pw, password, password_len); + + /* hmac(password, salt...) */ + hmac_pw_salt = hmac_pw; + scrypt_hmac_update(&hmac_pw_salt, salt, salt_len); + + blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; + for (i = 1; i <= blocks; i++) { + /* U1 = hmac(password, salt || be(i)) */ + U32TO8_BE(be, i); + work = hmac_pw_salt; + scrypt_hmac_update(&work, be, 4); + scrypt_hmac_finish(&work, ti); + memcpy(u, ti, sizeof(u)); + + /* T[i] = U1 ^ U2 ^ U3... */ + for (c = 0; c < N - 1; c++) { + /* UX = hmac(password, U{X-1}) */ + work = hmac_pw; + scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE); + scrypt_hmac_finish(&work, u); + + /* T[i] ^= UX */ + for (j = 0; j < sizeof(u); j++) + ti[j] ^= u[j]; + } + + memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); + out += SCRYPT_HASH_DIGEST_SIZE; + bytes -= SCRYPT_HASH_DIGEST_SIZE; + } + + scrypt_ensure_zero(ti, sizeof(ti)); + scrypt_ensure_zero(u, sizeof(u)); + scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw)); + scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt)); +} diff --git a/stratum/algos/ar2/sj/scrypt-jane-portable-x86.h b/stratum/algos/ar2/sj/scrypt-jane-portable-x86.h new file mode 100644 index 0000000..5be2db9 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-portable-x86.h @@ -0,0 +1,462 @@ +#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC)) + #define X86ASM + + /* gcc 2.95 royally screws up stack alignments on variables */ + #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000))) + #define X86ASM_SSE + #define X86ASM_SSE2 + #endif + #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102))) + #define X86ASM_SSSE3 + #endif + #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400))) + #define X86ASM_AVX + #define X86ASM_XOP + #endif + #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700))) + #define X86ASM_AVX2 + #endif +#endif + +#if defined(CPU_X86_64) && defined(COMPILER_GCC) + #define X86_64ASM + #define X86_64ASM_SSE2 + #if (COMPILER_GCC >= 40102) + #define X86_64ASM_SSSE3 + #endif + #if (COMPILER_GCC >= 40400) + #define X86_64ASM_AVX + #define X86_64ASM_XOP + #endif + #if (COMPILER_GCC >= 40700) + #define X86_64ASM_AVX2 + #endif +#endif + +#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64)) + #define X86_INTRINSIC + #if defined(CPU_X86_64) || defined(X86ASM_SSE) + #define X86_INTRINSIC_SSE + #endif + #if defined(CPU_X86_64) || defined(X86ASM_SSE2) + #define X86_INTRINSIC_SSE2 + #endif + #if (COMPILER_MSVC >= COMPILER_MSVC_VS2005) + #define X86_INTRINSIC_SSSE3 + #endif + #if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1) + #define X86_INTRINSIC_AVX + #define X86_INTRINSIC_XOP + #endif + #if (COMPILER_MSVC >= COMPILER_MSVC_VS2012) + #define X86_INTRINSIC_AVX2 + #endif +#endif + +#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS) + #define X86_INTRINSIC + #if defined(__SSE__) + #define X86_INTRINSIC_SSE + #endif + #if defined(__SSE2__) + #define X86_INTRINSIC_SSE2 + #endif + #if defined(__SSSE3__) + #define X86_INTRINSIC_SSSE3 + #endif + #if defined(__AVX__) + #define X86_INTRINSIC_AVX + #endif + #if defined(__XOP__) + #define X86_INTRINSIC_XOP + #endif + #if defined(__AVX2__) + #define X86_INTRINSIC_AVX2 + #endif +#endif + +/* only use simd on windows (or SSE2 on gcc)! */ +#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC) + #if defined(X86_INTRINSIC_SSE) + #include + #include + typedef __m64 qmm; + typedef __m128 xmm; + typedef __m128d xmmd; + #endif + #if defined(X86_INTRINSIC_SSE2) + #include + typedef __m128i xmmi; + #endif + #if defined(X86_INTRINSIC_SSSE3) + #include + #endif + #if defined(X86_INTRINSIC_AVX) + #include + #endif + #if defined(X86_INTRINSIC_XOP) + #if defined(COMPILER_MSVC) + #include + #else + #include + #endif + #endif + #if defined(X86_INTRINSIC_AVX2) + typedef __m256i ymmi; + #endif +#endif + +#if defined(X86_INTRINSIC_SSE2) + typedef union packedelem8_t { + uint8_t u[16]; + xmmi v; + } packedelem8; + + typedef union packedelem32_t { + uint32_t u[4]; + xmmi v; + } packedelem32; + + typedef union packedelem64_t { + uint64_t u[2]; + xmmi v; + } packedelem64; +#else + typedef union packedelem8_t { + uint8_t u[16]; + uint32_t dw[4]; + } packedelem8; + + typedef union packedelem32_t { + uint32_t u[4]; + uint8_t b[16]; + } packedelem32; + + typedef union packedelem64_t { + uint64_t u[2]; + uint8_t b[16]; + } packedelem64; +#endif + +#if defined(X86_INTRINSIC_SSSE3) + static const packedelem8 ALIGN(16) ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}}; + static const packedelem8 ALIGN(16) ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}}; +#endif + +/* + x86 inline asm for gcc/msvc. usage: + + asm_naked_fn_proto(return_type, name) (type parm1, type parm2..) + asm_naked_fn(name) + a1(..) + a2(.., ..) + a3(.., .., ..) + 64bit OR 0 paramters: a1(ret) + 32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters + asm_naked_fn_end(name) +*/ + +#if defined(X86ASM) || defined(X86_64ASM) + +#if defined(COMPILER_MSVC) + #pragma warning(disable : 4731) /* frame pointer modified by inline assembly */ + #define a1(x) __asm {x} + #define a2(x, y) __asm {x, y} + #define a3(x, y, z) __asm {x, y, z} + #define a4(x, y, z, w) __asm {x, y, z, w} + #define aj(x) __asm {x} + #define asm_align8 a1(ALIGN 8) + #define asm_align16 a1(ALIGN 16) + + #define asm_calling_convention STDCALL + #define aret(n) a1(ret n) + #define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn + #define asm_naked_fn(fn) { + #define asm_naked_fn_end(fn) } +#elif defined(COMPILER_GCC) + #define GNU_AS1(x) #x ";\n" + #define GNU_AS2(x, y) #x ", " #y ";\n" + #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n" + #define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n" + #define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n" + #define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n" + + #define a1(x) GNU_AS1(x) + #define a2(x, y) GNU_AS2(x, y) + #define a3(x, y, z) GNU_AS3(x, y, z) + #define a4(x, y, z, w) GNU_AS4(x, y, z, w) + #define aj(x) GNU_ASJ(x) + #define asm_align8 ".p2align 3,,7" + #define asm_align16 ".p2align 4,,15" + + #if defined(OS_WINDOWS) + #define asm_calling_convention CDECL + #define aret(n) a1(ret) + + #if defined(X86_64ASM) + #define asm_naked_fn(fn) ; __asm__ ( \ + ".text\n" \ + asm_align16 GNU_ASFN(fn) \ + "subq $136, %rsp;" \ + "movdqa %xmm6, 0(%rsp);" \ + "movdqa %xmm7, 16(%rsp);" \ + "movdqa %xmm8, 32(%rsp);" \ + "movdqa %xmm9, 48(%rsp);" \ + "movdqa %xmm10, 64(%rsp);" \ + "movdqa %xmm11, 80(%rsp);" \ + "movdqa %xmm12, 96(%rsp);" \ + "movq %rdi, 112(%rsp);" \ + "movq %rsi, 120(%rsp);" \ + "movq %rcx, %rdi;" \ + "movq %rdx, %rsi;" \ + "movq %r8, %rdx;" \ + "movq %r9, %rcx;" \ + "call 1f;" \ + "movdqa 0(%rsp), %xmm6;" \ + "movdqa 16(%rsp), %xmm7;" \ + "movdqa 32(%rsp), %xmm8;" \ + "movdqa 48(%rsp), %xmm9;" \ + "movdqa 64(%rsp), %xmm10;" \ + "movdqa 80(%rsp), %xmm11;" \ + "movdqa 96(%rsp), %xmm12;" \ + "movq 112(%rsp), %rdi;" \ + "movq 120(%rsp), %rsi;" \ + "addq $136, %rsp;" \ + "ret;" \ + ".intel_syntax noprefix;" \ + ".p2align 4,,15;" \ + "1:;" + #else + #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn) + #endif + #else + #define asm_calling_convention STDCALL + #define aret(n) a1(ret n) + #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn) + #endif + + #define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn + #define asm_naked_fn_end(fn) ".att_syntax prefix;\n" ); + + #define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n" + #define asm_gcc_parms() ".att_syntax prefix;" + #define asm_gcc_trashed() __asm__ __volatile__("" ::: + #define asm_gcc_end() ); +#else + need x86 asm +#endif + +#endif /* X86ASM || X86_64ASM */ + + +#if defined(CPU_X86) || defined(CPU_X86_64) + +typedef enum cpu_flags_x86_t { + cpu_mmx = 1 << 0, + cpu_sse = 1 << 1, + cpu_sse2 = 1 << 2, + cpu_sse3 = 1 << 3, + cpu_ssse3 = 1 << 4, + cpu_sse4_1 = 1 << 5, + cpu_sse4_2 = 1 << 6, + cpu_avx = 1 << 7, + cpu_xop = 1 << 8, + cpu_avx2 = 1 << 9 +} cpu_flags_x86; + +typedef enum cpu_vendors_x86_t { + cpu_nobody, + cpu_intel, + cpu_amd +} cpu_vendors_x86; + +typedef struct x86_regs_t { + uint32_t eax, ebx, ecx, edx; +} x86_regs; + +#if defined(X86ASM) +asm_naked_fn_proto(int, has_cpuid)(void) +asm_naked_fn(has_cpuid) + a1(pushfd) + a1(pop eax) + a2(mov ecx, eax) + a2(xor eax, 0x200000) + a1(push eax) + a1(popfd) + a1(pushfd) + a1(pop eax) + a2(xor eax, ecx) + a2(shr eax, 21) + a2(and eax, 1) + a1(push ecx) + a1(popfd) + a1(ret) +asm_naked_fn_end(has_cpuid) +#endif /* X86ASM */ + + +static void NOINLINE +get_cpuid(x86_regs *regs, uint32_t flags) { +#if defined(COMPILER_MSVC) + __cpuid((int *)regs, (int)flags); +#else + #if defined(CPU_X86_64) + #define cpuid_bx rbx + #else + #define cpuid_bx ebx + #endif + + asm_gcc() + a1(push cpuid_bx) + a2(xor ecx, ecx) + a1(cpuid) + a2(mov [%1 + 0], eax) + a2(mov [%1 + 4], ebx) + a2(mov [%1 + 8], ecx) + a2(mov [%1 + 12], edx) + a1(pop cpuid_bx) + asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc" + asm_gcc_end() +#endif +} + +#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) +static uint64_t NOINLINE +get_xgetbv(uint32_t flags) { +#if defined(COMPILER_MSVC) + return _xgetbv(flags); +#else + uint32_t lo, hi; + asm_gcc() + a1(xgetbv) + asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi) + asm_gcc_end() + return ((uint64_t)lo | ((uint64_t)hi << 32)); +#endif +} +#endif // AVX support + +#if defined(SCRYPT_TEST_SPEED) +size_t cpu_detect_mask = (size_t)-1; +#endif + +static size_t +detect_cpu(void) { + union { uint8_t s[12]; uint32_t i[3]; } vendor_string; + //cpu_vendors_x86 vendor = cpu_nobody; + x86_regs regs; + uint32_t max_level, max_ext_level; + size_t cpu_flags = 0; +#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) + uint64_t xgetbv_flags; +#endif + +#if defined(CPU_X86) + if (!has_cpuid()) + return cpu_flags; +#endif + + get_cpuid(®s, 0); + max_level = regs.eax; + vendor_string.i[0] = regs.ebx; + vendor_string.i[1] = regs.edx; + vendor_string.i[2] = regs.ecx; + + //if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12)) + // vendor = cpu_intel; + //else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12)) + // vendor = cpu_amd; + + if (max_level & 0x00000500) { + /* "Intel P5 pre-B0" */ + cpu_flags |= cpu_mmx; + return cpu_flags; + } + + if (max_level < 1) + return cpu_flags; + + get_cpuid(®s, 1); +#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) + /* xsave/xrestore */ + if (regs.ecx & (1 << 27)) { + xgetbv_flags = get_xgetbv(0); + if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx; + } +#endif + if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2; + if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2; + if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3; + if (regs.ecx & (1 )) cpu_flags |= cpu_sse3; + if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2; + if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse; + if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx; + + if (cpu_flags & cpu_avx) { + if (max_level >= 7) { + get_cpuid(®s, 7); + if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2; + } + + get_cpuid(®s, 0x80000000); + max_ext_level = regs.eax; + if (max_ext_level >= 0x80000001) { + get_cpuid(®s, 0x80000001); + if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop; + } + } + + +#if defined(SCRYPT_TEST_SPEED) + cpu_flags &= cpu_detect_mask; +#endif + + return cpu_flags; +} + +#if defined(SCRYPT_TEST_SPEED) +static const char * +get_top_cpuflag_desc(size_t flag) { + if (flag & cpu_avx2) return "AVX2"; + else if (flag & cpu_xop) return "XOP"; + else if (flag & cpu_avx) return "AVX"; + else if (flag & cpu_sse4_2) return "SSE4.2"; + else if (flag & cpu_sse4_1) return "SSE4.1"; + else if (flag & cpu_ssse3) return "SSSE3"; + else if (flag & cpu_sse2) return "SSE2"; + else if (flag & cpu_sse) return "SSE"; + else if (flag & cpu_mmx) return "MMX"; + else return "Basic"; +} +#endif + +/* enable the highest system-wide option */ +#if defined(SCRYPT_CHOOSE_COMPILETIME) + #if !defined(__AVX2__) + #undef X86_64ASM_AVX2 + #undef X86ASM_AVX2 + #undef X86_INTRINSIC_AVX2 + #endif + #if !defined(__XOP__) + #undef X86_64ASM_XOP + #undef X86ASM_XOP + #undef X86_INTRINSIC_XOP + #endif + #if !defined(__AVX__) + #undef X86_64ASM_AVX + #undef X86ASM_AVX + #undef X86_INTRINSIC_AVX + #endif + #if !defined(__SSSE3__) + #undef X86_64ASM_SSSE3 + #undef X86ASM_SSSE3 + #undef X86_INTRINSIC_SSSE3 + #endif + #if !defined(__SSE2__) + #undef X86_64ASM_SSE2 + #undef X86ASM_SSE2 + #undef X86_INTRINSIC_SSE2 + #endif +#endif + +#endif /* defined(CPU_X86) || defined(CPU_X86_64) */ diff --git a/stratum/algos/ar2/sj/scrypt-jane-portable.h b/stratum/algos/ar2/sj/scrypt-jane-portable.h new file mode 100644 index 0000000..f1c2d26 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-portable.h @@ -0,0 +1,307 @@ +/* determine os */ +#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__) + #include + #include + #define OS_WINDOWS +#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__) + #include + #include + #include + + #define OS_SOLARIS +#else + #include + #include + #include /* need this to define BSD */ + #include + #include + + #define OS_NIX + #if defined(__linux__) + #include + #define OS_LINUX + #elif defined(BSD) + #define OS_BSD + + #if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__)) + #define OS_OSX + #elif defined(macintosh) || defined(Macintosh) + #define OS_MAC + #elif defined(__OpenBSD__) + #define OS_OPENBSD + #endif + #endif +#endif + + +/* determine compiler */ +#if defined(_MSC_VER) + #define COMPILER_MSVC_VS6 120000000 + #define COMPILER_MSVC_VS6PP 121000000 + #define COMPILER_MSVC_VS2002 130000000 + #define COMPILER_MSVC_VS2003 131000000 + #define COMPILER_MSVC_VS2005 140050727 + #define COMPILER_MSVC_VS2008 150000000 + #define COMPILER_MSVC_VS2008SP1 150030729 + #define COMPILER_MSVC_VS2010 160000000 + #define COMPILER_MSVC_VS2010SP1 160040219 + #define COMPILER_MSVC_VS2012RC 170000000 + #define COMPILER_MSVC_VS2012 170050727 + + #if _MSC_FULL_VER > 100000000 + #define COMPILER_MSVC (_MSC_FULL_VER) + #else + #define COMPILER_MSVC (_MSC_FULL_VER * 10) + #endif + + #if ((_MSC_VER == 1200) && defined(_mm_free)) + #undef COMPILER_MSVC + #define COMPILER_MSVC COMPILER_MSVC_VS6PP + #endif + + #pragma warning(disable : 4127) /* conditional expression is constant */ + #pragma warning(disable : 4100) /* unreferenced formal parameter */ + + #define _CRT_SECURE_NO_WARNINGS + #include + #include /* _rotl */ + #include + + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + typedef signed int int32_t; + typedef unsigned __int64 uint64_t; + typedef signed __int64 int64_t; + + #define ROTL32(a,b) _rotl(a,b) + #define ROTR32(a,b) _rotr(a,b) + #define ROTL64(a,b) _rotl64(a,b) + #define ROTR64(a,b) _rotr64(a,b) + #undef NOINLINE + #define NOINLINE __declspec(noinline) + #undef NORETURN + #define NORETURN + #undef INLINE + #define INLINE __forceinline + #undef FASTCALL + #define FASTCALL __fastcall + #undef CDECL + #define CDECL __cdecl + #undef STDCALL + #define STDCALL __stdcall + #undef NAKED + #define NAKED __declspec(naked) + #define ALIGN(n) __declspec(align(n)) +#endif +#if defined(__ICC) + #define COMPILER_INTEL +#endif +#if defined(__GNUC__) + #if (__GNUC__ >= 3) + #define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__ + #else + #define COMPILER_GCC_PATCHLEVEL 0 + #endif + #define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL) + #define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) + #define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) + #define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) + #define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b))) + #undef NOINLINE + #if (COMPILER_GCC >= 30000) + #define NOINLINE __attribute__((noinline)) + #else + #define NOINLINE + #endif + #undef NORETURN + #if (COMPILER_GCC >= 30000) + #define NORETURN __attribute__((noreturn)) + #else + #define NORETURN + #endif + #undef INLINE + #if (COMPILER_GCC >= 30000) + #define INLINE __attribute__((always_inline)) + #else + #define INLINE inline + #endif + #undef FASTCALL + #if (COMPILER_GCC >= 30400) + #define FASTCALL __attribute__((fastcall)) + #else + #define FASTCALL + #endif + #undef CDECL + #define CDECL __attribute__((cdecl)) + #undef STDCALL + #define STDCALL __attribute__((stdcall)) + #define ALIGN(n) __attribute__((aligned(n))) + #include +#endif +#if defined(__MINGW32__) || defined(__MINGW64__) + #define COMPILER_MINGW +#endif +#if defined(__PATHCC__) + #define COMPILER_PATHCC +#endif + +#define OPTIONAL_INLINE +#if defined(OPTIONAL_INLINE) + #undef OPTIONAL_INLINE + #define OPTIONAL_INLINE INLINE +#else + #define OPTIONAL_INLINE +#endif + +#define CRYPTO_FN NOINLINE STDCALL + +/* determine cpu */ +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64) + #define CPU_X86_64 +#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500)) + #define CPU_X86 500 +#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400)) + #define CPU_X86 400 +#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__) + #define CPU_X86 300 +#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64) + #define CPU_IA64 +#endif + +#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9) + #define CPU_SPARC + #if defined(__sparcv9) + #define CPU_SPARC64 + #endif +#endif + +#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64)) + #define CPU_64BITS + #undef FASTCALL + #define FASTCALL + #undef CDECL + #define CDECL + #undef STDCALL + #define STDCALL +#endif + +#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC) + #define CPU_PPC + #if defined(_ARCH_PWR7) + #define CPU_POWER7 + #elif defined(__64BIT__) + #define CPU_PPC64 + #else + #define CPU_PPC32 + #endif +#endif + +#if defined(__hppa__) || defined(__hppa) + #define CPU_HPPA +#endif + +#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) + #define CPU_ALPHA +#endif + +/* endian */ + +#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \ + (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \ + (defined(CPU_X86) || defined(CPU_X86_64)) || \ + (defined(vax) || defined(MIPSEL) || defined(_MIPSEL))) +#define CPU_LE +#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \ + (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \ + (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB)) +#define CPU_BE +#else + /* unknown endian! */ +#endif + + +#define U8TO32_BE(p) \ + (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) + +#define U8TO32_LE(p) \ + (((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \ + ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) + +#define U32TO8_BE(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); + +#define U32TO8_LE(p, v) \ + (p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \ + (p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24); + +#define U8TO64_BE(p) \ + (((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4)) + +#define U8TO64_LE(p) \ + (((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32)) + +#define U64TO8_BE(p, v) \ + U32TO8_BE((p), (uint32_t)((v) >> 32)); \ + U32TO8_BE((p) + 4, (uint32_t)((v) )); + +#define U64TO8_LE(p, v) \ + U32TO8_LE((p), (uint32_t)((v) )); \ + U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); + +#define U32_SWAP(v) { \ + (v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \ + (v) = ((v) << 16) | ((v) >> 16); \ +} + +#define U64_SWAP(v) { \ + (v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \ + (v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \ + (v) = ((v) << 32) | ((v) >> 32); \ +} + +static int +scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) { + uint32_t differentbits = 0; + while (len--) + differentbits |= (*x++ ^ *y++); + return (1 & ((differentbits - 1) >> 8)); +} + +static void +scrypt_ensure_zero(void *p, size_t len) { +#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC)) + __stosb((unsigned char *)p, 0, len); +#elif (defined(CPU_X86) && defined(COMPILER_GCC)) + __asm__ __volatile__( + "pushl %%edi;\n" + "pushl %%ecx;\n" + "rep stosb;\n" + "popl %%ecx;\n" + "popl %%edi;\n" + :: "a"(0), "D"(p), "c"(len) : "cc", "memory" + ); +#elif (defined(CPU_X86_64) && defined(COMPILER_GCC)) + __asm__ __volatile__( + "pushq %%rdi;\n" + "pushq %%rcx;\n" + "rep stosb;\n" + "popq %%rcx;\n" + "popq %%rdi;\n" + :: "a"(0), "D"(p), "c"(len) : "cc", "memory" + ); +#else + volatile uint8_t *b = (volatile uint8_t *)p; + size_t i; + for (i = 0; i < len; i++) + b[i] = 0; +#endif +} + +#include "scrypt-jane-portable-x86.h" + +#if !defined(asm_calling_convention) +#define asm_calling_convention +#endif diff --git a/stratum/algos/ar2/sj/scrypt-jane-romix-basic.h b/stratum/algos/ar2/sj/scrypt-jane-romix-basic.h new file mode 100644 index 0000000..57ba649 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-romix-basic.h @@ -0,0 +1,74 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +/* function type returned by scrypt_getROMix, used with cpu detection */ +typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r); +#endif + +/* romix pre/post nop function */ +static void asm_calling_convention +scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { + (void)blocks; (void)nblocks; +} + +/* romix pre/post endian conversion function */ +static void asm_calling_convention +scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { +#if !defined(CPU_LE) + static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}}; + size_t i; + if (endian_test.w == 0x100) { + nblocks *= SCRYPT_BLOCK_WORDS; + for (i = 0; i < nblocks; i++) { + SCRYPT_WORD_ENDIAN_SWAP(blocks[i]); + } + } +#else + (void)blocks; (void)nblocks; +#endif +} + +/* chunkmix test function */ +typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); +typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); + +static int +scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) { + /* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */ + const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS; +#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2)) + scrypt_mix_word_t ALIGN(32) chunk[2][4 * SCRYPT_BLOCK_WORDS], v; +#else + scrypt_mix_word_t ALIGN(16) chunk[2][4 * SCRYPT_BLOCK_WORDS], v; +#endif + uint8_t final[16]; + size_t i; + + for (i = 0; i < words; i++) { + v = (scrypt_mix_word_t)i; + v = (v << 8) | v; + v = (v << 16) | v; + chunk[0][i] = v; + } + + prefn(chunk[0], blocks); + mixfn(chunk[1], chunk[0], NULL, r); + postfn(chunk[1], blocks); + + /* grab the last 16 bytes of the final block */ + for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) { + SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]); + } + + return scrypt_verify(expected, final, 16); +} + +/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */ +static scrypt_mix_word_t * +scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) { + return base + (i * len); +} + +/* returns a pointer to block i */ +static scrypt_mix_word_t * +scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) { + return base + (i * SCRYPT_BLOCK_WORDS); +} diff --git a/stratum/algos/ar2/sj/scrypt-jane-romix-template.h b/stratum/algos/ar2/sj/scrypt-jane-romix-template.h new file mode 100644 index 0000000..373ae60 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-romix-template.h @@ -0,0 +1,122 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) + +#if defined(SCRYPT_CHOOSE_COMPILETIME) +#undef SCRYPT_ROMIX_FN +#define SCRYPT_ROMIX_FN scrypt_ROMix +#endif + +#undef SCRYPT_HAVE_ROMIX +#define SCRYPT_HAVE_ROMIX + +#if !defined(SCRYPT_CHUNKMIX_FN) + +#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic + +/* + Bout = ChunkMix(Bin) + + 2*r: number of blocks in the chunk +*/ +static void asm_calling_convention +SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) { +#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2)) + scrypt_mix_word_t ALIGN(32) X[SCRYPT_BLOCK_WORDS], *block; +#else + scrypt_mix_word_t ALIGN(16) X[SCRYPT_BLOCK_WORDS], *block; +#endif + uint32_t i, j, blocksPerChunk = /*r * 2*/2, half = 0; + + /* 1: X = B_{2r - 1} */ + block = scrypt_block(Bin, blocksPerChunk - 1); + for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) + X[i] = block[i]; + + if (Bxor) { + block = scrypt_block(Bxor, blocksPerChunk - 1); + for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) + X[i] ^= block[i]; + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= /*r*/1) { + /* 3: X = H(X ^ B_i) */ + block = scrypt_block(Bin, i); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + X[j] ^= block[j]; + + if (Bxor) { + block = scrypt_block(Bxor, i); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + X[j] ^= block[j]; + } + SCRYPT_MIX_FN(X); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + block = scrypt_block(Bout, (i / 2) + half); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + block[j] = X[j]; + } +} +#endif + +/* + X = ROMix(X) + + X: chunk to mix + Y: scratch chunk + N: number of rounds + V[N]: array of chunks to randomly index in to + 2*r: number of blocks in a chunk +*/ + +static void NOINLINE FASTCALL +SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) { + uint32_t i, j, chunkWords = (uint32_t)(SCRYPT_BLOCK_WORDS * 2); + scrypt_mix_word_t *block = V; + + SCRYPT_ROMIX_TANGLE_FN(X, 2); + + /* 1: X = B */ + /* implicit */ + + /* 2: for i = 0 to N - 1 do */ + memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); + for (i = 0; i < /*N - 1*/511; i++, block += chunkWords) { + /* 3: V_i = X */ + /* 4: X = H(X) */ + SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, /*r*/1); + } + SCRYPT_CHUNKMIX_FN(X, block, NULL, 1); + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < /*N*/512; i += 2) { + /* 7: j = Integerify(X) % N */ + j = X[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511; + + /* 8: X = H(Y ^ V_j) */ + SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), 1); + + /* 7: j = Integerify(Y) % N */ + j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511; + + /* 8: X = H(Y ^ V_j) */ + SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), 1); + } + + /* 10: B' = X */ + /* implicit */ + + SCRYPT_ROMIX_UNTANGLE_FN(X, 2); +} + +#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */ + + +#undef SCRYPT_CHUNKMIX_FN +#undef SCRYPT_ROMIX_FN +#undef SCRYPT_MIX_FN +#undef SCRYPT_ROMIX_TANGLE_FN +#undef SCRYPT_ROMIX_UNTANGLE_FN + diff --git a/stratum/algos/ar2/sj/scrypt-jane-romix.h b/stratum/algos/ar2/sj/scrypt-jane-romix.h new file mode 100644 index 0000000..8ed6e9e --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-romix.h @@ -0,0 +1,25 @@ +#if defined(SCRYPT_SALSA) +#include "scrypt-jane-salsa.h" +#elif defined(SCRYPT_SALSA64) +#include "scrypt-jane-salsa64.h" +#else + #define SCRYPT_MIX_BASE "ERROR" + typedef uint32_t scrypt_mix_word_t; + #define SCRYPT_WORDTO8_LE U32TO8_LE + #define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP + #define SCRYPT_BLOCK_BYTES 64 + #define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + #if !defined(SCRYPT_CHOOSE_COMPILETIME) + static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {} + static scrypt_ROMixfn scrypt_getROMix(void) { return scrypt_ROMix_error; } + #else + static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {} + #endif + static int scrypt_test_mix(void) { return 0; } + #error must define a mix function! +#endif + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +#undef SCRYPT_MIX +#define SCRYPT_MIX SCRYPT_MIX_BASE +#endif diff --git a/stratum/algos/ar2/sj/scrypt-jane-salsa.h b/stratum/algos/ar2/sj/scrypt-jane-salsa.h new file mode 100644 index 0000000..df0a3e0 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-salsa.h @@ -0,0 +1,134 @@ +#define SCRYPT_MIX_BASE "Salsa20/8" + +typedef uint32_t scrypt_mix_word_t; + +#define SCRYPT_WORDTO8_LE U32TO8_LE +#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP + +#define SCRYPT_BLOCK_BYTES 64 +#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + +/* must have these here in case block bytes is ever != 64 */ +#include "scrypt-jane-romix-basic.h" + +#include "scrypt-jane-mix_salsa-xop.h" +#include "scrypt-jane-mix_salsa-avx.h" +#include "scrypt-jane-mix_salsa-sse2.h" +#include "scrypt-jane-mix_salsa.h" + +#if defined(SCRYPT_SALSA_XOP) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop + #define SCRYPT_ROMIX_FN scrypt_ROMix_xop + #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA_AVX) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx + #define SCRYPT_ROMIX_FN scrypt_ROMix_avx + #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA_SSE2) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 + #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 + #define SCRYPT_MIX_FN salsa_core_sse2 + #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +/* cpu agnostic */ +#define SCRYPT_ROMIX_FN scrypt_ROMix_basic +#define SCRYPT_MIX_FN salsa_core_basic +#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian +#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian +#include "scrypt-jane-romix-template.h" + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +static scrypt_ROMixfn +scrypt_getROMix(void) { + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA_XOP) + if (cpuflags & cpu_xop) + return scrypt_ROMix_xop; + else +#endif + +#if defined(SCRYPT_SALSA_AVX) + if (cpuflags & cpu_avx) + return scrypt_ROMix_avx; + else +#endif + +#if defined(SCRYPT_SALSA_SSE2) + if (cpuflags & cpu_sse2) + return scrypt_ROMix_sse2; + else +#endif + + return scrypt_ROMix_basic; +} +#endif + + +#if defined(SCRYPT_TEST_SPEED) +static size_t +available_implementations(void) { + size_t cpuflags = detect_cpu(); + size_t flags = 0; + +#if defined(SCRYPT_SALSA_XOP) + if (cpuflags & cpu_xop) + flags |= cpu_xop; +#endif + +#if defined(SCRYPT_SALSA_AVX) + if (cpuflags & cpu_avx) + flags |= cpu_avx; +#endif + +#if defined(SCRYPT_SALSA_SSE2) + if (cpuflags & cpu_sse2) + flags |= cpu_sse2; +#endif + + return flags; +} +#endif + + +static int +scrypt_test_mix(void) { + static const uint8_t expected[16] = { + 0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66, + }; + + int ret = 1; + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA_XOP) + if (cpuflags & cpu_xop) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA_AVX) + if (cpuflags & cpu_avx) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA_SSE2) + if (cpuflags & cpu_sse2) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA_BASIC) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); +#endif + + return ret; +} diff --git a/stratum/algos/ar2/sj/scrypt-jane-salsa64.h b/stratum/algos/ar2/sj/scrypt-jane-salsa64.h new file mode 100644 index 0000000..96b7813 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-salsa64.h @@ -0,0 +1,183 @@ +#define SCRYPT_MIX_BASE "Salsa64/8" + +typedef uint64_t scrypt_mix_word_t; + +#define SCRYPT_WORDTO8_LE U64TO8_LE +#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP + +#define SCRYPT_BLOCK_BYTES 128 +#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + +/* must have these here in case block bytes is ever != 64 */ +#include "scrypt-jane-romix-basic.h" + +#include "scrypt-jane-mix_salsa64-avx2.h" +#include "scrypt-jane-mix_salsa64-xop.h" +#include "scrypt-jane-mix_salsa64-avx.h" +#include "scrypt-jane-mix_salsa64-ssse3.h" +#include "scrypt-jane-mix_salsa64-sse2.h" +#include "scrypt-jane-mix_salsa64.h" + +#if defined(SCRYPT_SALSA64_AVX2) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx2 + #define SCRYPT_ROMIX_FN scrypt_ROMix_avx2 + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA64_XOP) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop + #define SCRYPT_ROMIX_FN scrypt_ROMix_xop + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA64_AVX) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx + #define SCRYPT_ROMIX_FN scrypt_ROMix_avx + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3 + #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3 + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 + #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +/* cpu agnostic */ +#define SCRYPT_ROMIX_FN scrypt_ROMix_basic +#define SCRYPT_MIX_FN salsa64_core_basic +#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian +#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian +#include "scrypt-jane-romix-template.h" + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +static scrypt_ROMixfn +scrypt_getROMix(void) { + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA64_AVX2) + if (cpuflags & cpu_avx2) + return scrypt_ROMix_avx2; + else +#endif + +#if defined(SCRYPT_SALSA64_XOP) + if (cpuflags & cpu_xop) + return scrypt_ROMix_xop; + else +#endif + +#if defined(SCRYPT_SALSA64_AVX) + if (cpuflags & cpu_avx) + return scrypt_ROMix_avx; + else +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + if (cpuflags & cpu_ssse3) + return scrypt_ROMix_ssse3; + else +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + if (cpuflags & cpu_sse2) + return scrypt_ROMix_sse2; + else +#endif + + return scrypt_ROMix_basic; +} +#endif + + +#if defined(SCRYPT_TEST_SPEED) +static size_t +available_implementations(void) { + size_t cpuflags = detect_cpu(); + size_t flags = 0; + +#if defined(SCRYPT_SALSA64_AVX2) + if (cpuflags & cpu_avx2) + flags |= cpu_avx2; +#endif + +#if defined(SCRYPT_SALSA64_XOP) + if (cpuflags & cpu_xop) + flags |= cpu_xop; +#endif + +#if defined(SCRYPT_SALSA64_AVX) + if (cpuflags & cpu_avx) + flags |= cpu_avx; +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + if (cpuflags & cpu_ssse3) + flags |= cpu_ssse3; +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + if (cpuflags & cpu_sse2) + flags |= cpu_sse2; +#endif + + return flags; +} +#endif + +static int +scrypt_test_mix(void) { + static const uint8_t expected[16] = { + 0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c, + }; + + int ret = 1; + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA64_AVX2) + if (cpuflags & cpu_avx2) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_XOP) + if (cpuflags & cpu_xop) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_AVX) + if (cpuflags & cpu_avx) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + if (cpuflags & cpu_ssse3) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + if (cpuflags & cpu_sse2) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_BASIC) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); +#endif + + return ret; +} + diff --git a/stratum/algos/ar2/sj/scrypt-jane-test-vectors.h b/stratum/algos/ar2/sj/scrypt-jane-test-vectors.h new file mode 100644 index 0000000..f5b15a3 --- /dev/null +++ b/stratum/algos/ar2/sj/scrypt-jane-test-vectors.h @@ -0,0 +1,39 @@ +typedef struct scrypt_test_setting_t { + const char *pw, *salt; + uint8_t Nfactor, rfactor, pfactor; +} scrypt_test_setting; + +static const scrypt_test_setting post_settings[] = { + {"", "", 3, 0, 0}, + {"password", "NaCl", 9, 3, 4}, + {0, 0, 0, 0, 0} +}; + +#if defined(SCRYPT_SKEIN512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69, + 0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87, + 0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f, + 0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e}, + {0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e, + 0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b, + 0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb, + 0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60, + 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59, + 0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9, + 0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89}, + {0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5, + 0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99, + 0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23, + 0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b} + }; + #endif +#else + static const uint8_t post_vectors[][64] = {{0}}; +#endif + diff --git a/stratum/algos/ar2/thread.c b/stratum/algos/ar2/thread.c new file mode 100644 index 0000000..0c4edea --- /dev/null +++ b/stratum/algos/ar2/thread.c @@ -0,0 +1,36 @@ +#include "thread.h" +#if defined(_WIN32) +#include +#endif + +int argon2_thread_create(argon2_thread_handle_t *handle, + argon2_thread_func_t func, void *args) { + if (NULL == handle || func == NULL) { + return -1; + } +#if defined(_WIN32) + *handle = _beginthreadex(NULL, 0, func, args, 0, NULL); + return *handle != 0 ? 0 : -1; +#else + return pthread_create(handle, NULL, func, args); +#endif +} + +int argon2_thread_join(argon2_thread_handle_t handle) { +#if defined(_WIN32) + if (WaitForSingleObject((HANDLE)handle, INFINITE) == WAIT_OBJECT_0) { + return CloseHandle((HANDLE)handle) != 0 ? 0 : -1; + } + return -1; +#else + return pthread_join(handle, NULL); +#endif +} + +void argon2_thread_exit(void) { +#if defined(_WIN32) + _endthreadex(0); +#else + pthread_exit(NULL); +#endif +} diff --git a/stratum/algos/ar2/thread.h b/stratum/algos/ar2/thread.h new file mode 100644 index 0000000..57c4ce5 --- /dev/null +++ b/stratum/algos/ar2/thread.h @@ -0,0 +1,46 @@ +#ifndef ARGON2_THREAD_H +#define ARGON2_THREAD_H +/* + Here we implement an abstraction layer for the simpĺe requirements + of the Argon2 code. We only require 3 primitives---thread creation, + joining, and termination---so full emulation of the pthreads API + is unwarranted. Currently we wrap pthreads and Win32 threads. + + The API defines 2 types: the function pointer type, + argon2_thread_func_t, + and the type of the thread handle---argon2_thread_handle_t. +*/ +#if defined(_WIN32) +#include +typedef unsigned(__stdcall *argon2_thread_func_t)(void *); +typedef uintptr_t argon2_thread_handle_t; +#else +#include +typedef void *(*argon2_thread_func_t)(void *); +typedef pthread_t argon2_thread_handle_t; +#endif + +/* Creates a thread + * @param handle pointer to a thread handle, which is the output of this + * function. Must not be NULL. + * @param func A function pointer for the thread's entry point. Must not be + * NULL. + * @param args Pointer that is passed as an argument to @func. May be NULL. + * @return 0 if @handle and @func are valid pointers and a thread is successfuly + * created. + */ +int argon2_thread_create(argon2_thread_handle_t *handle, + argon2_thread_func_t func, void *args); + +/* Waits for a thread to terminate + * @param handle Handle to a thread created with argon2_thread_create. + * @return 0 if @handle is a valid handle, and joining completed successfully. +*/ +int argon2_thread_join(argon2_thread_handle_t handle); + +/* Terminate the current thread. Must be run inside a thread created by + * argon2_thread_create. +*/ +void argon2_thread_exit(void); + +#endif diff --git a/stratum/algos/argon2a.c b/stratum/algos/argon2a.c new file mode 100644 index 0000000..f099694 --- /dev/null +++ b/stratum/algos/argon2a.c @@ -0,0 +1,76 @@ +#include +#include +#include +#include + +#include "sysendian.h" + +#include "ar2/argon2.h" +#include "ar2/cores.h" +#include "ar2/scrypt-jane.h" + +#define _ALIGN(x) __attribute__ ((aligned(x))) + +#define T_COSTS 2 +#define M_COSTS 16 +#define MASK 8 +#define ZERO 0 + +static void argon_call(void *out, void *in, void *salt, int type) +{ + argon2_context context = { 0 }; + + context.out = (uint8_t *)out; + context.pwd = (uint8_t *)in; + context.salt = (uint8_t *)salt; + + argon2_core(&context, type); +} + +static void bin2hex(char *s, const unsigned char *p, size_t len) +{ + for (size_t i = 0; i < len; i++) + sprintf(s + (i * 2), "%02x", (unsigned int) p[i]); +} + +static char *abin2hex(const unsigned char *p, size_t len) +{ + char *s = (char*) malloc((len * 2) + 1); + if (!s) + return NULL; + bin2hex(s, p, len); + return s; +} + +static void applog_data(void *pdata) +{ + char* hex = abin2hex((unsigned char*)pdata, 80); + fprintf(stderr, "%s\n", hex); + free(hex); +} + +void argon2_hash(const char* input, char* output, uint32_t len) +{ + // these uint512 in the c++ source of the client are backed by an array of uint32 + uint32_t _ALIGN(32) hashA[8], hashB[8], hashC[8]; + uint32_t _ALIGN(32) endian[20], *in; + + in = (uint32_t*) input; + for (int i=0; i + +void argon2_hash(const char* input, char* output, uint32_t len); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/stratum/algos/makefile b/stratum/algos/makefile index 4044c9f..2c060d3 100644 --- a/stratum/algos/makefile +++ b/stratum/algos/makefile @@ -12,26 +12,27 @@ SOURCES=lyra2re.c lyra2v2.c Lyra2.c Sponge.c blake.c scrypt.c c11.c x11.c x13.c skein2.c zr5.c bmw.c luffa.c pentablake.c whirlpool.c whirlpoolx.c blakecoin.c \ yescrypt.c yescrypt-opt.c sha256_Y.c \ m7m.c magimath.cpp velvet.c \ + argon2a.c ar2/blake2b.c ar2/argon2.c ar2/ref.c ar2/cores.c ar2/thread.c ar2/scrypt-jane.c \ hive.c pomelo.c \ sib.c gost.c -OBJECTS=$(SOURCES:.c=.o) $(SOURCES:.cpp=.o) +OBJECTS=$(SOURCES:%.c=%.o) $(SOURCES:%.cpp=%.o) OUTPUT=libalgos.a all: $(SOURCES) $(OUTPUT) -$(OUTPUT): $(OBJECTS) +$(OUTPUT): $(OBJECTS) ar rc $@ $(OBJECTS) - + .cpp.o: - $(CC) $(CFLAGS) $< - + $(CC) $(CFLAGS) $< -o $@ + .c.o: - $(CC) $(CFLAGS) $< + $(CC) $(CFLAGS) $< -o $@ # $(CC) $(CFLAGS) -std=gnu99 -Wno-pointer-sign -Wno-pointer-to-int-cast -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16 -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores -DUSE_ASM -pg $< - -clean: - rm *.o - +clean: + rm -f *.o + rm -f ar2/*.o ar2/*.a + diff --git a/stratum/algos/sha256_Y.c b/stratum/algos/sha256_Y.c old mode 100755 new mode 100644 diff --git a/stratum/algos/sha256_Y.h b/stratum/algos/sha256_Y.h old mode 100755 new mode 100644 diff --git a/stratum/algos/sysendian.h b/stratum/algos/sysendian.h old mode 100755 new mode 100644 diff --git a/stratum/algos/yescrypt-opt.c b/stratum/algos/yescrypt-opt.c old mode 100755 new mode 100644 diff --git a/stratum/algos/yescrypt-platform.c b/stratum/algos/yescrypt-platform.c old mode 100755 new mode 100644 diff --git a/stratum/config.sample/argon2.conf b/stratum/config.sample/argon2.conf new file mode 100644 index 0000000..d34f2a4 --- /dev/null +++ b/stratum/config.sample/argon2.conf @@ -0,0 +1,16 @@ +[TCP] +server = yaamp.com +port = 4234 +password = tu8tu5 + +[SQL] +host = yaampdb +database = yaamp +username = root +password = patofpaq + +[STRATUM] +algo = argon2 +difficulty = 2 +max_ttf = 40000 + diff --git a/stratum/stratum.cpp b/stratum/stratum.cpp index 3b185fe..a4e6a8c 100644 --- a/stratum/stratum.cpp +++ b/stratum/stratum.cpp @@ -118,6 +118,8 @@ YAAMP_ALGO g_algos[] = {"hive", hive_hash, 0x10000, 0, 0}, {"m7m", m7m_hash, 0x10000, 0, 0}, {"velvet", velvet_hash, 0x10000, 0, 0}, + {"argon2", argon2_hash, 0x10000, 0, sha256_hash_hex }, + {"sib", sib_hash, 1, 0, 0}, {"whirlcoin", whirlpool_hash, 1, 0, sha256_hash_hex }, /* old sha merkleroot */ diff --git a/stratum/stratum.h b/stratum/stratum.h index 4315f04..6e8b64f 100644 --- a/stratum/stratum.h +++ b/stratum/stratum.h @@ -154,4 +154,5 @@ void sha256_double_hash_hex(const char *input, char *output, unsigned int len); #include "algos/sib.h" #include "algos/m7m.h" #include "algos/velvet.h" +#include "algos/argon2a.h" diff --git a/web/yaamp/core/functions/yaamp.php b/web/yaamp/core/functions/yaamp.php index a917005..ce547f5 100755 --- a/web/yaamp/core/functions/yaamp.php +++ b/web/yaamp/core/functions/yaamp.php @@ -7,6 +7,7 @@ function yaamp_get_algos() 'sha256', 'scrypt', 'scryptn', + 'argon2', 'blake', 'keccak', 'luffa', @@ -47,12 +48,12 @@ function yaamp_get_algo_norm($algo) 'scryptn' => 1.0, 'x11' => 1.0, 'x13' => 1.0, - 'zr5' => 1.0, + 'argon2' => 1.0, + 'lyra2' => 1.0, + 'lyra2v2' => 1.0, 'myr-gr' => 1.0, 'nist5' => 1.0, 'neoscrypt' => 1.0, - 'lyra2' => 1.0, - 'lyra2v2' => 1.0, 'quark' => 1.0, 'qubit' => 1.0, 'skein' => 1.0, @@ -62,6 +63,7 @@ function yaamp_get_algo_norm($algo) 'velvet' => 1.0, 'whirlpool' => 1.0, 'yescrypt' => 1.0, + 'zr5' => 1.0, ); if(!isset($a[$algo])) @@ -82,6 +84,7 @@ function getAlgoColors($algo) 'x13' => '#ffd880', 'x14' => '#a0f0c0', 'x15' => '#f0b0a0', + 'argon2' => '#e0d0e0', 'blake' => '#f0f0f0', 'groestl' => '#d0a0a0', 'dmd-gr' => '#a0c0f0', @@ -127,6 +130,7 @@ function getAlgoPort($algo) 'quark' => 4033, 'whirlpool' => 4133, 'neoscrypt' => 4233, + 'argon2' => 4234, 'scryptn' => 4333, 'lyra2' => 4433, 'lyra2v2' => 4533,