mirror of
https://github.com/LBRYFoundation/pool.git
synced 2025-08-23 17:37:25 +00:00
Add argon2 algo support
This commit is contained in:
parent
f4371654d7
commit
52580a5636
55 changed files with 8076 additions and 13 deletions
1
rc.local
1
rc.local
|
@ -48,4 +48,5 @@ screen -dmS zr5 $STRATUM_DIR/run.sh zr5
|
|||
screen -dmS sib $STRATUM_DIR/run.sh sib
|
||||
screen -dmS m7m $STRATUM_DIR/run.sh m7m
|
||||
screen -dmS velvet $STRATUM_DIR/run.sh velvet
|
||||
screen -dmS argon2 $STRATUM_DIR/run.sh argon2
|
||||
|
||||
|
|
279
stratum/algos/ar2/argon2.c
Normal file
279
stratum/algos/ar2/argon2.c
Normal file
|
@ -0,0 +1,279 @@
|
|||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include "argon2.h"
|
||||
#include "cores.h"
|
||||
|
||||
/* Error messages */
|
||||
static const char *Argon2_ErrorMessage[] = {
|
||||
/*{ARGON2_OK, */ "OK",
|
||||
/*},
|
||||
|
||||
{ARGON2_OUTPUT_PTR_NULL, */ "Output pointer is NULL",
|
||||
/*},
|
||||
|
||||
{ARGON2_OUTPUT_TOO_SHORT, */ "Output is too short",
|
||||
/*},
|
||||
{ARGON2_OUTPUT_TOO_LONG, */ "Output is too long",
|
||||
/*},
|
||||
|
||||
{ARGON2_PWD_TOO_SHORT, */ "Password is too short",
|
||||
/*},
|
||||
{ARGON2_PWD_TOO_LONG, */ "Password is too long",
|
||||
/*},
|
||||
|
||||
{ARGON2_SALT_TOO_SHORT, */ "Salt is too short",
|
||||
/*},
|
||||
{ARGON2_SALT_TOO_LONG, */ "Salt is too long",
|
||||
/*},
|
||||
|
||||
{ARGON2_AD_TOO_SHORT, */ "Associated data is too short",
|
||||
/*},
|
||||
{ARGON2_AD_TOO_LONG, */ "Associated date is too long",
|
||||
/*},
|
||||
|
||||
{ARGON2_SECRET_TOO_SHORT, */ "Secret is too short",
|
||||
/*},
|
||||
{ARGON2_SECRET_TOO_LONG, */ "Secret is too long",
|
||||
/*},
|
||||
|
||||
{ARGON2_TIME_TOO_SMALL, */ "Time cost is too small",
|
||||
/*},
|
||||
{ARGON2_TIME_TOO_LARGE, */ "Time cost is too large",
|
||||
/*},
|
||||
|
||||
{ARGON2_MEMORY_TOO_LITTLE, */ "Memory cost is too small",
|
||||
/*},
|
||||
{ARGON2_MEMORY_TOO_MUCH, */ "Memory cost is too large",
|
||||
/*},
|
||||
|
||||
{ARGON2_LANES_TOO_FEW, */ "Too few lanes",
|
||||
/*},
|
||||
{ARGON2_LANES_TOO_MANY, */ "Too many lanes",
|
||||
/*},
|
||||
|
||||
{ARGON2_PWD_PTR_MISMATCH, */ "Password pointer is NULL, but password length is not 0",
|
||||
/*},
|
||||
{ARGON2_SALT_PTR_MISMATCH, */ "Salt pointer is NULL, but salt length is not 0",
|
||||
/*},
|
||||
{ARGON2_SECRET_PTR_MISMATCH, */ "Secret pointer is NULL, but secret length is not 0",
|
||||
/*},
|
||||
{ARGON2_AD_PTR_MISMATCH, */ "Associated data pointer is NULL, but ad length is not 0",
|
||||
/*},
|
||||
|
||||
{ARGON2_MEMORY_ALLOCATION_ERROR, */ "Memory allocation error",
|
||||
/*},
|
||||
|
||||
{ARGON2_FREE_MEMORY_CBK_NULL, */ "The free memory callback is NULL",
|
||||
/*},
|
||||
{ARGON2_ALLOCATE_MEMORY_CBK_NULL, */ "The allocate memory callback is NULL",
|
||||
/*},
|
||||
|
||||
{ARGON2_INCORRECT_PARAMETER, */ "Argon2_Context context is NULL",
|
||||
/*},
|
||||
{ARGON2_INCORRECT_TYPE, */ "There is no such version of Argon2",
|
||||
/*},
|
||||
|
||||
{ARGON2_OUT_PTR_MISMATCH, */ "Output pointer mismatch",
|
||||
/*},
|
||||
|
||||
{ARGON2_THREADS_TOO_FEW, */ "Not enough threads",
|
||||
/*},
|
||||
{ARGON2_THREADS_TOO_MANY, */ "Too many threads",
|
||||
/*},
|
||||
{ARGON2_MISSING_ARGS, */ "Missing arguments", /*},*/
|
||||
};
|
||||
|
||||
int argon2d(argon2_context *context) { return argon2_core(context, Argon2_d); }
|
||||
|
||||
int argon2i(argon2_context *context) { return argon2_core(context, Argon2_i); }
|
||||
|
||||
int verify_d(argon2_context *context, const char *hash) {
|
||||
int result;
|
||||
/*if (0 == context->outlen || NULL == hash) {
|
||||
return ARGON2_OUT_PTR_MISMATCH;
|
||||
}*/
|
||||
|
||||
result = argon2_core(context, Argon2_d);
|
||||
|
||||
if (ARGON2_OK != result) {
|
||||
return result;
|
||||
}
|
||||
|
||||
return 0 == memcmp(hash, context->out, 32);
|
||||
}
|
||||
|
||||
const char *error_message(int error_code) {
|
||||
enum {
|
||||
/* Make sure---at compile time---that the enum size matches the array
|
||||
size */
|
||||
ERROR_STRING_CHECK =
|
||||
1 /
|
||||
!!((sizeof(Argon2_ErrorMessage) / sizeof(Argon2_ErrorMessage[0])) ==
|
||||
ARGON2_ERROR_CODES_LENGTH)
|
||||
};
|
||||
if (error_code < ARGON2_ERROR_CODES_LENGTH) {
|
||||
return Argon2_ErrorMessage[(argon2_error_codes)error_code];
|
||||
}
|
||||
return "Unknown error code.";
|
||||
}
|
||||
|
||||
/* encoding/decoding helpers */
|
||||
|
||||
/*
|
||||
* Some macros for constant-time comparisons. These work over values in
|
||||
* the 0..255 range. Returned value is 0x00 on "false", 0xFF on "true".
|
||||
*/
|
||||
#define EQ(x, y) ((((0U - ((unsigned)(x) ^ (unsigned)(y))) >> 8) & 0xFF) ^ 0xFF)
|
||||
#define GT(x, y) ((((unsigned)(y) - (unsigned)(x)) >> 8) & 0xFF)
|
||||
#define GE(x, y) (GT(y, x) ^ 0xFF)
|
||||
#define LT(x, y) GT(y, x)
|
||||
#define LE(x, y) GE(y, x)
|
||||
|
||||
/*
|
||||
* Convert value x (0..63) to corresponding Base64 character.
|
||||
*/
|
||||
static int b64_byte_to_char(unsigned x) {
|
||||
return (LT(x, 26) & (x + 'A')) |
|
||||
(GE(x, 26) & LT(x, 52) & (x + ('a' - 26))) |
|
||||
(GE(x, 52) & LT(x, 62) & (x + ('0' - 52))) | (EQ(x, 62) & '+') |
|
||||
(EQ(x, 63) & '/');
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert some bytes to Base64. 'dst_len' is the length (in characters)
|
||||
* of the output buffer 'dst'; if that buffer is not large enough to
|
||||
* receive the result (including the terminating 0), then (size_t)-1
|
||||
* is returned. Otherwise, the zero-terminated Base64 string is written
|
||||
* in the buffer, and the output length (counted WITHOUT the terminating
|
||||
* zero) is returned.
|
||||
*/
|
||||
static size_t to_base64(char *dst, size_t dst_len, const void *src) {
|
||||
size_t olen;
|
||||
const unsigned char *buf;
|
||||
unsigned acc, acc_len;
|
||||
|
||||
olen = 43;
|
||||
/*switch (32 % 3) {
|
||||
case 2:
|
||||
olen++;*/
|
||||
/* fall through */
|
||||
/*case 1:
|
||||
olen += 2;
|
||||
break;
|
||||
}*/
|
||||
if (dst_len <= olen) {
|
||||
return (size_t)-1;
|
||||
}
|
||||
acc = 0;
|
||||
acc_len = 0;
|
||||
buf = (const unsigned char *)src;
|
||||
size_t src_len = 32;
|
||||
while (src_len-- > 0) {
|
||||
acc = (acc << 8) + (*buf++);
|
||||
acc_len += 8;
|
||||
while (acc_len >= 6) {
|
||||
acc_len -= 6;
|
||||
*dst++ = b64_byte_to_char((acc >> acc_len) & 0x3F);
|
||||
}
|
||||
}
|
||||
if (acc_len > 0) {
|
||||
*dst++ = b64_byte_to_char((acc << (6 - acc_len)) & 0x3F);
|
||||
}
|
||||
*dst++ = 0;
|
||||
return olen;
|
||||
}
|
||||
|
||||
/* ==================================================================== */
|
||||
/*
|
||||
* Code specific to Argon2i.
|
||||
*
|
||||
* The code below applies the following format:
|
||||
*
|
||||
* $argon2i$m=<num>,t=<num>,p=<num>[,keyid=<bin>][,data=<bin>][$<bin>[$<bin>]]
|
||||
*
|
||||
* where <num> is a decimal integer (positive, fits in an 'unsigned long')
|
||||
* and <bin> is Base64-encoded data (no '=' padding characters, no newline
|
||||
* or whitespace). The "keyid" is a binary identifier for a key (up to 8
|
||||
* bytes); "data" is associated data (up to 32 bytes). When the 'keyid'
|
||||
* (resp. the 'data') is empty, then it is ommitted from the output.
|
||||
*
|
||||
* The last two binary chunks (encoded in Base64) are, in that order,
|
||||
* the salt and the output. Both are optional, but you cannot have an
|
||||
* output without a salt. The binary salt length is between 8 and 48 bytes.
|
||||
* The output length is always exactly 32 bytes.
|
||||
*/
|
||||
|
||||
int encode_string(char *dst, size_t dst_len, argon2_context *ctx) {
|
||||
#define SS(str) \
|
||||
do { \
|
||||
size_t pp_len = strlen(str); \
|
||||
if (pp_len >= dst_len) { \
|
||||
return 0; \
|
||||
} \
|
||||
memcpy(dst, str, pp_len + 1); \
|
||||
dst += pp_len; \
|
||||
dst_len -= pp_len; \
|
||||
} while (0)
|
||||
|
||||
#define SX(x) \
|
||||
do { \
|
||||
char tmp[30]; \
|
||||
sprintf(tmp, "%lu", (unsigned long)(x)); \
|
||||
SS(tmp); \
|
||||
} while (0);
|
||||
|
||||
#define SB(buf) \
|
||||
do { \
|
||||
size_t sb_len = to_base64(dst, dst_len, buf); \
|
||||
if (sb_len == (size_t)-1) { \
|
||||
return 0; \
|
||||
} \
|
||||
dst += sb_len; \
|
||||
dst_len -= sb_len; \
|
||||
} while (0);
|
||||
|
||||
SS("$argon2i$m=");
|
||||
SX(16);
|
||||
SS(",t=");
|
||||
SX(2);
|
||||
SS(",p=");
|
||||
SX(1);
|
||||
|
||||
/*if (ctx->adlen > 0) {
|
||||
SS(",data=");
|
||||
SB(ctx->ad, ctx->adlen);
|
||||
}*/
|
||||
|
||||
/*if (ctx->saltlen == 0)
|
||||
return 1;*/
|
||||
|
||||
SS("$");
|
||||
SB(ctx->salt);
|
||||
|
||||
/*if (ctx->outlen32 == 0)
|
||||
return 1;*/
|
||||
|
||||
SS("$");
|
||||
SB(ctx->out);
|
||||
return 1;
|
||||
|
||||
#undef SS
|
||||
#undef SX
|
||||
#undef SB
|
||||
}
|
292
stratum/algos/ar2/argon2.h
Normal file
292
stratum/algos/ar2/argon2.h
Normal file
|
@ -0,0 +1,292 @@
|
|||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
#ifndef ARGON2_H
|
||||
#define ARGON2_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <limits.h>
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*************************Argon2 input parameter
|
||||
* restrictions**************************************************/
|
||||
|
||||
/* Minimum and maximum number of lanes (degree of parallelism) */
|
||||
#define ARGON2_MIN_LANES UINT32_C(1)
|
||||
#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF)
|
||||
|
||||
/* Minimum and maximum number of threads */
|
||||
#define ARGON2_MIN_THREADS UINT32_C(1)
|
||||
#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF)
|
||||
|
||||
/* Number of synchronization points between lanes per pass */
|
||||
#define ARGON2_SYNC_POINTS UINT32_C(4)
|
||||
|
||||
/* Minimum and maximum digest size in bytes */
|
||||
#define ARGON2_MIN_OUTLEN UINT32_C(4)
|
||||
#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF)
|
||||
|
||||
/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */
|
||||
#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */
|
||||
|
||||
#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
/* Max memory size is half the addressing space, topping at 2^32 blocks (4 TB)
|
||||
*/
|
||||
#define ARGON2_MAX_MEMORY_BITS \
|
||||
ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1))
|
||||
#define ARGON2_MAX_MEMORY \
|
||||
ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS)
|
||||
|
||||
/* Minimum and maximum number of passes */
|
||||
#define ARGON2_MIN_TIME UINT32_C(1)
|
||||
#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF)
|
||||
|
||||
/* Minimum and maximum password length in bytes */
|
||||
#define ARGON2_MIN_PWD_LENGTH UINT32_C(0)
|
||||
#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF)
|
||||
|
||||
/* Minimum and maximum associated data length in bytes */
|
||||
#define ARGON2_MIN_AD_LENGTH UINT32_C(0)
|
||||
#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF)
|
||||
|
||||
/* Minimum and maximum salt length in bytes */
|
||||
#define ARGON2_MIN_SALT_LENGTH UINT32_C(8)
|
||||
#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF)
|
||||
|
||||
/* Minimum and maximum key length in bytes */
|
||||
#define ARGON2_MIN_SECRET UINT32_C(0)
|
||||
#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF)
|
||||
|
||||
#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0)
|
||||
#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1)
|
||||
#define ARGON2_FLAG_CLEAR_MEMORY (UINT32_C(1) << 2)
|
||||
#define ARGON2_DEFAULT_FLAGS \
|
||||
(ARGON2_FLAG_CLEAR_PASSWORD | ARGON2_FLAG_CLEAR_MEMORY)
|
||||
|
||||
/* Error codes */
|
||||
typedef enum Argon2_ErrorCodes {
|
||||
ARGON2_OK = 0,
|
||||
|
||||
ARGON2_OUTPUT_PTR_NULL = 1,
|
||||
|
||||
ARGON2_OUTPUT_TOO_SHORT = 2,
|
||||
ARGON2_OUTPUT_TOO_LONG = 3,
|
||||
|
||||
ARGON2_PWD_TOO_SHORT = 4,
|
||||
ARGON2_PWD_TOO_LONG = 5,
|
||||
|
||||
ARGON2_SALT_TOO_SHORT = 6,
|
||||
ARGON2_SALT_TOO_LONG = 7,
|
||||
|
||||
ARGON2_AD_TOO_SHORT = 8,
|
||||
ARGON2_AD_TOO_LONG = 9,
|
||||
|
||||
ARGON2_SECRET_TOO_SHORT = 10,
|
||||
ARGON2_SECRET_TOO_LONG = 11,
|
||||
|
||||
ARGON2_TIME_TOO_SMALL = 12,
|
||||
ARGON2_TIME_TOO_LARGE = 13,
|
||||
|
||||
ARGON2_MEMORY_TOO_LITTLE = 14,
|
||||
ARGON2_MEMORY_TOO_MUCH = 15,
|
||||
|
||||
ARGON2_LANES_TOO_FEW = 16,
|
||||
ARGON2_LANES_TOO_MANY = 17,
|
||||
|
||||
ARGON2_PWD_PTR_MISMATCH = 18, /* NULL ptr with non-zero length */
|
||||
ARGON2_SALT_PTR_MISMATCH = 19, /* NULL ptr with non-zero length */
|
||||
ARGON2_SECRET_PTR_MISMATCH = 20, /* NULL ptr with non-zero length */
|
||||
ARGON2_AD_PTR_MISMATCH = 21, /* NULL ptr with non-zero length */
|
||||
|
||||
ARGON2_MEMORY_ALLOCATION_ERROR = 22,
|
||||
|
||||
ARGON2_FREE_MEMORY_CBK_NULL = 23,
|
||||
ARGON2_ALLOCATE_MEMORY_CBK_NULL = 24,
|
||||
|
||||
ARGON2_INCORRECT_PARAMETER = 25,
|
||||
ARGON2_INCORRECT_TYPE = 26,
|
||||
|
||||
ARGON2_OUT_PTR_MISMATCH = 27,
|
||||
|
||||
ARGON2_THREADS_TOO_FEW = 28,
|
||||
ARGON2_THREADS_TOO_MANY = 29,
|
||||
|
||||
ARGON2_MISSING_ARGS = 30,
|
||||
|
||||
ARGON2_ERROR_CODES_LENGTH /* Do NOT remove; Do NOT add error codes after
|
||||
this
|
||||
error code */
|
||||
} argon2_error_codes;
|
||||
|
||||
/* Memory allocator types --- for external allocation */
|
||||
typedef int (*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate);
|
||||
typedef void (*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate);
|
||||
|
||||
/* Argon2 external data structures */
|
||||
|
||||
/*
|
||||
*****Context: structure to hold Argon2 inputs:
|
||||
* output array and its length,
|
||||
* password and its length,
|
||||
* salt and its length,
|
||||
* secret and its length,
|
||||
* associated data and its length,
|
||||
* number of passes, amount of used memory (in KBytes, can be rounded up a bit)
|
||||
* number of parallel threads that will be run.
|
||||
* All the parameters above affect the output hash value.
|
||||
* Additionally, two function pointers can be provided to allocate and
|
||||
deallocate the memory (if NULL, memory will be allocated internally).
|
||||
* Also, three flags indicate whether to erase password, secret as soon as they
|
||||
are pre-hashed (and thus not needed anymore), and the entire memory
|
||||
****************************
|
||||
Simplest situation: you have output array out[8], password is stored in
|
||||
pwd[32], salt is stored in salt[16], you do not have keys nor associated data.
|
||||
You need to spend 1 GB of RAM and you run 5 passes of Argon2d with 4 parallel
|
||||
lanes.
|
||||
You want to erase the password, but you're OK with last pass not being erased.
|
||||
You want to use the default memory allocator.
|
||||
*/
|
||||
typedef struct Argon2_Context {
|
||||
uint8_t *out; /* output array */
|
||||
uint8_t *pwd; /* password array */
|
||||
uint8_t *salt; /* salt array */
|
||||
/*uint8_t *secret;*/ /* key array */
|
||||
/*uint8_t *ad;*/ /* associated data array */
|
||||
|
||||
allocate_fptr allocate_cbk; /* pointer to memory allocator */
|
||||
deallocate_fptr free_cbk; /* pointer to memory deallocator */
|
||||
|
||||
/*uint32_t outlen;*/ /* digest length */
|
||||
uint32_t pwdlen; /* password length */
|
||||
/*uint32_t saltlen;*/ /* salt length */
|
||||
/*uint32_t secretlen;*/ /* key length */
|
||||
/*uint32_t adlen;*/ /* associated data length */
|
||||
/*uint32_t t_cost;*/ /* number of passes */
|
||||
/*uint32_t m_cost;*/ /* amount of memory requested (KB) */
|
||||
/*uint32_t lanes;*/ /* number of lanes */
|
||||
/*uint32_t threads;*/ /* maximum number of threads */
|
||||
/*uint32_t flags;*/ /* array of bool options */
|
||||
|
||||
} argon2_context;
|
||||
|
||||
/**
|
||||
* Function to hash the inputs in the memory-hard fashion (uses Argon2i)
|
||||
* @param out Pointer to the memory where the hash digest will be written
|
||||
* @param outlen Digest length in bytes
|
||||
* @param in Pointer to the input (password)
|
||||
* @param inlen Input length in bytes
|
||||
* @param salt Pointer to the salt
|
||||
* @param saltlen Salt length in bytes
|
||||
* @pre @a out must have at least @a outlen bytes allocated
|
||||
* @pre @a in must be at least @inlen bytes long
|
||||
* @pre @a saltlen must be at least @saltlen bytes long
|
||||
* @return Zero if successful, 1 otherwise.
|
||||
*/
|
||||
/*int hash_argon2i(void *out, size_t outlen, const void *in, size_t inlen,
|
||||
const void *salt, size_t saltlen, unsigned int t_cost,
|
||||
unsigned int m_cost);*/
|
||||
|
||||
/* same for argon2d */
|
||||
/*int hash_argon2d(void *out, size_t outlen, const void *in, size_t inlen,
|
||||
const void *salt, size_t saltlen, unsigned int t_cost,
|
||||
unsigned int m_cost);*/
|
||||
|
||||
/*
|
||||
* **************Argon2d: Version of Argon2 that picks memory blocks depending
|
||||
* on the password and salt. Only for side-channel-free
|
||||
* environment!!***************
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @return Zero if successful, a non zero error code otherwise
|
||||
*/
|
||||
int argon2d(argon2_context *context);
|
||||
|
||||
/*
|
||||
* * **************Argon2i: Version of Argon2 that picks memory blocks
|
||||
*independent on the password and salt. Good for side-channels,
|
||||
******************* but worse w.r.t. tradeoff attacks if
|
||||
*******************only one pass is used***************
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @return Zero if successful, a non zero error code otherwise
|
||||
*/
|
||||
int argon2i(argon2_context *context);
|
||||
|
||||
/*
|
||||
* * **************Argon2di: Reserved name***************
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @return Zero if successful, a non zero error code otherwise
|
||||
*/
|
||||
int argon2di(argon2_context *context);
|
||||
|
||||
/*
|
||||
* * **************Argon2ds: Argon2d hardened against GPU attacks, 20%
|
||||
* slower***************
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @return Zero if successful, a non zero error code otherwise
|
||||
*/
|
||||
int argon2ds(argon2_context *context);
|
||||
|
||||
/*
|
||||
* * **************Argon2id: First half-pass over memory is
|
||||
*password-independent, the rest are password-dependent
|
||||
********************OK against side channels: they reduce to 1/2-pass
|
||||
*Argon2i***************
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @return Zero if successful, a non zero error code otherwise
|
||||
*/
|
||||
int argon2id(argon2_context *context);
|
||||
|
||||
/*
|
||||
* Verify if a given password is correct for Argon2d hashing
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @param hash The password hash to verify. The length of the hash is
|
||||
* specified by the context outlen member
|
||||
* @return Zero if successful, a non zero error code otherwise
|
||||
*/
|
||||
int verify_d(argon2_context *context, const char *hash);
|
||||
|
||||
/*
|
||||
* Get the associated error message for given error code
|
||||
* @return The error message associated with the given error code
|
||||
*/
|
||||
const char *error_message(int error_code);
|
||||
|
||||
/* ==================================================================== */
|
||||
/*
|
||||
* Code specific to Argon2i.
|
||||
*
|
||||
* The code below applies the following format:
|
||||
*
|
||||
* $argon2i$m=<num>,t=<num>,p=<num>[,keyid=<bin>][,data=<bin>][$<bin>[$<bin>]]
|
||||
*
|
||||
* where <num> is a decimal integer (positive, fits in an 'unsigned long')
|
||||
* and <bin> is Base64-encoded data (no '=' padding characters, no newline
|
||||
* or whitespace). The "keyid" is a binary identifier for a key (up to 8
|
||||
* bytes); "data" is associated data (up to 32 bytes). When the 'keyid'
|
||||
* (resp. the 'data') is empty, then it is ommitted from the output.
|
||||
*
|
||||
* The last two binary chunks (encoded in Base64) are, in that order,
|
||||
* the salt and the output. Both are optional, but you cannot have an
|
||||
* output without a salt. The binary salt length is between 8 and 48 bytes.
|
||||
* The output length is always exactly 32 bytes.
|
||||
*/
|
||||
|
||||
int encode_string(char *dst, size_t dst_len, argon2_context *ctx);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
111
stratum/algos/ar2/bench.c
Normal file
111
stratum/algos/ar2/bench.c
Normal file
|
@ -0,0 +1,111 @@
|
|||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include "argon2.h"
|
||||
|
||||
static uint64_t rdtsc(void) {
|
||||
#ifdef _MSC_VER
|
||||
return __rdtsc();
|
||||
#else
|
||||
#if defined(__amd64__) || defined(__x86_64__)
|
||||
uint64_t rax, rdx;
|
||||
__asm__ __volatile__("rdtsc" : "=a"(rax), "=d"(rdx) : :);
|
||||
return (rdx << 32) | rax;
|
||||
#elif defined(__i386__) || defined(__i386) || defined(__X86__)
|
||||
uint64_t rax;
|
||||
__asm__ __volatile__("rdtsc" : "=A"(rax) : :);
|
||||
return rax;
|
||||
#else
|
||||
#error "Not implemented!"
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Benchmarks Argon2 with salt length 16, password length 16, t_cost 1,
|
||||
and different m_cost and threads
|
||||
*/
|
||||
static void benchmark() {
|
||||
#define BENCH_OUTLEN 16
|
||||
#define BENCH_INLEN 16
|
||||
const uint32_t inlen = BENCH_INLEN;
|
||||
const unsigned outlen = BENCH_OUTLEN;
|
||||
unsigned char out[BENCH_OUTLEN];
|
||||
unsigned char pwd_array[BENCH_INLEN];
|
||||
unsigned char salt_array[BENCH_INLEN];
|
||||
#undef BENCH_INLEN
|
||||
#undef BENCH_OUTLEN
|
||||
|
||||
uint32_t t_cost = 1;
|
||||
uint32_t m_cost;
|
||||
uint32_t thread_test[6] = {1, 2, 4, 6, 8, 16};
|
||||
|
||||
memset(pwd_array, 0, inlen);
|
||||
memset(salt_array, 1, inlen);
|
||||
|
||||
for (m_cost = (uint32_t)1 << 10; m_cost <= (uint32_t)1 << 22; m_cost *= 2) {
|
||||
unsigned i;
|
||||
for (i = 0; i < 6; ++i) {
|
||||
argon2_context context;
|
||||
uint32_t thread_n = thread_test[i];
|
||||
uint64_t stop_cycles, stop_cycles_i;
|
||||
clock_t stop_time;
|
||||
uint64_t delta_d, delta_i;
|
||||
double mcycles_d, mcycles_i, run_time;
|
||||
|
||||
clock_t start_time = clock();
|
||||
uint64_t start_cycles = rdtsc();
|
||||
|
||||
context.out = out;
|
||||
context.outlen = outlen;
|
||||
context.pwd = pwd_array;
|
||||
context.pwdlen = inlen;
|
||||
context.salt = salt_array;
|
||||
context.saltlen = inlen;
|
||||
context.secret = NULL;
|
||||
context.secretlen = 0;
|
||||
context.ad = NULL;
|
||||
context.adlen = 0;
|
||||
context.t_cost = t_cost;
|
||||
context.m_cost = m_cost;
|
||||
context.lanes = thread_n;
|
||||
context.threads = thread_n;
|
||||
context.allocate_cbk = NULL;
|
||||
context.free_cbk = NULL;
|
||||
context.flags = 0;
|
||||
|
||||
argon2d(&context);
|
||||
stop_cycles = rdtsc();
|
||||
argon2i(&context);
|
||||
stop_cycles_i = rdtsc();
|
||||
stop_time = clock();
|
||||
|
||||
delta_d = (stop_cycles - start_cycles) / (m_cost);
|
||||
delta_i = (stop_cycles_i - stop_cycles) / (m_cost);
|
||||
mcycles_d = (double)(stop_cycles - start_cycles) / (1UL << 20);
|
||||
mcycles_i = (double)(stop_cycles_i - stop_cycles) / (1UL << 20);
|
||||
printf("Argon2d %d iterations %d MiB %d threads: %2.2f cpb %2.2f "
|
||||
"Mcycles \n",
|
||||
t_cost, m_cost >> 10, thread_n, (float)delta_d / 1024,
|
||||
mcycles_d);
|
||||
printf("Argon2i %d iterations %d MiB %d threads: %2.2f cpb %2.2f "
|
||||
"Mcycles \n",
|
||||
t_cost, m_cost >> 10, thread_n, (float)delta_i / 1024,
|
||||
mcycles_i);
|
||||
|
||||
run_time = ((double)stop_time - start_time) / (CLOCKS_PER_SEC);
|
||||
printf("%2.4f seconds\n\n", run_time);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
benchmark();
|
||||
return ARGON2_OK;
|
||||
}
|
143
stratum/algos/ar2/blake2/blake2-impl.h
Normal file
143
stratum/algos/ar2/blake2/blake2-impl.h
Normal file
|
@ -0,0 +1,143 @@
|
|||
#ifndef PORTABLE_BLAKE2_IMPL_H
|
||||
#define PORTABLE_BLAKE2_IMPL_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define BLAKE2_INLINE __inline
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
#define BLAKE2_INLINE __inline__
|
||||
#else
|
||||
#define BLAKE2_INLINE
|
||||
#endif
|
||||
|
||||
/* Argon2 Team - Begin Code */
|
||||
/*
|
||||
Not an exhaustive list, but should cover the majority of modern platforms
|
||||
Additionally, the code will always be correct---this is only a performance
|
||||
tweak.
|
||||
*/
|
||||
#if (defined(__BYTE_ORDER__) && \
|
||||
(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
|
||||
defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \
|
||||
defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \
|
||||
defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \
|
||||
defined(_M_ARM)
|
||||
#define NATIVE_LITTLE_ENDIAN
|
||||
#endif
|
||||
/* Argon2 Team - End Code */
|
||||
|
||||
static BLAKE2_INLINE uint32_t load32(const void *src) {
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
uint32_t w;
|
||||
memcpy(&w, src, sizeof w);
|
||||
return w;
|
||||
#else
|
||||
const uint8_t *p = (const uint8_t *)src;
|
||||
uint32_t w = *p++;
|
||||
w |= (uint32_t)(*p++) << 8;
|
||||
w |= (uint32_t)(*p++) << 16;
|
||||
w |= (uint32_t)(*p++) << 24;
|
||||
return w;
|
||||
#endif
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE uint64_t load64(const void *src) {
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
uint64_t w;
|
||||
memcpy(&w, src, sizeof w);
|
||||
return w;
|
||||
#else
|
||||
const uint8_t *p = (const uint8_t *)src;
|
||||
uint64_t w = *p++;
|
||||
w |= (uint64_t)(*p++) << 8;
|
||||
w |= (uint64_t)(*p++) << 16;
|
||||
w |= (uint64_t)(*p++) << 24;
|
||||
w |= (uint64_t)(*p++) << 32;
|
||||
w |= (uint64_t)(*p++) << 40;
|
||||
w |= (uint64_t)(*p++) << 48;
|
||||
w |= (uint64_t)(*p++) << 56;
|
||||
return w;
|
||||
#endif
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void store32(void *dst, uint32_t w) {
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
memcpy(dst, &w, sizeof w);
|
||||
#else
|
||||
uint8_t *p = (uint8_t *)dst;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
#endif
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void store64(void *dst, uint64_t w) {
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
memcpy(dst, &w, sizeof w);
|
||||
#else
|
||||
uint8_t *p = (uint8_t *)dst;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
#endif
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE uint64_t load48(const void *src) {
|
||||
const uint8_t *p = (const uint8_t *)src;
|
||||
uint64_t w = *p++;
|
||||
w |= (uint64_t)(*p++) << 8;
|
||||
w |= (uint64_t)(*p++) << 16;
|
||||
w |= (uint64_t)(*p++) << 24;
|
||||
w |= (uint64_t)(*p++) << 32;
|
||||
w |= (uint64_t)(*p++) << 40;
|
||||
return w;
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void store48(void *dst, uint64_t w) {
|
||||
uint8_t *p = (uint8_t *)dst;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) {
|
||||
return (w >> c) | (w << (32 - c));
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) {
|
||||
return (w >> c) | (w << (64 - c));
|
||||
}
|
||||
|
||||
/* prevents compiler optimizing out memset() */
|
||||
static BLAKE2_INLINE void burn(void *v, size_t n) {
|
||||
static void *(*const volatile memset_v)(void *, int, size_t) = &memset;
|
||||
memset_v(v, 0, n);
|
||||
}
|
||||
|
||||
#endif
|
76
stratum/algos/ar2/blake2/blake2.h
Normal file
76
stratum/algos/ar2/blake2/blake2.h
Normal file
|
@ -0,0 +1,76 @@
|
|||
#ifndef PORTABLE_BLAKE2_H
|
||||
#define PORTABLE_BLAKE2_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <limits.h>
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum blake2b_constant {
|
||||
BLAKE2B_BLOCKBYTES = 128,
|
||||
BLAKE2B_OUTBYTES = 64,
|
||||
BLAKE2B_KEYBYTES = 64,
|
||||
BLAKE2B_SALTBYTES = 16,
|
||||
BLAKE2B_PERSONALBYTES = 16
|
||||
};
|
||||
|
||||
#pragma pack(push, 1)
|
||||
typedef struct __blake2b_param {
|
||||
uint8_t digest_length; /* 1 */
|
||||
uint8_t key_length; /* 2 */
|
||||
uint8_t fanout; /* 3 */
|
||||
uint8_t depth; /* 4 */
|
||||
uint32_t leaf_length; /* 8 */
|
||||
uint64_t node_offset; /* 16 */
|
||||
uint8_t node_depth; /* 17 */
|
||||
uint8_t inner_length; /* 18 */
|
||||
uint8_t reserved[14]; /* 32 */
|
||||
uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */
|
||||
uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
|
||||
} blake2b_param;
|
||||
#pragma pack(pop)
|
||||
|
||||
typedef struct __blake2b_state {
|
||||
uint64_t h[8];
|
||||
uint64_t t[2];
|
||||
uint64_t f[2];
|
||||
unsigned buflen;
|
||||
unsigned outlen;
|
||||
uint8_t last_node;
|
||||
uint8_t buf[BLAKE2B_BLOCKBYTES];
|
||||
} blake2b_state;
|
||||
|
||||
/* Ensure param structs have not been wrongly padded */
|
||||
/* Poor man's static_assert */
|
||||
enum {
|
||||
blake2_size_check_0 = 1 / !!(CHAR_BIT == 8),
|
||||
blake2_size_check_2 =
|
||||
1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT)
|
||||
};
|
||||
|
||||
/* Streaming API */
|
||||
int blake2b_init(blake2b_state *S, size_t outlen);
|
||||
int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
|
||||
size_t keylen);
|
||||
int blake2b_init_param(blake2b_state *S, const blake2b_param *P);
|
||||
int blake2b_update(blake2b_state *S, const void *in, size_t inlen);
|
||||
void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen);
|
||||
int blake2b_final(blake2b_state *S, void *out, size_t outlen);
|
||||
|
||||
/* Simple API */
|
||||
int blake2b(void *out, const void *in, const void *key, size_t keylen);
|
||||
|
||||
/* Argon2 Team - Begin Code */
|
||||
int blake2b_long(void *out, const void *in);
|
||||
/* Argon2 Team - End Code */
|
||||
/* Miouyouyou */
|
||||
void blake2b_too(void *out, const void *in);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
162
stratum/algos/ar2/blake2/blamka-round-opt.h
Normal file
162
stratum/algos/ar2/blake2/blamka-round-opt.h
Normal file
|
@ -0,0 +1,162 @@
|
|||
#ifndef BLAKE_ROUND_MKA_OPT_H
|
||||
#define BLAKE_ROUND_MKA_OPT_H
|
||||
|
||||
#include "blake2-impl.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__XOP__)
|
||||
#if defined(__SSSE3__)
|
||||
#define r16 \
|
||||
(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
|
||||
#define r24 \
|
||||
(_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
|
||||
#define _mm_roti_epi64(x, c) \
|
||||
(-(c) == 32) \
|
||||
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
|
||||
: (-(c) == 24) \
|
||||
? _mm_shuffle_epi8((x), r24) \
|
||||
: (-(c) == 16) \
|
||||
? _mm_shuffle_epi8((x), r16) \
|
||||
: (-(c) == 63) \
|
||||
? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
|
||||
_mm_add_epi64((x), (x))) \
|
||||
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
|
||||
_mm_slli_epi64((x), 64 - (-(c))))
|
||||
#else /* defined(__SSE2__) */
|
||||
#define _mm_roti_epi64(r, c) \
|
||||
_mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
|
||||
#endif
|
||||
#else
|
||||
#endif
|
||||
|
||||
static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
||||
const __m128i z = _mm_mul_epu32(x, y);
|
||||
return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
|
||||
}
|
||||
|
||||
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
A0 = fBlaMka(A0, B0); \
|
||||
A1 = fBlaMka(A1, B1); \
|
||||
\
|
||||
D0 = _mm_xor_si128(D0, A0); \
|
||||
D1 = _mm_xor_si128(D1, A1); \
|
||||
\
|
||||
D0 = _mm_roti_epi64(D0, -32); \
|
||||
D1 = _mm_roti_epi64(D1, -32); \
|
||||
\
|
||||
C0 = fBlaMka(C0, D0); \
|
||||
C1 = fBlaMka(C1, D1); \
|
||||
\
|
||||
B0 = _mm_xor_si128(B0, C0); \
|
||||
B1 = _mm_xor_si128(B1, C1); \
|
||||
\
|
||||
B0 = _mm_roti_epi64(B0, -24); \
|
||||
B1 = _mm_roti_epi64(B1, -24); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
A0 = fBlaMka(A0, B0); \
|
||||
A1 = fBlaMka(A1, B1); \
|
||||
\
|
||||
D0 = _mm_xor_si128(D0, A0); \
|
||||
D1 = _mm_xor_si128(D1, A1); \
|
||||
\
|
||||
D0 = _mm_roti_epi64(D0, -16); \
|
||||
D1 = _mm_roti_epi64(D1, -16); \
|
||||
\
|
||||
C0 = fBlaMka(C0, D0); \
|
||||
C1 = fBlaMka(C1, D1); \
|
||||
\
|
||||
B0 = _mm_xor_si128(B0, C0); \
|
||||
B1 = _mm_xor_si128(B1, C1); \
|
||||
\
|
||||
B0 = _mm_roti_epi64(B0, -63); \
|
||||
B1 = _mm_roti_epi64(B1, -63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
__m128i t0 = _mm_alignr_epi8(B1, B0, 8); \
|
||||
__m128i t1 = _mm_alignr_epi8(B0, B1, 8); \
|
||||
B0 = t0; \
|
||||
B1 = t1; \
|
||||
\
|
||||
t0 = C0; \
|
||||
C0 = C1; \
|
||||
C1 = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(D1, D0, 8); \
|
||||
t1 = _mm_alignr_epi8(D0, D1, 8); \
|
||||
D0 = t1; \
|
||||
D1 = t0; \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
__m128i t0 = _mm_alignr_epi8(B0, B1, 8); \
|
||||
__m128i t1 = _mm_alignr_epi8(B1, B0, 8); \
|
||||
B0 = t0; \
|
||||
B1 = t1; \
|
||||
\
|
||||
t0 = C0; \
|
||||
C0 = C1; \
|
||||
C1 = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(D0, D1, 8); \
|
||||
t1 = _mm_alignr_epi8(D1, D0, 8); \
|
||||
D0 = t1; \
|
||||
D1 = t0; \
|
||||
} while ((void)0, 0)
|
||||
#else /* SSE2 */
|
||||
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
__m128i t0 = D0; \
|
||||
__m128i t1 = B0; \
|
||||
D0 = C0; \
|
||||
C0 = C1; \
|
||||
C1 = D0; \
|
||||
D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \
|
||||
D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \
|
||||
B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \
|
||||
B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
__m128i t0 = C0; \
|
||||
C0 = C1; \
|
||||
C1 = t0; \
|
||||
t0 = B0; \
|
||||
__m128i t1 = D0; \
|
||||
B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \
|
||||
B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \
|
||||
D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \
|
||||
D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \
|
||||
} while ((void)0, 0)
|
||||
#endif
|
||||
|
||||
#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||
do { \
|
||||
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||
\
|
||||
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||
\
|
||||
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||
\
|
||||
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#endif
|
39
stratum/algos/ar2/blake2/blamka-round-ref.h
Normal file
39
stratum/algos/ar2/blake2/blamka-round-ref.h
Normal file
|
@ -0,0 +1,39 @@
|
|||
#ifndef BLAKE_ROUND_MKA_H
|
||||
#define BLAKE_ROUND_MKA_H
|
||||
|
||||
#include "blake2.h"
|
||||
#include "blake2-impl.h"
|
||||
|
||||
/*designed by the Lyra PHC team */
|
||||
static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
|
||||
const uint64_t m = UINT64_C(0xFFFFFFFF);
|
||||
const uint64_t xy = (x & m) * (y & m);
|
||||
return x + y + 2 * xy;
|
||||
}
|
||||
|
||||
#define G(a, b, c, d) \
|
||||
do { \
|
||||
a = fBlaMka(a, b); \
|
||||
d = rotr64(d ^ a, 32); \
|
||||
c = fBlaMka(c, d); \
|
||||
b = rotr64(b ^ c, 24); \
|
||||
a = fBlaMka(a, b); \
|
||||
d = rotr64(d ^ a, 16); \
|
||||
c = fBlaMka(c, d); \
|
||||
b = rotr64(b ^ c, 63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, \
|
||||
v12, v13, v14, v15) \
|
||||
do { \
|
||||
G(v0, v4, v8, v12); \
|
||||
G(v1, v5, v9, v13); \
|
||||
G(v2, v6, v10, v14); \
|
||||
G(v3, v7, v11, v15); \
|
||||
G(v0, v5, v10, v15); \
|
||||
G(v1, v6, v11, v12); \
|
||||
G(v2, v7, v8, v13); \
|
||||
G(v3, v4, v9, v14); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#endif
|
305
stratum/algos/ar2/blake2b.c
Normal file
305
stratum/algos/ar2/blake2b.c
Normal file
|
@ -0,0 +1,305 @@
|
|||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "blake2/blake2.h"
|
||||
#include "blake2/blake2-impl.h"
|
||||
|
||||
static const uint64_t blake2b_IV[8] = {
|
||||
UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
|
||||
UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
|
||||
UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
|
||||
UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179)
|
||||
};
|
||||
|
||||
static const unsigned int blake2b_sigma[12][16] = {
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
|
||||
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
|
||||
{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
|
||||
{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
|
||||
{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
|
||||
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
|
||||
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
|
||||
{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
|
||||
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
|
||||
};
|
||||
|
||||
static BLAKE2_INLINE void blake2b_set_lastnode(blake2b_state *S) {
|
||||
S->f[1] = (uint64_t)-1;
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void blake2b_set_lastblock(blake2b_state *S) {
|
||||
if (S->last_node) {
|
||||
blake2b_set_lastnode(S);
|
||||
}
|
||||
S->f[0] = (uint64_t)-1;
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void blake2b_increment_counter(blake2b_state *S,
|
||||
uint64_t inc) {
|
||||
S->t[0] += inc;
|
||||
S->t[1] += (S->t[0] < inc);
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void blake2b_invalidate_state(blake2b_state *S) {
|
||||
burn(S, sizeof(*S)); /* wipe */
|
||||
blake2b_set_lastblock(S); /* invalidate for further use */
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void blake2b_init0(blake2b_state *S) {
|
||||
memset(S, 0, sizeof(*S));
|
||||
memcpy(S->h, blake2b_IV, sizeof(S->h));
|
||||
}
|
||||
|
||||
|
||||
/*void print_state(blake2b_state BlakeHash) {
|
||||
printf(".h = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n"
|
||||
"UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n"
|
||||
"UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n"
|
||||
"UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")},\n"
|
||||
".t = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")},\n"
|
||||
".f = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")}\n",
|
||||
BlakeHash.h[0], BlakeHash.h[1], BlakeHash.h[2], BlakeHash.h[3],
|
||||
BlakeHash.h[4], BlakeHash.h[5], BlakeHash.h[6], BlakeHash.h[7],
|
||||
BlakeHash.t[0], BlakeHash.t[1],
|
||||
BlakeHash.f[0], BlakeHash.f[1]);
|
||||
printf(".buf = {");
|
||||
for (register uint8_t i = 0; i < BLAKE2B_BLOCKBYTES; i++)
|
||||
printf("%" PRIu8 ", ", BlakeHash.buf[i]);
|
||||
puts("\n");
|
||||
printf("}\n.buflen = %d\n.outlen = %d\n",
|
||||
BlakeHash.buflen, BlakeHash.outlen);
|
||||
printf(".last_node = %" PRIu8 "\n", BlakeHash.last_node);
|
||||
fflush(stdout);
|
||||
}*/
|
||||
|
||||
static const blake2b_state miou = {
|
||||
.h = {
|
||||
UINT64_C(7640891576939301128), UINT64_C(13503953896175478587),
|
||||
UINT64_C(4354685564936845355), UINT64_C(11912009170470909681),
|
||||
UINT64_C(5840696475078001361), UINT64_C(11170449401992604703),
|
||||
UINT64_C(2270897969802886507), UINT64_C(6620516959819538809)
|
||||
},
|
||||
.t = {UINT64_C(0), UINT64_C(0)},
|
||||
.f = {UINT64_C(0), UINT64_C(0)},
|
||||
.buf = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
},
|
||||
.buflen = 0,
|
||||
.outlen = 64,
|
||||
.last_node = 0
|
||||
};
|
||||
|
||||
|
||||
int blake2b_init_param(blake2b_state *S, const blake2b_param *P) {
|
||||
const unsigned char *p = (const unsigned char *)P;
|
||||
unsigned int i;
|
||||
|
||||
if (NULL == P || NULL == S) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
blake2b_init0(S);
|
||||
|
||||
/* IV XOR Parameter Block */
|
||||
for (i = 0; i < 8; ++i) {
|
||||
S->h[i] ^= load64(&p[i * sizeof(S->h[i])]);
|
||||
}
|
||||
S->outlen = P->digest_length;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void compare_buffs(uint64_t *h, size_t outlen)
|
||||
{
|
||||
// printf("CMP : %d", memcmp(h, miou.h, 8*(sizeof(uint64_t))));
|
||||
printf("miou : %" PRIu64 " - h : %" PRIu64 " - outlen : %ld\n", miou.h[0], h[0], outlen);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
/* Sequential blake2b initialization */
|
||||
int blake2b_init(blake2b_state *S, size_t outlen) {
|
||||
memcpy(S, &miou, sizeof(*S));
|
||||
S->h[0] += outlen;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void print64(const char *name, const uint64_t *array, uint16_t size) {
|
||||
printf("%s = {", name);
|
||||
for (uint8_t i = 0; i < size; i++) printf("UINT64_C(%" PRIu64 "), ", array[i]);
|
||||
printf("};\n");
|
||||
}
|
||||
int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
|
||||
size_t keylen) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void blake2b_compress(blake2b_state *S, const uint8_t *block) {
|
||||
uint64_t m[16];
|
||||
uint64_t v[16];
|
||||
unsigned int i, r;
|
||||
|
||||
for (i = 0; i < 16; ++i) {
|
||||
m[i] = load64(block + i * 8);
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; ++i) {
|
||||
v[i] = S->h[i];
|
||||
}
|
||||
|
||||
v[8] = blake2b_IV[0];
|
||||
v[9] = blake2b_IV[1];
|
||||
v[10] = blake2b_IV[2];
|
||||
v[11] = blake2b_IV[3];
|
||||
v[12] = blake2b_IV[4] ^ S->t[0];
|
||||
v[13] = blake2b_IV[5]/* ^ S->t[1]*/;
|
||||
v[14] = blake2b_IV[6] ^ S->f[0];
|
||||
v[15] = blake2b_IV[7]/* ^ S->f[1]*/;
|
||||
|
||||
#define G(r, i, a, b, c, d) \
|
||||
do { \
|
||||
a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \
|
||||
d = rotr64(d ^ a, 32); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 24); \
|
||||
a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \
|
||||
d = rotr64(d ^ a, 16); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define ROUND(r) \
|
||||
do { \
|
||||
G(r, 0, v[0], v[4], v[8], v[12]); \
|
||||
G(r, 1, v[1], v[5], v[9], v[13]); \
|
||||
G(r, 2, v[2], v[6], v[10], v[14]); \
|
||||
G(r, 3, v[3], v[7], v[11], v[15]); \
|
||||
G(r, 4, v[0], v[5], v[10], v[15]); \
|
||||
G(r, 5, v[1], v[6], v[11], v[12]); \
|
||||
G(r, 6, v[2], v[7], v[8], v[13]); \
|
||||
G(r, 7, v[3], v[4], v[9], v[14]); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
for (r = 0; r < 12; ++r) ROUND(r);
|
||||
|
||||
for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
|
||||
|
||||
#undef G
|
||||
#undef ROUND
|
||||
}
|
||||
|
||||
int blake2b_update(blake2b_state *S, const void *in, size_t inlen) {
|
||||
const uint8_t *pin = (const uint8_t *)in;
|
||||
/* Complete current block */
|
||||
memcpy(&S->buf[4], pin, 124);
|
||||
blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
|
||||
blake2b_compress(S, S->buf);
|
||||
S->buflen = 0;
|
||||
pin += 124;
|
||||
|
||||
register int8_t i = 7;
|
||||
/* Avoid buffer copies when possible */
|
||||
while (i--) {
|
||||
blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
|
||||
blake2b_compress(S, pin);
|
||||
pin += BLAKE2B_BLOCKBYTES;
|
||||
}
|
||||
memcpy(&S->buf[S->buflen], pin, 4);
|
||||
S->buflen += 4;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen) {
|
||||
|
||||
memcpy(&S->buf[S->buflen], in, inlen);
|
||||
S->buflen += (unsigned int)inlen;
|
||||
}
|
||||
|
||||
int blake2b_final(blake2b_state *S, void *out, size_t outlen) {
|
||||
uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
|
||||
unsigned int i;
|
||||
|
||||
blake2b_increment_counter(S, S->buflen);
|
||||
blake2b_set_lastblock(S);
|
||||
memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
|
||||
blake2b_compress(S, S->buf);
|
||||
|
||||
for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */
|
||||
store64(buffer + sizeof(S->h[i]) * i, S->h[i]);
|
||||
}
|
||||
|
||||
memcpy(out, buffer, S->outlen);
|
||||
|
||||
burn(buffer, sizeof(buffer));
|
||||
burn(S->buf, sizeof(S->buf));
|
||||
burn(S->h, sizeof(S->h));
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2b(void *out, const void *in, const void *key, size_t keylen)
|
||||
{
|
||||
blake2b_state S;
|
||||
|
||||
blake2b_init(&S, 64);
|
||||
my_blake2b_update(&S, in, 64);
|
||||
blake2b_final(&S, out, 64);
|
||||
burn(&S, sizeof(S));
|
||||
return 0;
|
||||
}
|
||||
|
||||
void blake2b_too(void *pout, const void *in)
|
||||
{
|
||||
uint8_t *out = (uint8_t *)pout;
|
||||
uint8_t out_buffer[64];
|
||||
uint8_t in_buffer[64];
|
||||
|
||||
blake2b_state blake_state;
|
||||
blake2b_init(&blake_state, 64);
|
||||
blake_state.buflen = blake_state.buf[1] = 4;
|
||||
my_blake2b_update(&blake_state, in, 72);
|
||||
blake2b_final(&blake_state, out_buffer, 64);
|
||||
memcpy(out, out_buffer, 32);
|
||||
out += 32;
|
||||
|
||||
register uint8_t i = 29;
|
||||
while (i--) {
|
||||
memcpy(in_buffer, out_buffer, 64);
|
||||
blake2b(out_buffer, in_buffer, NULL, 0);
|
||||
memcpy(out, out_buffer, 32);
|
||||
out += 32;
|
||||
}
|
||||
|
||||
memcpy(in_buffer, out_buffer, 64);
|
||||
blake2b(out_buffer, in_buffer, NULL, 0);
|
||||
memcpy(out, out_buffer, 64);
|
||||
|
||||
burn(&blake_state, sizeof(blake_state));
|
||||
}
|
||||
|
||||
/* Argon2 Team - Begin Code */
|
||||
int blake2b_long(void *pout, const void *in)
|
||||
{
|
||||
uint8_t *out = (uint8_t *)pout;
|
||||
blake2b_state blake_state;
|
||||
uint8_t outlen_bytes[sizeof(uint32_t)] = {0};
|
||||
|
||||
store32(outlen_bytes, 32);
|
||||
|
||||
blake2b_init(&blake_state, 32);
|
||||
my_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes));
|
||||
blake2b_update(&blake_state, in, 1024);
|
||||
blake2b_final(&blake_state, out, 32);
|
||||
burn(&blake_state, sizeof(blake_state));
|
||||
return 0;
|
||||
|
||||
}
|
||||
/* Argon2 Team - End Code */
|
341
stratum/algos/ar2/cores.c
Normal file
341
stratum/algos/ar2/cores.c
Normal file
|
@ -0,0 +1,341 @@
|
|||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
/*For memory wiping*/
|
||||
#ifdef _MSC_VER
|
||||
#include <windows.h>
|
||||
#include <winbase.h> /* For SecureZeroMemory */
|
||||
#endif
|
||||
#if defined __STDC_LIB_EXT1__
|
||||
#define __STDC_WANT_LIB_EXT1__ 1
|
||||
#endif
|
||||
#define VC_GE_2005(version) (version >= 1400)
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "argon2.h"
|
||||
#include "cores.h"
|
||||
#include "blake2/blake2.h"
|
||||
#include "blake2/blake2-impl.h"
|
||||
|
||||
#ifdef GENKAT
|
||||
#include "genkat.h"
|
||||
#endif
|
||||
|
||||
#if defined(__clang__)
|
||||
#if __has_attribute(optnone)
|
||||
#define NOT_OPTIMIZED __attribute__((optnone))
|
||||
#endif
|
||||
#elif defined(__GNUC__)
|
||||
#define GCC_VERSION \
|
||||
(__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
|
||||
#if GCC_VERSION >= 40400
|
||||
#define NOT_OPTIMIZED __attribute__((optimize("O0")))
|
||||
#endif
|
||||
#endif
|
||||
#ifndef NOT_OPTIMIZED
|
||||
#define NOT_OPTIMIZED
|
||||
#endif
|
||||
|
||||
/***************Instance and Position constructors**********/
|
||||
void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
|
||||
|
||||
void copy_block(block *dst, const block *src) {
|
||||
memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_WORDS_IN_BLOCK);
|
||||
}
|
||||
|
||||
void xor_block(block *dst, const block *src) {
|
||||
int i;
|
||||
for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
|
||||
dst->v[i] ^= src->v[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void load_block(block *dst, const void *input) {
|
||||
unsigned i;
|
||||
for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
|
||||
dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i]));
|
||||
}
|
||||
}
|
||||
|
||||
static void store_block(void *output, const block *src) {
|
||||
unsigned i;
|
||||
for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
|
||||
store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/***************Memory allocators*****************/
|
||||
int allocate_memory(block **memory, uint32_t m_cost) {
|
||||
if (memory != NULL) {
|
||||
size_t memory_size = sizeof(block) * m_cost;
|
||||
if (m_cost != 0 &&
|
||||
memory_size / m_cost !=
|
||||
sizeof(block)) { /*1. Check for multiplication overflow*/
|
||||
return ARGON2_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
|
||||
*memory = (block *)malloc(memory_size); /*2. Try to allocate*/
|
||||
|
||||
if (!*memory) {
|
||||
return ARGON2_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
|
||||
return ARGON2_OK;
|
||||
} else {
|
||||
return ARGON2_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
void secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); }
|
||||
|
||||
/*********Memory functions*/
|
||||
|
||||
void clear_memory(argon2_instance_t *instance, int clear) {
|
||||
if (instance->memory != NULL && clear) {
|
||||
secure_wipe_memory(instance->memory,
|
||||
sizeof(block) * /*instance->memory_blocks*/16);
|
||||
}
|
||||
}
|
||||
|
||||
void free_memory(block *memory) { free(memory); }
|
||||
|
||||
void finalize(const argon2_context *context, argon2_instance_t *instance) {
|
||||
if (context != NULL && instance != NULL) {
|
||||
block blockhash;
|
||||
copy_block(&blockhash, instance->memory + 15);
|
||||
|
||||
/* Hash the result */
|
||||
{
|
||||
uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
|
||||
store_block(blockhash_bytes, &blockhash);
|
||||
blake2b_long(context->out, blockhash_bytes);
|
||||
secure_wipe_memory(blockhash.v, ARGON2_BLOCK_SIZE);
|
||||
secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); /* clear blockhash_bytes */
|
||||
}
|
||||
|
||||
#ifdef GENKAT
|
||||
print_tag(context->out, context->outlen);
|
||||
#endif
|
||||
|
||||
/* Clear memory */
|
||||
clear_memory(instance, 1);
|
||||
|
||||
free_memory(instance->memory);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t index_alpha(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position, uint32_t pseudo_rand,
|
||||
int same_lane) {
|
||||
/*
|
||||
* Pass 0:
|
||||
* This lane : all already finished segments plus already constructed
|
||||
* blocks in this segment
|
||||
* Other lanes : all already finished segments
|
||||
* Pass 1+:
|
||||
* This lane : (SYNC_POINTS - 1) last segments plus already constructed
|
||||
* blocks in this segment
|
||||
* Other lanes : (SYNC_POINTS - 1) last segments
|
||||
*/
|
||||
uint32_t reference_area_size;
|
||||
uint64_t relative_position;
|
||||
uint32_t start_position, absolute_position;
|
||||
|
||||
if (0 == position->pass) {
|
||||
/* First pass */
|
||||
if (0 == position->slice) {
|
||||
/* First slice */
|
||||
reference_area_size =
|
||||
position->index - 1; /* all but the previous */
|
||||
} else {
|
||||
if (same_lane) {
|
||||
/* The same lane => add current segment */
|
||||
reference_area_size =
|
||||
position->slice * 4 +
|
||||
position->index - 1;
|
||||
} else {
|
||||
reference_area_size =
|
||||
position->slice * 4 +
|
||||
((position->index == 0) ? (-1) : 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Second pass */
|
||||
if (same_lane) {reference_area_size = 11 + position->index;}
|
||||
else {reference_area_size = 12 - (position->index == 0);}
|
||||
}
|
||||
|
||||
/* 1.2.4. Mapping pseudo_rand to 0..<reference_area_size-1> and produce
|
||||
* relative position */
|
||||
relative_position = pseudo_rand;
|
||||
relative_position = relative_position * relative_position >> 32;
|
||||
relative_position = reference_area_size - 1 -
|
||||
(reference_area_size * relative_position >> 32);
|
||||
|
||||
/* 1.2.5 Computing starting position */
|
||||
start_position = 0;
|
||||
|
||||
if (0 != position->pass) {
|
||||
start_position = (position->slice == ARGON2_SYNC_POINTS - 1)
|
||||
? 0 : (position->slice + 1) * 4;
|
||||
}
|
||||
|
||||
/* 1.2.6. Computing absolute position */
|
||||
absolute_position = (start_position + relative_position) % 16;
|
||||
return absolute_position;
|
||||
}
|
||||
|
||||
void fill_memory_blocks(argon2_instance_t *instance) {
|
||||
uint32_t r, s;
|
||||
|
||||
for (r = 0; r < 2; ++r) {
|
||||
for (s = 0; s < ARGON2_SYNC_POINTS; ++s) {
|
||||
|
||||
argon2_position_t position;
|
||||
position.pass = r;
|
||||
position.lane = 0;
|
||||
position.slice = (uint8_t)s;
|
||||
position.index = 0;
|
||||
fill_segment(instance, position);
|
||||
}
|
||||
|
||||
#ifdef GENKAT
|
||||
internal_kat(instance, r); /* Print all memory blocks */
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) {
|
||||
/* Make the first and second block in each lane as G(H0||i||0) or
|
||||
G(H0||i||1) */
|
||||
uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
|
||||
store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0);
|
||||
store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, 0);
|
||||
blake2b_too(blockhash_bytes, blockhash);
|
||||
load_block(&instance->memory[0], blockhash_bytes);
|
||||
|
||||
store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1);
|
||||
blake2b_too(blockhash_bytes, blockhash);
|
||||
load_block(&instance->memory[1], blockhash_bytes);
|
||||
secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
|
||||
static const blake2b_state base_hash = {
|
||||
.h = {
|
||||
UINT64_C(7640891576939301192), UINT64_C(13503953896175478587),
|
||||
UINT64_C(4354685564936845355), UINT64_C(11912009170470909681),
|
||||
UINT64_C(5840696475078001361), UINT64_C(11170449401992604703),
|
||||
UINT64_C(2270897969802886507), UINT64_C(6620516959819538809)
|
||||
},
|
||||
.t = {UINT64_C(0),UINT64_C(0)},
|
||||
.f = {UINT64_C(0),UINT64_C(0)},
|
||||
.buf = {
|
||||
1, 0, 0, 0, 32, 0, 0, 0, 16, 0, 0, 0, 2, 0, 0, 0, 16, 0, 0, 0, 1, 0,
|
||||
0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
.buflen = 28,
|
||||
.outlen = 64,
|
||||
.last_node = 0
|
||||
};
|
||||
|
||||
#define PWDLEN 32
|
||||
#define SALTLEN 32
|
||||
#define SECRETLEN 0
|
||||
#define ADLEN 0
|
||||
void initial_hash(uint8_t *blockhash, argon2_context *context,
|
||||
argon2_type type) {
|
||||
|
||||
uint8_t value[sizeof(uint32_t)];
|
||||
|
||||
/* Is it generating cache invalidation between cores ? */
|
||||
blake2b_state BlakeHash = base_hash;
|
||||
BlakeHash.buf[20] = (uint8_t) type;
|
||||
my_blake2b_update(&BlakeHash, (const uint8_t *)context->pwd,
|
||||
PWDLEN);
|
||||
|
||||
|
||||
secure_wipe_memory(context->pwd, PWDLEN);
|
||||
context->pwdlen = 0;
|
||||
|
||||
store32(&value, SALTLEN);
|
||||
my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
|
||||
|
||||
my_blake2b_update(&BlakeHash, (const uint8_t *)context->salt,
|
||||
SALTLEN);
|
||||
|
||||
store32(&value, SECRETLEN);
|
||||
my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
|
||||
|
||||
store32(&value, ADLEN);
|
||||
my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
|
||||
|
||||
blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
|
||||
}
|
||||
|
||||
int initialize(argon2_instance_t *instance, argon2_context *context) {
|
||||
/* 1. Memory allocation */
|
||||
|
||||
|
||||
allocate_memory(&(instance->memory), 16);
|
||||
|
||||
/* 2. Initial hashing */
|
||||
/* H_0 + 8 extra bytes to produce the first blocks */
|
||||
/* Hashing all inputs */
|
||||
uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH];
|
||||
initial_hash(blockhash, context, instance->type);
|
||||
/* Zeroing 8 extra bytes */
|
||||
secure_wipe_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
|
||||
ARGON2_PREHASH_SEED_LENGTH -
|
||||
ARGON2_PREHASH_DIGEST_LENGTH);
|
||||
|
||||
#ifdef GENKAT
|
||||
initial_kat(blockhash, context, instance->type);
|
||||
#endif
|
||||
|
||||
/* 3. Creating first blocks, we always have at least two blocks in a slice
|
||||
*/
|
||||
fill_first_blocks(blockhash, instance);
|
||||
/* Clearing the hash */
|
||||
secure_wipe_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH);
|
||||
|
||||
return ARGON2_OK;
|
||||
}
|
||||
|
||||
int argon2_core(argon2_context *context, argon2_type type) {
|
||||
argon2_instance_t instance;
|
||||
instance.memory = NULL;
|
||||
instance.type = type;
|
||||
|
||||
/* 3. Initialization: Hashing inputs, allocating memory, filling first
|
||||
* blocks
|
||||
*/
|
||||
|
||||
int result = initialize(&instance, context);
|
||||
if (ARGON2_OK != result) return result;
|
||||
|
||||
/* 4. Filling memory */
|
||||
fill_memory_blocks(&instance);
|
||||
|
||||
/* 5. Finalization */
|
||||
finalize(context, &instance);
|
||||
|
||||
return ARGON2_OK;
|
||||
}
|
220
stratum/algos/ar2/cores.h
Normal file
220
stratum/algos/ar2/cores.h
Normal file
|
@ -0,0 +1,220 @@
|
|||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#ifndef ARGON2_CORES_H
|
||||
#define ARGON2_CORES_H
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define ALIGN(n) __declspec(align(16))
|
||||
#elif defined(__GNUC__) || defined(__clang)
|
||||
#define ALIGN(x) __attribute__((__aligned__(x)))
|
||||
#else
|
||||
#define ALIGN(x)
|
||||
#endif
|
||||
|
||||
/*************************Argon2 internal
|
||||
* constants**************************************************/
|
||||
|
||||
enum argon2_core_constants {
|
||||
/* Version of the algorithm */
|
||||
ARGON2_VERSION_NUMBER = 0x10,
|
||||
|
||||
/* Memory block size in bytes */
|
||||
ARGON2_BLOCK_SIZE = 1024,
|
||||
ARGON2_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8,
|
||||
ARGON2_QWORDS_IN_BLOCK = 64,
|
||||
|
||||
/* Number of pseudo-random values generated by one call to Blake in Argon2i
|
||||
to
|
||||
generate reference block positions */
|
||||
ARGON2_ADDRESSES_IN_BLOCK = 128,
|
||||
|
||||
/* Pre-hashing digest length and its extension*/
|
||||
ARGON2_PREHASH_DIGEST_LENGTH = 64,
|
||||
ARGON2_PREHASH_SEED_LENGTH = 72
|
||||
};
|
||||
|
||||
/* Argon2 primitive type */
|
||||
typedef enum Argon2_type { Argon2_d = 0, Argon2_i = 1 } argon2_type;
|
||||
|
||||
/*************************Argon2 internal data
|
||||
* types**************************************************/
|
||||
|
||||
/*
|
||||
* Structure for the (1KB) memory block implemented as 128 64-bit words.
|
||||
* Memory blocks can be copied, XORed. Internal words can be accessed by [] (no
|
||||
* bounds checking).
|
||||
*/
|
||||
typedef struct _block { uint64_t v[ARGON2_WORDS_IN_BLOCK]; } __attribute__ ((aligned (16))) block;
|
||||
|
||||
/*****************Functions that work with the block******************/
|
||||
|
||||
/* Initialize each byte of the block with @in */
|
||||
void init_block_value(block *b, uint8_t in);
|
||||
|
||||
/* Copy block @src to block @dst */
|
||||
void copy_block(block *dst, const block *src);
|
||||
|
||||
/* XOR @src onto @dst bytewise */
|
||||
void xor_block(block *dst, const block *src);
|
||||
|
||||
/*
|
||||
* Argon2 instance: memory pointer, number of passes, amount of memory, type,
|
||||
* and derived values.
|
||||
* Used to evaluate the number and location of blocks to construct in each
|
||||
* thread
|
||||
*/
|
||||
typedef struct Argon2_instance_t {
|
||||
block *memory; /* Memory pointer */
|
||||
argon2_type type;
|
||||
int print_internals; /* whether to print the memory blocks */
|
||||
} argon2_instance_t;
|
||||
|
||||
/*
|
||||
* Argon2 position: where we construct the block right now. Used to distribute
|
||||
* work between threads.
|
||||
*/
|
||||
typedef struct Argon2_position_t {
|
||||
uint32_t pass;
|
||||
uint32_t lane;
|
||||
uint8_t slice;
|
||||
uint32_t index;
|
||||
} argon2_position_t;
|
||||
|
||||
/*Struct that holds the inputs for thread handling FillSegment*/
|
||||
typedef struct Argon2_thread_data {
|
||||
argon2_instance_t *instance_ptr;
|
||||
argon2_position_t pos;
|
||||
} argon2_thread_data;
|
||||
|
||||
/*************************Argon2 core
|
||||
* functions**************************************************/
|
||||
|
||||
/* Allocates memory to the given pointer
|
||||
* @param memory pointer to the pointer to the memory
|
||||
* @param m_cost number of blocks to allocate in the memory
|
||||
* @return ARGON2_OK if @memory is a valid pointer and memory is allocated
|
||||
*/
|
||||
int allocate_memory(block **memory, uint32_t m_cost);
|
||||
|
||||
/* Function that securely cleans the memory
|
||||
* @param mem Pointer to the memory
|
||||
* @param s Memory size in bytes
|
||||
*/
|
||||
void secure_wipe_memory(void *v, size_t n);
|
||||
|
||||
/* Clears memory
|
||||
* @param instance pointer to the current instance
|
||||
* @param clear_memory indicates if we clear the memory with zeros.
|
||||
*/
|
||||
void clear_memory(argon2_instance_t *instance, int clear);
|
||||
|
||||
/* Deallocates memory
|
||||
* @param memory pointer to the blocks
|
||||
*/
|
||||
void free_memory(block *memory);
|
||||
|
||||
/*
|
||||
* Computes absolute position of reference block in the lane following a skewed
|
||||
* distribution and using a pseudo-random value as input
|
||||
* @param instance Pointer to the current instance
|
||||
* @param position Pointer to the current position
|
||||
* @param pseudo_rand 32-bit pseudo-random value used to determine the position
|
||||
* @param same_lane Indicates if the block will be taken from the current lane.
|
||||
* If so we can reference the current segment
|
||||
* @pre All pointers must be valid
|
||||
*/
|
||||
uint32_t index_alpha(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position, uint32_t pseudo_rand,
|
||||
int same_lane);
|
||||
|
||||
/*
|
||||
* Function that validates all inputs against predefined restrictions and return
|
||||
* an error code
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @return ARGON2_OK if everything is all right, otherwise one of error codes
|
||||
* (all defined in <argon2.h>
|
||||
*/
|
||||
int validate_inputs(const argon2_context *context);
|
||||
|
||||
/*
|
||||
* Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears
|
||||
* password and secret if needed
|
||||
* @param context Pointer to the Argon2 internal structure containing memory
|
||||
* pointer, and parameters for time and space requirements.
|
||||
* @param blockhash Buffer for pre-hashing digest
|
||||
* @param type Argon2 type
|
||||
* @pre @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes
|
||||
* allocated
|
||||
*/
|
||||
void initial_hash(uint8_t *blockhash, argon2_context *context,
|
||||
argon2_type type);
|
||||
|
||||
/*
|
||||
* Function creates first 2 blocks per lane
|
||||
* @param instance Pointer to the current instance
|
||||
* @param blockhash Pointer to the pre-hashing digest
|
||||
* @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values
|
||||
*/
|
||||
void fill_firsts_blocks(uint8_t *blockhash, const argon2_instance_t *instance);
|
||||
|
||||
/*
|
||||
* Function allocates memory, hashes the inputs with Blake, and creates first
|
||||
* two blocks. Returns the pointer to the main memory with 2 blocks per lane
|
||||
* initialized
|
||||
* @param context Pointer to the Argon2 internal structure containing memory
|
||||
* pointer, and parameters for time and space requirements.
|
||||
* @param instance Current Argon2 instance
|
||||
* @return Zero if successful, -1 if memory failed to allocate. @context->state
|
||||
* will be modified if successful.
|
||||
*/
|
||||
int initialize(argon2_instance_t *instance, argon2_context *context);
|
||||
|
||||
/*
|
||||
* XORing the last block of each lane, hashing it, making the tag. Deallocates
|
||||
* the memory.
|
||||
* @param context Pointer to current Argon2 context (use only the out parameters
|
||||
* from it)
|
||||
* @param instance Pointer to current instance of Argon2
|
||||
* @pre instance->state must point to necessary amount of memory
|
||||
* @pre context->out must point to outlen bytes of memory
|
||||
* @pre if context->free_cbk is not NULL, it should point to a function that
|
||||
* deallocates memory
|
||||
*/
|
||||
void finalize(const argon2_context *context, argon2_instance_t *instance);
|
||||
|
||||
/*
|
||||
* Function that fills the segment using previous segments also from other
|
||||
* threads
|
||||
* @param instance Pointer to the current instance
|
||||
* @param position Current position
|
||||
* @pre all block pointers must be valid
|
||||
*/
|
||||
void fill_segment(const argon2_instance_t *instance,
|
||||
argon2_position_t position);
|
||||
|
||||
/*
|
||||
* Function that fills the entire memory t_cost times based on the first two
|
||||
* blocks in each lane
|
||||
* @param instance Pointer to the current instance
|
||||
*/
|
||||
void fill_memory_blocks(argon2_instance_t *instance);
|
||||
|
||||
/*
|
||||
* Function that performs memory-hard hashing with certain degree of parallelism
|
||||
* @param context Pointer to the Argon2 internal structure
|
||||
* @return Error code if smth is wrong, ARGON2_OK otherwise
|
||||
*/
|
||||
int argon2_core(argon2_context *context, argon2_type type);
|
||||
|
||||
#endif
|
182
stratum/algos/ar2/genkat.c
Normal file
182
stratum/algos/ar2/genkat.c
Normal file
|
@ -0,0 +1,182 @@
|
|||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "argon2.h"
|
||||
#include "cores.h"
|
||||
|
||||
void initial_kat(const uint8_t *blockhash, const argon2_context *context,
|
||||
argon2_type type) {
|
||||
unsigned i;
|
||||
|
||||
if (blockhash != NULL && context != NULL) {
|
||||
printf("=======================================");
|
||||
|
||||
switch (type) {
|
||||
case Argon2_d:
|
||||
printf("Argon2d\n");
|
||||
break;
|
||||
|
||||
case Argon2_i:
|
||||
printf("Argon2i\n");
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
printf("Memory: %u KiB, Iterations: %u, Parallelism: %u lanes, Tag "
|
||||
"length: %u bytes\n",
|
||||
context->m_cost, context->t_cost, context->lanes,
|
||||
context->outlen);
|
||||
|
||||
printf("Password[%u]: ", context->pwdlen);
|
||||
|
||||
if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
|
||||
printf("CLEARED\n");
|
||||
} else {
|
||||
for (i = 0; i < context->pwdlen; ++i) {
|
||||
printf("%2.2x ", ((unsigned char *)context->pwd)[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("Salt[%u]: ", context->saltlen);
|
||||
|
||||
for (i = 0; i < context->saltlen; ++i) {
|
||||
printf("%2.2x ", ((unsigned char *)context->salt)[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
printf("Secret[%u]: ", context->secretlen);
|
||||
|
||||
if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
|
||||
printf("CLEARED\n");
|
||||
} else {
|
||||
for (i = 0; i < context->secretlen; ++i) {
|
||||
printf("%2.2x ", ((unsigned char *)context->secret)[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("Associated data[%u]: ", context->adlen);
|
||||
|
||||
for (i = 0; i < context->adlen; ++i) {
|
||||
printf("%2.2x ", ((unsigned char *)context->ad)[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
printf("Pre-hashing digest: ");
|
||||
|
||||
for (i = 0; i < ARGON2_PREHASH_DIGEST_LENGTH; ++i) {
|
||||
printf("%2.2x ", ((unsigned char *)blockhash)[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
void print_tag(const void *out, uint32_t outlen) {
|
||||
unsigned i;
|
||||
if (out != NULL) {
|
||||
printf("Tag: ");
|
||||
|
||||
for (i = 0; i < outlen; ++i) {
|
||||
printf("%2.2x ", ((uint8_t *)out)[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
void internal_kat(const argon2_instance_t *instance, uint32_t pass) {
|
||||
|
||||
if (instance != NULL) {
|
||||
uint32_t i, j;
|
||||
printf("\n After pass %u:\n", pass);
|
||||
|
||||
for (i = 0; i < instance->memory_blocks; ++i) {
|
||||
uint32_t how_many_words =
|
||||
(instance->memory_blocks > ARGON2_WORDS_IN_BLOCK)
|
||||
? 1
|
||||
: ARGON2_WORDS_IN_BLOCK;
|
||||
|
||||
for (j = 0; j < how_many_words; ++j)
|
||||
printf("Block %.4u [%3u]: %016" PRIx64 "\n", i, j,
|
||||
instance->memory[i].v[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void fatal(const char *error) {
|
||||
fprintf(stderr, "Error: %s\n", error);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static void generate_testvectors(const char *type) {
|
||||
#define TEST_OUTLEN 32
|
||||
#define TEST_PWDLEN 32
|
||||
#define TEST_SALTLEN 16
|
||||
#define TEST_SECRETLEN 8
|
||||
#define TEST_ADLEN 12
|
||||
argon2_context context;
|
||||
|
||||
unsigned char out[TEST_OUTLEN];
|
||||
unsigned char pwd[TEST_PWDLEN];
|
||||
unsigned char salt[TEST_SALTLEN];
|
||||
unsigned char secret[TEST_SECRETLEN];
|
||||
unsigned char ad[TEST_ADLEN];
|
||||
const allocate_fptr myown_allocator = NULL;
|
||||
const deallocate_fptr myown_deallocator = NULL;
|
||||
|
||||
unsigned t_cost = 3;
|
||||
unsigned m_cost = 16;
|
||||
unsigned lanes = 4;
|
||||
|
||||
memset(pwd, 1, TEST_OUTLEN);
|
||||
memset(salt, 2, TEST_SALTLEN);
|
||||
memset(secret, 3, TEST_SECRETLEN);
|
||||
memset(ad, 4, TEST_ADLEN);
|
||||
|
||||
context.out = out;
|
||||
context.outlen = TEST_OUTLEN;
|
||||
context.pwd = pwd;
|
||||
context.pwdlen = TEST_PWDLEN;
|
||||
context.salt = salt;
|
||||
context.saltlen = TEST_SALTLEN;
|
||||
context.secret = secret;
|
||||
context.secretlen = TEST_SECRETLEN;
|
||||
context.ad = ad;
|
||||
context.adlen = TEST_ADLEN;
|
||||
context.t_cost = t_cost;
|
||||
context.m_cost = m_cost;
|
||||
context.lanes = lanes;
|
||||
context.threads = lanes;
|
||||
context.allocate_cbk = myown_allocator;
|
||||
context.free_cbk = myown_deallocator;
|
||||
context.flags = 0;
|
||||
|
||||
#undef TEST_OUTLEN
|
||||
#undef TEST_PWDLEN
|
||||
#undef TEST_SALTLEN
|
||||
#undef TEST_SECRETLEN
|
||||
#undef TEST_ADLEN
|
||||
|
||||
if (!strcmp(type, "d")) {
|
||||
argon2d(&context);
|
||||
} else if (!strcmp(type, "i")) {
|
||||
argon2i(&context);
|
||||
} else
|
||||
fatal("wrong Argon2 type");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
const char *type = (argc > 1) ? argv[1] : "i";
|
||||
generate_testvectors(type);
|
||||
return ARGON2_OK;
|
||||
}
|
45
stratum/algos/ar2/genkat.h
Normal file
45
stratum/algos/ar2/genkat.h
Normal file
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#ifndef ARGON2_KAT_H
|
||||
#define ARGON2_KAT_H
|
||||
|
||||
/*
|
||||
* Initial KAT function that prints the inputs to the file
|
||||
* @param blockhash Array that contains pre-hashing digest
|
||||
* @param context Holds inputs
|
||||
* @param type Argon2 type
|
||||
* @pre blockhash must point to INPUT_INITIAL_HASH_LENGTH bytes
|
||||
* @pre context member pointers must point to allocated memory of size according
|
||||
* to the length values
|
||||
*/
|
||||
void initial_kat(const uint8_t *blockhash, const argon2_context *context,
|
||||
argon2_type type);
|
||||
|
||||
/*
|
||||
* Function that prints the output tag
|
||||
* @param out output array pointer
|
||||
* @param outlen digest length
|
||||
* @pre out must point to @a outlen bytes
|
||||
**/
|
||||
void print_tag(const void *out, uint32_t outlen);
|
||||
|
||||
/*
|
||||
* Function that prints the internal state at given moment
|
||||
* @param instance pointer to the current instance
|
||||
* @param pass current pass number
|
||||
* @pre instance must have necessary memory allocated
|
||||
**/
|
||||
void internal_kat(const argon2_instance_t *instance, uint32_t pass);
|
||||
|
||||
#endif
|
185
stratum/algos/ar2/opt.c
Normal file
185
stratum/algos/ar2/opt.c
Normal file
|
@ -0,0 +1,185 @@
|
|||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#include "argon2.h"
|
||||
#include "cores.h"
|
||||
#include "opt.h"
|
||||
|
||||
#include "blake2/blake2.h"
|
||||
#include "blake2/blamka-round-opt.h"
|
||||
|
||||
void fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block)
|
||||
{
|
||||
__m128i block_XY[ARGON2_QWORDS_IN_BLOCK] __attribute__ ((aligned (16)));
|
||||
uint32_t i;
|
||||
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm_xor_si128(
|
||||
state[i], _mm_load_si128(&ref_block[i]));
|
||||
}
|
||||
|
||||
BLAKE2_ROUND(state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]);
|
||||
BLAKE2_ROUND(state[8], state[9], state[10], state[11], state[12], state[13], state[14], state[15]);
|
||||
BLAKE2_ROUND(state[16], state[17], state[18], state[19], state[20], state[21], state[22], state[23]);
|
||||
BLAKE2_ROUND(state[24], state[25], state[26], state[27], state[28], state[29], state[30], state[31]);
|
||||
BLAKE2_ROUND(state[32], state[33], state[34], state[35], state[36], state[37], state[38], state[39]);
|
||||
BLAKE2_ROUND(state[40], state[41], state[42], state[43], state[44], state[45], state[46], state[47]);
|
||||
BLAKE2_ROUND(state[48], state[49], state[50], state[51], state[52], state[53], state[54], state[55]);
|
||||
BLAKE2_ROUND(state[56], state[57], state[58], state[59], state[60], state[61], state[62], state[63]);
|
||||
/*for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
|
||||
state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
|
||||
state[8 * i + 6], state[8 * i + 7]);
|
||||
}*/
|
||||
|
||||
BLAKE2_ROUND(state[0], state[8], state[16], state[24], state[32], state[40], state[48], state[56]);
|
||||
BLAKE2_ROUND(state[1], state[9], state[17], state[25], state[33], state[41], state[49], state[57]);
|
||||
BLAKE2_ROUND(state[2], state[10], state[18], state[26], state[34], state[42], state[50], state[58]);
|
||||
BLAKE2_ROUND(state[3], state[11], state[19], state[27], state[35], state[43], state[51], state[59]);
|
||||
BLAKE2_ROUND(state[4], state[12], state[20], state[28], state[36], state[44], state[52], state[60]);
|
||||
BLAKE2_ROUND(state[5], state[13], state[21], state[29], state[37], state[45], state[53], state[61]);
|
||||
BLAKE2_ROUND(state[6], state[14], state[22], state[30], state[38], state[46], state[54], state[62]);
|
||||
BLAKE2_ROUND(state[7], state[15], state[23], state[31], state[39], state[47], state[55], state[63]);
|
||||
/*for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
|
||||
state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
|
||||
state[8 * 6 + i], state[8 * 7 + i]);
|
||||
}*/
|
||||
|
||||
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm_xor_si128(state[i], block_XY[i]);
|
||||
_mm_storeu_si128(&next_block[i], state[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static const uint64_t bad_rands[32] = {
|
||||
UINT64_C(17023632018251376180), UINT64_C(4911461131397773491),
|
||||
UINT64_C(15927076453364631751), UINT64_C(7860239898779391109),
|
||||
|
||||
UINT64_C(11820267568857244377), UINT64_C(12188179869468676617),
|
||||
UINT64_C(3732913385414474778), UINT64_C(7651458777762572084),
|
||||
|
||||
UINT64_C(3062274162574341415), UINT64_C(17922653540258786897),
|
||||
UINT64_C(17393848266100524980), UINT64_C(8539695715554563839),
|
||||
|
||||
UINT64_C(13824538050656654359), UINT64_C(12078939433126460936),
|
||||
UINT64_C(15331979418564540430), UINT64_C(12058346794217174273),
|
||||
|
||||
UINT64_C(13593922096015221049), UINT64_C(18356682276374416500),
|
||||
UINT64_C(4968040514092703824), UINT64_C(11202790346130235567),
|
||||
|
||||
UINT64_C(2276229735041314644), UINT64_C(220837743321691382),
|
||||
UINT64_C(4861211596230784273), UINT64_C(6330592584132590331),
|
||||
|
||||
UINT64_C(3515580430960296763), UINT64_C(9869356316971855173),
|
||||
UINT64_C(485533243489193056), UINT64_C(14596447761048148032),
|
||||
|
||||
UINT64_C(16531790085730132900), UINT64_C(17328824500878824371),
|
||||
UINT64_C(8548260058287621283), UINT64_C(8641748798041936364)
|
||||
};
|
||||
|
||||
void generate_addresses(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position,
|
||||
uint64_t *pseudo_rands)
|
||||
{
|
||||
uint8_t offset = position->pass * 16 + position->slice * 4;
|
||||
pseudo_rands[0] = bad_rands[offset++];
|
||||
pseudo_rands[1] = bad_rands[offset++];
|
||||
pseudo_rands[2] = bad_rands[offset++];
|
||||
pseudo_rands[3] = bad_rands[offset++];
|
||||
|
||||
/*if ((position->pass == 1 && position->slice == 3))
|
||||
print64("pseudo_rands", pseudo_rands, 4);*/
|
||||
}
|
||||
|
||||
#define SEGMENT_LENGTH 4
|
||||
#define LANE_LENGTH 16
|
||||
#define POS_LANE 0
|
||||
|
||||
void fill_segment(const argon2_instance_t *instance,
|
||||
argon2_position_t position)
|
||||
{
|
||||
block *ref_block = NULL, *curr_block = NULL;
|
||||
uint64_t pseudo_rand, ref_index;
|
||||
uint32_t prev_offset, curr_offset;
|
||||
uint8_t i;
|
||||
__m128i state[64];
|
||||
int data_independent_addressing = (instance->type == Argon2_i);
|
||||
|
||||
/* Pseudo-random values that determine the reference block position */
|
||||
uint64_t *pseudo_rands = NULL;
|
||||
|
||||
pseudo_rands = (uint64_t *)malloc(/*sizeof(uint64_t) * 4*/32);
|
||||
|
||||
if (data_independent_addressing) {
|
||||
generate_addresses(instance, &position, pseudo_rands);
|
||||
}
|
||||
|
||||
i = 0;
|
||||
|
||||
if ((0 == position.pass) && (0 == position.slice)) {
|
||||
i = 2; /* we have already generated the first two blocks */
|
||||
}
|
||||
|
||||
/*printf("Position.lane = %d\nPosition.slice = %d\nStarting index : %d\n", position.lane, position.slice, starting_index);*/
|
||||
/* Offset of the current block */
|
||||
curr_offset = position.slice * 4 + i;
|
||||
|
||||
if (0 == curr_offset % 16) {
|
||||
/* Last block in this lane */
|
||||
prev_offset = curr_offset + /*instance->lane_length - 1*/15;
|
||||
} else {
|
||||
/* Previous block */
|
||||
prev_offset = curr_offset - 1;
|
||||
}
|
||||
|
||||
memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
|
||||
|
||||
for (; i < SEGMENT_LENGTH;
|
||||
++i, ++curr_offset, ++prev_offset) {
|
||||
/*1.1 Rotating prev_offset if needed */
|
||||
if (curr_offset % LANE_LENGTH == 1) {
|
||||
prev_offset = curr_offset - 1;
|
||||
}
|
||||
|
||||
/* 1.2 Computing the index of the reference block */
|
||||
/* 1.2.1 Taking pseudo-random value from the previous block */
|
||||
if (data_independent_addressing) {
|
||||
pseudo_rand = pseudo_rands[i];
|
||||
} else {
|
||||
pseudo_rand = instance->memory[prev_offset].v[0];
|
||||
}
|
||||
|
||||
/* 1.2.2 Computing the lane of the reference block */
|
||||
|
||||
/* 1.2.3 Computing the number of possible reference block within the
|
||||
* lane.
|
||||
*/
|
||||
position.index = i;
|
||||
ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,1);
|
||||
|
||||
/* 2 Creating a new block */
|
||||
ref_block = instance->memory + ref_index;
|
||||
curr_block = instance->memory + curr_offset;
|
||||
fill_block(state, (__m128i const *)ref_block->v, (__m128i *)curr_block->v);
|
||||
}
|
||||
|
||||
free(pseudo_rands);
|
||||
}
|
49
stratum/algos/ar2/opt.h
Normal file
49
stratum/algos/ar2/opt.h
Normal file
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#ifndef ARGON2_OPT_H
|
||||
#define ARGON2_OPT_H
|
||||
|
||||
/*
|
||||
* Function fills a new memory block. Differs from the
|
||||
* @param state Pointer to the just produced block. Content will be updated(!)
|
||||
* @param ref_block Pointer to the reference block
|
||||
* @param next_block Pointer to the block to be constructed
|
||||
* @pre all block pointers must be valid
|
||||
*/
|
||||
void fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block);
|
||||
|
||||
/*
|
||||
* Generate pseudo-random values to reference blocks in the segment and puts
|
||||
* them into the array
|
||||
* @param instance Pointer to the current instance
|
||||
* @param position Pointer to the current position
|
||||
* @param pseudo_rands Pointer to the array of 64-bit values
|
||||
* @pre pseudo_rands must point to @a instance->segment_length allocated values
|
||||
*/
|
||||
void generate_addresses(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position,
|
||||
uint64_t *pseudo_rands);
|
||||
|
||||
/*
|
||||
* Function that fills the segment using previous segments also from other
|
||||
* threads.
|
||||
* Identical to the reference code except that it calls optimized FillBlock()
|
||||
* @param instance Pointer to the current instance
|
||||
* @param position Current position
|
||||
* @pre all block pointers must be valid
|
||||
*/
|
||||
void fill_segment(const argon2_instance_t *instance,
|
||||
argon2_position_t position);
|
||||
|
||||
#endif /* ARGON2_OPT_H */
|
174
stratum/algos/ar2/ref.c
Normal file
174
stratum/algos/ar2/ref.c
Normal file
|
@ -0,0 +1,174 @@
|
|||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "argon2.h"
|
||||
#include "cores.h"
|
||||
#include "ref.h"
|
||||
|
||||
#include "blake2/blamka-round-ref.h"
|
||||
#include "blake2/blake2-impl.h"
|
||||
#include "blake2/blake2.h"
|
||||
|
||||
void fill_block(const block *prev_block, const block *ref_block,
|
||||
block *next_block) {
|
||||
block blockR, block_tmp;
|
||||
unsigned i;
|
||||
|
||||
copy_block(&blockR, ref_block);
|
||||
xor_block(&blockR, prev_block);
|
||||
copy_block(&block_tmp, &blockR);
|
||||
|
||||
/* Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then
|
||||
(16,17,..31)... finally (112,113,...127) */
|
||||
for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND_NOMSG(
|
||||
blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2],
|
||||
blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5],
|
||||
blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8],
|
||||
blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11],
|
||||
blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14],
|
||||
blockR.v[16 * i + 15]);
|
||||
}
|
||||
|
||||
/* Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then
|
||||
(2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127) */
|
||||
for (i = 0; i < 8; i++) {
|
||||
BLAKE2_ROUND_NOMSG(
|
||||
blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16],
|
||||
blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33],
|
||||
blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64],
|
||||
blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81],
|
||||
blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112],
|
||||
blockR.v[2 * i + 113]);
|
||||
}
|
||||
|
||||
copy_block(next_block, &block_tmp);
|
||||
xor_block(next_block, &blockR);
|
||||
}
|
||||
|
||||
void generate_addresses(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position,
|
||||
uint64_t *pseudo_rands) {
|
||||
block zero_block, input_block, address_block;
|
||||
uint32_t i;
|
||||
|
||||
init_block_value(&zero_block, 0);
|
||||
init_block_value(&input_block, 0);
|
||||
init_block_value(&address_block, 0);
|
||||
|
||||
if (instance != NULL && position != NULL) {
|
||||
input_block.v[0] = position->pass;
|
||||
input_block.v[1] = position->lane;
|
||||
input_block.v[2] = position->slice;
|
||||
input_block.v[3] = 16;
|
||||
input_block.v[4] = 2;
|
||||
input_block.v[5] = instance->type;
|
||||
|
||||
for (i = 0; i < 4; ++i) {
|
||||
if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
|
||||
input_block.v[6]++;
|
||||
fill_block(&zero_block, &input_block, &address_block);
|
||||
fill_block(&zero_block, &address_block, &address_block);
|
||||
}
|
||||
|
||||
pseudo_rands[i] = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void fill_segment(const argon2_instance_t *instance,
|
||||
argon2_position_t position) {
|
||||
block *ref_block = NULL, *curr_block = NULL;
|
||||
uint64_t pseudo_rand, ref_index, ref_lane;
|
||||
uint32_t prev_offset, curr_offset;
|
||||
uint32_t starting_index;
|
||||
uint32_t i;
|
||||
int data_independent_addressing = (instance->type == Argon2_i);
|
||||
/* Pseudo-random values that determine the reference block position */
|
||||
uint64_t *pseudo_rands = NULL;
|
||||
|
||||
if (instance == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
pseudo_rands =
|
||||
(uint64_t *)malloc(sizeof(uint64_t) * 4);
|
||||
|
||||
if (pseudo_rands == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (data_independent_addressing) {
|
||||
generate_addresses(instance, &position, pseudo_rands);
|
||||
}
|
||||
|
||||
starting_index = 0;
|
||||
|
||||
if ((0 == position.pass) && (0 == position.slice)) {
|
||||
starting_index = 2; /* we have already generated the first two blocks */
|
||||
}
|
||||
|
||||
/* Offset of the current block */
|
||||
curr_offset = position.lane * 16 +
|
||||
position.slice * 4 + starting_index;
|
||||
|
||||
if (0 == curr_offset % 16) {
|
||||
/* Last block in this lane */
|
||||
prev_offset = curr_offset + 16 - 1;
|
||||
} else {
|
||||
/* Previous block */
|
||||
prev_offset = curr_offset - 1;
|
||||
}
|
||||
|
||||
for (i = starting_index; i < 4; ++i, ++curr_offset, ++prev_offset) {
|
||||
/*1.1 Rotating prev_offset if needed */
|
||||
if (curr_offset % 16 == 1) {
|
||||
prev_offset = curr_offset - 1;
|
||||
}
|
||||
|
||||
/* 1.2 Computing the index of the reference block */
|
||||
/* 1.2.1 Taking pseudo-random value from the previous block */
|
||||
if (data_independent_addressing) {
|
||||
pseudo_rand = pseudo_rands[i];
|
||||
} else {
|
||||
pseudo_rand = instance->memory[prev_offset].v[0];
|
||||
}
|
||||
|
||||
/* 1.2.2 Computing the lane of the reference block */
|
||||
ref_lane = ((pseudo_rand >> 32)) % 1;
|
||||
|
||||
if ((position.pass == 0) && (position.slice == 0)) {
|
||||
/* Can not reference other lanes yet */
|
||||
ref_lane = position.lane;
|
||||
}
|
||||
|
||||
/* 1.2.3 Computing the number of possible reference block within the
|
||||
* lane.
|
||||
*/
|
||||
position.index = i;
|
||||
ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
|
||||
ref_lane == position.lane);
|
||||
|
||||
/* 2 Creating a new block */
|
||||
ref_block =
|
||||
instance->memory + 16 * ref_lane + ref_index;
|
||||
curr_block = instance->memory + curr_offset;
|
||||
fill_block(instance->memory + prev_offset, ref_block, curr_block);
|
||||
}
|
||||
|
||||
free(pseudo_rands);
|
||||
}
|
49
stratum/algos/ar2/ref.h
Normal file
49
stratum/algos/ar2/ref.h
Normal file
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#ifndef ARGON2_REF_H
|
||||
#define ARGON2_REF_H
|
||||
|
||||
/*
|
||||
* Function fills a new memory block
|
||||
* @param prev_block Pointer to the previous block
|
||||
* @param ref_block Pointer to the reference block
|
||||
* @param next_block Pointer to the block to be constructed
|
||||
* @pre all block pointers must be valid
|
||||
*/
|
||||
void fill_block(const block *prev_block, const block *ref_block,
|
||||
block *next_block);
|
||||
|
||||
/*
|
||||
* Generate pseudo-random values to reference blocks in the segment and puts
|
||||
* them into the array
|
||||
* @param instance Pointer to the current instance
|
||||
* @param position Pointer to the current position
|
||||
* @param pseudo_rands Pointer to the array of 64-bit values
|
||||
* @pre pseudo_rands must point to @a instance->segment_length allocated values
|
||||
*/
|
||||
void generate_addresses(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position,
|
||||
uint64_t *pseudo_rands);
|
||||
|
||||
/*
|
||||
* Function that fills the segment using previous segments also from other
|
||||
* threads
|
||||
* @param instance Pointer to the current instance
|
||||
* @param position Current position
|
||||
* @pre all block pointers must be valid
|
||||
*/
|
||||
void fill_segment(const argon2_instance_t *instance,
|
||||
argon2_position_t position);
|
||||
|
||||
#endif /* ARGON2_REF_H */
|
223
stratum/algos/ar2/run.c
Normal file
223
stratum/algos/ar2/run.c
Normal file
|
@ -0,0 +1,223 @@
|
|||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "argon2.h"
|
||||
#include "cores.h"
|
||||
|
||||
#define T_COST_DEF 3
|
||||
#define LOG_M_COST_DEF 12 /* 2^12 = 4 MiB */
|
||||
#define LANES_DEF 1
|
||||
#define THREADS_DEF 1
|
||||
#define OUT_LEN 32
|
||||
#define SALT_LEN 16
|
||||
|
||||
#define UNUSED_PARAMETER(x) (void)(x)
|
||||
|
||||
static void usage(const char *cmd) {
|
||||
printf("Usage: %s pwd salt [-y version] [-t iterations] [-m memory] [-p "
|
||||
"parallelism]\n",
|
||||
cmd);
|
||||
|
||||
printf("Parameters:\n");
|
||||
printf("\tpwd\t\tThe password to hash\n");
|
||||
printf("\tsalt\t\tThe salt to use, at most 16 characters\n");
|
||||
printf("\t-d\t\tUse Argon2d instead of Argon2i (which is the default)\n");
|
||||
printf("\t-t N\t\tSets the number of iterations to N (default = %d)\n",
|
||||
T_COST_DEF);
|
||||
printf("\t-m N\t\tSets the memory usage of 2^N KiB (default %d)\n",
|
||||
LOG_M_COST_DEF);
|
||||
printf("\t-p N\t\tSets parallelism to N threads (default %d)\n",
|
||||
THREADS_DEF);
|
||||
}
|
||||
|
||||
static void fatal(const char *error) {
|
||||
fprintf(stderr, "Error: %s\n", error);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
Runs Argon2 with certain inputs and parameters, inputs not cleared. Prints the
|
||||
Base64-encoded hash string
|
||||
@out output array with at least 32 bytes allocated
|
||||
@pwd NULL-terminated string, presumably from argv[]
|
||||
@salt salt array with at least SALTLEN_DEF bytes allocated
|
||||
@t_cost number of iterations
|
||||
@m_cost amount of requested memory in KB
|
||||
@lanes amount of requested parallelism
|
||||
@threads actual parallelism
|
||||
@type String, only "d" and "i" are accepted
|
||||
*/
|
||||
static void run(uint8_t *out, char *pwd, uint8_t *salt, uint32_t t_cost,
|
||||
uint32_t m_cost, uint32_t lanes, uint32_t threads,
|
||||
const char *type) {
|
||||
clock_t start_time, stop_time;
|
||||
unsigned pwd_length;
|
||||
argon2_context context;
|
||||
int i;
|
||||
|
||||
start_time = clock();
|
||||
|
||||
if (!pwd) {
|
||||
fatal("password missing");
|
||||
}
|
||||
|
||||
if (!salt) {
|
||||
secure_wipe_memory(pwd, strlen(pwd));
|
||||
fatal("salt missing");
|
||||
}
|
||||
|
||||
pwd_length = strlen(pwd);
|
||||
|
||||
UNUSED_PARAMETER(threads);
|
||||
|
||||
context.out = out;
|
||||
context.outlen = OUT_LEN;
|
||||
context.pwd = (uint8_t *)pwd;
|
||||
context.pwdlen = pwd_length;
|
||||
context.salt = salt;
|
||||
context.saltlen = SALT_LEN;
|
||||
context.secret = NULL;
|
||||
context.secretlen = 0;
|
||||
context.ad = NULL;
|
||||
context.adlen = 0;
|
||||
context.t_cost = t_cost;
|
||||
context.m_cost = m_cost;
|
||||
context.lanes = lanes;
|
||||
context.threads = lanes;
|
||||
context.allocate_cbk = NULL;
|
||||
context.free_cbk = NULL;
|
||||
context.flags = ARGON2_FLAG_CLEAR_PASSWORD;
|
||||
|
||||
if (!strcmp(type, "d")) {
|
||||
int result = argon2d(&context);
|
||||
if (result != ARGON2_OK)
|
||||
fatal(error_message(result));
|
||||
} else if (!strcmp(type, "i")) {
|
||||
int result = argon2i(&context);
|
||||
if (result != ARGON2_OK)
|
||||
fatal(error_message(result));
|
||||
} else {
|
||||
secure_wipe_memory(pwd, strlen(pwd));
|
||||
fatal("wrong Argon2 type");
|
||||
}
|
||||
|
||||
stop_time = clock();
|
||||
|
||||
/* add back when proper decoding */
|
||||
/*
|
||||
char encoded[300];
|
||||
encode_string(encoded, sizeof encoded, &context);
|
||||
printf("%s\n", encoded);
|
||||
*/
|
||||
printf("Hash:\t\t");
|
||||
for (i = 0; i < context.outlen; ++i) {
|
||||
printf("%02x", context.out[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("%2.3f seconds\n",
|
||||
((double)stop_time - start_time) / (CLOCKS_PER_SEC));
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
unsigned char out[OUT_LEN];
|
||||
uint32_t m_cost = 1 << LOG_M_COST_DEF;
|
||||
uint32_t t_cost = T_COST_DEF;
|
||||
uint32_t lanes = LANES_DEF;
|
||||
uint32_t threads = THREADS_DEF;
|
||||
char *pwd = NULL;
|
||||
uint8_t salt[SALT_LEN];
|
||||
const char *type = "i";
|
||||
int i;
|
||||
|
||||
if (argc < 3) {
|
||||
usage(argv[0]);
|
||||
return ARGON2_MISSING_ARGS;
|
||||
}
|
||||
|
||||
/* get password and salt from command line */
|
||||
pwd = argv[1];
|
||||
if (strlen(argv[2]) > SALT_LEN) {
|
||||
fatal("salt too long");
|
||||
}
|
||||
memset(salt, 0x00, SALT_LEN); /* pad with null bytes */
|
||||
memcpy(salt, argv[2], strlen(argv[2]));
|
||||
|
||||
/* parse options */
|
||||
for (i = 3; i < argc; i++) {
|
||||
const char *a = argv[i];
|
||||
unsigned long input = 0;
|
||||
if (!strcmp(a, "-m")) {
|
||||
if (i < argc - 1) {
|
||||
i++;
|
||||
input = strtoul(argv[i], NULL, 10);
|
||||
if (input == 0 || input == ULONG_MAX ||
|
||||
input > ARGON2_MAX_MEMORY_BITS) {
|
||||
fatal("bad numeric input for -m");
|
||||
}
|
||||
m_cost = ARGON2_MIN(UINT64_C(1) << input, UINT32_C(0xFFFFFFFF));
|
||||
if (m_cost > ARGON2_MAX_MEMORY) {
|
||||
fatal("m_cost overflow");
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
fatal("missing -m argument");
|
||||
}
|
||||
} else if (!strcmp(a, "-t")) {
|
||||
if (i < argc - 1) {
|
||||
i++;
|
||||
input = strtoul(argv[i], NULL, 10);
|
||||
if (input == 0 || input == ULONG_MAX ||
|
||||
input > ARGON2_MAX_TIME) {
|
||||
fatal("bad numeric input for -t");
|
||||
}
|
||||
t_cost = input;
|
||||
continue;
|
||||
} else {
|
||||
fatal("missing -t argument");
|
||||
}
|
||||
} else if (!strcmp(a, "-p")) {
|
||||
if (i < argc - 1) {
|
||||
i++;
|
||||
input = strtoul(argv[i], NULL, 10);
|
||||
if (input == 0 || input == ULONG_MAX ||
|
||||
input > ARGON2_MAX_THREADS || input > ARGON2_MAX_LANES) {
|
||||
fatal("bad numeric input for -p");
|
||||
}
|
||||
threads = input;
|
||||
lanes = threads;
|
||||
continue;
|
||||
} else {
|
||||
fatal("missing -p argument");
|
||||
}
|
||||
} else if (!strcmp(a, "-d")) {
|
||||
type = "d";
|
||||
} else {
|
||||
fatal("unknown argument");
|
||||
}
|
||||
}
|
||||
printf("Type:\t\tArgon2%c\n", type[0]);
|
||||
printf("Iterations:\t%" PRIu32 " \n", t_cost);
|
||||
printf("Memory:\t\t%" PRIu32 " KiB\n", m_cost);
|
||||
printf("Parallelism:\t%" PRIu32 " \n", lanes);
|
||||
run(out, pwd, salt, t_cost, m_cost, lanes, threads, type);
|
||||
|
||||
return ARGON2_OK;
|
||||
}
|
249
stratum/algos/ar2/scrypt-jane.c
Normal file
249
stratum/algos/ar2/scrypt-jane.c
Normal file
|
@ -0,0 +1,249 @@
|
|||
/*
|
||||
scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane
|
||||
|
||||
Public Domain or MIT License, whichever is easier
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#if defined( _WINDOWS )
|
||||
#if !defined( QT_GUI )
|
||||
extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "scrypt-jane.h"
|
||||
|
||||
#include "sj/scrypt-jane-portable.h"
|
||||
#include "sj/scrypt-jane-hash.h"
|
||||
#include "sj/scrypt-jane-romix.h"
|
||||
#include "sj/scrypt-jane-test-vectors.h"
|
||||
|
||||
#define scrypt_maxNfactor 30 /* (1 << (30 + 1)) = ~2 billion */
|
||||
#if (SCRYPT_BLOCK_BYTES == 64)
|
||||
#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
|
||||
#elif (SCRYPT_BLOCK_BYTES == 128)
|
||||
#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */
|
||||
#elif (SCRYPT_BLOCK_BYTES == 256)
|
||||
#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */
|
||||
#elif (SCRYPT_BLOCK_BYTES == 512)
|
||||
#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */
|
||||
#endif
|
||||
#define scrypt_maxrfactor scrypt_r_32kb /* 32kb */
|
||||
#define scrypt_maxpfactor 25 /* (1 << 25) = ~33 million */
|
||||
|
||||
#include <stdio.h>
|
||||
//#include <malloc.h>
|
||||
|
||||
static void NORETURN
|
||||
scrypt_fatal_error_default(const char *msg) {
|
||||
fprintf(stderr, "%s\n", msg);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default;
|
||||
|
||||
void scrypt_set_fatal_error(scrypt_fatal_errorfn fn) {
|
||||
scrypt_fatal_error = fn;
|
||||
}
|
||||
|
||||
static int scrypt_power_on_self_test(void)
|
||||
{
|
||||
const scrypt_test_setting *t;
|
||||
uint8_t test_digest[64];
|
||||
uint32_t i;
|
||||
int res = 7, scrypt_valid;
|
||||
|
||||
if (!scrypt_test_mix()) {
|
||||
#if !defined(SCRYPT_TEST)
|
||||
scrypt_fatal_error("scrypt: mix function power-on-self-test failed");
|
||||
#endif
|
||||
res &= ~1;
|
||||
}
|
||||
|
||||
if (!scrypt_test_hash()) {
|
||||
#if !defined(SCRYPT_TEST)
|
||||
scrypt_fatal_error("scrypt: hash function power-on-self-test failed");
|
||||
#endif
|
||||
res &= ~2;
|
||||
}
|
||||
|
||||
for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) {
|
||||
t = post_settings + i;
|
||||
scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest));
|
||||
scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest));
|
||||
}
|
||||
|
||||
if (!scrypt_valid) {
|
||||
#if !defined(SCRYPT_TEST)
|
||||
scrypt_fatal_error("scrypt: scrypt power-on-self-test failed");
|
||||
#endif
|
||||
res &= ~4;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct scrypt_aligned_alloc_t {
|
||||
uint8_t *mem, *ptr;
|
||||
} scrypt_aligned_alloc;
|
||||
|
||||
#ifdef SCRYPT_TEST_SPEED
|
||||
|
||||
static uint8_t *mem_base = (uint8_t *)0;
|
||||
static size_t mem_bump = 0;
|
||||
|
||||
/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */
|
||||
static scrypt_aligned_alloc scrypt_alloc(uint64_t size)
|
||||
{
|
||||
scrypt_aligned_alloc aa;
|
||||
if (!mem_base) {
|
||||
mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1));
|
||||
if (!mem_base)
|
||||
scrypt_fatal_error("scrypt: out of memory");
|
||||
mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
|
||||
}
|
||||
aa.mem = mem_base + mem_bump;
|
||||
aa.ptr = aa.mem;
|
||||
mem_bump += (size_t)size;
|
||||
return aa;
|
||||
}
|
||||
|
||||
static void scrypt_free(scrypt_aligned_alloc *aa) {
|
||||
mem_bump = 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static scrypt_aligned_alloc scrypt_alloc(uint64_t size)
|
||||
{
|
||||
static const size_t max_alloc = (size_t)-1;
|
||||
scrypt_aligned_alloc aa;
|
||||
size += (SCRYPT_BLOCK_BYTES - 1);
|
||||
if (size > max_alloc)
|
||||
scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
|
||||
aa.mem = (uint8_t *)malloc((size_t)size);
|
||||
aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
|
||||
if (!aa.mem)
|
||||
scrypt_fatal_error("scrypt: out of memory");
|
||||
return aa;
|
||||
}
|
||||
|
||||
static void scrypt_free(scrypt_aligned_alloc *aa)
|
||||
{
|
||||
free(aa->mem);
|
||||
}
|
||||
|
||||
#endif /* SCRYPT_TEST_SPEED */
|
||||
|
||||
|
||||
void scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len,
|
||||
uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes)
|
||||
{
|
||||
scrypt_aligned_alloc YX, V;
|
||||
uint8_t *X, *Y;
|
||||
uint32_t N, r, p, chunk_bytes, i;
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
|
||||
#endif
|
||||
|
||||
#if !defined(SCRYPT_TEST)
|
||||
static int power_on_self_test = 0;
|
||||
if (!power_on_self_test) {
|
||||
power_on_self_test = 1;
|
||||
if (!scrypt_power_on_self_test())
|
||||
scrypt_fatal_error("scrypt: power on self test failed");
|
||||
}
|
||||
#endif
|
||||
|
||||
if (Nfactor > scrypt_maxNfactor)
|
||||
scrypt_fatal_error("scrypt: N out of range");
|
||||
if (rfactor > scrypt_maxrfactor)
|
||||
scrypt_fatal_error("scrypt: r out of range");
|
||||
if (pfactor > scrypt_maxpfactor)
|
||||
scrypt_fatal_error("scrypt: p out of range");
|
||||
|
||||
N = (1 << (Nfactor + 1));
|
||||
r = (1 << rfactor);
|
||||
p = (1 << pfactor);
|
||||
|
||||
chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
|
||||
V = scrypt_alloc((uint64_t)N * chunk_bytes);
|
||||
YX = scrypt_alloc((p + 1) * chunk_bytes);
|
||||
|
||||
/* 1: X = PBKDF2(password, salt) */
|
||||
Y = YX.ptr;
|
||||
X = Y + chunk_bytes;
|
||||
scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p);
|
||||
|
||||
/* 2: X = ROMix(X) */
|
||||
for (i = 0; i < p; i++)
|
||||
scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r);
|
||||
|
||||
/* 3: Out = PBKDF2(password, X) */
|
||||
scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes);
|
||||
|
||||
scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes);
|
||||
|
||||
scrypt_free(&V);
|
||||
scrypt_free(&YX);
|
||||
}
|
||||
|
||||
#define Nfactor 8
|
||||
#define rfactor 0
|
||||
#define pfactor 0
|
||||
#if (SCRYPT_BLOCK_BYTES == 64)
|
||||
#define chunk_bytes 128
|
||||
#elif (SCRYPT_BLOCK_BYTES == 128)
|
||||
#define chunk_bytes 256
|
||||
#elif (SCRYPT_BLOCK_BYTES == 256)
|
||||
#define chunk_bytes 512
|
||||
#elif (SCRYPT_BLOCK_BYTES == 512)
|
||||
#define chunk_bytes 1024
|
||||
#endif
|
||||
|
||||
void my_scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out)
|
||||
{
|
||||
scrypt_aligned_alloc YX, V;
|
||||
uint8_t *X, *Y;
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if !defined(SCRYPT_TEST)
|
||||
static int power_on_self_test = 0;
|
||||
if (!power_on_self_test) {
|
||||
power_on_self_test = 1;
|
||||
if (!scrypt_power_on_self_test())
|
||||
scrypt_fatal_error("scrypt: power on self test failed");
|
||||
}
|
||||
#endif
|
||||
*/
|
||||
V = scrypt_alloc((uint64_t)512 * chunk_bytes);
|
||||
YX = scrypt_alloc(2 * chunk_bytes);
|
||||
|
||||
/* 1: X = PBKDF2(password, salt) */
|
||||
Y = YX.ptr;
|
||||
X = Y + chunk_bytes;
|
||||
scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes);
|
||||
|
||||
/* 2: X = ROMix(X) */
|
||||
scrypt_ROMix((scrypt_mix_word_t *)X, (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, 512, 1);
|
||||
|
||||
/* 3: Out = PBKDF2(password, X) */
|
||||
scrypt_pbkdf2(password, password_len, X, chunk_bytes, 1, out, 32);
|
||||
|
||||
scrypt_ensure_zero(YX.ptr, 2 * chunk_bytes);
|
||||
|
||||
scrypt_free(&V);
|
||||
scrypt_free(&YX);
|
||||
}
|
||||
|
||||
#if defined( _WINDOWS )
|
||||
#if !defined( QT_GUI )
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
#endif
|
33
stratum/algos/ar2/scrypt-jane.h
Normal file
33
stratum/algos/ar2/scrypt-jane.h
Normal file
|
@ -0,0 +1,33 @@
|
|||
#ifndef AR2_SCRYPT_JANE_H
|
||||
#define AR2_SCRYPT_JANE_H
|
||||
|
||||
//#define SCRYPT_CHOOSE_COMPILETIME
|
||||
//#define SCRYPT_TEST
|
||||
#define SCRYPT_SKEIN512
|
||||
#define SCRYPT_SALSA64
|
||||
|
||||
/*
|
||||
Nfactor: Increases CPU & Memory Hardness
|
||||
N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used
|
||||
|
||||
rfactor: Increases Memory Hardness
|
||||
r = (1 << rfactor): How large a chunk is
|
||||
|
||||
pfactor: Increases CPU Hardness
|
||||
p = (1 << pfactor): Number of times to mix the main chunk
|
||||
|
||||
A block is the basic mixing unit (salsa/chacha block = 64 bytes)
|
||||
A chunk is (2 * r) blocks
|
||||
|
||||
~Memory used = (N + 2) * ((2 * r) * block size)
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
typedef void (*scrypt_fatal_errorfn)(const char *msg);
|
||||
void scrypt_set_fatal_error(scrypt_fatal_errorfn fn);
|
||||
|
||||
void scrypt(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, unsigned char Nfactor, unsigned char rfactor, unsigned char pfactor, unsigned char *out, size_t bytes);
|
||||
void my_scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out);
|
||||
#endif /* AR2_SCRYPT_JANE_H */
|
38
stratum/algos/ar2/sj/scrypt-jane-hash.h
Normal file
38
stratum/algos/ar2/sj/scrypt-jane-hash.h
Normal file
|
@ -0,0 +1,38 @@
|
|||
#if defined(SCRYPT_SKEIN512)
|
||||
#include "scrypt-jane-hash_skein512.h"
|
||||
#else
|
||||
#define SCRYPT_HASH "ERROR"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state;
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
static void scrypt_hash_init(scrypt_hash_state *S) {}
|
||||
static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {}
|
||||
static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {}
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0};
|
||||
#error must define a hash function!
|
||||
#endif
|
||||
|
||||
#include "scrypt-jane-pbkdf2.h"
|
||||
|
||||
#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
|
||||
|
||||
static int
|
||||
scrypt_test_hash(void) {
|
||||
scrypt_hash_state st;
|
||||
scrypt_hash_digest hash, final;
|
||||
uint8_t msg[SCRYPT_TEST_HASH_LEN];
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++)
|
||||
msg[i] = (uint8_t)i;
|
||||
|
||||
scrypt_hash_init(&st);
|
||||
for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) {
|
||||
scrypt_hash(hash, msg, i);
|
||||
scrypt_hash_update(&st, hash, sizeof(hash));
|
||||
}
|
||||
scrypt_hash_finish(&st, final);
|
||||
return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
|
||||
}
|
||||
|
188
stratum/algos/ar2/sj/scrypt-jane-hash_skein512.h
Normal file
188
stratum/algos/ar2/sj/scrypt-jane-hash_skein512.h
Normal file
|
@ -0,0 +1,188 @@
|
|||
#define SCRYPT_HASH "Skein-512"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint64_t X[8], T[2];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
static void
|
||||
skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) {
|
||||
uint64_t X[8], key[8], Xt[9+18], T[3+1];
|
||||
size_t r;
|
||||
|
||||
while (blocks--) {
|
||||
T[0] = S->T[0] + add;
|
||||
T[1] = S->T[1];
|
||||
T[2] = T[0] ^ T[1];
|
||||
key[0] = U8TO64_LE(in + 0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0];
|
||||
key[1] = U8TO64_LE(in + 8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1];
|
||||
key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2];
|
||||
key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3];
|
||||
key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4];
|
||||
key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0];
|
||||
key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1];
|
||||
key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7];
|
||||
Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7];
|
||||
in += SCRYPT_HASH_BLOCK_SIZE;
|
||||
|
||||
for (r = 0; r < 18; r++)
|
||||
Xt[r + 9] = Xt[r + 0];
|
||||
|
||||
for (r = 0; r < 18; r += 2) {
|
||||
X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0];
|
||||
X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2];
|
||||
X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4];
|
||||
X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6];
|
||||
X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2];
|
||||
X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0];
|
||||
X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6];
|
||||
X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4];
|
||||
X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4];
|
||||
X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6];
|
||||
X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0];
|
||||
X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2];
|
||||
X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6];
|
||||
X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4];
|
||||
X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2];
|
||||
X[0] += X[7]; X[7] = ROTL64(X[7], 9) ^ X[0];
|
||||
|
||||
X[0] += Xt[r + 1];
|
||||
X[1] += Xt[r + 2];
|
||||
X[2] += Xt[r + 3];
|
||||
X[3] += Xt[r + 4];
|
||||
X[4] += Xt[r + 5];
|
||||
X[5] += Xt[r + 6] + T[1];
|
||||
X[6] += Xt[r + 7] + T[2];
|
||||
X[7] += Xt[r + 8] + r + 1;
|
||||
|
||||
T[3] = T[0];
|
||||
T[0] = T[1];
|
||||
T[1] = T[2];
|
||||
T[2] = T[3];
|
||||
|
||||
X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0];
|
||||
X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2];
|
||||
X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4];
|
||||
X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6];
|
||||
X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2];
|
||||
X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0];
|
||||
X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6];
|
||||
X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4];
|
||||
X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4];
|
||||
X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6];
|
||||
X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0];
|
||||
X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2];
|
||||
X[6] += X[1]; X[1] = ROTL64(X[1], 8) ^ X[6];
|
||||
X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4];
|
||||
X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2];
|
||||
X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0];
|
||||
|
||||
X[0] += Xt[r + 2];
|
||||
X[1] += Xt[r + 3];
|
||||
X[2] += Xt[r + 4];
|
||||
X[3] += Xt[r + 5];
|
||||
X[4] += Xt[r + 6];
|
||||
X[5] += Xt[r + 7] + T[1];
|
||||
X[6] += Xt[r + 8] + T[2];
|
||||
X[7] += Xt[r + 9] + r + 2;
|
||||
|
||||
T[3] = T[0];
|
||||
T[0] = T[1];
|
||||
T[1] = T[2];
|
||||
T[2] = T[3];
|
||||
}
|
||||
|
||||
S->X[0] = key[0] ^ X[0];
|
||||
S->X[1] = key[1] ^ X[1];
|
||||
S->X[2] = key[2] ^ X[2];
|
||||
S->X[3] = key[3] ^ X[3];
|
||||
S->X[4] = key[4] ^ X[4];
|
||||
S->X[5] = key[5] ^ X[5];
|
||||
S->X[6] = key[6] ^ X[6];
|
||||
S->X[7] = key[7] ^ X[7];
|
||||
|
||||
S->T[0] = T[0];
|
||||
S->T[1] = T[1] & ~0x4000000000000000ull;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->X[0] = 0x4903ADFF749C51CEull;
|
||||
S->X[1] = 0x0D95DE399746DF03ull;
|
||||
S->X[2] = 0x8FD1934127C79BCEull;
|
||||
S->X[3] = 0x9A255629FF352CB1ull;
|
||||
S->X[4] = 0x5DB62599DF6CA7B0ull;
|
||||
S->X[5] = 0xEABE394CA9D5C3F4ull;
|
||||
S->X[6] = 0x991112C71A75B523ull;
|
||||
S->X[7] = 0xAE18A40B660FCC33ull;
|
||||
S->T[0] = 0x0000000000000000ull;
|
||||
S->T[1] = 0x7000000000000000ull;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */
|
||||
if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) {
|
||||
/* handle the previous data, we know there is enough for at least one block */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
in += want;
|
||||
inlen -= want;
|
||||
S->leftover = 0;
|
||||
skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/* handle the current data if there's more than one block */
|
||||
if (inlen > SCRYPT_HASH_BLOCK_SIZE) {
|
||||
blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE);
|
||||
inlen -= blocks;
|
||||
in += blocks;
|
||||
}
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
memcpy(S->buffer + S->leftover, in, inlen);
|
||||
S->leftover += inlen;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
S->T[1] |= 0x8000000000000000ull;
|
||||
skein512_blocks(S, S->buffer, 1, S->leftover);
|
||||
|
||||
memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE);
|
||||
S->T[0] = 0;
|
||||
S->T[1] = 0xff00000000000000ull;
|
||||
skein512_blocks(S, S->buffer, 1, 8);
|
||||
|
||||
U64TO8_LE(&hash[ 0], S->X[0]);
|
||||
U64TO8_LE(&hash[ 8], S->X[1]);
|
||||
U64TO8_LE(&hash[16], S->X[2]);
|
||||
U64TO8_LE(&hash[24], S->X[3]);
|
||||
U64TO8_LE(&hash[32], S->X[4]);
|
||||
U64TO8_LE(&hash[40], S->X[5]);
|
||||
U64TO8_LE(&hash[48], S->X[6]);
|
||||
U64TO8_LE(&hash[56], S->X[7]);
|
||||
}
|
||||
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4,
|
||||
0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf,
|
||||
0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41,
|
||||
0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67,
|
||||
};
|
381
stratum/algos/ar2/sj/scrypt-jane-mix_salsa-avx.h
Normal file
381
stratum/algos/ar2/sj/scrypt-jane-mix_salsa-avx.h
Normal file
|
@ -0,0 +1,381 @@
|
|||
/* x86 */
|
||||
#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,32)
|
||||
a2(and esp,~63)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(movdqa xmm0,[ecx+esi+0])
|
||||
a2(movdqa xmm1,[ecx+esi+16])
|
||||
a2(movdqa xmm2,[ecx+esi+32])
|
||||
a2(movdqa xmm3,[ecx+esi+48])
|
||||
aj(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[ecx+eax+0])
|
||||
a3(vpxor xmm1,xmm1,[ecx+eax+16])
|
||||
a3(vpxor xmm2,xmm2,[ecx+eax+32])
|
||||
a3(vpxor xmm3,xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and eax, eax)
|
||||
a3(vpxor xmm0,xmm0,[esi+ecx+0])
|
||||
a3(vpxor xmm1,xmm1,[esi+ecx+16])
|
||||
a3(vpxor xmm2,xmm2,[esi+ecx+32])
|
||||
a3(vpxor xmm3,xmm3,[esi+ecx+48])
|
||||
aj(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[eax+ecx+0])
|
||||
a3(vpxor xmm1,xmm1,[eax+ecx+16])
|
||||
a3(vpxor xmm2,xmm2,[eax+ecx+32])
|
||||
a3(vpxor xmm3,xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa [esp+0],xmm0)
|
||||
a2(vmovdqa [esp+16],xmm1)
|
||||
a2(vmovdqa xmm6,xmm2)
|
||||
a2(vmovdqa xmm7,xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_salsa_avx_loop: )
|
||||
a3(vpaddd xmm4, xmm1, xmm0)
|
||||
a3(vpsrld xmm5, xmm4, 25)
|
||||
a3(vpslld xmm4, xmm4, 7)
|
||||
a3(vpxor xmm3, xmm3, xmm5)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm3)
|
||||
a3(vpsrld xmm5, xmm4, 23)
|
||||
a3(vpslld xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm5)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm3, xmm2)
|
||||
a3(vpsrld xmm5, xmm4, 19)
|
||||
a3(vpslld xmm4, xmm4, 13)
|
||||
a3(vpxor xmm1, xmm1, xmm5)
|
||||
a3(vpshufd xmm3, xmm3, 0x93)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm1)
|
||||
a3(vpsrld xmm5, xmm4, 14)
|
||||
a3(vpslld xmm4, xmm4, 18)
|
||||
a3(vpxor xmm0, xmm0, xmm5)
|
||||
a3(vpshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a3(vpaddd xmm4, xmm3, xmm0)
|
||||
a3(vpshufd xmm1, xmm1, 0x39)
|
||||
a3(vpsrld xmm5, xmm4, 25)
|
||||
a3(vpslld xmm4, xmm4, 7)
|
||||
a3(vpxor xmm1, xmm1, xmm5)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm1)
|
||||
a3(vpsrld xmm5, xmm4, 23)
|
||||
a3(vpslld xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm5)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm1, xmm2)
|
||||
a3(vpsrld xmm5, xmm4, 19)
|
||||
a3(vpslld xmm4, xmm4, 13)
|
||||
a3(vpxor xmm3, xmm3, xmm5)
|
||||
a3(vpshufd xmm1, xmm1, 0x93)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm3)
|
||||
a3(vpsrld xmm5, xmm4, 14)
|
||||
a3(vpslld xmm4, xmm4, 18)
|
||||
a3(vpxor xmm0, xmm0, xmm5)
|
||||
a3(vpshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a3(vpshufd xmm3, xmm3, 0x39)
|
||||
a2(sub eax, 2)
|
||||
aj(ja scrypt_salsa_avx_loop)
|
||||
a3(vpaddd xmm0,xmm0,[esp+0])
|
||||
a3(vpaddd xmm1,xmm1,[esp+16])
|
||||
a3(vpaddd xmm2,xmm2,xmm6)
|
||||
a3(vpaddd xmm3,xmm3,xmm7)
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(vmovdqa [eax+0],xmm0)
|
||||
a2(vmovdqa [eax+16],xmm1)
|
||||
a2(vmovdqa [eax+32],xmm2)
|
||||
a2(vmovdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
aj(jne scrypt_ChunkMix_avx_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
aj(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
aj(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa xmm8,xmm0)
|
||||
a2(vmovdqa xmm9,xmm1)
|
||||
a2(vmovdqa xmm10,xmm2)
|
||||
a2(vmovdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa_avx_loop: )
|
||||
a3(vpaddd xmm4, xmm1, xmm0)
|
||||
a3(vpsrld xmm5, xmm4, 25)
|
||||
a3(vpslld xmm4, xmm4, 7)
|
||||
a3(vpxor xmm3, xmm3, xmm5)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm3)
|
||||
a3(vpsrld xmm5, xmm4, 23)
|
||||
a3(vpslld xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm5)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm3, xmm2)
|
||||
a3(vpsrld xmm5, xmm4, 19)
|
||||
a3(vpslld xmm4, xmm4, 13)
|
||||
a3(vpxor xmm1, xmm1, xmm5)
|
||||
a3(vpshufd xmm3, xmm3, 0x93)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm1)
|
||||
a3(vpsrld xmm5, xmm4, 14)
|
||||
a3(vpslld xmm4, xmm4, 18)
|
||||
a3(vpxor xmm0, xmm0, xmm5)
|
||||
a3(vpshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a3(vpaddd xmm4, xmm3, xmm0)
|
||||
a3(vpshufd xmm1, xmm1, 0x39)
|
||||
a3(vpsrld xmm5, xmm4, 25)
|
||||
a3(vpslld xmm4, xmm4, 7)
|
||||
a3(vpxor xmm1, xmm1, xmm5)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm1)
|
||||
a3(vpsrld xmm5, xmm4, 23)
|
||||
a3(vpslld xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm5)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm1, xmm2)
|
||||
a3(vpsrld xmm5, xmm4, 19)
|
||||
a3(vpslld xmm4, xmm4, 13)
|
||||
a3(vpxor xmm3, xmm3, xmm5)
|
||||
a3(vpshufd xmm1, xmm1, 0x93)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm3)
|
||||
a3(vpsrld xmm5, xmm4, 14)
|
||||
a3(vpslld xmm4, xmm4, 18)
|
||||
a3(vpxor xmm0, xmm0, xmm5)
|
||||
a3(vpshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a3(vpshufd xmm3, xmm3, 0x39)
|
||||
a2(sub rax, 2)
|
||||
aj(ja scrypt_salsa_avx_loop)
|
||||
a3(vpaddd xmm0,xmm0,xmm8)
|
||||
a3(vpaddd xmm1,xmm1,xmm9)
|
||||
a3(vpaddd xmm2,xmm2,xmm10)
|
||||
a3(vpaddd xmm3,xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
aj(jne scrypt_ChunkMix_avx_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_AVX
|
||||
|
||||
static void asm_calling_convention NOINLINE
|
||||
scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x4 = x1;
|
||||
x4 = _mm_add_epi32(x4, x0);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 7);
|
||||
x5 = _mm_srli_epi32(x5, 25);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = x0;
|
||||
x3 = _mm_xor_si128(x3, x5);
|
||||
x4 = _mm_add_epi32(x4, x3);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 9);
|
||||
x5 = _mm_srli_epi32(x5, 23);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = x3;
|
||||
x2 = _mm_xor_si128(x2, x5);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x93);
|
||||
x4 = _mm_add_epi32(x4, x2);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 13);
|
||||
x5 = _mm_srli_epi32(x5, 19);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = x2;
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x4 = _mm_add_epi32(x4, x1);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 18);
|
||||
x5 = _mm_srli_epi32(x5, 14);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x4 = x3;
|
||||
x0 = _mm_xor_si128(x0, x5);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x39);
|
||||
x4 = _mm_add_epi32(x4, x0);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 7);
|
||||
x5 = _mm_srli_epi32(x5, 25);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = x0;
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x4 = _mm_add_epi32(x4, x1);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 9);
|
||||
x5 = _mm_srli_epi32(x5, 23);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = x1;
|
||||
x2 = _mm_xor_si128(x2, x5);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x93);
|
||||
x4 = _mm_add_epi32(x4, x2);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 13);
|
||||
x5 = _mm_srli_epi32(x5, 19);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = x2;
|
||||
x3 = _mm_xor_si128(x3, x5);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x4 = _mm_add_epi32(x4, x3);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 18);
|
||||
x5 = _mm_srli_epi32(x5, 14);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x39);
|
||||
x0 = _mm_xor_si128(x0, x5);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
/* uses salsa_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa/8-AVX"
|
||||
#undef SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_INCLUDED
|
||||
#endif
|
443
stratum/algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h
Normal file
443
stratum/algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h
Normal file
|
@ -0,0 +1,443 @@
|
|||
/* x86 */
|
||||
#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,32)
|
||||
a2(and esp,~63)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(movdqa xmm0,[ecx+esi+0])
|
||||
a2(movdqa xmm1,[ecx+esi+16])
|
||||
a2(movdqa xmm2,[ecx+esi+32])
|
||||
a2(movdqa xmm3,[ecx+esi+48])
|
||||
aj(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[ecx+eax+0])
|
||||
a2(pxor xmm1,[ecx+eax+16])
|
||||
a2(pxor xmm2,[ecx+eax+32])
|
||||
a2(pxor xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and eax, eax)
|
||||
a2(pxor xmm0,[esi+ecx+0])
|
||||
a2(pxor xmm1,[esi+ecx+16])
|
||||
a2(pxor xmm2,[esi+ecx+32])
|
||||
a2(pxor xmm3,[esi+ecx+48])
|
||||
aj(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[eax+ecx+0])
|
||||
a2(pxor xmm1,[eax+ecx+16])
|
||||
a2(pxor xmm2,[eax+ecx+32])
|
||||
a2(pxor xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa [esp+0],xmm0)
|
||||
a2(movdqa [esp+16],xmm1)
|
||||
a2(movdqa xmm6,xmm2)
|
||||
a2(movdqa xmm7,xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_salsa_sse2_loop: )
|
||||
a2(movdqa xmm4, xmm1)
|
||||
a2(paddd xmm4, xmm0)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 7)
|
||||
a2(psrld xmm5, 25)
|
||||
a2(pxor xmm3, xmm4)
|
||||
a2(movdqa xmm4, xmm0)
|
||||
a2(pxor xmm3, xmm5)
|
||||
a2(paddd xmm4, xmm3)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 9)
|
||||
a2(psrld xmm5, 23)
|
||||
a2(pxor xmm2, xmm4)
|
||||
a2(movdqa xmm4, xmm3)
|
||||
a2(pxor xmm2, xmm5)
|
||||
a3(pshufd xmm3, xmm3, 0x93)
|
||||
a2(paddd xmm4, xmm2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 13)
|
||||
a2(psrld xmm5, 19)
|
||||
a2(pxor xmm1, xmm4)
|
||||
a2(movdqa xmm4, xmm2)
|
||||
a2(pxor xmm1, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a2(paddd xmm4, xmm1)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 18)
|
||||
a2(psrld xmm5, 14)
|
||||
a2(pxor xmm0, xmm4)
|
||||
a2(movdqa xmm4, xmm3)
|
||||
a2(pxor xmm0, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x39)
|
||||
a2(paddd xmm4, xmm0)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 7)
|
||||
a2(psrld xmm5, 25)
|
||||
a2(pxor xmm1, xmm4)
|
||||
a2(movdqa xmm4, xmm0)
|
||||
a2(pxor xmm1, xmm5)
|
||||
a2(paddd xmm4, xmm1)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 9)
|
||||
a2(psrld xmm5, 23)
|
||||
a2(pxor xmm2, xmm4)
|
||||
a2(movdqa xmm4, xmm1)
|
||||
a2(pxor xmm2, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x93)
|
||||
a2(paddd xmm4, xmm2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 13)
|
||||
a2(psrld xmm5, 19)
|
||||
a2(pxor xmm3, xmm4)
|
||||
a2(movdqa xmm4, xmm2)
|
||||
a2(pxor xmm3, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a2(paddd xmm4, xmm3)
|
||||
a2(sub eax, 2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 18)
|
||||
a2(psrld xmm5, 14)
|
||||
a2(pxor xmm0, xmm4)
|
||||
a3(pshufd xmm3, xmm3, 0x39)
|
||||
a2(pxor xmm0, xmm5)
|
||||
aj(ja scrypt_salsa_sse2_loop)
|
||||
a2(paddd xmm0,[esp+0])
|
||||
a2(paddd xmm1,[esp+16])
|
||||
a2(paddd xmm2,xmm6)
|
||||
a2(paddd xmm3,xmm7)
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(movdqa [eax+0],xmm0)
|
||||
a2(movdqa [eax+16],xmm1)
|
||||
a2(movdqa [eax+32],xmm2)
|
||||
a2(movdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
aj(jne scrypt_ChunkMix_sse2_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
aj(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
aj(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa xmm8,xmm0)
|
||||
a2(movdqa xmm9,xmm1)
|
||||
a2(movdqa xmm10,xmm2)
|
||||
a2(movdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa_sse2_loop: )
|
||||
a2(movdqa xmm4, xmm1)
|
||||
a2(paddd xmm4, xmm0)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 7)
|
||||
a2(psrld xmm5, 25)
|
||||
a2(pxor xmm3, xmm4)
|
||||
a2(movdqa xmm4, xmm0)
|
||||
a2(pxor xmm3, xmm5)
|
||||
a2(paddd xmm4, xmm3)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 9)
|
||||
a2(psrld xmm5, 23)
|
||||
a2(pxor xmm2, xmm4)
|
||||
a2(movdqa xmm4, xmm3)
|
||||
a2(pxor xmm2, xmm5)
|
||||
a3(pshufd xmm3, xmm3, 0x93)
|
||||
a2(paddd xmm4, xmm2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 13)
|
||||
a2(psrld xmm5, 19)
|
||||
a2(pxor xmm1, xmm4)
|
||||
a2(movdqa xmm4, xmm2)
|
||||
a2(pxor xmm1, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a2(paddd xmm4, xmm1)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 18)
|
||||
a2(psrld xmm5, 14)
|
||||
a2(pxor xmm0, xmm4)
|
||||
a2(movdqa xmm4, xmm3)
|
||||
a2(pxor xmm0, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x39)
|
||||
a2(paddd xmm4, xmm0)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 7)
|
||||
a2(psrld xmm5, 25)
|
||||
a2(pxor xmm1, xmm4)
|
||||
a2(movdqa xmm4, xmm0)
|
||||
a2(pxor xmm1, xmm5)
|
||||
a2(paddd xmm4, xmm1)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 9)
|
||||
a2(psrld xmm5, 23)
|
||||
a2(pxor xmm2, xmm4)
|
||||
a2(movdqa xmm4, xmm1)
|
||||
a2(pxor xmm2, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x93)
|
||||
a2(paddd xmm4, xmm2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 13)
|
||||
a2(psrld xmm5, 19)
|
||||
a2(pxor xmm3, xmm4)
|
||||
a2(movdqa xmm4, xmm2)
|
||||
a2(pxor xmm3, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a2(paddd xmm4, xmm3)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 18)
|
||||
a2(psrld xmm5, 14)
|
||||
a2(pxor xmm0, xmm4)
|
||||
a3(pshufd xmm3, xmm3, 0x39)
|
||||
a2(pxor xmm0, xmm5)
|
||||
aj(ja scrypt_salsa_sse2_loop)
|
||||
a2(paddd xmm0,xmm8)
|
||||
a2(paddd xmm1,xmm9)
|
||||
a2(paddd xmm2,xmm10)
|
||||
a2(paddd xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
aj(jne scrypt_ChunkMix_sse2_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_SSE2
|
||||
|
||||
static void NOINLINE asm_calling_convention
|
||||
scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x4 = x1;
|
||||
x4 = _mm_add_epi32(x4, x0);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 7);
|
||||
x5 = _mm_srli_epi32(x5, 25);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = x0;
|
||||
x3 = _mm_xor_si128(x3, x5);
|
||||
x4 = _mm_add_epi32(x4, x3);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 9);
|
||||
x5 = _mm_srli_epi32(x5, 23);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = x3;
|
||||
x2 = _mm_xor_si128(x2, x5);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x93);
|
||||
x4 = _mm_add_epi32(x4, x2);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 13);
|
||||
x5 = _mm_srli_epi32(x5, 19);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = x2;
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x4 = _mm_add_epi32(x4, x1);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 18);
|
||||
x5 = _mm_srli_epi32(x5, 14);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x4 = x3;
|
||||
x0 = _mm_xor_si128(x0, x5);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x39);
|
||||
x4 = _mm_add_epi32(x4, x0);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 7);
|
||||
x5 = _mm_srli_epi32(x5, 25);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = x0;
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x4 = _mm_add_epi32(x4, x1);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 9);
|
||||
x5 = _mm_srli_epi32(x5, 23);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = x1;
|
||||
x2 = _mm_xor_si128(x2, x5);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x93);
|
||||
x4 = _mm_add_epi32(x4, x2);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 13);
|
||||
x5 = _mm_srli_epi32(x5, 19);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = x2;
|
||||
x3 = _mm_xor_si128(x3, x5);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x4 = _mm_add_epi32(x4, x3);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 18);
|
||||
x5 = _mm_srli_epi32(x5, 14);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x39);
|
||||
x0 = _mm_xor_si128(x0, x5);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa/8-SSE2"
|
||||
#undef SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_INCLUDED
|
||||
#endif
|
||||
|
||||
/* used by avx,etc as well */
|
||||
#if defined(SCRYPT_SALSA_INCLUDED)
|
||||
/*
|
||||
Default layout:
|
||||
0 1 2 3
|
||||
4 5 6 7
|
||||
8 9 10 11
|
||||
12 13 14 15
|
||||
|
||||
SSE2 layout:
|
||||
0 5 10 15
|
||||
12 1 6 11
|
||||
8 13 2 7
|
||||
4 9 14 3
|
||||
*/
|
||||
|
||||
static void asm_calling_convention
|
||||
salsa_core_tangle_sse2(uint32_t *blocks, size_t count) {
|
||||
uint32_t t;
|
||||
while (count--) {
|
||||
t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
|
||||
t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
|
||||
t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
|
||||
t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
|
||||
t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
|
||||
t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
|
||||
blocks += 16;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
317
stratum/algos/ar2/sj/scrypt-jane-mix_salsa-xop.h
Normal file
317
stratum/algos/ar2/sj/scrypt-jane-mix_salsa-xop.h
Normal file
|
@ -0,0 +1,317 @@
|
|||
/* x86 */
|
||||
#if defined(X86ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA_XOP
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_xop)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,32)
|
||||
a2(and esp,~63)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(movdqa xmm0,[ecx+esi+0])
|
||||
a2(movdqa xmm1,[ecx+esi+16])
|
||||
a2(movdqa xmm2,[ecx+esi+32])
|
||||
a2(movdqa xmm3,[ecx+esi+48])
|
||||
aj(jz scrypt_ChunkMix_xop_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[ecx+eax+0])
|
||||
a3(vpxor xmm1,xmm1,[ecx+eax+16])
|
||||
a3(vpxor xmm2,xmm2,[ecx+eax+32])
|
||||
a3(vpxor xmm3,xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_xop_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_xop_loop:)
|
||||
a2(and eax, eax)
|
||||
a3(vpxor xmm0,xmm0,[esi+ecx+0])
|
||||
a3(vpxor xmm1,xmm1,[esi+ecx+16])
|
||||
a3(vpxor xmm2,xmm2,[esi+ecx+32])
|
||||
a3(vpxor xmm3,xmm3,[esi+ecx+48])
|
||||
aj(jz scrypt_ChunkMix_xop_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[eax+ecx+0])
|
||||
a3(vpxor xmm1,xmm1,[eax+ecx+16])
|
||||
a3(vpxor xmm2,xmm2,[eax+ecx+32])
|
||||
a3(vpxor xmm3,xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_xop_no_xor2:)
|
||||
a2(vmovdqa [esp+0],xmm0)
|
||||
a2(vmovdqa [esp+16],xmm1)
|
||||
a2(vmovdqa xmm6,xmm2)
|
||||
a2(vmovdqa xmm7,xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_salsa_xop_loop: )
|
||||
a3(vpaddd xmm4, xmm1, xmm0)
|
||||
a3(vprotd xmm4, xmm4, 7)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm3)
|
||||
a3(vprotd xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm3, xmm2)
|
||||
a3(vprotd xmm4, xmm4, 13)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm1)
|
||||
a3(pshufd xmm3, xmm3, 0x93)
|
||||
a3(vprotd xmm4, xmm4, 18)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a3(pshufd xmm1, xmm1, 0x39)
|
||||
a3(vpaddd xmm4, xmm3, xmm0)
|
||||
a3(vprotd xmm4, xmm4, 7)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm1)
|
||||
a3(vprotd xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm1, xmm2)
|
||||
a3(vprotd xmm4, xmm4, 13)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(pshufd xmm1, xmm1, 0x93)
|
||||
a3(vpaddd xmm4, xmm2, xmm3)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a3(vprotd xmm4, xmm4, 18)
|
||||
a3(pshufd xmm3, xmm3, 0x39)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a2(sub eax, 2)
|
||||
aj(ja scrypt_salsa_xop_loop)
|
||||
a3(vpaddd xmm0,xmm0,[esp+0])
|
||||
a3(vpaddd xmm1,xmm1,[esp+16])
|
||||
a3(vpaddd xmm2,xmm2,xmm6)
|
||||
a3(vpaddd xmm3,xmm3,xmm7)
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(vmovdqa [eax+0],xmm0)
|
||||
a2(vmovdqa [eax+16],xmm1)
|
||||
a2(vmovdqa [eax+32],xmm2)
|
||||
a2(vmovdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
aj(jne scrypt_ChunkMix_xop_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_xop)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA_XOP
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_xop)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
aj(jz scrypt_ChunkMix_xop_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_xop_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_xop_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
aj(jz scrypt_ChunkMix_xop_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_xop_no_xor2:)
|
||||
a2(vmovdqa xmm8,xmm0)
|
||||
a2(vmovdqa xmm9,xmm1)
|
||||
a2(vmovdqa xmm10,xmm2)
|
||||
a2(vmovdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa_xop_loop: )
|
||||
a3(vpaddd xmm4, xmm1, xmm0)
|
||||
a3(vprotd xmm4, xmm4, 7)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm3)
|
||||
a3(vprotd xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm3, xmm2)
|
||||
a3(vprotd xmm4, xmm4, 13)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm1)
|
||||
a3(pshufd xmm3, xmm3, 0x93)
|
||||
a3(vprotd xmm4, xmm4, 18)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a3(pshufd xmm1, xmm1, 0x39)
|
||||
a3(vpaddd xmm4, xmm3, xmm0)
|
||||
a3(vprotd xmm4, xmm4, 7)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm1)
|
||||
a3(vprotd xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm1, xmm2)
|
||||
a3(vprotd xmm4, xmm4, 13)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(pshufd xmm1, xmm1, 0x93)
|
||||
a3(vpaddd xmm4, xmm2, xmm3)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a3(vprotd xmm4, xmm4, 18)
|
||||
a3(pshufd xmm3, xmm3, 0x39)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a2(sub rax, 2)
|
||||
aj(ja scrypt_salsa_xop_loop)
|
||||
a3(vpaddd xmm0,xmm0,xmm8)
|
||||
a3(vpaddd xmm1,xmm1,xmm9)
|
||||
a3(vpaddd xmm2,xmm2,xmm10)
|
||||
a3(vpaddd xmm3,xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
aj(jne scrypt_ChunkMix_xop_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_xop)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_XOP
|
||||
|
||||
static void asm_calling_convention NOINLINE
|
||||
scrypt_ChunkMix_xop(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x4 = _mm_add_epi32(x1, x0);
|
||||
x4 = _mm_roti_epi32(x4, 7);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = _mm_add_epi32(x0, x3);
|
||||
x4 = _mm_roti_epi32(x4, 9);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = _mm_add_epi32(x3, x2);
|
||||
x4 = _mm_roti_epi32(x4, 13);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = _mm_add_epi32(x2, x1);
|
||||
x4 = _mm_roti_epi32(x4, 18);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x93);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x39);
|
||||
x4 = _mm_add_epi32(x3, x0);
|
||||
x4 = _mm_roti_epi32(x4, 7);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = _mm_add_epi32(x0, x1);
|
||||
x4 = _mm_roti_epi32(x4, 9);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = _mm_add_epi32(x1, x2);
|
||||
x4 = _mm_roti_epi32(x4, 13);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = _mm_add_epi32(x2, x3);
|
||||
x4 = _mm_roti_epi32(x4, 18);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x93);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x39);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_XOP)
|
||||
/* uses salsa_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa/8-XOP"
|
||||
#undef SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_INCLUDED
|
||||
#endif
|
70
stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h
Normal file
70
stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h
Normal file
|
@ -0,0 +1,70 @@
|
|||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa20/8 Ref"
|
||||
|
||||
#undef SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_BASIC
|
||||
|
||||
static void
|
||||
salsa_core_basic(uint32_t state[16]) {
|
||||
size_t rounds = 8;
|
||||
uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t;
|
||||
|
||||
x0 = state[0];
|
||||
x1 = state[1];
|
||||
x2 = state[2];
|
||||
x3 = state[3];
|
||||
x4 = state[4];
|
||||
x5 = state[5];
|
||||
x6 = state[6];
|
||||
x7 = state[7];
|
||||
x8 = state[8];
|
||||
x9 = state[9];
|
||||
x10 = state[10];
|
||||
x11 = state[11];
|
||||
x12 = state[12];
|
||||
x13 = state[13];
|
||||
x14 = state[14];
|
||||
x15 = state[15];
|
||||
|
||||
#define quarter(a,b,c,d) \
|
||||
t = a+d; t = ROTL32(t, 7); b ^= t; \
|
||||
t = b+a; t = ROTL32(t, 9); c ^= t; \
|
||||
t = c+b; t = ROTL32(t, 13); d ^= t; \
|
||||
t = d+c; t = ROTL32(t, 18); a ^= t; \
|
||||
|
||||
for (; rounds; rounds -= 2) {
|
||||
quarter( x0, x4, x8,x12)
|
||||
quarter( x5, x9,x13, x1)
|
||||
quarter(x10,x14, x2, x6)
|
||||
quarter(x15, x3, x7,x11)
|
||||
quarter( x0, x1, x2, x3)
|
||||
quarter( x5, x6, x7, x4)
|
||||
quarter(x10,x11, x8, x9)
|
||||
quarter(x15,x12,x13,x14)
|
||||
}
|
||||
|
||||
state[0] += x0;
|
||||
state[1] += x1;
|
||||
state[2] += x2;
|
||||
state[3] += x3;
|
||||
state[4] += x4;
|
||||
state[5] += x5;
|
||||
state[6] += x6;
|
||||
state[7] += x7;
|
||||
state[8] += x8;
|
||||
state[9] += x9;
|
||||
state[10] += x10;
|
||||
state[11] += x11;
|
||||
state[12] += x12;
|
||||
state[13] += x13;
|
||||
state[14] += x14;
|
||||
state[15] += x15;
|
||||
|
||||
#undef quarter
|
||||
}
|
||||
|
||||
#endif
|
||||
|
367
stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-avx.h
Normal file
367
stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-avx.h
Normal file
|
@ -0,0 +1,367 @@
|
|||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
a2(vmovdqa xmm4,[rax+64])
|
||||
a2(vmovdqa xmm5,[rax+80])
|
||||
a2(vmovdqa xmm6,[rax+96])
|
||||
a2(vmovdqa xmm7,[rax+112])
|
||||
aj(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a3(vpxor xmm4,xmm4,[r9+64])
|
||||
a3(vpxor xmm5,xmm5,[r9+80])
|
||||
a3(vpxor xmm6,xmm6,[r9+96])
|
||||
a3(vpxor xmm7,xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rsi+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rsi+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rsi+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rsi+r9+112])
|
||||
aj(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rdx+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rdx+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rdx+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa [rsp+0],xmm0)
|
||||
a2(vmovdqa [rsp+16],xmm1)
|
||||
a2(vmovdqa [rsp+32],xmm2)
|
||||
a2(vmovdqa [rsp+48],xmm3)
|
||||
a2(vmovdqa [rsp+64],xmm4)
|
||||
a2(vmovdqa [rsp+80],xmm5)
|
||||
a2(vmovdqa [rsp+96],xmm6)
|
||||
a2(vmovdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_avx_loop: )
|
||||
a3(vpaddq xmm8, xmm0, xmm2)
|
||||
a3(vpaddq xmm9, xmm1, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm8)
|
||||
a3(vpxor xmm7, xmm7, xmm9)
|
||||
a3(vpaddq xmm10, xmm0, xmm6)
|
||||
a3(vpaddq xmm11, xmm1, xmm7)
|
||||
a3(vpsrlq xmm8, xmm10, 51)
|
||||
a3(vpsrlq xmm9, xmm11, 51)
|
||||
a3(vpsllq xmm10, xmm10, 13)
|
||||
a3(vpsllq xmm11, xmm11, 13)
|
||||
a3(vpxor xmm4, xmm4, xmm8)
|
||||
a3(vpxor xmm5, xmm5, xmm9)
|
||||
a3(vpxor xmm4, xmm4, xmm10)
|
||||
a3(vpxor xmm5, xmm5, xmm11)
|
||||
a3(vpaddq xmm8, xmm6, xmm4)
|
||||
a3(vpaddq xmm9, xmm7, xmm5)
|
||||
a3(vpsrlq xmm10, xmm8, 25)
|
||||
a3(vpsrlq xmm11, xmm9, 25)
|
||||
a3(vpsllq xmm8, xmm8, 39)
|
||||
a3(vpsllq xmm9, xmm9, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpaddq xmm10, xmm4, xmm2)
|
||||
a3(vpaddq xmm11, xmm5, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm10)
|
||||
a3(vpxor xmm1, xmm1, xmm11)
|
||||
a2(vmovdqa xmm8, xmm2)
|
||||
a2(vmovdqa xmm9, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm9, xmm8, 8)
|
||||
a4(vpalignr xmm7, xmm8, xmm9, 8)
|
||||
a3(vpaddq xmm10, xmm0, xmm2)
|
||||
a3(vpaddq xmm11, xmm1, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm10)
|
||||
a3(vpxor xmm7, xmm7, xmm11)
|
||||
a3(vpaddq xmm8, xmm0, xmm6)
|
||||
a3(vpaddq xmm9, xmm1, xmm7)
|
||||
a3(vpsrlq xmm10, xmm8, 51)
|
||||
a3(vpsrlq xmm11, xmm9, 51)
|
||||
a3(vpsllq xmm8, xmm8, 13)
|
||||
a3(vpsllq xmm9, xmm9, 13)
|
||||
a3(vpxor xmm5, xmm5, xmm10)
|
||||
a3(vpxor xmm4, xmm4, xmm11)
|
||||
a3(vpxor xmm5, xmm5, xmm8)
|
||||
a3(vpxor xmm4, xmm4, xmm9)
|
||||
a3(vpaddq xmm10, xmm6, xmm5)
|
||||
a3(vpaddq xmm11, xmm7, xmm4)
|
||||
a3(vpsrlq xmm8, xmm10, 25)
|
||||
a3(vpsrlq xmm9, xmm11, 25)
|
||||
a3(vpsllq xmm10, xmm10, 39)
|
||||
a3(vpsllq xmm11, xmm11, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpaddq xmm8, xmm5, xmm2)
|
||||
a3(vpaddq xmm9, xmm4, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm8)
|
||||
a3(vpxor xmm1, xmm1, xmm9)
|
||||
a2(vmovdqa xmm10, xmm2)
|
||||
a2(vmovdqa xmm11, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm11, xmm10, 8)
|
||||
a4(vpalignr xmm7, xmm10, xmm11, 8)
|
||||
a2(sub rax, 2)
|
||||
aj(ja scrypt_salsa64_avx_loop)
|
||||
a3(vpaddq xmm0,xmm0,[rsp+0])
|
||||
a3(vpaddq xmm1,xmm1,[rsp+16])
|
||||
a3(vpaddq xmm2,xmm2,[rsp+32])
|
||||
a3(vpaddq xmm3,xmm3,[rsp+48])
|
||||
a3(vpaddq xmm4,xmm4,[rsp+64])
|
||||
a3(vpaddq xmm5,xmm5,[rsp+80])
|
||||
a3(vpaddq xmm6,xmm6,[rsp+96])
|
||||
a3(vpaddq xmm7,xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
a2(vmovdqa [rax+64],xmm4)
|
||||
a2(vmovdqa [rax+80],xmm5)
|
||||
a2(vmovdqa [rax+96],xmm6)
|
||||
a2(vmovdqa [rax+112],xmm7)
|
||||
aj(jne scrypt_ChunkMix_avx_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_AVX
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z2);
|
||||
x4 = _mm_xor_si128(x4, z3);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-AVX"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
221
stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-avx2.h
Normal file
221
stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-avx2.h
Normal file
|
@ -0,0 +1,221 @@
|
|||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_AVX2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx2)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa ymm0,[rax+0])
|
||||
a2(vmovdqa ymm1,[rax+32])
|
||||
a2(vmovdqa ymm2,[rax+64])
|
||||
a2(vmovdqa ymm3,[rax+96])
|
||||
aj(jz scrypt_ChunkMix_avx2_no_xor1)
|
||||
a3(vpxor ymm0,ymm0,[r9+0])
|
||||
a3(vpxor ymm1,ymm1,[r9+32])
|
||||
a3(vpxor ymm2,ymm2,[r9+64])
|
||||
a3(vpxor ymm3,ymm3,[r9+96])
|
||||
a1(scrypt_ChunkMix_avx2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_avx2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor ymm0,ymm0,[rsi+r9+0])
|
||||
a3(vpxor ymm1,ymm1,[rsi+r9+32])
|
||||
a3(vpxor ymm2,ymm2,[rsi+r9+64])
|
||||
a3(vpxor ymm3,ymm3,[rsi+r9+96])
|
||||
aj(jz scrypt_ChunkMix_avx2_no_xor2)
|
||||
a3(vpxor ymm0,ymm0,[rdx+r9+0])
|
||||
a3(vpxor ymm1,ymm1,[rdx+r9+32])
|
||||
a3(vpxor ymm2,ymm2,[rdx+r9+64])
|
||||
a3(vpxor ymm3,ymm3,[rdx+r9+96])
|
||||
a1(scrypt_ChunkMix_avx2_no_xor2:)
|
||||
a2(vmovdqa ymm6,ymm0)
|
||||
a2(vmovdqa ymm7,ymm1)
|
||||
a2(vmovdqa ymm8,ymm2)
|
||||
a2(vmovdqa ymm9,ymm3)
|
||||
a2(mov rax,4)
|
||||
a1(scrypt_salsa64_avx2_loop: )
|
||||
a3(vpaddq ymm4, ymm1, ymm0)
|
||||
a3(vpshufd ymm4, ymm4, 0xb1)
|
||||
a3(vpxor ymm3, ymm3, ymm4)
|
||||
a3(vpaddq ymm4, ymm0, ymm3)
|
||||
a3(vpsrlq ymm5, ymm4, 51)
|
||||
a3(vpxor ymm2, ymm2, ymm5)
|
||||
a3(vpsllq ymm4, ymm4, 13)
|
||||
a3(vpxor ymm2, ymm2, ymm4)
|
||||
a3(vpaddq ymm4, ymm3, ymm2)
|
||||
a3(vpsrlq ymm5, ymm4, 25)
|
||||
a3(vpxor ymm1, ymm1, ymm5)
|
||||
a3(vpsllq ymm4, ymm4, 39)
|
||||
a3(vpxor ymm1, ymm1, ymm4)
|
||||
a3(vpaddq ymm4, ymm2, ymm1)
|
||||
a3(vpshufd ymm4, ymm4, 0xb1)
|
||||
a3(vpermq ymm1, ymm1, 0x39)
|
||||
a3(vpermq ymm10, ymm2, 0x4e)
|
||||
a3(vpxor ymm0, ymm0, ymm4)
|
||||
a3(vpermq ymm3, ymm3, 0x93)
|
||||
a3(vpaddq ymm4, ymm3, ymm0)
|
||||
a3(vpshufd ymm4, ymm4, 0xb1)
|
||||
a3(vpxor ymm1, ymm1, ymm4)
|
||||
a3(vpaddq ymm4, ymm0, ymm1)
|
||||
a3(vpsrlq ymm5, ymm4, 51)
|
||||
a3(vpxor ymm10, ymm10, ymm5)
|
||||
a3(vpsllq ymm4, ymm4, 13)
|
||||
a3(vpxor ymm10, ymm10, ymm4)
|
||||
a3(vpaddq ymm4, ymm1, ymm10)
|
||||
a3(vpsrlq ymm5, ymm4, 25)
|
||||
a3(vpxor ymm3, ymm3, ymm5)
|
||||
a3(vpsllq ymm4, ymm4, 39)
|
||||
a3(vpermq ymm1, ymm1, 0x93)
|
||||
a3(vpxor ymm3, ymm3, ymm4)
|
||||
a3(vpermq ymm2, ymm10, 0x4e)
|
||||
a3(vpaddq ymm4, ymm10, ymm3)
|
||||
a3(vpshufd ymm4, ymm4, 0xb1)
|
||||
a3(vpermq ymm3, ymm3, 0x39)
|
||||
a3(vpxor ymm0, ymm0, ymm4)
|
||||
a1(dec rax)
|
||||
aj(jnz scrypt_salsa64_avx2_loop)
|
||||
a3(vpaddq ymm0,ymm0,ymm6)
|
||||
a3(vpaddq ymm1,ymm1,ymm7)
|
||||
a3(vpaddq ymm2,ymm2,ymm8)
|
||||
a3(vpaddq ymm3,ymm3,ymm9)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],ymm0)
|
||||
a2(vmovdqa [rax+32],ymm1)
|
||||
a2(vmovdqa [rax+64],ymm2)
|
||||
a2(vmovdqa [rax+96],ymm3)
|
||||
aj(jne scrypt_ChunkMix_avx2_loop)
|
||||
a1(vzeroupper)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_AVX2
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_avx2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
ymmi *ymmp,y0,y1,y2,y3,t0,t1,t2,t3,z0,z1;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
ymmp = (ymmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
y0 = ymmp[0];
|
||||
y1 = ymmp[1];
|
||||
y2 = ymmp[2];
|
||||
y3 = ymmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
ymmp = (ymmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
y0 = _mm256_xor_si256(y0, ymmp[0]);
|
||||
y1 = _mm256_xor_si256(y1, ymmp[1]);
|
||||
y2 = _mm256_xor_si256(y2, ymmp[2]);
|
||||
y3 = _mm256_xor_si256(y3, ymmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
ymmp = (ymmi *)scrypt_block(Bin, i);
|
||||
y0 = _mm256_xor_si256(y0, ymmp[0]);
|
||||
y1 = _mm256_xor_si256(y1, ymmp[1]);
|
||||
y2 = _mm256_xor_si256(y2, ymmp[2]);
|
||||
y3 = _mm256_xor_si256(y3, ymmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
ymmp = (ymmi *)scrypt_block(Bxor, i);
|
||||
y0 = _mm256_xor_si256(y0, ymmp[0]);
|
||||
y1 = _mm256_xor_si256(y1, ymmp[1]);
|
||||
y2 = _mm256_xor_si256(y2, ymmp[2]);
|
||||
y3 = _mm256_xor_si256(y3, ymmp[3]);
|
||||
}
|
||||
|
||||
t0 = y0;
|
||||
t1 = y1;
|
||||
t2 = y2;
|
||||
t3 = y3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm256_add_epi64(y0, y1);
|
||||
z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
y3 = _mm256_xor_si256(y3, z0);
|
||||
z0 = _mm256_add_epi64(y3, y0);
|
||||
z1 = _mm256_srli_epi64(z0, 64-13);
|
||||
y2 = _mm256_xor_si256(y2, z1);
|
||||
z0 = _mm256_slli_epi64(z0, 13);
|
||||
y2 = _mm256_xor_si256(y2, z0);
|
||||
z0 = _mm256_add_epi64(y2, y3);
|
||||
z1 = _mm256_srli_epi64(z0, 64-39);
|
||||
y1 = _mm256_xor_si256(y1, z1);
|
||||
z0 = _mm256_slli_epi64(z0, 39);
|
||||
y1 = _mm256_xor_si256(y1, z0);
|
||||
y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(0,3,2,1));
|
||||
y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
|
||||
y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(2,1,0,3));
|
||||
z0 = _mm256_add_epi64(y1, y2);
|
||||
z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
y0 = _mm256_xor_si256(y0, z0);
|
||||
z0 = _mm256_add_epi64(y0, y3);
|
||||
z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
y1 = _mm256_xor_si256(y1, z0);
|
||||
z0 = _mm256_add_epi64(y1, y0);
|
||||
z1 = _mm256_srli_epi64(z0, 64-13);
|
||||
y2 = _mm256_xor_si256(y2, z1);
|
||||
z0 = _mm256_slli_epi64(z0, 13);
|
||||
y2 = _mm256_xor_si256(y2, z0);
|
||||
z0 = _mm256_add_epi64(y2, y1);
|
||||
z1 = _mm256_srli_epi64(z0, 64-39);
|
||||
y3 = _mm256_xor_si256(y3, z1);
|
||||
z0 = _mm256_slli_epi64(z0, 39);
|
||||
y3 = _mm256_xor_si256(y3, z0);
|
||||
z0 = _mm256_add_epi64(y3, y2);
|
||||
z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
y0 = _mm256_xor_si256(y0, z0);
|
||||
y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(2,1,0,3));
|
||||
y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
|
||||
y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(0,3,2,1));
|
||||
}
|
||||
|
||||
y0 = _mm256_add_epi64(y0, t0);
|
||||
y1 = _mm256_add_epi64(y1, t1);
|
||||
y2 = _mm256_add_epi64(y2, t2);
|
||||
y3 = _mm256_add_epi64(y3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
ymmp = (ymmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
ymmp[0] = y0;
|
||||
ymmp[1] = y1;
|
||||
ymmp[2] = y2;
|
||||
ymmp[3] = y3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-AVX2"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
449
stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-sse2.h
Normal file
449
stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-sse2.h
Normal file
|
@ -0,0 +1,449 @@
|
|||
/* x64 */
|
||||
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a2(movdqa xmm4,[rax+64])
|
||||
a2(movdqa xmm5,[rax+80])
|
||||
a2(movdqa xmm6,[rax+96])
|
||||
a2(movdqa xmm7,[rax+112])
|
||||
aj(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a2(pxor xmm4,[r9+64])
|
||||
a2(pxor xmm5,[r9+80])
|
||||
a2(pxor xmm6,[r9+96])
|
||||
a2(pxor xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a2(pxor xmm4,[rsi+r9+64])
|
||||
a2(pxor xmm5,[rsi+r9+80])
|
||||
a2(pxor xmm6,[rsi+r9+96])
|
||||
a2(pxor xmm7,[rsi+r9+112])
|
||||
aj(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a2(pxor xmm4,[rdx+r9+64])
|
||||
a2(pxor xmm5,[rdx+r9+80])
|
||||
a2(pxor xmm6,[rdx+r9+96])
|
||||
a2(pxor xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa [rsp+0],xmm0)
|
||||
a2(movdqa [rsp+16],xmm1)
|
||||
a2(movdqa [rsp+32],xmm2)
|
||||
a2(movdqa [rsp+48],xmm3)
|
||||
a2(movdqa [rsp+64],xmm4)
|
||||
a2(movdqa [rsp+80],xmm5)
|
||||
a2(movdqa [rsp+96],xmm6)
|
||||
a2(movdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_sse2_loop: )
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm4, xmm10)
|
||||
a2(pxor xmm5, xmm11)
|
||||
a2(pxor xmm4, xmm8)
|
||||
a2(pxor xmm5, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm4)
|
||||
a2(paddq xmm11, xmm5)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm4)
|
||||
a2(movdqa xmm9, xmm5)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm8, xmm2)
|
||||
a2(movdqa xmm9, xmm3)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(movdqa xmm2, xmm7)
|
||||
a2(movdqa xmm3, xmm6)
|
||||
a2(punpcklqdq xmm10, xmm6)
|
||||
a2(punpcklqdq xmm11, xmm7)
|
||||
a2(movdqa xmm6, xmm8)
|
||||
a2(movdqa xmm7, xmm9)
|
||||
a2(punpcklqdq xmm9, xmm9)
|
||||
a2(punpcklqdq xmm8, xmm8)
|
||||
a2(punpckhqdq xmm2, xmm10)
|
||||
a2(punpckhqdq xmm3, xmm11)
|
||||
a2(punpckhqdq xmm6, xmm9)
|
||||
a2(punpckhqdq xmm7, xmm8)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm5, xmm10)
|
||||
a2(pxor xmm4, xmm11)
|
||||
a2(pxor xmm5, xmm8)
|
||||
a2(pxor xmm4, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm5)
|
||||
a2(paddq xmm11, xmm4)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm5)
|
||||
a2(movdqa xmm9, xmm4)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm8, xmm2)
|
||||
a2(movdqa xmm9, xmm3)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(movdqa xmm2, xmm7)
|
||||
a2(movdqa xmm3, xmm6)
|
||||
a2(punpcklqdq xmm10, xmm6)
|
||||
a2(punpcklqdq xmm11, xmm7)
|
||||
a2(movdqa xmm6, xmm8)
|
||||
a2(movdqa xmm7, xmm9)
|
||||
a2(punpcklqdq xmm9, xmm9)
|
||||
a2(punpcklqdq xmm8, xmm8)
|
||||
a2(punpckhqdq xmm2, xmm10)
|
||||
a2(punpckhqdq xmm3, xmm11)
|
||||
a2(punpckhqdq xmm6, xmm9)
|
||||
a2(punpckhqdq xmm7, xmm8)
|
||||
aj(ja scrypt_salsa64_sse2_loop)
|
||||
a2(paddq xmm0,[rsp+0])
|
||||
a2(paddq xmm1,[rsp+16])
|
||||
a2(paddq xmm2,[rsp+32])
|
||||
a2(paddq xmm3,[rsp+48])
|
||||
a2(paddq xmm4,[rsp+64])
|
||||
a2(paddq xmm5,[rsp+80])
|
||||
a2(paddq xmm6,[rsp+96])
|
||||
a2(paddq xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a2(movdqa [rax+64],xmm4)
|
||||
a2(movdqa [rax+80],xmm5)
|
||||
a2(movdqa [rax+96],xmm6)
|
||||
a2(movdqa [rax+112],xmm7)
|
||||
aj(jne scrypt_ChunkMix_sse2_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_SSE2
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x4;
|
||||
z1 = x5;
|
||||
z2 = x2;
|
||||
z3 = x3;
|
||||
x4 = z1;
|
||||
x5 = z0;
|
||||
x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
|
||||
x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
|
||||
x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
|
||||
x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x4;
|
||||
z1 = x5;
|
||||
z2 = x2;
|
||||
z3 = x3;
|
||||
x4 = z1;
|
||||
x5 = z0;
|
||||
x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
|
||||
x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
|
||||
x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
|
||||
x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-SSE2"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
|
||||
/* sse3/avx use this as well */
|
||||
#if defined(SCRYPT_SALSA64_INCLUDED)
|
||||
/*
|
||||
Default layout:
|
||||
0 1 2 3
|
||||
4 5 6 7
|
||||
8 9 10 11
|
||||
12 13 14 15
|
||||
|
||||
SSE2 layout:
|
||||
0 5 10 15
|
||||
12 1 6 11
|
||||
8 13 2 7
|
||||
4 9 14 3
|
||||
*/
|
||||
|
||||
|
||||
static void asm_calling_convention
|
||||
salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {
|
||||
uint64_t t;
|
||||
while (count--) {
|
||||
t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
|
||||
t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
|
||||
t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
|
||||
t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
|
||||
t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
|
||||
t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
|
||||
blocks += 16;
|
||||
}
|
||||
}
|
||||
#endif
|
399
stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h
Normal file
399
stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h
Normal file
|
@ -0,0 +1,399 @@
|
|||
/* x64 */
|
||||
#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_SSSE3
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_ssse3)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a2(movdqa xmm4,[rax+64])
|
||||
a2(movdqa xmm5,[rax+80])
|
||||
a2(movdqa xmm6,[rax+96])
|
||||
a2(movdqa xmm7,[rax+112])
|
||||
aj(jz scrypt_ChunkMix_ssse3_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a2(pxor xmm4,[r9+64])
|
||||
a2(pxor xmm5,[r9+80])
|
||||
a2(pxor xmm6,[r9+96])
|
||||
a2(pxor xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_ssse3_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a2(pxor xmm4,[rsi+r9+64])
|
||||
a2(pxor xmm5,[rsi+r9+80])
|
||||
a2(pxor xmm6,[rsi+r9+96])
|
||||
a2(pxor xmm7,[rsi+r9+112])
|
||||
aj(jz scrypt_ChunkMix_ssse3_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a2(pxor xmm4,[rdx+r9+64])
|
||||
a2(pxor xmm5,[rdx+r9+80])
|
||||
a2(pxor xmm6,[rdx+r9+96])
|
||||
a2(pxor xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor2:)
|
||||
a2(movdqa [rsp+0],xmm0)
|
||||
a2(movdqa [rsp+16],xmm1)
|
||||
a2(movdqa [rsp+32],xmm2)
|
||||
a2(movdqa [rsp+48],xmm3)
|
||||
a2(movdqa [rsp+64],xmm4)
|
||||
a2(movdqa [rsp+80],xmm5)
|
||||
a2(movdqa [rsp+96],xmm6)
|
||||
a2(movdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_ssse3_loop: )
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm4, xmm10)
|
||||
a2(pxor xmm5, xmm11)
|
||||
a2(pxor xmm4, xmm8)
|
||||
a2(pxor xmm5, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm4)
|
||||
a2(paddq xmm11, xmm5)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm4)
|
||||
a2(movdqa xmm9, xmm5)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm10, xmm2)
|
||||
a2(movdqa xmm11, xmm3)
|
||||
a2(movdqa xmm2, xmm6)
|
||||
a2(movdqa xmm3, xmm7)
|
||||
a3(palignr xmm2, xmm7, 8)
|
||||
a3(palignr xmm3, xmm6, 8)
|
||||
a2(movdqa xmm6, xmm11)
|
||||
a2(movdqa xmm7, xmm10)
|
||||
a3(palignr xmm6, xmm10, 8)
|
||||
a3(palignr xmm7, xmm11, 8)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm5, xmm10)
|
||||
a2(pxor xmm4, xmm11)
|
||||
a2(pxor xmm5, xmm8)
|
||||
a2(pxor xmm4, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm5)
|
||||
a2(paddq xmm11, xmm4)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm5)
|
||||
a2(movdqa xmm9, xmm4)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm10, xmm2)
|
||||
a2(movdqa xmm11, xmm3)
|
||||
a2(movdqa xmm2, xmm6)
|
||||
a2(movdqa xmm3, xmm7)
|
||||
a3(palignr xmm2, xmm7, 8)
|
||||
a3(palignr xmm3, xmm6, 8)
|
||||
a2(movdqa xmm6, xmm11)
|
||||
a2(movdqa xmm7, xmm10)
|
||||
a3(palignr xmm6, xmm10, 8)
|
||||
a3(palignr xmm7, xmm11, 8)
|
||||
aj(ja scrypt_salsa64_ssse3_loop)
|
||||
a2(paddq xmm0,[rsp+0])
|
||||
a2(paddq xmm1,[rsp+16])
|
||||
a2(paddq xmm2,[rsp+32])
|
||||
a2(paddq xmm3,[rsp+48])
|
||||
a2(paddq xmm4,[rsp+64])
|
||||
a2(paddq xmm5,[rsp+80])
|
||||
a2(paddq xmm6,[rsp+96])
|
||||
a2(paddq xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a2(movdqa [rax+64],xmm4)
|
||||
a2(movdqa [rax+80],xmm5)
|
||||
a2(movdqa [rax+96],xmm6)
|
||||
a2(movdqa [rax+112],xmm7)
|
||||
aj(jne scrypt_ChunkMix_ssse3_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_SSSE3
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z2);
|
||||
x4 = _mm_xor_si128(x4, z3);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-SSSE3"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
335
stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-xop.h
Normal file
335
stratum/algos/ar2/sj/scrypt-jane-mix_salsa64-xop.h
Normal file
|
@ -0,0 +1,335 @@
|
|||
/* x64 */
|
||||
#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_XOP
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_xop)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
a2(vmovdqa xmm4,[rax+64])
|
||||
a2(vmovdqa xmm5,[rax+80])
|
||||
a2(vmovdqa xmm6,[rax+96])
|
||||
a2(vmovdqa xmm7,[rax+112])
|
||||
aj(jz scrypt_ChunkMix_xop_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a3(vpxor xmm4,xmm4,[r9+64])
|
||||
a3(vpxor xmm5,xmm5,[r9+80])
|
||||
a3(vpxor xmm6,xmm6,[r9+96])
|
||||
a3(vpxor xmm7,xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_xop_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_xop_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rsi+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rsi+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rsi+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rsi+r9+112])
|
||||
aj(jz scrypt_ChunkMix_xop_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rdx+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rdx+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rdx+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_xop_no_xor2:)
|
||||
a2(vmovdqa [rsp+0],xmm0)
|
||||
a2(vmovdqa [rsp+16],xmm1)
|
||||
a2(vmovdqa [rsp+32],xmm2)
|
||||
a2(vmovdqa [rsp+48],xmm3)
|
||||
a2(vmovdqa [rsp+64],xmm4)
|
||||
a2(vmovdqa [rsp+80],xmm5)
|
||||
a2(vmovdqa [rsp+96],xmm6)
|
||||
a2(vmovdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_xop_loop: )
|
||||
a3(vpaddq xmm8, xmm0, xmm2)
|
||||
a3(vpaddq xmm9, xmm1, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm8)
|
||||
a3(vpxor xmm7, xmm7, xmm9)
|
||||
a3(vpaddq xmm10, xmm0, xmm6)
|
||||
a3(vpaddq xmm11, xmm1, xmm7)
|
||||
a3(vprotq xmm10, xmm10, 13)
|
||||
a3(vprotq xmm11, xmm11, 13)
|
||||
a3(vpxor xmm4, xmm4, xmm10)
|
||||
a3(vpxor xmm5, xmm5, xmm11)
|
||||
a3(vpaddq xmm8, xmm6, xmm4)
|
||||
a3(vpaddq xmm9, xmm7, xmm5)
|
||||
a3(vprotq xmm8, xmm8, 39)
|
||||
a3(vprotq xmm9, xmm9, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpaddq xmm10, xmm4, xmm2)
|
||||
a3(vpaddq xmm11, xmm5, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm10)
|
||||
a3(vpxor xmm1, xmm1, xmm11)
|
||||
a2(vmovdqa xmm8, xmm2)
|
||||
a2(vmovdqa xmm9, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm9, xmm8, 8)
|
||||
a4(vpalignr xmm7, xmm8, xmm9, 8)
|
||||
a3(vpaddq xmm10, xmm0, xmm2)
|
||||
a3(vpaddq xmm11, xmm1, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm10)
|
||||
a3(vpxor xmm7, xmm7, xmm11)
|
||||
a3(vpaddq xmm8, xmm0, xmm6)
|
||||
a3(vpaddq xmm9, xmm1, xmm7)
|
||||
a3(vprotq xmm8, xmm8, 13)
|
||||
a3(vprotq xmm9, xmm9, 13)
|
||||
a3(vpxor xmm5, xmm5, xmm8)
|
||||
a3(vpxor xmm4, xmm4, xmm9)
|
||||
a3(vpaddq xmm10, xmm6, xmm5)
|
||||
a3(vpaddq xmm11, xmm7, xmm4)
|
||||
a3(vprotq xmm10, xmm10, 39)
|
||||
a3(vprotq xmm11, xmm11, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpaddq xmm8, xmm5, xmm2)
|
||||
a3(vpaddq xmm9, xmm4, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm8)
|
||||
a3(vpxor xmm1, xmm1, xmm9)
|
||||
a2(vmovdqa xmm10, xmm2)
|
||||
a2(vmovdqa xmm11, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm11, xmm10, 8)
|
||||
a4(vpalignr xmm7, xmm10, xmm11, 8)
|
||||
a2(sub rax, 2)
|
||||
aj(ja scrypt_salsa64_xop_loop)
|
||||
a3(vpaddq xmm0,xmm0,[rsp+0])
|
||||
a3(vpaddq xmm1,xmm1,[rsp+16])
|
||||
a3(vpaddq xmm2,xmm2,[rsp+32])
|
||||
a3(vpaddq xmm3,xmm3,[rsp+48])
|
||||
a3(vpaddq xmm4,xmm4,[rsp+64])
|
||||
a3(vpaddq xmm5,xmm5,[rsp+80])
|
||||
a3(vpaddq xmm6,xmm6,[rsp+96])
|
||||
a3(vpaddq xmm7,xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
a2(vmovdqa [rax+64],xmm4)
|
||||
a2(vmovdqa [rax+80],xmm5)
|
||||
a2(vmovdqa [rax+96],xmm6)
|
||||
a2(vmovdqa [rax+112],xmm7)
|
||||
aj(jne scrypt_ChunkMix_xop_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_xop)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_XOP
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_xop(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z0 = _mm_roti_epi64(z0, 13);
|
||||
z1 = _mm_roti_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z0 = _mm_roti_epi64(z0, 39);
|
||||
z1 = _mm_roti_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z0 = _mm_roti_epi64(z0, 13);
|
||||
z1 = _mm_roti_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z0 = _mm_roti_epi64(z0, 39);
|
||||
z1 = _mm_roti_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-XOP"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
41
stratum/algos/ar2/sj/scrypt-jane-mix_salsa64.h
Normal file
41
stratum/algos/ar2/sj/scrypt-jane-mix_salsa64.h
Normal file
|
@ -0,0 +1,41 @@
|
|||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8 Ref"
|
||||
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_BASIC
|
||||
|
||||
static void
|
||||
salsa64_core_basic(uint64_t state[16]) {
|
||||
const size_t rounds = 8;
|
||||
uint64_t v[16], t;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < 16; i++) v[i] = state[i];
|
||||
|
||||
#define G(a,b,c,d) \
|
||||
t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \
|
||||
t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \
|
||||
t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \
|
||||
t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \
|
||||
|
||||
for (i = 0; i < rounds; i += 2) {
|
||||
G( 0, 4, 8,12);
|
||||
G( 5, 9,13, 1);
|
||||
G(10,14, 2, 6);
|
||||
G(15, 3, 7,11);
|
||||
G( 0, 1, 2, 3);
|
||||
G( 5, 6, 7, 4);
|
||||
G(10,11, 8, 9);
|
||||
G(15,12,13,14);
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i++) state[i] += v[i];
|
||||
|
||||
#undef G
|
||||
}
|
||||
|
||||
#endif
|
||||
|
112
stratum/algos/ar2/sj/scrypt-jane-pbkdf2.h
Normal file
112
stratum/algos/ar2/sj/scrypt-jane-pbkdf2.h
Normal file
|
@ -0,0 +1,112 @@
|
|||
typedef struct scrypt_hmac_state_t {
|
||||
scrypt_hash_state inner, outer;
|
||||
} scrypt_hmac_state;
|
||||
|
||||
|
||||
static void
|
||||
scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) {
|
||||
scrypt_hash_state st;
|
||||
scrypt_hash_init(&st);
|
||||
scrypt_hash_update(&st, m, mlen);
|
||||
scrypt_hash_finish(&st, hash);
|
||||
}
|
||||
|
||||
/* hmac */
|
||||
static void
|
||||
scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
|
||||
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
|
||||
size_t i;
|
||||
|
||||
scrypt_hash_init(&st->inner);
|
||||
scrypt_hash_init(&st->outer);
|
||||
|
||||
if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
|
||||
/* use the key directly if it's <= blocksize bytes */
|
||||
memcpy(pad, key, keylen);
|
||||
} else {
|
||||
/* if it's > blocksize bytes, hash it */
|
||||
scrypt_hash(pad, key, keylen);
|
||||
}
|
||||
|
||||
/* inner = (key ^ 0x36) */
|
||||
/* h(inner || ...) */
|
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
|
||||
pad[i] ^= 0x36;
|
||||
scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
|
||||
|
||||
/* outer = (key ^ 0x5c) */
|
||||
/* h(outer || ...) */
|
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
|
||||
pad[i] ^= (0x5c ^ 0x36);
|
||||
scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
|
||||
|
||||
scrypt_ensure_zero(pad, sizeof(pad));
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) {
|
||||
/* h(inner || m...) */
|
||||
scrypt_hash_update(&st->inner, m, mlen);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
|
||||
/* h(inner || m) */
|
||||
scrypt_hash_digest innerhash;
|
||||
scrypt_hash_finish(&st->inner, innerhash);
|
||||
|
||||
/* h(outer || h(inner || m)) */
|
||||
scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
|
||||
scrypt_hash_finish(&st->outer, mac);
|
||||
|
||||
scrypt_ensure_zero(st, sizeof(*st));
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) {
|
||||
scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
|
||||
scrypt_hash_digest ti, u;
|
||||
uint8_t be[4];
|
||||
uint32_t i, j, blocks;
|
||||
uint64_t c;
|
||||
|
||||
/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
|
||||
|
||||
/* hmac(password, ...) */
|
||||
scrypt_hmac_init(&hmac_pw, password, password_len);
|
||||
|
||||
/* hmac(password, salt...) */
|
||||
hmac_pw_salt = hmac_pw;
|
||||
scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
|
||||
|
||||
blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
|
||||
for (i = 1; i <= blocks; i++) {
|
||||
/* U1 = hmac(password, salt || be(i)) */
|
||||
U32TO8_BE(be, i);
|
||||
work = hmac_pw_salt;
|
||||
scrypt_hmac_update(&work, be, 4);
|
||||
scrypt_hmac_finish(&work, ti);
|
||||
memcpy(u, ti, sizeof(u));
|
||||
|
||||
/* T[i] = U1 ^ U2 ^ U3... */
|
||||
for (c = 0; c < N - 1; c++) {
|
||||
/* UX = hmac(password, U{X-1}) */
|
||||
work = hmac_pw;
|
||||
scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE);
|
||||
scrypt_hmac_finish(&work, u);
|
||||
|
||||
/* T[i] ^= UX */
|
||||
for (j = 0; j < sizeof(u); j++)
|
||||
ti[j] ^= u[j];
|
||||
}
|
||||
|
||||
memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
|
||||
out += SCRYPT_HASH_DIGEST_SIZE;
|
||||
bytes -= SCRYPT_HASH_DIGEST_SIZE;
|
||||
}
|
||||
|
||||
scrypt_ensure_zero(ti, sizeof(ti));
|
||||
scrypt_ensure_zero(u, sizeof(u));
|
||||
scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
|
||||
scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
|
||||
}
|
462
stratum/algos/ar2/sj/scrypt-jane-portable-x86.h
Normal file
462
stratum/algos/ar2/sj/scrypt-jane-portable-x86.h
Normal file
|
@ -0,0 +1,462 @@
|
|||
#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))
|
||||
#define X86ASM
|
||||
|
||||
/* gcc 2.95 royally screws up stack alignments on variables */
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
|
||||
#define X86ASM_SSE
|
||||
#define X86ASM_SSE2
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
|
||||
#define X86ASM_SSSE3
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
|
||||
#define X86ASM_AVX
|
||||
#define X86ASM_XOP
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700)))
|
||||
#define X86ASM_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86_64) && defined(COMPILER_GCC)
|
||||
#define X86_64ASM
|
||||
#define X86_64ASM_SSE2
|
||||
#if (COMPILER_GCC >= 40102)
|
||||
#define X86_64ASM_SSSE3
|
||||
#endif
|
||||
#if (COMPILER_GCC >= 40400)
|
||||
#define X86_64ASM_AVX
|
||||
#define X86_64ASM_XOP
|
||||
#endif
|
||||
#if (COMPILER_GCC >= 40700)
|
||||
#define X86_64ASM_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64))
|
||||
#define X86_INTRINSIC
|
||||
#if defined(CPU_X86_64) || defined(X86ASM_SSE)
|
||||
#define X86_INTRINSIC_SSE
|
||||
#endif
|
||||
#if defined(CPU_X86_64) || defined(X86ASM_SSE2)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2005)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)
|
||||
#define X86_INTRINSIC_AVX
|
||||
#define X86_INTRINSIC_XOP
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2012)
|
||||
#define X86_INTRINSIC_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
|
||||
#define X86_INTRINSIC
|
||||
#if defined(__SSE__)
|
||||
#define X86_INTRINSIC_SSE
|
||||
#endif
|
||||
#if defined(__SSE2__)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#if defined(__SSSE3__)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if defined(__AVX__)
|
||||
#define X86_INTRINSIC_AVX
|
||||
#endif
|
||||
#if defined(__XOP__)
|
||||
#define X86_INTRINSIC_XOP
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
#define X86_INTRINSIC_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* only use simd on windows (or SSE2 on gcc)! */
|
||||
#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)
|
||||
#if defined(X86_INTRINSIC_SSE)
|
||||
#include <mmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
typedef __m64 qmm;
|
||||
typedef __m128 xmm;
|
||||
typedef __m128d xmmd;
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
#include <emmintrin.h>
|
||||
typedef __m128i xmmi;
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_AVX)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_XOP)
|
||||
#if defined(COMPILER_MSVC)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_AVX2)
|
||||
typedef __m256i ymmi;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
typedef union packedelem8_t {
|
||||
uint8_t u[16];
|
||||
xmmi v;
|
||||
} packedelem8;
|
||||
|
||||
typedef union packedelem32_t {
|
||||
uint32_t u[4];
|
||||
xmmi v;
|
||||
} packedelem32;
|
||||
|
||||
typedef union packedelem64_t {
|
||||
uint64_t u[2];
|
||||
xmmi v;
|
||||
} packedelem64;
|
||||
#else
|
||||
typedef union packedelem8_t {
|
||||
uint8_t u[16];
|
||||
uint32_t dw[4];
|
||||
} packedelem8;
|
||||
|
||||
typedef union packedelem32_t {
|
||||
uint32_t u[4];
|
||||
uint8_t b[16];
|
||||
} packedelem32;
|
||||
|
||||
typedef union packedelem64_t {
|
||||
uint64_t u[2];
|
||||
uint8_t b[16];
|
||||
} packedelem64;
|
||||
#endif
|
||||
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
static const packedelem8 ALIGN(16) ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
|
||||
static const packedelem8 ALIGN(16) ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
|
||||
#endif
|
||||
|
||||
/*
|
||||
x86 inline asm for gcc/msvc. usage:
|
||||
|
||||
asm_naked_fn_proto(return_type, name) (type parm1, type parm2..)
|
||||
asm_naked_fn(name)
|
||||
a1(..)
|
||||
a2(.., ..)
|
||||
a3(.., .., ..)
|
||||
64bit OR 0 paramters: a1(ret)
|
||||
32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters
|
||||
asm_naked_fn_end(name)
|
||||
*/
|
||||
|
||||
#if defined(X86ASM) || defined(X86_64ASM)
|
||||
|
||||
#if defined(COMPILER_MSVC)
|
||||
#pragma warning(disable : 4731) /* frame pointer modified by inline assembly */
|
||||
#define a1(x) __asm {x}
|
||||
#define a2(x, y) __asm {x, y}
|
||||
#define a3(x, y, z) __asm {x, y, z}
|
||||
#define a4(x, y, z, w) __asm {x, y, z, w}
|
||||
#define aj(x) __asm {x}
|
||||
#define asm_align8 a1(ALIGN 8)
|
||||
#define asm_align16 a1(ALIGN 16)
|
||||
|
||||
#define asm_calling_convention STDCALL
|
||||
#define aret(n) a1(ret n)
|
||||
#define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn
|
||||
#define asm_naked_fn(fn) {
|
||||
#define asm_naked_fn_end(fn) }
|
||||
#elif defined(COMPILER_GCC)
|
||||
#define GNU_AS1(x) #x ";\n"
|
||||
#define GNU_AS2(x, y) #x ", " #y ";\n"
|
||||
#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
|
||||
#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
|
||||
#define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"
|
||||
#define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n"
|
||||
|
||||
#define a1(x) GNU_AS1(x)
|
||||
#define a2(x, y) GNU_AS2(x, y)
|
||||
#define a3(x, y, z) GNU_AS3(x, y, z)
|
||||
#define a4(x, y, z, w) GNU_AS4(x, y, z, w)
|
||||
#define aj(x) GNU_ASJ(x)
|
||||
#define asm_align8 ".p2align 3,,7"
|
||||
#define asm_align16 ".p2align 4,,15"
|
||||
|
||||
#if defined(OS_WINDOWS)
|
||||
#define asm_calling_convention CDECL
|
||||
#define aret(n) a1(ret)
|
||||
|
||||
#if defined(X86_64ASM)
|
||||
#define asm_naked_fn(fn) ; __asm__ ( \
|
||||
".text\n" \
|
||||
asm_align16 GNU_ASFN(fn) \
|
||||
"subq $136, %rsp;" \
|
||||
"movdqa %xmm6, 0(%rsp);" \
|
||||
"movdqa %xmm7, 16(%rsp);" \
|
||||
"movdqa %xmm8, 32(%rsp);" \
|
||||
"movdqa %xmm9, 48(%rsp);" \
|
||||
"movdqa %xmm10, 64(%rsp);" \
|
||||
"movdqa %xmm11, 80(%rsp);" \
|
||||
"movdqa %xmm12, 96(%rsp);" \
|
||||
"movq %rdi, 112(%rsp);" \
|
||||
"movq %rsi, 120(%rsp);" \
|
||||
"movq %rcx, %rdi;" \
|
||||
"movq %rdx, %rsi;" \
|
||||
"movq %r8, %rdx;" \
|
||||
"movq %r9, %rcx;" \
|
||||
"call 1f;" \
|
||||
"movdqa 0(%rsp), %xmm6;" \
|
||||
"movdqa 16(%rsp), %xmm7;" \
|
||||
"movdqa 32(%rsp), %xmm8;" \
|
||||
"movdqa 48(%rsp), %xmm9;" \
|
||||
"movdqa 64(%rsp), %xmm10;" \
|
||||
"movdqa 80(%rsp), %xmm11;" \
|
||||
"movdqa 96(%rsp), %xmm12;" \
|
||||
"movq 112(%rsp), %rdi;" \
|
||||
"movq 120(%rsp), %rsi;" \
|
||||
"addq $136, %rsp;" \
|
||||
"ret;" \
|
||||
".intel_syntax noprefix;" \
|
||||
".p2align 4,,15;" \
|
||||
"1:;"
|
||||
#else
|
||||
#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
|
||||
#endif
|
||||
#else
|
||||
#define asm_calling_convention STDCALL
|
||||
#define aret(n) a1(ret n)
|
||||
#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
|
||||
#endif
|
||||
|
||||
#define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn
|
||||
#define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );
|
||||
|
||||
#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
|
||||
#define asm_gcc_parms() ".att_syntax prefix;"
|
||||
#define asm_gcc_trashed() __asm__ __volatile__("" :::
|
||||
#define asm_gcc_end() );
|
||||
#else
|
||||
need x86 asm
|
||||
#endif
|
||||
|
||||
#endif /* X86ASM || X86_64ASM */
|
||||
|
||||
|
||||
#if defined(CPU_X86) || defined(CPU_X86_64)
|
||||
|
||||
typedef enum cpu_flags_x86_t {
|
||||
cpu_mmx = 1 << 0,
|
||||
cpu_sse = 1 << 1,
|
||||
cpu_sse2 = 1 << 2,
|
||||
cpu_sse3 = 1 << 3,
|
||||
cpu_ssse3 = 1 << 4,
|
||||
cpu_sse4_1 = 1 << 5,
|
||||
cpu_sse4_2 = 1 << 6,
|
||||
cpu_avx = 1 << 7,
|
||||
cpu_xop = 1 << 8,
|
||||
cpu_avx2 = 1 << 9
|
||||
} cpu_flags_x86;
|
||||
|
||||
typedef enum cpu_vendors_x86_t {
|
||||
cpu_nobody,
|
||||
cpu_intel,
|
||||
cpu_amd
|
||||
} cpu_vendors_x86;
|
||||
|
||||
typedef struct x86_regs_t {
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
} x86_regs;
|
||||
|
||||
#if defined(X86ASM)
|
||||
asm_naked_fn_proto(int, has_cpuid)(void)
|
||||
asm_naked_fn(has_cpuid)
|
||||
a1(pushfd)
|
||||
a1(pop eax)
|
||||
a2(mov ecx, eax)
|
||||
a2(xor eax, 0x200000)
|
||||
a1(push eax)
|
||||
a1(popfd)
|
||||
a1(pushfd)
|
||||
a1(pop eax)
|
||||
a2(xor eax, ecx)
|
||||
a2(shr eax, 21)
|
||||
a2(and eax, 1)
|
||||
a1(push ecx)
|
||||
a1(popfd)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(has_cpuid)
|
||||
#endif /* X86ASM */
|
||||
|
||||
|
||||
static void NOINLINE
|
||||
get_cpuid(x86_regs *regs, uint32_t flags) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
__cpuid((int *)regs, (int)flags);
|
||||
#else
|
||||
#if defined(CPU_X86_64)
|
||||
#define cpuid_bx rbx
|
||||
#else
|
||||
#define cpuid_bx ebx
|
||||
#endif
|
||||
|
||||
asm_gcc()
|
||||
a1(push cpuid_bx)
|
||||
a2(xor ecx, ecx)
|
||||
a1(cpuid)
|
||||
a2(mov [%1 + 0], eax)
|
||||
a2(mov [%1 + 4], ebx)
|
||||
a2(mov [%1 + 8], ecx)
|
||||
a2(mov [%1 + 12], edx)
|
||||
a1(pop cpuid_bx)
|
||||
asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc"
|
||||
asm_gcc_end()
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
static uint64_t NOINLINE
|
||||
get_xgetbv(uint32_t flags) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
return _xgetbv(flags);
|
||||
#else
|
||||
uint32_t lo, hi;
|
||||
asm_gcc()
|
||||
a1(xgetbv)
|
||||
asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi)
|
||||
asm_gcc_end()
|
||||
return ((uint64_t)lo | ((uint64_t)hi << 32));
|
||||
#endif
|
||||
}
|
||||
#endif // AVX support
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
size_t cpu_detect_mask = (size_t)-1;
|
||||
#endif
|
||||
|
||||
static size_t
|
||||
detect_cpu(void) {
|
||||
union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
|
||||
//cpu_vendors_x86 vendor = cpu_nobody;
|
||||
x86_regs regs;
|
||||
uint32_t max_level, max_ext_level;
|
||||
size_t cpu_flags = 0;
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
uint64_t xgetbv_flags;
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86)
|
||||
if (!has_cpuid())
|
||||
return cpu_flags;
|
||||
#endif
|
||||
|
||||
get_cpuid(®s, 0);
|
||||
max_level = regs.eax;
|
||||
vendor_string.i[0] = regs.ebx;
|
||||
vendor_string.i[1] = regs.edx;
|
||||
vendor_string.i[2] = regs.ecx;
|
||||
|
||||
//if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12))
|
||||
// vendor = cpu_intel;
|
||||
//else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12))
|
||||
// vendor = cpu_amd;
|
||||
|
||||
if (max_level & 0x00000500) {
|
||||
/* "Intel P5 pre-B0" */
|
||||
cpu_flags |= cpu_mmx;
|
||||
return cpu_flags;
|
||||
}
|
||||
|
||||
if (max_level < 1)
|
||||
return cpu_flags;
|
||||
|
||||
get_cpuid(®s, 1);
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
/* xsave/xrestore */
|
||||
if (regs.ecx & (1 << 27)) {
|
||||
xgetbv_flags = get_xgetbv(0);
|
||||
if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx;
|
||||
}
|
||||
#endif
|
||||
if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2;
|
||||
if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2;
|
||||
if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3;
|
||||
if (regs.ecx & (1 )) cpu_flags |= cpu_sse3;
|
||||
if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;
|
||||
if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;
|
||||
if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;
|
||||
|
||||
if (cpu_flags & cpu_avx) {
|
||||
if (max_level >= 7) {
|
||||
get_cpuid(®s, 7);
|
||||
if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2;
|
||||
}
|
||||
|
||||
get_cpuid(®s, 0x80000000);
|
||||
max_ext_level = regs.eax;
|
||||
if (max_ext_level >= 0x80000001) {
|
||||
get_cpuid(®s, 0x80000001);
|
||||
if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
cpu_flags &= cpu_detect_mask;
|
||||
#endif
|
||||
|
||||
return cpu_flags;
|
||||
}
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static const char *
|
||||
get_top_cpuflag_desc(size_t flag) {
|
||||
if (flag & cpu_avx2) return "AVX2";
|
||||
else if (flag & cpu_xop) return "XOP";
|
||||
else if (flag & cpu_avx) return "AVX";
|
||||
else if (flag & cpu_sse4_2) return "SSE4.2";
|
||||
else if (flag & cpu_sse4_1) return "SSE4.1";
|
||||
else if (flag & cpu_ssse3) return "SSSE3";
|
||||
else if (flag & cpu_sse2) return "SSE2";
|
||||
else if (flag & cpu_sse) return "SSE";
|
||||
else if (flag & cpu_mmx) return "MMX";
|
||||
else return "Basic";
|
||||
}
|
||||
#endif
|
||||
|
||||
/* enable the highest system-wide option */
|
||||
#if defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#if !defined(__AVX2__)
|
||||
#undef X86_64ASM_AVX2
|
||||
#undef X86ASM_AVX2
|
||||
#undef X86_INTRINSIC_AVX2
|
||||
#endif
|
||||
#if !defined(__XOP__)
|
||||
#undef X86_64ASM_XOP
|
||||
#undef X86ASM_XOP
|
||||
#undef X86_INTRINSIC_XOP
|
||||
#endif
|
||||
#if !defined(__AVX__)
|
||||
#undef X86_64ASM_AVX
|
||||
#undef X86ASM_AVX
|
||||
#undef X86_INTRINSIC_AVX
|
||||
#endif
|
||||
#if !defined(__SSSE3__)
|
||||
#undef X86_64ASM_SSSE3
|
||||
#undef X86ASM_SSSE3
|
||||
#undef X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if !defined(__SSE2__)
|
||||
#undef X86_64ASM_SSE2
|
||||
#undef X86ASM_SSE2
|
||||
#undef X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
|
307
stratum/algos/ar2/sj/scrypt-jane-portable.h
Normal file
307
stratum/algos/ar2/sj/scrypt-jane-portable.h
Normal file
|
@ -0,0 +1,307 @@
|
|||
/* determine os */
|
||||
#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
|
||||
#include <windows.h>
|
||||
#include <wincrypt.h>
|
||||
#define OS_WINDOWS
|
||||
#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
|
||||
#include <sys/mman.h>
|
||||
#include <sys/time.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#define OS_SOLARIS
|
||||
#else
|
||||
#include <sys/mman.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/param.h> /* need this to define BSD */
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#define OS_NIX
|
||||
#if defined(__linux__)
|
||||
#include <endian.h>
|
||||
#define OS_LINUX
|
||||
#elif defined(BSD)
|
||||
#define OS_BSD
|
||||
|
||||
#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
|
||||
#define OS_OSX
|
||||
#elif defined(macintosh) || defined(Macintosh)
|
||||
#define OS_MAC
|
||||
#elif defined(__OpenBSD__)
|
||||
#define OS_OPENBSD
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/* determine compiler */
|
||||
#if defined(_MSC_VER)
|
||||
#define COMPILER_MSVC_VS6 120000000
|
||||
#define COMPILER_MSVC_VS6PP 121000000
|
||||
#define COMPILER_MSVC_VS2002 130000000
|
||||
#define COMPILER_MSVC_VS2003 131000000
|
||||
#define COMPILER_MSVC_VS2005 140050727
|
||||
#define COMPILER_MSVC_VS2008 150000000
|
||||
#define COMPILER_MSVC_VS2008SP1 150030729
|
||||
#define COMPILER_MSVC_VS2010 160000000
|
||||
#define COMPILER_MSVC_VS2010SP1 160040219
|
||||
#define COMPILER_MSVC_VS2012RC 170000000
|
||||
#define COMPILER_MSVC_VS2012 170050727
|
||||
|
||||
#if _MSC_FULL_VER > 100000000
|
||||
#define COMPILER_MSVC (_MSC_FULL_VER)
|
||||
#else
|
||||
#define COMPILER_MSVC (_MSC_FULL_VER * 10)
|
||||
#endif
|
||||
|
||||
#if ((_MSC_VER == 1200) && defined(_mm_free))
|
||||
#undef COMPILER_MSVC
|
||||
#define COMPILER_MSVC COMPILER_MSVC_VS6PP
|
||||
#endif
|
||||
|
||||
#pragma warning(disable : 4127) /* conditional expression is constant */
|
||||
#pragma warning(disable : 4100) /* unreferenced formal parameter */
|
||||
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#include <float.h>
|
||||
#include <stdlib.h> /* _rotl */
|
||||
#include <intrin.h>
|
||||
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef signed int int32_t;
|
||||
typedef unsigned __int64 uint64_t;
|
||||
typedef signed __int64 int64_t;
|
||||
|
||||
#define ROTL32(a,b) _rotl(a,b)
|
||||
#define ROTR32(a,b) _rotr(a,b)
|
||||
#define ROTL64(a,b) _rotl64(a,b)
|
||||
#define ROTR64(a,b) _rotr64(a,b)
|
||||
#undef NOINLINE
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#undef NORETURN
|
||||
#define NORETURN
|
||||
#undef INLINE
|
||||
#define INLINE __forceinline
|
||||
#undef FASTCALL
|
||||
#define FASTCALL __fastcall
|
||||
#undef CDECL
|
||||
#define CDECL __cdecl
|
||||
#undef STDCALL
|
||||
#define STDCALL __stdcall
|
||||
#undef NAKED
|
||||
#define NAKED __declspec(naked)
|
||||
#define ALIGN(n) __declspec(align(n))
|
||||
#endif
|
||||
#if defined(__ICC)
|
||||
#define COMPILER_INTEL
|
||||
#endif
|
||||
#if defined(__GNUC__)
|
||||
#if (__GNUC__ >= 3)
|
||||
#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__
|
||||
#else
|
||||
#define COMPILER_GCC_PATCHLEVEL 0
|
||||
#endif
|
||||
#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL)
|
||||
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
||||
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
||||
#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
|
||||
#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b)))
|
||||
#undef NOINLINE
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#else
|
||||
#define NOINLINE
|
||||
#endif
|
||||
#undef NORETURN
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define NORETURN __attribute__((noreturn))
|
||||
#else
|
||||
#define NORETURN
|
||||
#endif
|
||||
#undef INLINE
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define INLINE __attribute__((always_inline))
|
||||
#else
|
||||
#define INLINE inline
|
||||
#endif
|
||||
#undef FASTCALL
|
||||
#if (COMPILER_GCC >= 30400)
|
||||
#define FASTCALL __attribute__((fastcall))
|
||||
#else
|
||||
#define FASTCALL
|
||||
#endif
|
||||
#undef CDECL
|
||||
#define CDECL __attribute__((cdecl))
|
||||
#undef STDCALL
|
||||
#define STDCALL __attribute__((stdcall))
|
||||
#define ALIGN(n) __attribute__((aligned(n)))
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
#if defined(__MINGW32__) || defined(__MINGW64__)
|
||||
#define COMPILER_MINGW
|
||||
#endif
|
||||
#if defined(__PATHCC__)
|
||||
#define COMPILER_PATHCC
|
||||
#endif
|
||||
|
||||
#define OPTIONAL_INLINE
|
||||
#if defined(OPTIONAL_INLINE)
|
||||
#undef OPTIONAL_INLINE
|
||||
#define OPTIONAL_INLINE INLINE
|
||||
#else
|
||||
#define OPTIONAL_INLINE
|
||||
#endif
|
||||
|
||||
#define CRYPTO_FN NOINLINE STDCALL
|
||||
|
||||
/* determine cpu */
|
||||
#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
|
||||
#define CPU_X86_64
|
||||
#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
|
||||
#define CPU_X86 500
|
||||
#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
|
||||
#define CPU_X86 400
|
||||
#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
|
||||
#define CPU_X86 300
|
||||
#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
|
||||
#define CPU_IA64
|
||||
#endif
|
||||
|
||||
#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
|
||||
#define CPU_SPARC
|
||||
#if defined(__sparcv9)
|
||||
#define CPU_SPARC64
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
|
||||
#define CPU_64BITS
|
||||
#undef FASTCALL
|
||||
#define FASTCALL
|
||||
#undef CDECL
|
||||
#define CDECL
|
||||
#undef STDCALL
|
||||
#define STDCALL
|
||||
#endif
|
||||
|
||||
#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
|
||||
#define CPU_PPC
|
||||
#if defined(_ARCH_PWR7)
|
||||
#define CPU_POWER7
|
||||
#elif defined(__64BIT__)
|
||||
#define CPU_PPC64
|
||||
#else
|
||||
#define CPU_PPC32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__hppa__) || defined(__hppa)
|
||||
#define CPU_HPPA
|
||||
#endif
|
||||
|
||||
#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
|
||||
#define CPU_ALPHA
|
||||
#endif
|
||||
|
||||
/* endian */
|
||||
|
||||
#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
|
||||
(defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
|
||||
(defined(CPU_X86) || defined(CPU_X86_64)) || \
|
||||
(defined(vax) || defined(MIPSEL) || defined(_MIPSEL)))
|
||||
#define CPU_LE
|
||||
#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \
|
||||
(defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \
|
||||
(defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB))
|
||||
#define CPU_BE
|
||||
#else
|
||||
/* unknown endian! */
|
||||
#endif
|
||||
|
||||
|
||||
#define U8TO32_BE(p) \
|
||||
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
|
||||
((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) ))
|
||||
|
||||
#define U8TO32_LE(p) \
|
||||
(((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \
|
||||
((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
|
||||
|
||||
#define U32TO8_BE(p, v) \
|
||||
(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
|
||||
(p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) );
|
||||
|
||||
#define U32TO8_LE(p, v) \
|
||||
(p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \
|
||||
(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24);
|
||||
|
||||
#define U8TO64_BE(p) \
|
||||
(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4))
|
||||
|
||||
#define U8TO64_LE(p) \
|
||||
(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32))
|
||||
|
||||
#define U64TO8_BE(p, v) \
|
||||
U32TO8_BE((p), (uint32_t)((v) >> 32)); \
|
||||
U32TO8_BE((p) + 4, (uint32_t)((v) ));
|
||||
|
||||
#define U64TO8_LE(p, v) \
|
||||
U32TO8_LE((p), (uint32_t)((v) )); \
|
||||
U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
|
||||
|
||||
#define U32_SWAP(v) { \
|
||||
(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \
|
||||
(v) = ((v) << 16) | ((v) >> 16); \
|
||||
}
|
||||
|
||||
#define U64_SWAP(v) { \
|
||||
(v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \
|
||||
(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \
|
||||
(v) = ((v) << 32) | ((v) >> 32); \
|
||||
}
|
||||
|
||||
static int
|
||||
scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
|
||||
uint32_t differentbits = 0;
|
||||
while (len--)
|
||||
differentbits |= (*x++ ^ *y++);
|
||||
return (1 & ((differentbits - 1) >> 8));
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_ensure_zero(void *p, size_t len) {
|
||||
#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
|
||||
__stosb((unsigned char *)p, 0, len);
|
||||
#elif (defined(CPU_X86) && defined(COMPILER_GCC))
|
||||
__asm__ __volatile__(
|
||||
"pushl %%edi;\n"
|
||||
"pushl %%ecx;\n"
|
||||
"rep stosb;\n"
|
||||
"popl %%ecx;\n"
|
||||
"popl %%edi;\n"
|
||||
:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
|
||||
);
|
||||
#elif (defined(CPU_X86_64) && defined(COMPILER_GCC))
|
||||
__asm__ __volatile__(
|
||||
"pushq %%rdi;\n"
|
||||
"pushq %%rcx;\n"
|
||||
"rep stosb;\n"
|
||||
"popq %%rcx;\n"
|
||||
"popq %%rdi;\n"
|
||||
:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
|
||||
);
|
||||
#else
|
||||
volatile uint8_t *b = (volatile uint8_t *)p;
|
||||
size_t i;
|
||||
for (i = 0; i < len; i++)
|
||||
b[i] = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "scrypt-jane-portable-x86.h"
|
||||
|
||||
#if !defined(asm_calling_convention)
|
||||
#define asm_calling_convention
|
||||
#endif
|
74
stratum/algos/ar2/sj/scrypt-jane-romix-basic.h
Normal file
74
stratum/algos/ar2/sj/scrypt-jane-romix-basic.h
Normal file
|
@ -0,0 +1,74 @@
|
|||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
/* function type returned by scrypt_getROMix, used with cpu detection */
|
||||
typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r);
|
||||
#endif
|
||||
|
||||
/* romix pre/post nop function */
|
||||
static void asm_calling_convention
|
||||
scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
|
||||
(void)blocks; (void)nblocks;
|
||||
}
|
||||
|
||||
/* romix pre/post endian conversion function */
|
||||
static void asm_calling_convention
|
||||
scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
|
||||
#if !defined(CPU_LE)
|
||||
static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
|
||||
size_t i;
|
||||
if (endian_test.w == 0x100) {
|
||||
nblocks *= SCRYPT_BLOCK_WORDS;
|
||||
for (i = 0; i < nblocks; i++) {
|
||||
SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)blocks; (void)nblocks;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* chunkmix test function */
|
||||
typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
|
||||
typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
|
||||
|
||||
static int
|
||||
scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
|
||||
/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
|
||||
const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
|
||||
#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
|
||||
scrypt_mix_word_t ALIGN(32) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
|
||||
#else
|
||||
scrypt_mix_word_t ALIGN(16) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
|
||||
#endif
|
||||
uint8_t final[16];
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < words; i++) {
|
||||
v = (scrypt_mix_word_t)i;
|
||||
v = (v << 8) | v;
|
||||
v = (v << 16) | v;
|
||||
chunk[0][i] = v;
|
||||
}
|
||||
|
||||
prefn(chunk[0], blocks);
|
||||
mixfn(chunk[1], chunk[0], NULL, r);
|
||||
postfn(chunk[1], blocks);
|
||||
|
||||
/* grab the last 16 bytes of the final block */
|
||||
for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) {
|
||||
SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]);
|
||||
}
|
||||
|
||||
return scrypt_verify(expected, final, 16);
|
||||
}
|
||||
|
||||
/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */
|
||||
static scrypt_mix_word_t *
|
||||
scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) {
|
||||
return base + (i * len);
|
||||
}
|
||||
|
||||
/* returns a pointer to block i */
|
||||
static scrypt_mix_word_t *
|
||||
scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) {
|
||||
return base + (i * SCRYPT_BLOCK_WORDS);
|
||||
}
|
122
stratum/algos/ar2/sj/scrypt-jane-romix-template.h
Normal file
122
stratum/algos/ar2/sj/scrypt-jane-romix-template.h
Normal file
|
@ -0,0 +1,122 @@
|
|||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
|
||||
|
||||
#if defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#undef SCRYPT_ROMIX_FN
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix
|
||||
#endif
|
||||
|
||||
#undef SCRYPT_HAVE_ROMIX
|
||||
#define SCRYPT_HAVE_ROMIX
|
||||
|
||||
#if !defined(SCRYPT_CHUNKMIX_FN)
|
||||
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic
|
||||
|
||||
/*
|
||||
Bout = ChunkMix(Bin)
|
||||
|
||||
2*r: number of blocks in the chunk
|
||||
*/
|
||||
static void asm_calling_convention
|
||||
SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
|
||||
#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
|
||||
scrypt_mix_word_t ALIGN(32) X[SCRYPT_BLOCK_WORDS], *block;
|
||||
#else
|
||||
scrypt_mix_word_t ALIGN(16) X[SCRYPT_BLOCK_WORDS], *block;
|
||||
#endif
|
||||
uint32_t i, j, blocksPerChunk = /*r * 2*/2, half = 0;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
block = scrypt_block(Bin, blocksPerChunk - 1);
|
||||
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
|
||||
X[i] = block[i];
|
||||
|
||||
if (Bxor) {
|
||||
block = scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
|
||||
X[i] ^= block[i];
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= /*r*/1) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
block = scrypt_block(Bin, i);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
X[j] ^= block[j];
|
||||
|
||||
if (Bxor) {
|
||||
block = scrypt_block(Bxor, i);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
X[j] ^= block[j];
|
||||
}
|
||||
SCRYPT_MIX_FN(X);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
block = scrypt_block(Bout, (i / 2) + half);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
block[j] = X[j];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
X = ROMix(X)
|
||||
|
||||
X: chunk to mix
|
||||
Y: scratch chunk
|
||||
N: number of rounds
|
||||
V[N]: array of chunks to randomly index in to
|
||||
2*r: number of blocks in a chunk
|
||||
*/
|
||||
|
||||
static void NOINLINE FASTCALL
|
||||
SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
|
||||
uint32_t i, j, chunkWords = (uint32_t)(SCRYPT_BLOCK_WORDS * 2);
|
||||
scrypt_mix_word_t *block = V;
|
||||
|
||||
SCRYPT_ROMIX_TANGLE_FN(X, 2);
|
||||
|
||||
/* 1: X = B */
|
||||
/* implicit */
|
||||
|
||||
/* 2: for i = 0 to N - 1 do */
|
||||
memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
|
||||
for (i = 0; i < /*N - 1*/511; i++, block += chunkWords) {
|
||||
/* 3: V_i = X */
|
||||
/* 4: X = H(X) */
|
||||
SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, /*r*/1);
|
||||
}
|
||||
SCRYPT_CHUNKMIX_FN(X, block, NULL, 1);
|
||||
|
||||
/* 6: for i = 0 to N - 1 do */
|
||||
for (i = 0; i < /*N*/512; i += 2) {
|
||||
/* 7: j = Integerify(X) % N */
|
||||
j = X[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511;
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), 1);
|
||||
|
||||
/* 7: j = Integerify(Y) % N */
|
||||
j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511;
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), 1);
|
||||
}
|
||||
|
||||
/* 10: B' = X */
|
||||
/* implicit */
|
||||
|
||||
SCRYPT_ROMIX_UNTANGLE_FN(X, 2);
|
||||
}
|
||||
|
||||
#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
|
||||
|
||||
|
||||
#undef SCRYPT_CHUNKMIX_FN
|
||||
#undef SCRYPT_ROMIX_FN
|
||||
#undef SCRYPT_MIX_FN
|
||||
#undef SCRYPT_ROMIX_TANGLE_FN
|
||||
#undef SCRYPT_ROMIX_UNTANGLE_FN
|
||||
|
25
stratum/algos/ar2/sj/scrypt-jane-romix.h
Normal file
25
stratum/algos/ar2/sj/scrypt-jane-romix.h
Normal file
|
@ -0,0 +1,25 @@
|
|||
#if defined(SCRYPT_SALSA)
|
||||
#include "scrypt-jane-salsa.h"
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
#include "scrypt-jane-salsa64.h"
|
||||
#else
|
||||
#define SCRYPT_MIX_BASE "ERROR"
|
||||
typedef uint32_t scrypt_mix_word_t;
|
||||
#define SCRYPT_WORDTO8_LE U32TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
|
||||
#define SCRYPT_BLOCK_BYTES 64
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {}
|
||||
static scrypt_ROMixfn scrypt_getROMix(void) { return scrypt_ROMix_error; }
|
||||
#else
|
||||
static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {}
|
||||
#endif
|
||||
static int scrypt_test_mix(void) { return 0; }
|
||||
#error must define a mix function!
|
||||
#endif
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX SCRYPT_MIX_BASE
|
||||
#endif
|
134
stratum/algos/ar2/sj/scrypt-jane-salsa.h
Normal file
134
stratum/algos/ar2/sj/scrypt-jane-salsa.h
Normal file
|
@ -0,0 +1,134 @@
|
|||
#define SCRYPT_MIX_BASE "Salsa20/8"
|
||||
|
||||
typedef uint32_t scrypt_mix_word_t;
|
||||
|
||||
#define SCRYPT_WORDTO8_LE U32TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
|
||||
|
||||
#define SCRYPT_BLOCK_BYTES 64
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
|
||||
/* must have these here in case block bytes is ever != 64 */
|
||||
#include "scrypt-jane-romix-basic.h"
|
||||
|
||||
#include "scrypt-jane-mix_salsa-xop.h"
|
||||
#include "scrypt-jane-mix_salsa-avx.h"
|
||||
#include "scrypt-jane-mix_salsa-sse2.h"
|
||||
#include "scrypt-jane-mix_salsa.h"
|
||||
|
||||
#if defined(SCRYPT_SALSA_XOP)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_xop
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
|
||||
#define SCRYPT_MIX_FN salsa_core_sse2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
/* cpu agnostic */
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
|
||||
#define SCRYPT_MIX_FN salsa_core_basic
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static scrypt_ROMixfn
|
||||
scrypt_getROMix(void) {
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA_XOP)
|
||||
if (cpuflags & cpu_xop)
|
||||
return scrypt_ROMix_xop;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
return scrypt_ROMix_avx;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
return scrypt_ROMix_sse2;
|
||||
else
|
||||
#endif
|
||||
|
||||
return scrypt_ROMix_basic;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static size_t
|
||||
available_implementations(void) {
|
||||
size_t cpuflags = detect_cpu();
|
||||
size_t flags = 0;
|
||||
|
||||
#if defined(SCRYPT_SALSA_XOP)
|
||||
if (cpuflags & cpu_xop)
|
||||
flags |= cpu_xop;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
flags |= cpu_avx;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
flags |= cpu_sse2;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static int
|
||||
scrypt_test_mix(void) {
|
||||
static const uint8_t expected[16] = {
|
||||
0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66,
|
||||
};
|
||||
|
||||
int ret = 1;
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA_XOP)
|
||||
if (cpuflags & cpu_xop)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_BASIC)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
183
stratum/algos/ar2/sj/scrypt-jane-salsa64.h
Normal file
183
stratum/algos/ar2/sj/scrypt-jane-salsa64.h
Normal file
|
@ -0,0 +1,183 @@
|
|||
#define SCRYPT_MIX_BASE "Salsa64/8"
|
||||
|
||||
typedef uint64_t scrypt_mix_word_t;
|
||||
|
||||
#define SCRYPT_WORDTO8_LE U64TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP
|
||||
|
||||
#define SCRYPT_BLOCK_BYTES 128
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
|
||||
/* must have these here in case block bytes is ever != 64 */
|
||||
#include "scrypt-jane-romix-basic.h"
|
||||
|
||||
#include "scrypt-jane-mix_salsa64-avx2.h"
|
||||
#include "scrypt-jane-mix_salsa64-xop.h"
|
||||
#include "scrypt-jane-mix_salsa64-avx.h"
|
||||
#include "scrypt-jane-mix_salsa64-ssse3.h"
|
||||
#include "scrypt-jane-mix_salsa64-sse2.h"
|
||||
#include "scrypt-jane-mix_salsa64.h"
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx2
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_xop
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
/* cpu agnostic */
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
|
||||
#define SCRYPT_MIX_FN salsa64_core_basic
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static scrypt_ROMixfn
|
||||
scrypt_getROMix(void) {
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
if (cpuflags & cpu_avx2)
|
||||
return scrypt_ROMix_avx2;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
if (cpuflags & cpu_xop)
|
||||
return scrypt_ROMix_xop;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
return scrypt_ROMix_avx;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
return scrypt_ROMix_ssse3;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
return scrypt_ROMix_sse2;
|
||||
else
|
||||
#endif
|
||||
|
||||
return scrypt_ROMix_basic;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static size_t
|
||||
available_implementations(void) {
|
||||
size_t cpuflags = detect_cpu();
|
||||
size_t flags = 0;
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
if (cpuflags & cpu_avx2)
|
||||
flags |= cpu_avx2;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
if (cpuflags & cpu_xop)
|
||||
flags |= cpu_xop;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
flags |= cpu_avx;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
flags |= cpu_ssse3;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
flags |= cpu_sse2;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
scrypt_test_mix(void) {
|
||||
static const uint8_t expected[16] = {
|
||||
0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c,
|
||||
};
|
||||
|
||||
int ret = 1;
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
if (cpuflags & cpu_avx2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
if (cpuflags & cpu_xop)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_BASIC)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
39
stratum/algos/ar2/sj/scrypt-jane-test-vectors.h
Normal file
39
stratum/algos/ar2/sj/scrypt-jane-test-vectors.h
Normal file
|
@ -0,0 +1,39 @@
|
|||
typedef struct scrypt_test_setting_t {
|
||||
const char *pw, *salt;
|
||||
uint8_t Nfactor, rfactor, pfactor;
|
||||
} scrypt_test_setting;
|
||||
|
||||
static const scrypt_test_setting post_settings[] = {
|
||||
{"", "", 3, 0, 0},
|
||||
{"password", "NaCl", 9, 3, 4},
|
||||
{0, 0, 0, 0, 0}
|
||||
};
|
||||
|
||||
#if defined(SCRYPT_SKEIN512)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69,
|
||||
0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87,
|
||||
0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f,
|
||||
0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e},
|
||||
{0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e,
|
||||
0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b,
|
||||
0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb,
|
||||
0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60,
|
||||
0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59,
|
||||
0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9,
|
||||
0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89},
|
||||
{0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5,
|
||||
0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99,
|
||||
0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23,
|
||||
0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b}
|
||||
};
|
||||
#endif
|
||||
#else
|
||||
static const uint8_t post_vectors[][64] = {{0}};
|
||||
#endif
|
||||
|
36
stratum/algos/ar2/thread.c
Normal file
36
stratum/algos/ar2/thread.c
Normal file
|
@ -0,0 +1,36 @@
|
|||
#include "thread.h"
|
||||
#if defined(_WIN32)
|
||||
#include <Windows.h>
|
||||
#endif
|
||||
|
||||
int argon2_thread_create(argon2_thread_handle_t *handle,
|
||||
argon2_thread_func_t func, void *args) {
|
||||
if (NULL == handle || func == NULL) {
|
||||
return -1;
|
||||
}
|
||||
#if defined(_WIN32)
|
||||
*handle = _beginthreadex(NULL, 0, func, args, 0, NULL);
|
||||
return *handle != 0 ? 0 : -1;
|
||||
#else
|
||||
return pthread_create(handle, NULL, func, args);
|
||||
#endif
|
||||
}
|
||||
|
||||
int argon2_thread_join(argon2_thread_handle_t handle) {
|
||||
#if defined(_WIN32)
|
||||
if (WaitForSingleObject((HANDLE)handle, INFINITE) == WAIT_OBJECT_0) {
|
||||
return CloseHandle((HANDLE)handle) != 0 ? 0 : -1;
|
||||
}
|
||||
return -1;
|
||||
#else
|
||||
return pthread_join(handle, NULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
void argon2_thread_exit(void) {
|
||||
#if defined(_WIN32)
|
||||
_endthreadex(0);
|
||||
#else
|
||||
pthread_exit(NULL);
|
||||
#endif
|
||||
}
|
46
stratum/algos/ar2/thread.h
Normal file
46
stratum/algos/ar2/thread.h
Normal file
|
@ -0,0 +1,46 @@
|
|||
#ifndef ARGON2_THREAD_H
|
||||
#define ARGON2_THREAD_H
|
||||
/*
|
||||
Here we implement an abstraction layer for the simpĺe requirements
|
||||
of the Argon2 code. We only require 3 primitives---thread creation,
|
||||
joining, and termination---so full emulation of the pthreads API
|
||||
is unwarranted. Currently we wrap pthreads and Win32 threads.
|
||||
|
||||
The API defines 2 types: the function pointer type,
|
||||
argon2_thread_func_t,
|
||||
and the type of the thread handle---argon2_thread_handle_t.
|
||||
*/
|
||||
#if defined(_WIN32)
|
||||
#include <process.h>
|
||||
typedef unsigned(__stdcall *argon2_thread_func_t)(void *);
|
||||
typedef uintptr_t argon2_thread_handle_t;
|
||||
#else
|
||||
#include <pthread.h>
|
||||
typedef void *(*argon2_thread_func_t)(void *);
|
||||
typedef pthread_t argon2_thread_handle_t;
|
||||
#endif
|
||||
|
||||
/* Creates a thread
|
||||
* @param handle pointer to a thread handle, which is the output of this
|
||||
* function. Must not be NULL.
|
||||
* @param func A function pointer for the thread's entry point. Must not be
|
||||
* NULL.
|
||||
* @param args Pointer that is passed as an argument to @func. May be NULL.
|
||||
* @return 0 if @handle and @func are valid pointers and a thread is successfuly
|
||||
* created.
|
||||
*/
|
||||
int argon2_thread_create(argon2_thread_handle_t *handle,
|
||||
argon2_thread_func_t func, void *args);
|
||||
|
||||
/* Waits for a thread to terminate
|
||||
* @param handle Handle to a thread created with argon2_thread_create.
|
||||
* @return 0 if @handle is a valid handle, and joining completed successfully.
|
||||
*/
|
||||
int argon2_thread_join(argon2_thread_handle_t handle);
|
||||
|
||||
/* Terminate the current thread. Must be run inside a thread created by
|
||||
* argon2_thread_create.
|
||||
*/
|
||||
void argon2_thread_exit(void);
|
||||
|
||||
#endif
|
76
stratum/algos/argon2a.c
Normal file
76
stratum/algos/argon2a.c
Normal file
|
@ -0,0 +1,76 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sysendian.h"
|
||||
|
||||
#include "ar2/argon2.h"
|
||||
#include "ar2/cores.h"
|
||||
#include "ar2/scrypt-jane.h"
|
||||
|
||||
#define _ALIGN(x) __attribute__ ((aligned(x)))
|
||||
|
||||
#define T_COSTS 2
|
||||
#define M_COSTS 16
|
||||
#define MASK 8
|
||||
#define ZERO 0
|
||||
|
||||
static void argon_call(void *out, void *in, void *salt, int type)
|
||||
{
|
||||
argon2_context context = { 0 };
|
||||
|
||||
context.out = (uint8_t *)out;
|
||||
context.pwd = (uint8_t *)in;
|
||||
context.salt = (uint8_t *)salt;
|
||||
|
||||
argon2_core(&context, type);
|
||||
}
|
||||
|
||||
static void bin2hex(char *s, const unsigned char *p, size_t len)
|
||||
{
|
||||
for (size_t i = 0; i < len; i++)
|
||||
sprintf(s + (i * 2), "%02x", (unsigned int) p[i]);
|
||||
}
|
||||
|
||||
static char *abin2hex(const unsigned char *p, size_t len)
|
||||
{
|
||||
char *s = (char*) malloc((len * 2) + 1);
|
||||
if (!s)
|
||||
return NULL;
|
||||
bin2hex(s, p, len);
|
||||
return s;
|
||||
}
|
||||
|
||||
static void applog_data(void *pdata)
|
||||
{
|
||||
char* hex = abin2hex((unsigned char*)pdata, 80);
|
||||
fprintf(stderr, "%s\n", hex);
|
||||
free(hex);
|
||||
}
|
||||
|
||||
void argon2_hash(const char* input, char* output, uint32_t len)
|
||||
{
|
||||
// these uint512 in the c++ source of the client are backed by an array of uint32
|
||||
uint32_t _ALIGN(32) hashA[8], hashB[8], hashC[8];
|
||||
uint32_t _ALIGN(32) endian[20], *in;
|
||||
|
||||
in = (uint32_t*) input;
|
||||
for (int i=0; i<len/4; i++)
|
||||
endian[i] = in[i];
|
||||
// be32enc(&endian[i], in[i]);
|
||||
//applog_data((void*) endian);
|
||||
|
||||
my_scrypt((unsigned char *)endian, len,
|
||||
(unsigned char *)endian, len,
|
||||
(unsigned char *)hashA);
|
||||
|
||||
argon_call(hashB, hashA, hashA, (hashA[0] & MASK) == ZERO);
|
||||
|
||||
my_scrypt((const unsigned char *)hashB, 32,
|
||||
(const unsigned char *)hashB, 32,
|
||||
(unsigned char *)hashC);
|
||||
|
||||
memcpy(output, hashC, 32);
|
||||
}
|
||||
|
16
stratum/algos/argon2a.h
Normal file
16
stratum/algos/argon2a.h
Normal file
|
@ -0,0 +1,16 @@
|
|||
#ifndef ARGON2A_H
|
||||
#define ARGON2A_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
void argon2_hash(const char* input, char* output, uint32_t len);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -12,26 +12,27 @@ SOURCES=lyra2re.c lyra2v2.c Lyra2.c Sponge.c blake.c scrypt.c c11.c x11.c x13.c
|
|||
skein2.c zr5.c bmw.c luffa.c pentablake.c whirlpool.c whirlpoolx.c blakecoin.c \
|
||||
yescrypt.c yescrypt-opt.c sha256_Y.c \
|
||||
m7m.c magimath.cpp velvet.c \
|
||||
argon2a.c ar2/blake2b.c ar2/argon2.c ar2/ref.c ar2/cores.c ar2/thread.c ar2/scrypt-jane.c \
|
||||
hive.c pomelo.c \
|
||||
sib.c gost.c
|
||||
|
||||
OBJECTS=$(SOURCES:.c=.o) $(SOURCES:.cpp=.o)
|
||||
OBJECTS=$(SOURCES:%.c=%.o) $(SOURCES:%.cpp=%.o)
|
||||
OUTPUT=libalgos.a
|
||||
|
||||
all: $(SOURCES) $(OUTPUT)
|
||||
|
||||
$(OUTPUT): $(OBJECTS)
|
||||
$(OUTPUT): $(OBJECTS)
|
||||
ar rc $@ $(OBJECTS)
|
||||
|
||||
|
||||
.cpp.o:
|
||||
$(CC) $(CFLAGS) $<
|
||||
|
||||
$(CC) $(CFLAGS) $< -o $@
|
||||
|
||||
.c.o:
|
||||
$(CC) $(CFLAGS) $<
|
||||
$(CC) $(CFLAGS) $< -o $@
|
||||
|
||||
# $(CC) $(CFLAGS) -std=gnu99 -Wno-pointer-sign -Wno-pointer-to-int-cast -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16 -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores -DUSE_ASM -pg $<
|
||||
|
||||
clean:
|
||||
rm *.o
|
||||
|
||||
|
||||
clean:
|
||||
rm -f *.o
|
||||
rm -f ar2/*.o ar2/*.a
|
||||
|
||||
|
|
0
stratum/algos/sha256_Y.c
Executable file → Normal file
0
stratum/algos/sha256_Y.c
Executable file → Normal file
0
stratum/algos/sha256_Y.h
Executable file → Normal file
0
stratum/algos/sha256_Y.h
Executable file → Normal file
0
stratum/algos/sysendian.h
Executable file → Normal file
0
stratum/algos/sysendian.h
Executable file → Normal file
0
stratum/algos/yescrypt-opt.c
Executable file → Normal file
0
stratum/algos/yescrypt-opt.c
Executable file → Normal file
0
stratum/algos/yescrypt-platform.c
Executable file → Normal file
0
stratum/algos/yescrypt-platform.c
Executable file → Normal file
16
stratum/config.sample/argon2.conf
Normal file
16
stratum/config.sample/argon2.conf
Normal file
|
@ -0,0 +1,16 @@
|
|||
[TCP]
|
||||
server = yaamp.com
|
||||
port = 4234
|
||||
password = tu8tu5
|
||||
|
||||
[SQL]
|
||||
host = yaampdb
|
||||
database = yaamp
|
||||
username = root
|
||||
password = patofpaq
|
||||
|
||||
[STRATUM]
|
||||
algo = argon2
|
||||
difficulty = 2
|
||||
max_ttf = 40000
|
||||
|
|
@ -118,6 +118,8 @@ YAAMP_ALGO g_algos[] =
|
|||
{"hive", hive_hash, 0x10000, 0, 0},
|
||||
{"m7m", m7m_hash, 0x10000, 0, 0},
|
||||
{"velvet", velvet_hash, 0x10000, 0, 0},
|
||||
{"argon2", argon2_hash, 0x10000, 0, sha256_hash_hex },
|
||||
|
||||
{"sib", sib_hash, 1, 0, 0},
|
||||
|
||||
{"whirlcoin", whirlpool_hash, 1, 0, sha256_hash_hex }, /* old sha merkleroot */
|
||||
|
|
|
@ -154,4 +154,5 @@ void sha256_double_hash_hex(const char *input, char *output, unsigned int len);
|
|||
#include "algos/sib.h"
|
||||
#include "algos/m7m.h"
|
||||
#include "algos/velvet.h"
|
||||
#include "algos/argon2a.h"
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ function yaamp_get_algos()
|
|||
'sha256',
|
||||
'scrypt',
|
||||
'scryptn',
|
||||
'argon2',
|
||||
'blake',
|
||||
'keccak',
|
||||
'luffa',
|
||||
|
@ -47,12 +48,12 @@ function yaamp_get_algo_norm($algo)
|
|||
'scryptn' => 1.0,
|
||||
'x11' => 1.0,
|
||||
'x13' => 1.0,
|
||||
'zr5' => 1.0,
|
||||
'argon2' => 1.0,
|
||||
'lyra2' => 1.0,
|
||||
'lyra2v2' => 1.0,
|
||||
'myr-gr' => 1.0,
|
||||
'nist5' => 1.0,
|
||||
'neoscrypt' => 1.0,
|
||||
'lyra2' => 1.0,
|
||||
'lyra2v2' => 1.0,
|
||||
'quark' => 1.0,
|
||||
'qubit' => 1.0,
|
||||
'skein' => 1.0,
|
||||
|
@ -62,6 +63,7 @@ function yaamp_get_algo_norm($algo)
|
|||
'velvet' => 1.0,
|
||||
'whirlpool' => 1.0,
|
||||
'yescrypt' => 1.0,
|
||||
'zr5' => 1.0,
|
||||
);
|
||||
|
||||
if(!isset($a[$algo]))
|
||||
|
@ -82,6 +84,7 @@ function getAlgoColors($algo)
|
|||
'x13' => '#ffd880',
|
||||
'x14' => '#a0f0c0',
|
||||
'x15' => '#f0b0a0',
|
||||
'argon2' => '#e0d0e0',
|
||||
'blake' => '#f0f0f0',
|
||||
'groestl' => '#d0a0a0',
|
||||
'dmd-gr' => '#a0c0f0',
|
||||
|
@ -127,6 +130,7 @@ function getAlgoPort($algo)
|
|||
'quark' => 4033,
|
||||
'whirlpool' => 4133,
|
||||
'neoscrypt' => 4233,
|
||||
'argon2' => 4234,
|
||||
'scryptn' => 4333,
|
||||
'lyra2' => 4433,
|
||||
'lyra2v2' => 4533,
|
||||
|
|
Loading…
Add table
Reference in a new issue