From 9d1c34f89438a523aa0fd06c871e2411f515ae56 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 25 Jan 2016 02:25:22 +0100 Subject: [PATCH] argon2: clean stratum code remove dead code detected after cpuminer implementation --- .../ar2/{scrypt-jane.c => ar2-scrypt-jane.c} | 2 +- .../ar2/{scrypt-jane.h => ar2-scrypt-jane.h} | 0 .../algos/ar2/sj/scrypt-jane-mix_salsa-avx.h | 381 --------------- .../algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h | 443 ------------------ .../algos/ar2/sj/scrypt-jane-mix_salsa-xop.h | 317 ------------- stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h | 70 --- stratum/algos/ar2/sj/scrypt-jane-romix.h | 4 +- stratum/algos/ar2/sj/scrypt-jane-salsa.h | 134 ------ .../algos/ar2/sj/scrypt-jane-test-vectors.h | 13 +- stratum/algos/ar2/thread.c | 36 -- stratum/algos/ar2/thread.h | 46 -- stratum/algos/argon2a.c | 44 +- stratum/algos/makefile | 2 +- 13 files changed, 10 insertions(+), 1482 deletions(-) rename stratum/algos/ar2/{scrypt-jane.c => ar2-scrypt-jane.c} (99%) rename stratum/algos/ar2/{scrypt-jane.h => ar2-scrypt-jane.h} (100%) delete mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa-avx.h delete mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h delete mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa-xop.h delete mode 100644 stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h delete mode 100644 stratum/algos/ar2/sj/scrypt-jane-salsa.h delete mode 100644 stratum/algos/ar2/thread.c delete mode 100644 stratum/algos/ar2/thread.h diff --git a/stratum/algos/ar2/scrypt-jane.c b/stratum/algos/ar2/ar2-scrypt-jane.c similarity index 99% rename from stratum/algos/ar2/scrypt-jane.c rename to stratum/algos/ar2/ar2-scrypt-jane.c index 70fa626..e75b73b 100644 --- a/stratum/algos/ar2/scrypt-jane.c +++ b/stratum/algos/ar2/ar2-scrypt-jane.c @@ -12,7 +12,7 @@ extern "C" { #endif #endif -#include "scrypt-jane.h" +#include "ar2-scrypt-jane.h" #include "sj/scrypt-jane-portable.h" #include "sj/scrypt-jane-hash.h" diff --git a/stratum/algos/ar2/scrypt-jane.h b/stratum/algos/ar2/ar2-scrypt-jane.h similarity index 100% rename from stratum/algos/ar2/scrypt-jane.h rename to stratum/algos/ar2/ar2-scrypt-jane.h diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-avx.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-avx.h deleted file mode 100644 index 259fae4..0000000 --- a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-avx.h +++ /dev/null @@ -1,381 +0,0 @@ -/* x86 */ -#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) - -#define SCRYPT_SALSA_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,32) - a2(and esp,~63) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(movdqa xmm0,[ecx+esi+0]) - a2(movdqa xmm1,[ecx+esi+16]) - a2(movdqa xmm2,[ecx+esi+32]) - a2(movdqa xmm3,[ecx+esi+48]) - aj(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[ecx+eax+0]) - a3(vpxor xmm1,xmm1,[ecx+eax+16]) - a3(vpxor xmm2,xmm2,[ecx+eax+32]) - a3(vpxor xmm3,xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_avx_loop:) - a2(and eax, eax) - a3(vpxor xmm0,xmm0,[esi+ecx+0]) - a3(vpxor xmm1,xmm1,[esi+ecx+16]) - a3(vpxor xmm2,xmm2,[esi+ecx+32]) - a3(vpxor xmm3,xmm3,[esi+ecx+48]) - aj(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[eax+ecx+0]) - a3(vpxor xmm1,xmm1,[eax+ecx+16]) - a3(vpxor xmm2,xmm2,[eax+ecx+32]) - a3(vpxor xmm3,xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa [esp+0],xmm0) - a2(vmovdqa [esp+16],xmm1) - a2(vmovdqa xmm6,xmm2) - a2(vmovdqa xmm7,xmm3) - a2(mov eax,8) - a1(scrypt_salsa_avx_loop: ) - a3(vpaddd xmm4, xmm1, xmm0) - a3(vpsrld xmm5, xmm4, 25) - a3(vpslld xmm4, xmm4, 7) - a3(vpxor xmm3, xmm3, xmm5) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm0, xmm3) - a3(vpsrld xmm5, xmm4, 23) - a3(vpslld xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm5) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm3, xmm2) - a3(vpsrld xmm5, xmm4, 19) - a3(vpslld xmm4, xmm4, 13) - a3(vpxor xmm1, xmm1, xmm5) - a3(vpshufd xmm3, xmm3, 0x93) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm2, xmm1) - a3(vpsrld xmm5, xmm4, 14) - a3(vpslld xmm4, xmm4, 18) - a3(vpxor xmm0, xmm0, xmm5) - a3(vpshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a3(vpaddd xmm4, xmm3, xmm0) - a3(vpshufd xmm1, xmm1, 0x39) - a3(vpsrld xmm5, xmm4, 25) - a3(vpslld xmm4, xmm4, 7) - a3(vpxor xmm1, xmm1, xmm5) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm0, xmm1) - a3(vpsrld xmm5, xmm4, 23) - a3(vpslld xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm5) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm1, xmm2) - a3(vpsrld xmm5, xmm4, 19) - a3(vpslld xmm4, xmm4, 13) - a3(vpxor xmm3, xmm3, xmm5) - a3(vpshufd xmm1, xmm1, 0x93) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm2, xmm3) - a3(vpsrld xmm5, xmm4, 14) - a3(vpslld xmm4, xmm4, 18) - a3(vpxor xmm0, xmm0, xmm5) - a3(vpshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a3(vpshufd xmm3, xmm3, 0x39) - a2(sub eax, 2) - aj(ja scrypt_salsa_avx_loop) - a3(vpaddd xmm0,xmm0,[esp+0]) - a3(vpaddd xmm1,xmm1,[esp+16]) - a3(vpaddd xmm2,xmm2,xmm6) - a3(vpaddd xmm3,xmm3,xmm7) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(vmovdqa [eax+0],xmm0) - a2(vmovdqa [eax+16],xmm1) - a2(vmovdqa [eax+32],xmm2) - a2(vmovdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - aj(jne scrypt_ChunkMix_avx_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) - -#define SCRYPT_SALSA_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(vmovdqa xmm0,[rax+0]) - a2(vmovdqa xmm1,[rax+16]) - a2(vmovdqa xmm2,[rax+32]) - a2(vmovdqa xmm3,[rax+48]) - aj(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[r9+0]) - a3(vpxor xmm1,xmm1,[r9+16]) - a3(vpxor xmm2,xmm2,[r9+32]) - a3(vpxor xmm3,xmm3,[r9+48]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_avx_loop:) - a2(and rdx, rdx) - a3(vpxor xmm0,xmm0,[rsi+r9+0]) - a3(vpxor xmm1,xmm1,[rsi+r9+16]) - a3(vpxor xmm2,xmm2,[rsi+r9+32]) - a3(vpxor xmm3,xmm3,[rsi+r9+48]) - aj(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[rdx+r9+0]) - a3(vpxor xmm1,xmm1,[rdx+r9+16]) - a3(vpxor xmm2,xmm2,[rdx+r9+32]) - a3(vpxor xmm3,xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa xmm8,xmm0) - a2(vmovdqa xmm9,xmm1) - a2(vmovdqa xmm10,xmm2) - a2(vmovdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_salsa_avx_loop: ) - a3(vpaddd xmm4, xmm1, xmm0) - a3(vpsrld xmm5, xmm4, 25) - a3(vpslld xmm4, xmm4, 7) - a3(vpxor xmm3, xmm3, xmm5) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm0, xmm3) - a3(vpsrld xmm5, xmm4, 23) - a3(vpslld xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm5) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm3, xmm2) - a3(vpsrld xmm5, xmm4, 19) - a3(vpslld xmm4, xmm4, 13) - a3(vpxor xmm1, xmm1, xmm5) - a3(vpshufd xmm3, xmm3, 0x93) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm2, xmm1) - a3(vpsrld xmm5, xmm4, 14) - a3(vpslld xmm4, xmm4, 18) - a3(vpxor xmm0, xmm0, xmm5) - a3(vpshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a3(vpaddd xmm4, xmm3, xmm0) - a3(vpshufd xmm1, xmm1, 0x39) - a3(vpsrld xmm5, xmm4, 25) - a3(vpslld xmm4, xmm4, 7) - a3(vpxor xmm1, xmm1, xmm5) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm0, xmm1) - a3(vpsrld xmm5, xmm4, 23) - a3(vpslld xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm5) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm1, xmm2) - a3(vpsrld xmm5, xmm4, 19) - a3(vpslld xmm4, xmm4, 13) - a3(vpxor xmm3, xmm3, xmm5) - a3(vpshufd xmm1, xmm1, 0x93) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm2, xmm3) - a3(vpsrld xmm5, xmm4, 14) - a3(vpslld xmm4, xmm4, 18) - a3(vpxor xmm0, xmm0, xmm5) - a3(vpshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a3(vpshufd xmm3, xmm3, 0x39) - a2(sub rax, 2) - aj(ja scrypt_salsa_avx_loop) - a3(vpaddd xmm0,xmm0,xmm8) - a3(vpaddd xmm1,xmm1,xmm9) - a3(vpaddd xmm2,xmm2,xmm10) - a3(vpaddd xmm3,xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(vmovdqa [rax+0],xmm0) - a2(vmovdqa [rax+16],xmm1) - a2(vmovdqa [rax+32],xmm2) - a2(vmovdqa [rax+48],xmm3) - aj(jne scrypt_ChunkMix_avx_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_AVX - -static void asm_calling_convention NOINLINE -scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x4 = x1; - x4 = _mm_add_epi32(x4, x0); - x5 = x4; - x4 = _mm_slli_epi32(x4, 7); - x5 = _mm_srli_epi32(x5, 25); - x3 = _mm_xor_si128(x3, x4); - x4 = x0; - x3 = _mm_xor_si128(x3, x5); - x4 = _mm_add_epi32(x4, x3); - x5 = x4; - x4 = _mm_slli_epi32(x4, 9); - x5 = _mm_srli_epi32(x5, 23); - x2 = _mm_xor_si128(x2, x4); - x4 = x3; - x2 = _mm_xor_si128(x2, x5); - x3 = _mm_shuffle_epi32(x3, 0x93); - x4 = _mm_add_epi32(x4, x2); - x5 = x4; - x4 = _mm_slli_epi32(x4, 13); - x5 = _mm_srli_epi32(x5, 19); - x1 = _mm_xor_si128(x1, x4); - x4 = x2; - x1 = _mm_xor_si128(x1, x5); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x4 = _mm_add_epi32(x4, x1); - x5 = x4; - x4 = _mm_slli_epi32(x4, 18); - x5 = _mm_srli_epi32(x5, 14); - x0 = _mm_xor_si128(x0, x4); - x4 = x3; - x0 = _mm_xor_si128(x0, x5); - x1 = _mm_shuffle_epi32(x1, 0x39); - x4 = _mm_add_epi32(x4, x0); - x5 = x4; - x4 = _mm_slli_epi32(x4, 7); - x5 = _mm_srli_epi32(x5, 25); - x1 = _mm_xor_si128(x1, x4); - x4 = x0; - x1 = _mm_xor_si128(x1, x5); - x4 = _mm_add_epi32(x4, x1); - x5 = x4; - x4 = _mm_slli_epi32(x4, 9); - x5 = _mm_srli_epi32(x5, 23); - x2 = _mm_xor_si128(x2, x4); - x4 = x1; - x2 = _mm_xor_si128(x2, x5); - x1 = _mm_shuffle_epi32(x1, 0x93); - x4 = _mm_add_epi32(x4, x2); - x5 = x4; - x4 = _mm_slli_epi32(x4, 13); - x5 = _mm_srli_epi32(x5, 19); - x3 = _mm_xor_si128(x3, x4); - x4 = x2; - x3 = _mm_xor_si128(x3, x5); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x4 = _mm_add_epi32(x4, x3); - x5 = x4; - x4 = _mm_slli_epi32(x4, 18); - x5 = _mm_srli_epi32(x5, 14); - x0 = _mm_xor_si128(x0, x4); - x3 = _mm_shuffle_epi32(x3, 0x39); - x0 = _mm_xor_si128(x0, x5); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_SALSA_AVX) - /* uses salsa_core_tangle_sse2 */ - - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa/8-AVX" - #undef SCRYPT_SALSA_INCLUDED - #define SCRYPT_SALSA_INCLUDED -#endif diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h deleted file mode 100644 index 00f4a3c..0000000 --- a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h +++ /dev/null @@ -1,443 +0,0 @@ -/* x86 */ -#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) - -#define SCRYPT_SALSA_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,32) - a2(and esp,~63) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(movdqa xmm0,[ecx+esi+0]) - a2(movdqa xmm1,[ecx+esi+16]) - a2(movdqa xmm2,[ecx+esi+32]) - a2(movdqa xmm3,[ecx+esi+48]) - aj(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[ecx+eax+0]) - a2(pxor xmm1,[ecx+eax+16]) - a2(pxor xmm2,[ecx+eax+32]) - a2(pxor xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and eax, eax) - a2(pxor xmm0,[esi+ecx+0]) - a2(pxor xmm1,[esi+ecx+16]) - a2(pxor xmm2,[esi+ecx+32]) - a2(pxor xmm3,[esi+ecx+48]) - aj(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[eax+ecx+0]) - a2(pxor xmm1,[eax+ecx+16]) - a2(pxor xmm2,[eax+ecx+32]) - a2(pxor xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa [esp+0],xmm0) - a2(movdqa [esp+16],xmm1) - a2(movdqa xmm6,xmm2) - a2(movdqa xmm7,xmm3) - a2(mov eax,8) - a1(scrypt_salsa_sse2_loop: ) - a2(movdqa xmm4, xmm1) - a2(paddd xmm4, xmm0) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 7) - a2(psrld xmm5, 25) - a2(pxor xmm3, xmm4) - a2(movdqa xmm4, xmm0) - a2(pxor xmm3, xmm5) - a2(paddd xmm4, xmm3) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 9) - a2(psrld xmm5, 23) - a2(pxor xmm2, xmm4) - a2(movdqa xmm4, xmm3) - a2(pxor xmm2, xmm5) - a3(pshufd xmm3, xmm3, 0x93) - a2(paddd xmm4, xmm2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 13) - a2(psrld xmm5, 19) - a2(pxor xmm1, xmm4) - a2(movdqa xmm4, xmm2) - a2(pxor xmm1, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a2(paddd xmm4, xmm1) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 18) - a2(psrld xmm5, 14) - a2(pxor xmm0, xmm4) - a2(movdqa xmm4, xmm3) - a2(pxor xmm0, xmm5) - a3(pshufd xmm1, xmm1, 0x39) - a2(paddd xmm4, xmm0) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 7) - a2(psrld xmm5, 25) - a2(pxor xmm1, xmm4) - a2(movdqa xmm4, xmm0) - a2(pxor xmm1, xmm5) - a2(paddd xmm4, xmm1) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 9) - a2(psrld xmm5, 23) - a2(pxor xmm2, xmm4) - a2(movdqa xmm4, xmm1) - a2(pxor xmm2, xmm5) - a3(pshufd xmm1, xmm1, 0x93) - a2(paddd xmm4, xmm2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 13) - a2(psrld xmm5, 19) - a2(pxor xmm3, xmm4) - a2(movdqa xmm4, xmm2) - a2(pxor xmm3, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a2(paddd xmm4, xmm3) - a2(sub eax, 2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 18) - a2(psrld xmm5, 14) - a2(pxor xmm0, xmm4) - a3(pshufd xmm3, xmm3, 0x39) - a2(pxor xmm0, xmm5) - aj(ja scrypt_salsa_sse2_loop) - a2(paddd xmm0,[esp+0]) - a2(paddd xmm1,[esp+16]) - a2(paddd xmm2,xmm6) - a2(paddd xmm3,xmm7) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(movdqa [eax+0],xmm0) - a2(movdqa [eax+16],xmm1) - a2(movdqa [eax+32],xmm2) - a2(movdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - aj(jne scrypt_ChunkMix_sse2_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) - -#define SCRYPT_SALSA_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - aj(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - aj(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa xmm8,xmm0) - a2(movdqa xmm9,xmm1) - a2(movdqa xmm10,xmm2) - a2(movdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_salsa_sse2_loop: ) - a2(movdqa xmm4, xmm1) - a2(paddd xmm4, xmm0) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 7) - a2(psrld xmm5, 25) - a2(pxor xmm3, xmm4) - a2(movdqa xmm4, xmm0) - a2(pxor xmm3, xmm5) - a2(paddd xmm4, xmm3) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 9) - a2(psrld xmm5, 23) - a2(pxor xmm2, xmm4) - a2(movdqa xmm4, xmm3) - a2(pxor xmm2, xmm5) - a3(pshufd xmm3, xmm3, 0x93) - a2(paddd xmm4, xmm2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 13) - a2(psrld xmm5, 19) - a2(pxor xmm1, xmm4) - a2(movdqa xmm4, xmm2) - a2(pxor xmm1, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a2(paddd xmm4, xmm1) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 18) - a2(psrld xmm5, 14) - a2(pxor xmm0, xmm4) - a2(movdqa xmm4, xmm3) - a2(pxor xmm0, xmm5) - a3(pshufd xmm1, xmm1, 0x39) - a2(paddd xmm4, xmm0) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 7) - a2(psrld xmm5, 25) - a2(pxor xmm1, xmm4) - a2(movdqa xmm4, xmm0) - a2(pxor xmm1, xmm5) - a2(paddd xmm4, xmm1) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 9) - a2(psrld xmm5, 23) - a2(pxor xmm2, xmm4) - a2(movdqa xmm4, xmm1) - a2(pxor xmm2, xmm5) - a3(pshufd xmm1, xmm1, 0x93) - a2(paddd xmm4, xmm2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 13) - a2(psrld xmm5, 19) - a2(pxor xmm3, xmm4) - a2(movdqa xmm4, xmm2) - a2(pxor xmm3, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a2(paddd xmm4, xmm3) - a2(sub rax, 2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 18) - a2(psrld xmm5, 14) - a2(pxor xmm0, xmm4) - a3(pshufd xmm3, xmm3, 0x39) - a2(pxor xmm0, xmm5) - aj(ja scrypt_salsa_sse2_loop) - a2(paddd xmm0,xmm8) - a2(paddd xmm1,xmm9) - a2(paddd xmm2,xmm10) - a2(paddd xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - aj(jne scrypt_ChunkMix_sse2_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_SSE2 - -static void NOINLINE asm_calling_convention -scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x4 = x1; - x4 = _mm_add_epi32(x4, x0); - x5 = x4; - x4 = _mm_slli_epi32(x4, 7); - x5 = _mm_srli_epi32(x5, 25); - x3 = _mm_xor_si128(x3, x4); - x4 = x0; - x3 = _mm_xor_si128(x3, x5); - x4 = _mm_add_epi32(x4, x3); - x5 = x4; - x4 = _mm_slli_epi32(x4, 9); - x5 = _mm_srli_epi32(x5, 23); - x2 = _mm_xor_si128(x2, x4); - x4 = x3; - x2 = _mm_xor_si128(x2, x5); - x3 = _mm_shuffle_epi32(x3, 0x93); - x4 = _mm_add_epi32(x4, x2); - x5 = x4; - x4 = _mm_slli_epi32(x4, 13); - x5 = _mm_srli_epi32(x5, 19); - x1 = _mm_xor_si128(x1, x4); - x4 = x2; - x1 = _mm_xor_si128(x1, x5); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x4 = _mm_add_epi32(x4, x1); - x5 = x4; - x4 = _mm_slli_epi32(x4, 18); - x5 = _mm_srli_epi32(x5, 14); - x0 = _mm_xor_si128(x0, x4); - x4 = x3; - x0 = _mm_xor_si128(x0, x5); - x1 = _mm_shuffle_epi32(x1, 0x39); - x4 = _mm_add_epi32(x4, x0); - x5 = x4; - x4 = _mm_slli_epi32(x4, 7); - x5 = _mm_srli_epi32(x5, 25); - x1 = _mm_xor_si128(x1, x4); - x4 = x0; - x1 = _mm_xor_si128(x1, x5); - x4 = _mm_add_epi32(x4, x1); - x5 = x4; - x4 = _mm_slli_epi32(x4, 9); - x5 = _mm_srli_epi32(x5, 23); - x2 = _mm_xor_si128(x2, x4); - x4 = x1; - x2 = _mm_xor_si128(x2, x5); - x1 = _mm_shuffle_epi32(x1, 0x93); - x4 = _mm_add_epi32(x4, x2); - x5 = x4; - x4 = _mm_slli_epi32(x4, 13); - x5 = _mm_srli_epi32(x5, 19); - x3 = _mm_xor_si128(x3, x4); - x4 = x2; - x3 = _mm_xor_si128(x3, x5); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x4 = _mm_add_epi32(x4, x3); - x5 = x4; - x4 = _mm_slli_epi32(x4, 18); - x5 = _mm_srli_epi32(x5, 14); - x0 = _mm_xor_si128(x0, x4); - x3 = _mm_shuffle_epi32(x3, 0x39); - x0 = _mm_xor_si128(x0, x5); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_SALSA_SSE2) - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa/8-SSE2" - #undef SCRYPT_SALSA_INCLUDED - #define SCRYPT_SALSA_INCLUDED -#endif - -/* used by avx,etc as well */ -#if defined(SCRYPT_SALSA_INCLUDED) - /* - Default layout: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 - 12 13 14 15 - - SSE2 layout: - 0 5 10 15 - 12 1 6 11 - 8 13 2 7 - 4 9 14 3 - */ - - static void asm_calling_convention - salsa_core_tangle_sse2(uint32_t *blocks, size_t count) { - uint32_t t; - while (count--) { - t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; - t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; - t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; - t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; - t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; - t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; - blocks += 16; - } - } -#endif - diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-xop.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-xop.h deleted file mode 100644 index 1d014d2..0000000 --- a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-xop.h +++ /dev/null @@ -1,317 +0,0 @@ -/* x86 */ -#if defined(X86ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) - -#define SCRYPT_SALSA_XOP - -asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_xop) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,32) - a2(and esp,~63) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(movdqa xmm0,[ecx+esi+0]) - a2(movdqa xmm1,[ecx+esi+16]) - a2(movdqa xmm2,[ecx+esi+32]) - a2(movdqa xmm3,[ecx+esi+48]) - aj(jz scrypt_ChunkMix_xop_no_xor1) - a3(vpxor xmm0,xmm0,[ecx+eax+0]) - a3(vpxor xmm1,xmm1,[ecx+eax+16]) - a3(vpxor xmm2,xmm2,[ecx+eax+32]) - a3(vpxor xmm3,xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_xop_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_xop_loop:) - a2(and eax, eax) - a3(vpxor xmm0,xmm0,[esi+ecx+0]) - a3(vpxor xmm1,xmm1,[esi+ecx+16]) - a3(vpxor xmm2,xmm2,[esi+ecx+32]) - a3(vpxor xmm3,xmm3,[esi+ecx+48]) - aj(jz scrypt_ChunkMix_xop_no_xor2) - a3(vpxor xmm0,xmm0,[eax+ecx+0]) - a3(vpxor xmm1,xmm1,[eax+ecx+16]) - a3(vpxor xmm2,xmm2,[eax+ecx+32]) - a3(vpxor xmm3,xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_xop_no_xor2:) - a2(vmovdqa [esp+0],xmm0) - a2(vmovdqa [esp+16],xmm1) - a2(vmovdqa xmm6,xmm2) - a2(vmovdqa xmm7,xmm3) - a2(mov eax,8) - a1(scrypt_salsa_xop_loop: ) - a3(vpaddd xmm4, xmm1, xmm0) - a3(vprotd xmm4, xmm4, 7) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm0, xmm3) - a3(vprotd xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm3, xmm2) - a3(vprotd xmm4, xmm4, 13) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm2, xmm1) - a3(pshufd xmm3, xmm3, 0x93) - a3(vprotd xmm4, xmm4, 18) - a3(pshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a3(pshufd xmm1, xmm1, 0x39) - a3(vpaddd xmm4, xmm3, xmm0) - a3(vprotd xmm4, xmm4, 7) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm0, xmm1) - a3(vprotd xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm1, xmm2) - a3(vprotd xmm4, xmm4, 13) - a3(vpxor xmm3, xmm3, xmm4) - a3(pshufd xmm1, xmm1, 0x93) - a3(vpaddd xmm4, xmm2, xmm3) - a3(pshufd xmm2, xmm2, 0x4e) - a3(vprotd xmm4, xmm4, 18) - a3(pshufd xmm3, xmm3, 0x39) - a3(vpxor xmm0, xmm0, xmm4) - a2(sub eax, 2) - aj(ja scrypt_salsa_xop_loop) - a3(vpaddd xmm0,xmm0,[esp+0]) - a3(vpaddd xmm1,xmm1,[esp+16]) - a3(vpaddd xmm2,xmm2,xmm6) - a3(vpaddd xmm3,xmm3,xmm7) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(vmovdqa [eax+0],xmm0) - a2(vmovdqa [eax+16],xmm1) - a2(vmovdqa [eax+32],xmm2) - a2(vmovdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - aj(jne scrypt_ChunkMix_xop_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_xop) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) - -#define SCRYPT_SALSA_XOP - -asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_xop) - a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(vmovdqa xmm0,[rax+0]) - a2(vmovdqa xmm1,[rax+16]) - a2(vmovdqa xmm2,[rax+32]) - a2(vmovdqa xmm3,[rax+48]) - aj(jz scrypt_ChunkMix_xop_no_xor1) - a3(vpxor xmm0,xmm0,[r9+0]) - a3(vpxor xmm1,xmm1,[r9+16]) - a3(vpxor xmm2,xmm2,[r9+32]) - a3(vpxor xmm3,xmm3,[r9+48]) - a1(scrypt_ChunkMix_xop_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_xop_loop:) - a2(and rdx, rdx) - a3(vpxor xmm0,xmm0,[rsi+r9+0]) - a3(vpxor xmm1,xmm1,[rsi+r9+16]) - a3(vpxor xmm2,xmm2,[rsi+r9+32]) - a3(vpxor xmm3,xmm3,[rsi+r9+48]) - aj(jz scrypt_ChunkMix_xop_no_xor2) - a3(vpxor xmm0,xmm0,[rdx+r9+0]) - a3(vpxor xmm1,xmm1,[rdx+r9+16]) - a3(vpxor xmm2,xmm2,[rdx+r9+32]) - a3(vpxor xmm3,xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_xop_no_xor2:) - a2(vmovdqa xmm8,xmm0) - a2(vmovdqa xmm9,xmm1) - a2(vmovdqa xmm10,xmm2) - a2(vmovdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_salsa_xop_loop: ) - a3(vpaddd xmm4, xmm1, xmm0) - a3(vprotd xmm4, xmm4, 7) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm0, xmm3) - a3(vprotd xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm3, xmm2) - a3(vprotd xmm4, xmm4, 13) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm2, xmm1) - a3(pshufd xmm3, xmm3, 0x93) - a3(vprotd xmm4, xmm4, 18) - a3(pshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a3(pshufd xmm1, xmm1, 0x39) - a3(vpaddd xmm4, xmm3, xmm0) - a3(vprotd xmm4, xmm4, 7) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm0, xmm1) - a3(vprotd xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm1, xmm2) - a3(vprotd xmm4, xmm4, 13) - a3(vpxor xmm3, xmm3, xmm4) - a3(pshufd xmm1, xmm1, 0x93) - a3(vpaddd xmm4, xmm2, xmm3) - a3(pshufd xmm2, xmm2, 0x4e) - a3(vprotd xmm4, xmm4, 18) - a3(pshufd xmm3, xmm3, 0x39) - a3(vpxor xmm0, xmm0, xmm4) - a2(sub rax, 2) - aj(ja scrypt_salsa_xop_loop) - a3(vpaddd xmm0,xmm0,xmm8) - a3(vpaddd xmm1,xmm1,xmm9) - a3(vpaddd xmm2,xmm2,xmm10) - a3(vpaddd xmm3,xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(vmovdqa [rax+0],xmm0) - a2(vmovdqa [rax+16],xmm1) - a2(vmovdqa [rax+32],xmm2) - a2(vmovdqa [rax+48],xmm3) - aj(jne scrypt_ChunkMix_xop_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_xop) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_XOP - -static void asm_calling_convention NOINLINE -scrypt_ChunkMix_xop(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x4 = _mm_add_epi32(x1, x0); - x4 = _mm_roti_epi32(x4, 7); - x3 = _mm_xor_si128(x3, x4); - x4 = _mm_add_epi32(x0, x3); - x4 = _mm_roti_epi32(x4, 9); - x2 = _mm_xor_si128(x2, x4); - x4 = _mm_add_epi32(x3, x2); - x4 = _mm_roti_epi32(x4, 13); - x1 = _mm_xor_si128(x1, x4); - x4 = _mm_add_epi32(x2, x1); - x4 = _mm_roti_epi32(x4, 18); - x0 = _mm_xor_si128(x0, x4); - x3 = _mm_shuffle_epi32(x3, 0x93); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x1 = _mm_shuffle_epi32(x1, 0x39); - x4 = _mm_add_epi32(x3, x0); - x4 = _mm_roti_epi32(x4, 7); - x1 = _mm_xor_si128(x1, x4); - x4 = _mm_add_epi32(x0, x1); - x4 = _mm_roti_epi32(x4, 9); - x2 = _mm_xor_si128(x2, x4); - x4 = _mm_add_epi32(x1, x2); - x4 = _mm_roti_epi32(x4, 13); - x3 = _mm_xor_si128(x3, x4); - x4 = _mm_add_epi32(x2, x3); - x4 = _mm_roti_epi32(x4, 18); - x0 = _mm_xor_si128(x0, x4); - x1 = _mm_shuffle_epi32(x1, 0x93); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x3 = _mm_shuffle_epi32(x3, 0x39); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_SALSA_XOP) - /* uses salsa_core_tangle_sse2 */ - - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa/8-XOP" - #undef SCRYPT_SALSA_INCLUDED - #define SCRYPT_SALSA_INCLUDED -#endif diff --git a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h deleted file mode 100644 index 33f3340..0000000 --- a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h +++ /dev/null @@ -1,70 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED) - -#undef SCRYPT_MIX -#define SCRYPT_MIX "Salsa20/8 Ref" - -#undef SCRYPT_SALSA_INCLUDED -#define SCRYPT_SALSA_INCLUDED -#define SCRYPT_SALSA_BASIC - -static void -salsa_core_basic(uint32_t state[16]) { - size_t rounds = 8; - uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; - - x0 = state[0]; - x1 = state[1]; - x2 = state[2]; - x3 = state[3]; - x4 = state[4]; - x5 = state[5]; - x6 = state[6]; - x7 = state[7]; - x8 = state[8]; - x9 = state[9]; - x10 = state[10]; - x11 = state[11]; - x12 = state[12]; - x13 = state[13]; - x14 = state[14]; - x15 = state[15]; - - #define quarter(a,b,c,d) \ - t = a+d; t = ROTL32(t, 7); b ^= t; \ - t = b+a; t = ROTL32(t, 9); c ^= t; \ - t = c+b; t = ROTL32(t, 13); d ^= t; \ - t = d+c; t = ROTL32(t, 18); a ^= t; \ - - for (; rounds; rounds -= 2) { - quarter( x0, x4, x8,x12) - quarter( x5, x9,x13, x1) - quarter(x10,x14, x2, x6) - quarter(x15, x3, x7,x11) - quarter( x0, x1, x2, x3) - quarter( x5, x6, x7, x4) - quarter(x10,x11, x8, x9) - quarter(x15,x12,x13,x14) - } - - state[0] += x0; - state[1] += x1; - state[2] += x2; - state[3] += x3; - state[4] += x4; - state[5] += x5; - state[6] += x6; - state[7] += x7; - state[8] += x8; - state[9] += x9; - state[10] += x10; - state[11] += x11; - state[12] += x12; - state[13] += x13; - state[14] += x14; - state[15] += x15; - - #undef quarter -} - -#endif - diff --git a/stratum/algos/ar2/sj/scrypt-jane-romix.h b/stratum/algos/ar2/sj/scrypt-jane-romix.h index 8ed6e9e..cf4ac2f 100644 --- a/stratum/algos/ar2/sj/scrypt-jane-romix.h +++ b/stratum/algos/ar2/sj/scrypt-jane-romix.h @@ -1,6 +1,4 @@ -#if defined(SCRYPT_SALSA) -#include "scrypt-jane-salsa.h" -#elif defined(SCRYPT_SALSA64) +#ifdef SCRYPT_SALSA64 #include "scrypt-jane-salsa64.h" #else #define SCRYPT_MIX_BASE "ERROR" diff --git a/stratum/algos/ar2/sj/scrypt-jane-salsa.h b/stratum/algos/ar2/sj/scrypt-jane-salsa.h deleted file mode 100644 index df0a3e0..0000000 --- a/stratum/algos/ar2/sj/scrypt-jane-salsa.h +++ /dev/null @@ -1,134 +0,0 @@ -#define SCRYPT_MIX_BASE "Salsa20/8" - -typedef uint32_t scrypt_mix_word_t; - -#define SCRYPT_WORDTO8_LE U32TO8_LE -#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP - -#define SCRYPT_BLOCK_BYTES 64 -#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) - -/* must have these here in case block bytes is ever != 64 */ -#include "scrypt-jane-romix-basic.h" - -#include "scrypt-jane-mix_salsa-xop.h" -#include "scrypt-jane-mix_salsa-avx.h" -#include "scrypt-jane-mix_salsa-sse2.h" -#include "scrypt-jane-mix_salsa.h" - -#if defined(SCRYPT_SALSA_XOP) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop - #define SCRYPT_ROMIX_FN scrypt_ROMix_xop - #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_SALSA_AVX) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx - #define SCRYPT_ROMIX_FN scrypt_ROMix_avx - #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_SALSA_SSE2) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 - #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 - #define SCRYPT_MIX_FN salsa_core_sse2 - #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -/* cpu agnostic */ -#define SCRYPT_ROMIX_FN scrypt_ROMix_basic -#define SCRYPT_MIX_FN salsa_core_basic -#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian -#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian -#include "scrypt-jane-romix-template.h" - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -static scrypt_ROMixfn -scrypt_getROMix(void) { - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_SALSA_XOP) - if (cpuflags & cpu_xop) - return scrypt_ROMix_xop; - else -#endif - -#if defined(SCRYPT_SALSA_AVX) - if (cpuflags & cpu_avx) - return scrypt_ROMix_avx; - else -#endif - -#if defined(SCRYPT_SALSA_SSE2) - if (cpuflags & cpu_sse2) - return scrypt_ROMix_sse2; - else -#endif - - return scrypt_ROMix_basic; -} -#endif - - -#if defined(SCRYPT_TEST_SPEED) -static size_t -available_implementations(void) { - size_t cpuflags = detect_cpu(); - size_t flags = 0; - -#if defined(SCRYPT_SALSA_XOP) - if (cpuflags & cpu_xop) - flags |= cpu_xop; -#endif - -#if defined(SCRYPT_SALSA_AVX) - if (cpuflags & cpu_avx) - flags |= cpu_avx; -#endif - -#if defined(SCRYPT_SALSA_SSE2) - if (cpuflags & cpu_sse2) - flags |= cpu_sse2; -#endif - - return flags; -} -#endif - - -static int -scrypt_test_mix(void) { - static const uint8_t expected[16] = { - 0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66, - }; - - int ret = 1; - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_SALSA_XOP) - if (cpuflags & cpu_xop) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA_AVX) - if (cpuflags & cpu_avx) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA_SSE2) - if (cpuflags & cpu_sse2) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA_BASIC) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); -#endif - - return ret; -} diff --git a/stratum/algos/ar2/sj/scrypt-jane-test-vectors.h b/stratum/algos/ar2/sj/scrypt-jane-test-vectors.h index f5b15a3..20fd0cf 100644 --- a/stratum/algos/ar2/sj/scrypt-jane-test-vectors.h +++ b/stratum/algos/ar2/sj/scrypt-jane-test-vectors.h @@ -10,18 +10,7 @@ static const scrypt_test_setting post_settings[] = { }; #if defined(SCRYPT_SKEIN512) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69, - 0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87, - 0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f, - 0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e}, - {0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e, - 0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b, - 0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb, - 0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00} - }; - #elif defined(SCRYPT_SALSA64) + #ifdef SCRYPT_SALSA64 static const uint8_t post_vectors[][64] = { {0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60, 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59, diff --git a/stratum/algos/ar2/thread.c b/stratum/algos/ar2/thread.c deleted file mode 100644 index 0c4edea..0000000 --- a/stratum/algos/ar2/thread.c +++ /dev/null @@ -1,36 +0,0 @@ -#include "thread.h" -#if defined(_WIN32) -#include -#endif - -int argon2_thread_create(argon2_thread_handle_t *handle, - argon2_thread_func_t func, void *args) { - if (NULL == handle || func == NULL) { - return -1; - } -#if defined(_WIN32) - *handle = _beginthreadex(NULL, 0, func, args, 0, NULL); - return *handle != 0 ? 0 : -1; -#else - return pthread_create(handle, NULL, func, args); -#endif -} - -int argon2_thread_join(argon2_thread_handle_t handle) { -#if defined(_WIN32) - if (WaitForSingleObject((HANDLE)handle, INFINITE) == WAIT_OBJECT_0) { - return CloseHandle((HANDLE)handle) != 0 ? 0 : -1; - } - return -1; -#else - return pthread_join(handle, NULL); -#endif -} - -void argon2_thread_exit(void) { -#if defined(_WIN32) - _endthreadex(0); -#else - pthread_exit(NULL); -#endif -} diff --git a/stratum/algos/ar2/thread.h b/stratum/algos/ar2/thread.h deleted file mode 100644 index 57c4ce5..0000000 --- a/stratum/algos/ar2/thread.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef ARGON2_THREAD_H -#define ARGON2_THREAD_H -/* - Here we implement an abstraction layer for the simpĺe requirements - of the Argon2 code. We only require 3 primitives---thread creation, - joining, and termination---so full emulation of the pthreads API - is unwarranted. Currently we wrap pthreads and Win32 threads. - - The API defines 2 types: the function pointer type, - argon2_thread_func_t, - and the type of the thread handle---argon2_thread_handle_t. -*/ -#if defined(_WIN32) -#include -typedef unsigned(__stdcall *argon2_thread_func_t)(void *); -typedef uintptr_t argon2_thread_handle_t; -#else -#include -typedef void *(*argon2_thread_func_t)(void *); -typedef pthread_t argon2_thread_handle_t; -#endif - -/* Creates a thread - * @param handle pointer to a thread handle, which is the output of this - * function. Must not be NULL. - * @param func A function pointer for the thread's entry point. Must not be - * NULL. - * @param args Pointer that is passed as an argument to @func. May be NULL. - * @return 0 if @handle and @func are valid pointers and a thread is successfuly - * created. - */ -int argon2_thread_create(argon2_thread_handle_t *handle, - argon2_thread_func_t func, void *args); - -/* Waits for a thread to terminate - * @param handle Handle to a thread created with argon2_thread_create. - * @return 0 if @handle is a valid handle, and joining completed successfully. -*/ -int argon2_thread_join(argon2_thread_handle_t handle); - -/* Terminate the current thread. Must be run inside a thread created by - * argon2_thread_create. -*/ -void argon2_thread_exit(void); - -#endif diff --git a/stratum/algos/argon2a.c b/stratum/algos/argon2a.c index f099694..eab0d81 100644 --- a/stratum/algos/argon2a.c +++ b/stratum/algos/argon2a.c @@ -7,7 +7,7 @@ #include "ar2/argon2.h" #include "ar2/cores.h" -#include "ar2/scrypt-jane.h" +#include "ar2/ar2-scrypt-jane.h" #define _ALIGN(x) __attribute__ ((aligned(x))) @@ -16,7 +16,7 @@ #define MASK 8 #define ZERO 0 -static void argon_call(void *out, void *in, void *salt, int type) +inline void argon_call(void *out, void *in, void *salt, int type) { argon2_context context = { 0 }; @@ -27,50 +27,18 @@ static void argon_call(void *out, void *in, void *salt, int type) argon2_core(&context, type); } -static void bin2hex(char *s, const unsigned char *p, size_t len) -{ - for (size_t i = 0; i < len; i++) - sprintf(s + (i * 2), "%02x", (unsigned int) p[i]); -} - -static char *abin2hex(const unsigned char *p, size_t len) -{ - char *s = (char*) malloc((len * 2) + 1); - if (!s) - return NULL; - bin2hex(s, p, len); - return s; -} - -static void applog_data(void *pdata) -{ - char* hex = abin2hex((unsigned char*)pdata, 80); - fprintf(stderr, "%s\n", hex); - free(hex); -} - void argon2_hash(const char* input, char* output, uint32_t len) { - // these uint512 in the c++ source of the client are backed by an array of uint32 - uint32_t _ALIGN(32) hashA[8], hashB[8], hashC[8]; - uint32_t _ALIGN(32) endian[20], *in; + uint32_t _ALIGN(32) hashA[8], hashB[8]; - in = (uint32_t*) input; - for (int i=0; i