argon2: clean stratum code

remove dead code detected after cpuminer implementation
This commit is contained in:
Tanguy Pruvot 2016-01-25 02:25:22 +01:00
parent 52580a5636
commit 9d1c34f894
13 changed files with 10 additions and 1482 deletions

View file

@ -12,7 +12,7 @@ extern "C" {
#endif #endif
#endif #endif
#include "scrypt-jane.h" #include "ar2-scrypt-jane.h"
#include "sj/scrypt-jane-portable.h" #include "sj/scrypt-jane-portable.h"
#include "sj/scrypt-jane-hash.h" #include "sj/scrypt-jane-hash.h"

View file

@ -1,381 +0,0 @@
/* x86 */
#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
#define SCRYPT_SALSA_AVX
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
asm_naked_fn(scrypt_ChunkMix_avx)
a1(push ebx)
a1(push edi)
a1(push esi)
a1(push ebp)
a2(mov ebp,esp)
a2(mov edi,[ebp+20])
a2(mov esi,[ebp+24])
a2(mov eax,[ebp+28])
a2(mov ebx,[ebp+32])
a2(sub esp,32)
a2(and esp,~63)
a2(lea edx,[ebx*2])
a2(shl edx,6)
a2(lea ecx,[edx-64])
a2(and eax, eax)
a2(movdqa xmm0,[ecx+esi+0])
a2(movdqa xmm1,[ecx+esi+16])
a2(movdqa xmm2,[ecx+esi+32])
a2(movdqa xmm3,[ecx+esi+48])
aj(jz scrypt_ChunkMix_avx_no_xor1)
a3(vpxor xmm0,xmm0,[ecx+eax+0])
a3(vpxor xmm1,xmm1,[ecx+eax+16])
a3(vpxor xmm2,xmm2,[ecx+eax+32])
a3(vpxor xmm3,xmm3,[ecx+eax+48])
a1(scrypt_ChunkMix_avx_no_xor1:)
a2(xor ecx,ecx)
a2(xor ebx,ebx)
a1(scrypt_ChunkMix_avx_loop:)
a2(and eax, eax)
a3(vpxor xmm0,xmm0,[esi+ecx+0])
a3(vpxor xmm1,xmm1,[esi+ecx+16])
a3(vpxor xmm2,xmm2,[esi+ecx+32])
a3(vpxor xmm3,xmm3,[esi+ecx+48])
aj(jz scrypt_ChunkMix_avx_no_xor2)
a3(vpxor xmm0,xmm0,[eax+ecx+0])
a3(vpxor xmm1,xmm1,[eax+ecx+16])
a3(vpxor xmm2,xmm2,[eax+ecx+32])
a3(vpxor xmm3,xmm3,[eax+ecx+48])
a1(scrypt_ChunkMix_avx_no_xor2:)
a2(vmovdqa [esp+0],xmm0)
a2(vmovdqa [esp+16],xmm1)
a2(vmovdqa xmm6,xmm2)
a2(vmovdqa xmm7,xmm3)
a2(mov eax,8)
a1(scrypt_salsa_avx_loop: )
a3(vpaddd xmm4, xmm1, xmm0)
a3(vpsrld xmm5, xmm4, 25)
a3(vpslld xmm4, xmm4, 7)
a3(vpxor xmm3, xmm3, xmm5)
a3(vpxor xmm3, xmm3, xmm4)
a3(vpaddd xmm4, xmm0, xmm3)
a3(vpsrld xmm5, xmm4, 23)
a3(vpslld xmm4, xmm4, 9)
a3(vpxor xmm2, xmm2, xmm5)
a3(vpxor xmm2, xmm2, xmm4)
a3(vpaddd xmm4, xmm3, xmm2)
a3(vpsrld xmm5, xmm4, 19)
a3(vpslld xmm4, xmm4, 13)
a3(vpxor xmm1, xmm1, xmm5)
a3(vpshufd xmm3, xmm3, 0x93)
a3(vpxor xmm1, xmm1, xmm4)
a3(vpaddd xmm4, xmm2, xmm1)
a3(vpsrld xmm5, xmm4, 14)
a3(vpslld xmm4, xmm4, 18)
a3(vpxor xmm0, xmm0, xmm5)
a3(vpshufd xmm2, xmm2, 0x4e)
a3(vpxor xmm0, xmm0, xmm4)
a3(vpaddd xmm4, xmm3, xmm0)
a3(vpshufd xmm1, xmm1, 0x39)
a3(vpsrld xmm5, xmm4, 25)
a3(vpslld xmm4, xmm4, 7)
a3(vpxor xmm1, xmm1, xmm5)
a3(vpxor xmm1, xmm1, xmm4)
a3(vpaddd xmm4, xmm0, xmm1)
a3(vpsrld xmm5, xmm4, 23)
a3(vpslld xmm4, xmm4, 9)
a3(vpxor xmm2, xmm2, xmm5)
a3(vpxor xmm2, xmm2, xmm4)
a3(vpaddd xmm4, xmm1, xmm2)
a3(vpsrld xmm5, xmm4, 19)
a3(vpslld xmm4, xmm4, 13)
a3(vpxor xmm3, xmm3, xmm5)
a3(vpshufd xmm1, xmm1, 0x93)
a3(vpxor xmm3, xmm3, xmm4)
a3(vpaddd xmm4, xmm2, xmm3)
a3(vpsrld xmm5, xmm4, 14)
a3(vpslld xmm4, xmm4, 18)
a3(vpxor xmm0, xmm0, xmm5)
a3(vpshufd xmm2, xmm2, 0x4e)
a3(vpxor xmm0, xmm0, xmm4)
a3(vpshufd xmm3, xmm3, 0x39)
a2(sub eax, 2)
aj(ja scrypt_salsa_avx_loop)
a3(vpaddd xmm0,xmm0,[esp+0])
a3(vpaddd xmm1,xmm1,[esp+16])
a3(vpaddd xmm2,xmm2,xmm6)
a3(vpaddd xmm3,xmm3,xmm7)
a2(lea eax,[ebx+ecx])
a2(xor ebx,edx)
a2(and eax,~0x7f)
a2(add ecx,64)
a2(shr eax,1)
a2(add eax, edi)
a2(cmp ecx,edx)
a2(vmovdqa [eax+0],xmm0)
a2(vmovdqa [eax+16],xmm1)
a2(vmovdqa [eax+32],xmm2)
a2(vmovdqa [eax+48],xmm3)
a2(mov eax,[ebp+28])
aj(jne scrypt_ChunkMix_avx_loop)
a2(mov esp,ebp)
a1(pop ebp)
a1(pop esi)
a1(pop edi)
a1(pop ebx)
aret(16)
asm_naked_fn_end(scrypt_ChunkMix_avx)
#endif
/* x64 */
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
#define SCRYPT_SALSA_AVX
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
asm_naked_fn(scrypt_ChunkMix_avx)
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
a2(shl rcx,6)
a2(lea r9,[rcx-64])
a2(lea rax,[rsi+r9])
a2(lea r9,[rdx+r9])
a2(and rdx, rdx)
a2(vmovdqa xmm0,[rax+0])
a2(vmovdqa xmm1,[rax+16])
a2(vmovdqa xmm2,[rax+32])
a2(vmovdqa xmm3,[rax+48])
aj(jz scrypt_ChunkMix_avx_no_xor1)
a3(vpxor xmm0,xmm0,[r9+0])
a3(vpxor xmm1,xmm1,[r9+16])
a3(vpxor xmm2,xmm2,[r9+32])
a3(vpxor xmm3,xmm3,[r9+48])
a1(scrypt_ChunkMix_avx_no_xor1:)
a2(xor r9,r9)
a2(xor r8,r8)
a1(scrypt_ChunkMix_avx_loop:)
a2(and rdx, rdx)
a3(vpxor xmm0,xmm0,[rsi+r9+0])
a3(vpxor xmm1,xmm1,[rsi+r9+16])
a3(vpxor xmm2,xmm2,[rsi+r9+32])
a3(vpxor xmm3,xmm3,[rsi+r9+48])
aj(jz scrypt_ChunkMix_avx_no_xor2)
a3(vpxor xmm0,xmm0,[rdx+r9+0])
a3(vpxor xmm1,xmm1,[rdx+r9+16])
a3(vpxor xmm2,xmm2,[rdx+r9+32])
a3(vpxor xmm3,xmm3,[rdx+r9+48])
a1(scrypt_ChunkMix_avx_no_xor2:)
a2(vmovdqa xmm8,xmm0)
a2(vmovdqa xmm9,xmm1)
a2(vmovdqa xmm10,xmm2)
a2(vmovdqa xmm11,xmm3)
a2(mov rax,8)
a1(scrypt_salsa_avx_loop: )
a3(vpaddd xmm4, xmm1, xmm0)
a3(vpsrld xmm5, xmm4, 25)
a3(vpslld xmm4, xmm4, 7)
a3(vpxor xmm3, xmm3, xmm5)
a3(vpxor xmm3, xmm3, xmm4)
a3(vpaddd xmm4, xmm0, xmm3)
a3(vpsrld xmm5, xmm4, 23)
a3(vpslld xmm4, xmm4, 9)
a3(vpxor xmm2, xmm2, xmm5)
a3(vpxor xmm2, xmm2, xmm4)
a3(vpaddd xmm4, xmm3, xmm2)
a3(vpsrld xmm5, xmm4, 19)
a3(vpslld xmm4, xmm4, 13)
a3(vpxor xmm1, xmm1, xmm5)
a3(vpshufd xmm3, xmm3, 0x93)
a3(vpxor xmm1, xmm1, xmm4)
a3(vpaddd xmm4, xmm2, xmm1)
a3(vpsrld xmm5, xmm4, 14)
a3(vpslld xmm4, xmm4, 18)
a3(vpxor xmm0, xmm0, xmm5)
a3(vpshufd xmm2, xmm2, 0x4e)
a3(vpxor xmm0, xmm0, xmm4)
a3(vpaddd xmm4, xmm3, xmm0)
a3(vpshufd xmm1, xmm1, 0x39)
a3(vpsrld xmm5, xmm4, 25)
a3(vpslld xmm4, xmm4, 7)
a3(vpxor xmm1, xmm1, xmm5)
a3(vpxor xmm1, xmm1, xmm4)
a3(vpaddd xmm4, xmm0, xmm1)
a3(vpsrld xmm5, xmm4, 23)
a3(vpslld xmm4, xmm4, 9)
a3(vpxor xmm2, xmm2, xmm5)
a3(vpxor xmm2, xmm2, xmm4)
a3(vpaddd xmm4, xmm1, xmm2)
a3(vpsrld xmm5, xmm4, 19)
a3(vpslld xmm4, xmm4, 13)
a3(vpxor xmm3, xmm3, xmm5)
a3(vpshufd xmm1, xmm1, 0x93)
a3(vpxor xmm3, xmm3, xmm4)
a3(vpaddd xmm4, xmm2, xmm3)
a3(vpsrld xmm5, xmm4, 14)
a3(vpslld xmm4, xmm4, 18)
a3(vpxor xmm0, xmm0, xmm5)
a3(vpshufd xmm2, xmm2, 0x4e)
a3(vpxor xmm0, xmm0, xmm4)
a3(vpshufd xmm3, xmm3, 0x39)
a2(sub rax, 2)
aj(ja scrypt_salsa_avx_loop)
a3(vpaddd xmm0,xmm0,xmm8)
a3(vpaddd xmm1,xmm1,xmm9)
a3(vpaddd xmm2,xmm2,xmm10)
a3(vpaddd xmm3,xmm3,xmm11)
a2(lea rax,[r8+r9])
a2(xor r8,rcx)
a2(and rax,~0x7f)
a2(add r9,64)
a2(shr rax,1)
a2(add rax, rdi)
a2(cmp r9,rcx)
a2(vmovdqa [rax+0],xmm0)
a2(vmovdqa [rax+16],xmm1)
a2(vmovdqa [rax+32],xmm2)
a2(vmovdqa [rax+48],xmm3)
aj(jne scrypt_ChunkMix_avx_loop)
a1(ret)
asm_naked_fn_end(scrypt_ChunkMix_avx)
#endif
/* intrinsic */
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
#define SCRYPT_SALSA_AVX
static void asm_calling_convention NOINLINE
scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
uint32_t i, blocksPerChunk = r * 2, half = 0;
xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
size_t rounds;
/* 1: X = B_{2r - 1} */
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
x0 = xmmp[0];
x1 = xmmp[1];
x2 = xmmp[2];
x3 = xmmp[3];
if (Bxor) {
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
x0 = _mm_xor_si128(x0, xmmp[0]);
x1 = _mm_xor_si128(x1, xmmp[1]);
x2 = _mm_xor_si128(x2, xmmp[2]);
x3 = _mm_xor_si128(x3, xmmp[3]);
}
/* 2: for i = 0 to 2r - 1 do */
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
/* 3: X = H(X ^ B_i) */
xmmp = (xmmi *)scrypt_block(Bin, i);
x0 = _mm_xor_si128(x0, xmmp[0]);
x1 = _mm_xor_si128(x1, xmmp[1]);
x2 = _mm_xor_si128(x2, xmmp[2]);
x3 = _mm_xor_si128(x3, xmmp[3]);
if (Bxor) {
xmmp = (xmmi *)scrypt_block(Bxor, i);
x0 = _mm_xor_si128(x0, xmmp[0]);
x1 = _mm_xor_si128(x1, xmmp[1]);
x2 = _mm_xor_si128(x2, xmmp[2]);
x3 = _mm_xor_si128(x3, xmmp[3]);
}
t0 = x0;
t1 = x1;
t2 = x2;
t3 = x3;
for (rounds = 8; rounds; rounds -= 2) {
x4 = x1;
x4 = _mm_add_epi32(x4, x0);
x5 = x4;
x4 = _mm_slli_epi32(x4, 7);
x5 = _mm_srli_epi32(x5, 25);
x3 = _mm_xor_si128(x3, x4);
x4 = x0;
x3 = _mm_xor_si128(x3, x5);
x4 = _mm_add_epi32(x4, x3);
x5 = x4;
x4 = _mm_slli_epi32(x4, 9);
x5 = _mm_srli_epi32(x5, 23);
x2 = _mm_xor_si128(x2, x4);
x4 = x3;
x2 = _mm_xor_si128(x2, x5);
x3 = _mm_shuffle_epi32(x3, 0x93);
x4 = _mm_add_epi32(x4, x2);
x5 = x4;
x4 = _mm_slli_epi32(x4, 13);
x5 = _mm_srli_epi32(x5, 19);
x1 = _mm_xor_si128(x1, x4);
x4 = x2;
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_shuffle_epi32(x2, 0x4e);
x4 = _mm_add_epi32(x4, x1);
x5 = x4;
x4 = _mm_slli_epi32(x4, 18);
x5 = _mm_srli_epi32(x5, 14);
x0 = _mm_xor_si128(x0, x4);
x4 = x3;
x0 = _mm_xor_si128(x0, x5);
x1 = _mm_shuffle_epi32(x1, 0x39);
x4 = _mm_add_epi32(x4, x0);
x5 = x4;
x4 = _mm_slli_epi32(x4, 7);
x5 = _mm_srli_epi32(x5, 25);
x1 = _mm_xor_si128(x1, x4);
x4 = x0;
x1 = _mm_xor_si128(x1, x5);
x4 = _mm_add_epi32(x4, x1);
x5 = x4;
x4 = _mm_slli_epi32(x4, 9);
x5 = _mm_srli_epi32(x5, 23);
x2 = _mm_xor_si128(x2, x4);
x4 = x1;
x2 = _mm_xor_si128(x2, x5);
x1 = _mm_shuffle_epi32(x1, 0x93);
x4 = _mm_add_epi32(x4, x2);
x5 = x4;
x4 = _mm_slli_epi32(x4, 13);
x5 = _mm_srli_epi32(x5, 19);
x3 = _mm_xor_si128(x3, x4);
x4 = x2;
x3 = _mm_xor_si128(x3, x5);
x2 = _mm_shuffle_epi32(x2, 0x4e);
x4 = _mm_add_epi32(x4, x3);
x5 = x4;
x4 = _mm_slli_epi32(x4, 18);
x5 = _mm_srli_epi32(x5, 14);
x0 = _mm_xor_si128(x0, x4);
x3 = _mm_shuffle_epi32(x3, 0x39);
x0 = _mm_xor_si128(x0, x5);
}
x0 = _mm_add_epi32(x0, t0);
x1 = _mm_add_epi32(x1, t1);
x2 = _mm_add_epi32(x2, t2);
x3 = _mm_add_epi32(x3, t3);
/* 4: Y_i = X */
/* 6: B'[0..r-1] = Y_even */
/* 6: B'[r..2r-1] = Y_odd */
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
xmmp[0] = x0;
xmmp[1] = x1;
xmmp[2] = x2;
xmmp[3] = x3;
}
}
#endif
#if defined(SCRYPT_SALSA_AVX)
/* uses salsa_core_tangle_sse2 */
#undef SCRYPT_MIX
#define SCRYPT_MIX "Salsa/8-AVX"
#undef SCRYPT_SALSA_INCLUDED
#define SCRYPT_SALSA_INCLUDED
#endif

View file

@ -1,443 +0,0 @@
/* x86 */
#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
#define SCRYPT_SALSA_SSE2
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
asm_naked_fn(scrypt_ChunkMix_sse2)
a1(push ebx)
a1(push edi)
a1(push esi)
a1(push ebp)
a2(mov ebp,esp)
a2(mov edi,[ebp+20])
a2(mov esi,[ebp+24])
a2(mov eax,[ebp+28])
a2(mov ebx,[ebp+32])
a2(sub esp,32)
a2(and esp,~63)
a2(lea edx,[ebx*2])
a2(shl edx,6)
a2(lea ecx,[edx-64])
a2(and eax, eax)
a2(movdqa xmm0,[ecx+esi+0])
a2(movdqa xmm1,[ecx+esi+16])
a2(movdqa xmm2,[ecx+esi+32])
a2(movdqa xmm3,[ecx+esi+48])
aj(jz scrypt_ChunkMix_sse2_no_xor1)
a2(pxor xmm0,[ecx+eax+0])
a2(pxor xmm1,[ecx+eax+16])
a2(pxor xmm2,[ecx+eax+32])
a2(pxor xmm3,[ecx+eax+48])
a1(scrypt_ChunkMix_sse2_no_xor1:)
a2(xor ecx,ecx)
a2(xor ebx,ebx)
a1(scrypt_ChunkMix_sse2_loop:)
a2(and eax, eax)
a2(pxor xmm0,[esi+ecx+0])
a2(pxor xmm1,[esi+ecx+16])
a2(pxor xmm2,[esi+ecx+32])
a2(pxor xmm3,[esi+ecx+48])
aj(jz scrypt_ChunkMix_sse2_no_xor2)
a2(pxor xmm0,[eax+ecx+0])
a2(pxor xmm1,[eax+ecx+16])
a2(pxor xmm2,[eax+ecx+32])
a2(pxor xmm3,[eax+ecx+48])
a1(scrypt_ChunkMix_sse2_no_xor2:)
a2(movdqa [esp+0],xmm0)
a2(movdqa [esp+16],xmm1)
a2(movdqa xmm6,xmm2)
a2(movdqa xmm7,xmm3)
a2(mov eax,8)
a1(scrypt_salsa_sse2_loop: )
a2(movdqa xmm4, xmm1)
a2(paddd xmm4, xmm0)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 7)
a2(psrld xmm5, 25)
a2(pxor xmm3, xmm4)
a2(movdqa xmm4, xmm0)
a2(pxor xmm3, xmm5)
a2(paddd xmm4, xmm3)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 9)
a2(psrld xmm5, 23)
a2(pxor xmm2, xmm4)
a2(movdqa xmm4, xmm3)
a2(pxor xmm2, xmm5)
a3(pshufd xmm3, xmm3, 0x93)
a2(paddd xmm4, xmm2)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 13)
a2(psrld xmm5, 19)
a2(pxor xmm1, xmm4)
a2(movdqa xmm4, xmm2)
a2(pxor xmm1, xmm5)
a3(pshufd xmm2, xmm2, 0x4e)
a2(paddd xmm4, xmm1)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 18)
a2(psrld xmm5, 14)
a2(pxor xmm0, xmm4)
a2(movdqa xmm4, xmm3)
a2(pxor xmm0, xmm5)
a3(pshufd xmm1, xmm1, 0x39)
a2(paddd xmm4, xmm0)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 7)
a2(psrld xmm5, 25)
a2(pxor xmm1, xmm4)
a2(movdqa xmm4, xmm0)
a2(pxor xmm1, xmm5)
a2(paddd xmm4, xmm1)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 9)
a2(psrld xmm5, 23)
a2(pxor xmm2, xmm4)
a2(movdqa xmm4, xmm1)
a2(pxor xmm2, xmm5)
a3(pshufd xmm1, xmm1, 0x93)
a2(paddd xmm4, xmm2)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 13)
a2(psrld xmm5, 19)
a2(pxor xmm3, xmm4)
a2(movdqa xmm4, xmm2)
a2(pxor xmm3, xmm5)
a3(pshufd xmm2, xmm2, 0x4e)
a2(paddd xmm4, xmm3)
a2(sub eax, 2)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 18)
a2(psrld xmm5, 14)
a2(pxor xmm0, xmm4)
a3(pshufd xmm3, xmm3, 0x39)
a2(pxor xmm0, xmm5)
aj(ja scrypt_salsa_sse2_loop)
a2(paddd xmm0,[esp+0])
a2(paddd xmm1,[esp+16])
a2(paddd xmm2,xmm6)
a2(paddd xmm3,xmm7)
a2(lea eax,[ebx+ecx])
a2(xor ebx,edx)
a2(and eax,~0x7f)
a2(add ecx,64)
a2(shr eax,1)
a2(add eax, edi)
a2(cmp ecx,edx)
a2(movdqa [eax+0],xmm0)
a2(movdqa [eax+16],xmm1)
a2(movdqa [eax+32],xmm2)
a2(movdqa [eax+48],xmm3)
a2(mov eax,[ebp+28])
aj(jne scrypt_ChunkMix_sse2_loop)
a2(mov esp,ebp)
a1(pop ebp)
a1(pop esi)
a1(pop edi)
a1(pop ebx)
aret(16)
asm_naked_fn_end(scrypt_ChunkMix_sse2)
#endif
/* x64 */
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
#define SCRYPT_SALSA_SSE2
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
asm_naked_fn(scrypt_ChunkMix_sse2)
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
a2(shl rcx,6)
a2(lea r9,[rcx-64])
a2(lea rax,[rsi+r9])
a2(lea r9,[rdx+r9])
a2(and rdx, rdx)
a2(movdqa xmm0,[rax+0])
a2(movdqa xmm1,[rax+16])
a2(movdqa xmm2,[rax+32])
a2(movdqa xmm3,[rax+48])
aj(jz scrypt_ChunkMix_sse2_no_xor1)
a2(pxor xmm0,[r9+0])
a2(pxor xmm1,[r9+16])
a2(pxor xmm2,[r9+32])
a2(pxor xmm3,[r9+48])
a1(scrypt_ChunkMix_sse2_no_xor1:)
a2(xor r9,r9)
a2(xor r8,r8)
a1(scrypt_ChunkMix_sse2_loop:)
a2(and rdx, rdx)
a2(pxor xmm0,[rsi+r9+0])
a2(pxor xmm1,[rsi+r9+16])
a2(pxor xmm2,[rsi+r9+32])
a2(pxor xmm3,[rsi+r9+48])
aj(jz scrypt_ChunkMix_sse2_no_xor2)
a2(pxor xmm0,[rdx+r9+0])
a2(pxor xmm1,[rdx+r9+16])
a2(pxor xmm2,[rdx+r9+32])
a2(pxor xmm3,[rdx+r9+48])
a1(scrypt_ChunkMix_sse2_no_xor2:)
a2(movdqa xmm8,xmm0)
a2(movdqa xmm9,xmm1)
a2(movdqa xmm10,xmm2)
a2(movdqa xmm11,xmm3)
a2(mov rax,8)
a1(scrypt_salsa_sse2_loop: )
a2(movdqa xmm4, xmm1)
a2(paddd xmm4, xmm0)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 7)
a2(psrld xmm5, 25)
a2(pxor xmm3, xmm4)
a2(movdqa xmm4, xmm0)
a2(pxor xmm3, xmm5)
a2(paddd xmm4, xmm3)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 9)
a2(psrld xmm5, 23)
a2(pxor xmm2, xmm4)
a2(movdqa xmm4, xmm3)
a2(pxor xmm2, xmm5)
a3(pshufd xmm3, xmm3, 0x93)
a2(paddd xmm4, xmm2)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 13)
a2(psrld xmm5, 19)
a2(pxor xmm1, xmm4)
a2(movdqa xmm4, xmm2)
a2(pxor xmm1, xmm5)
a3(pshufd xmm2, xmm2, 0x4e)
a2(paddd xmm4, xmm1)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 18)
a2(psrld xmm5, 14)
a2(pxor xmm0, xmm4)
a2(movdqa xmm4, xmm3)
a2(pxor xmm0, xmm5)
a3(pshufd xmm1, xmm1, 0x39)
a2(paddd xmm4, xmm0)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 7)
a2(psrld xmm5, 25)
a2(pxor xmm1, xmm4)
a2(movdqa xmm4, xmm0)
a2(pxor xmm1, xmm5)
a2(paddd xmm4, xmm1)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 9)
a2(psrld xmm5, 23)
a2(pxor xmm2, xmm4)
a2(movdqa xmm4, xmm1)
a2(pxor xmm2, xmm5)
a3(pshufd xmm1, xmm1, 0x93)
a2(paddd xmm4, xmm2)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 13)
a2(psrld xmm5, 19)
a2(pxor xmm3, xmm4)
a2(movdqa xmm4, xmm2)
a2(pxor xmm3, xmm5)
a3(pshufd xmm2, xmm2, 0x4e)
a2(paddd xmm4, xmm3)
a2(sub rax, 2)
a2(movdqa xmm5, xmm4)
a2(pslld xmm4, 18)
a2(psrld xmm5, 14)
a2(pxor xmm0, xmm4)
a3(pshufd xmm3, xmm3, 0x39)
a2(pxor xmm0, xmm5)
aj(ja scrypt_salsa_sse2_loop)
a2(paddd xmm0,xmm8)
a2(paddd xmm1,xmm9)
a2(paddd xmm2,xmm10)
a2(paddd xmm3,xmm11)
a2(lea rax,[r8+r9])
a2(xor r8,rcx)
a2(and rax,~0x7f)
a2(add r9,64)
a2(shr rax,1)
a2(add rax, rdi)
a2(cmp r9,rcx)
a2(movdqa [rax+0],xmm0)
a2(movdqa [rax+16],xmm1)
a2(movdqa [rax+32],xmm2)
a2(movdqa [rax+48],xmm3)
aj(jne scrypt_ChunkMix_sse2_loop)
a1(ret)
asm_naked_fn_end(scrypt_ChunkMix_sse2)
#endif
/* intrinsic */
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
#define SCRYPT_SALSA_SSE2
static void NOINLINE asm_calling_convention
scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
uint32_t i, blocksPerChunk = r * 2, half = 0;
xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
size_t rounds;
/* 1: X = B_{2r - 1} */
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
x0 = xmmp[0];
x1 = xmmp[1];
x2 = xmmp[2];
x3 = xmmp[3];
if (Bxor) {
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
x0 = _mm_xor_si128(x0, xmmp[0]);
x1 = _mm_xor_si128(x1, xmmp[1]);
x2 = _mm_xor_si128(x2, xmmp[2]);
x3 = _mm_xor_si128(x3, xmmp[3]);
}
/* 2: for i = 0 to 2r - 1 do */
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
/* 3: X = H(X ^ B_i) */
xmmp = (xmmi *)scrypt_block(Bin, i);
x0 = _mm_xor_si128(x0, xmmp[0]);
x1 = _mm_xor_si128(x1, xmmp[1]);
x2 = _mm_xor_si128(x2, xmmp[2]);
x3 = _mm_xor_si128(x3, xmmp[3]);
if (Bxor) {
xmmp = (xmmi *)scrypt_block(Bxor, i);
x0 = _mm_xor_si128(x0, xmmp[0]);
x1 = _mm_xor_si128(x1, xmmp[1]);
x2 = _mm_xor_si128(x2, xmmp[2]);
x3 = _mm_xor_si128(x3, xmmp[3]);
}
t0 = x0;
t1 = x1;
t2 = x2;
t3 = x3;
for (rounds = 8; rounds; rounds -= 2) {
x4 = x1;
x4 = _mm_add_epi32(x4, x0);
x5 = x4;
x4 = _mm_slli_epi32(x4, 7);
x5 = _mm_srli_epi32(x5, 25);
x3 = _mm_xor_si128(x3, x4);
x4 = x0;
x3 = _mm_xor_si128(x3, x5);
x4 = _mm_add_epi32(x4, x3);
x5 = x4;
x4 = _mm_slli_epi32(x4, 9);
x5 = _mm_srli_epi32(x5, 23);
x2 = _mm_xor_si128(x2, x4);
x4 = x3;
x2 = _mm_xor_si128(x2, x5);
x3 = _mm_shuffle_epi32(x3, 0x93);
x4 = _mm_add_epi32(x4, x2);
x5 = x4;
x4 = _mm_slli_epi32(x4, 13);
x5 = _mm_srli_epi32(x5, 19);
x1 = _mm_xor_si128(x1, x4);
x4 = x2;
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_shuffle_epi32(x2, 0x4e);
x4 = _mm_add_epi32(x4, x1);
x5 = x4;
x4 = _mm_slli_epi32(x4, 18);
x5 = _mm_srli_epi32(x5, 14);
x0 = _mm_xor_si128(x0, x4);
x4 = x3;
x0 = _mm_xor_si128(x0, x5);
x1 = _mm_shuffle_epi32(x1, 0x39);
x4 = _mm_add_epi32(x4, x0);
x5 = x4;
x4 = _mm_slli_epi32(x4, 7);
x5 = _mm_srli_epi32(x5, 25);
x1 = _mm_xor_si128(x1, x4);
x4 = x0;
x1 = _mm_xor_si128(x1, x5);
x4 = _mm_add_epi32(x4, x1);
x5 = x4;
x4 = _mm_slli_epi32(x4, 9);
x5 = _mm_srli_epi32(x5, 23);
x2 = _mm_xor_si128(x2, x4);
x4 = x1;
x2 = _mm_xor_si128(x2, x5);
x1 = _mm_shuffle_epi32(x1, 0x93);
x4 = _mm_add_epi32(x4, x2);
x5 = x4;
x4 = _mm_slli_epi32(x4, 13);
x5 = _mm_srli_epi32(x5, 19);
x3 = _mm_xor_si128(x3, x4);
x4 = x2;
x3 = _mm_xor_si128(x3, x5);
x2 = _mm_shuffle_epi32(x2, 0x4e);
x4 = _mm_add_epi32(x4, x3);
x5 = x4;
x4 = _mm_slli_epi32(x4, 18);
x5 = _mm_srli_epi32(x5, 14);
x0 = _mm_xor_si128(x0, x4);
x3 = _mm_shuffle_epi32(x3, 0x39);
x0 = _mm_xor_si128(x0, x5);
}
x0 = _mm_add_epi32(x0, t0);
x1 = _mm_add_epi32(x1, t1);
x2 = _mm_add_epi32(x2, t2);
x3 = _mm_add_epi32(x3, t3);
/* 4: Y_i = X */
/* 6: B'[0..r-1] = Y_even */
/* 6: B'[r..2r-1] = Y_odd */
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
xmmp[0] = x0;
xmmp[1] = x1;
xmmp[2] = x2;
xmmp[3] = x3;
}
}
#endif
#if defined(SCRYPT_SALSA_SSE2)
#undef SCRYPT_MIX
#define SCRYPT_MIX "Salsa/8-SSE2"
#undef SCRYPT_SALSA_INCLUDED
#define SCRYPT_SALSA_INCLUDED
#endif
/* used by avx,etc as well */
#if defined(SCRYPT_SALSA_INCLUDED)
/*
Default layout:
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
SSE2 layout:
0 5 10 15
12 1 6 11
8 13 2 7
4 9 14 3
*/
static void asm_calling_convention
salsa_core_tangle_sse2(uint32_t *blocks, size_t count) {
uint32_t t;
while (count--) {
t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
blocks += 16;
}
}
#endif

View file

@ -1,317 +0,0 @@
/* x86 */
#if defined(X86ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
#define SCRYPT_SALSA_XOP
asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
asm_naked_fn(scrypt_ChunkMix_xop)
a1(push ebx)
a1(push edi)
a1(push esi)
a1(push ebp)
a2(mov ebp,esp)
a2(mov edi,[ebp+20])
a2(mov esi,[ebp+24])
a2(mov eax,[ebp+28])
a2(mov ebx,[ebp+32])
a2(sub esp,32)
a2(and esp,~63)
a2(lea edx,[ebx*2])
a2(shl edx,6)
a2(lea ecx,[edx-64])
a2(and eax, eax)
a2(movdqa xmm0,[ecx+esi+0])
a2(movdqa xmm1,[ecx+esi+16])
a2(movdqa xmm2,[ecx+esi+32])
a2(movdqa xmm3,[ecx+esi+48])
aj(jz scrypt_ChunkMix_xop_no_xor1)
a3(vpxor xmm0,xmm0,[ecx+eax+0])
a3(vpxor xmm1,xmm1,[ecx+eax+16])
a3(vpxor xmm2,xmm2,[ecx+eax+32])
a3(vpxor xmm3,xmm3,[ecx+eax+48])
a1(scrypt_ChunkMix_xop_no_xor1:)
a2(xor ecx,ecx)
a2(xor ebx,ebx)
a1(scrypt_ChunkMix_xop_loop:)
a2(and eax, eax)
a3(vpxor xmm0,xmm0,[esi+ecx+0])
a3(vpxor xmm1,xmm1,[esi+ecx+16])
a3(vpxor xmm2,xmm2,[esi+ecx+32])
a3(vpxor xmm3,xmm3,[esi+ecx+48])
aj(jz scrypt_ChunkMix_xop_no_xor2)
a3(vpxor xmm0,xmm0,[eax+ecx+0])
a3(vpxor xmm1,xmm1,[eax+ecx+16])
a3(vpxor xmm2,xmm2,[eax+ecx+32])
a3(vpxor xmm3,xmm3,[eax+ecx+48])
a1(scrypt_ChunkMix_xop_no_xor2:)
a2(vmovdqa [esp+0],xmm0)
a2(vmovdqa [esp+16],xmm1)
a2(vmovdqa xmm6,xmm2)
a2(vmovdqa xmm7,xmm3)
a2(mov eax,8)
a1(scrypt_salsa_xop_loop: )
a3(vpaddd xmm4, xmm1, xmm0)
a3(vprotd xmm4, xmm4, 7)
a3(vpxor xmm3, xmm3, xmm4)
a3(vpaddd xmm4, xmm0, xmm3)
a3(vprotd xmm4, xmm4, 9)
a3(vpxor xmm2, xmm2, xmm4)
a3(vpaddd xmm4, xmm3, xmm2)
a3(vprotd xmm4, xmm4, 13)
a3(vpxor xmm1, xmm1, xmm4)
a3(vpaddd xmm4, xmm2, xmm1)
a3(pshufd xmm3, xmm3, 0x93)
a3(vprotd xmm4, xmm4, 18)
a3(pshufd xmm2, xmm2, 0x4e)
a3(vpxor xmm0, xmm0, xmm4)
a3(pshufd xmm1, xmm1, 0x39)
a3(vpaddd xmm4, xmm3, xmm0)
a3(vprotd xmm4, xmm4, 7)
a3(vpxor xmm1, xmm1, xmm4)
a3(vpaddd xmm4, xmm0, xmm1)
a3(vprotd xmm4, xmm4, 9)
a3(vpxor xmm2, xmm2, xmm4)
a3(vpaddd xmm4, xmm1, xmm2)
a3(vprotd xmm4, xmm4, 13)
a3(vpxor xmm3, xmm3, xmm4)
a3(pshufd xmm1, xmm1, 0x93)
a3(vpaddd xmm4, xmm2, xmm3)
a3(pshufd xmm2, xmm2, 0x4e)
a3(vprotd xmm4, xmm4, 18)
a3(pshufd xmm3, xmm3, 0x39)
a3(vpxor xmm0, xmm0, xmm4)
a2(sub eax, 2)
aj(ja scrypt_salsa_xop_loop)
a3(vpaddd xmm0,xmm0,[esp+0])
a3(vpaddd xmm1,xmm1,[esp+16])
a3(vpaddd xmm2,xmm2,xmm6)
a3(vpaddd xmm3,xmm3,xmm7)
a2(lea eax,[ebx+ecx])
a2(xor ebx,edx)
a2(and eax,~0x7f)
a2(add ecx,64)
a2(shr eax,1)
a2(add eax, edi)
a2(cmp ecx,edx)
a2(vmovdqa [eax+0],xmm0)
a2(vmovdqa [eax+16],xmm1)
a2(vmovdqa [eax+32],xmm2)
a2(vmovdqa [eax+48],xmm3)
a2(mov eax,[ebp+28])
aj(jne scrypt_ChunkMix_xop_loop)
a2(mov esp,ebp)
a1(pop ebp)
a1(pop esi)
a1(pop edi)
a1(pop ebx)
aret(16)
asm_naked_fn_end(scrypt_ChunkMix_xop)
#endif
/* x64 */
#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
#define SCRYPT_SALSA_XOP
asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
asm_naked_fn(scrypt_ChunkMix_xop)
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
a2(shl rcx,6)
a2(lea r9,[rcx-64])
a2(lea rax,[rsi+r9])
a2(lea r9,[rdx+r9])
a2(and rdx, rdx)
a2(vmovdqa xmm0,[rax+0])
a2(vmovdqa xmm1,[rax+16])
a2(vmovdqa xmm2,[rax+32])
a2(vmovdqa xmm3,[rax+48])
aj(jz scrypt_ChunkMix_xop_no_xor1)
a3(vpxor xmm0,xmm0,[r9+0])
a3(vpxor xmm1,xmm1,[r9+16])
a3(vpxor xmm2,xmm2,[r9+32])
a3(vpxor xmm3,xmm3,[r9+48])
a1(scrypt_ChunkMix_xop_no_xor1:)
a2(xor r9,r9)
a2(xor r8,r8)
a1(scrypt_ChunkMix_xop_loop:)
a2(and rdx, rdx)
a3(vpxor xmm0,xmm0,[rsi+r9+0])
a3(vpxor xmm1,xmm1,[rsi+r9+16])
a3(vpxor xmm2,xmm2,[rsi+r9+32])
a3(vpxor xmm3,xmm3,[rsi+r9+48])
aj(jz scrypt_ChunkMix_xop_no_xor2)
a3(vpxor xmm0,xmm0,[rdx+r9+0])
a3(vpxor xmm1,xmm1,[rdx+r9+16])
a3(vpxor xmm2,xmm2,[rdx+r9+32])
a3(vpxor xmm3,xmm3,[rdx+r9+48])
a1(scrypt_ChunkMix_xop_no_xor2:)
a2(vmovdqa xmm8,xmm0)
a2(vmovdqa xmm9,xmm1)
a2(vmovdqa xmm10,xmm2)
a2(vmovdqa xmm11,xmm3)
a2(mov rax,8)
a1(scrypt_salsa_xop_loop: )
a3(vpaddd xmm4, xmm1, xmm0)
a3(vprotd xmm4, xmm4, 7)
a3(vpxor xmm3, xmm3, xmm4)
a3(vpaddd xmm4, xmm0, xmm3)
a3(vprotd xmm4, xmm4, 9)
a3(vpxor xmm2, xmm2, xmm4)
a3(vpaddd xmm4, xmm3, xmm2)
a3(vprotd xmm4, xmm4, 13)
a3(vpxor xmm1, xmm1, xmm4)
a3(vpaddd xmm4, xmm2, xmm1)
a3(pshufd xmm3, xmm3, 0x93)
a3(vprotd xmm4, xmm4, 18)
a3(pshufd xmm2, xmm2, 0x4e)
a3(vpxor xmm0, xmm0, xmm4)
a3(pshufd xmm1, xmm1, 0x39)
a3(vpaddd xmm4, xmm3, xmm0)
a3(vprotd xmm4, xmm4, 7)
a3(vpxor xmm1, xmm1, xmm4)
a3(vpaddd xmm4, xmm0, xmm1)
a3(vprotd xmm4, xmm4, 9)
a3(vpxor xmm2, xmm2, xmm4)
a3(vpaddd xmm4, xmm1, xmm2)
a3(vprotd xmm4, xmm4, 13)
a3(vpxor xmm3, xmm3, xmm4)
a3(pshufd xmm1, xmm1, 0x93)
a3(vpaddd xmm4, xmm2, xmm3)
a3(pshufd xmm2, xmm2, 0x4e)
a3(vprotd xmm4, xmm4, 18)
a3(pshufd xmm3, xmm3, 0x39)
a3(vpxor xmm0, xmm0, xmm4)
a2(sub rax, 2)
aj(ja scrypt_salsa_xop_loop)
a3(vpaddd xmm0,xmm0,xmm8)
a3(vpaddd xmm1,xmm1,xmm9)
a3(vpaddd xmm2,xmm2,xmm10)
a3(vpaddd xmm3,xmm3,xmm11)
a2(lea rax,[r8+r9])
a2(xor r8,rcx)
a2(and rax,~0x7f)
a2(add r9,64)
a2(shr rax,1)
a2(add rax, rdi)
a2(cmp r9,rcx)
a2(vmovdqa [rax+0],xmm0)
a2(vmovdqa [rax+16],xmm1)
a2(vmovdqa [rax+32],xmm2)
a2(vmovdqa [rax+48],xmm3)
aj(jne scrypt_ChunkMix_xop_loop)
a1(ret)
asm_naked_fn_end(scrypt_ChunkMix_xop)
#endif
/* intrinsic */
#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
#define SCRYPT_SALSA_XOP
static void asm_calling_convention NOINLINE
scrypt_ChunkMix_xop(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
uint32_t i, blocksPerChunk = r * 2, half = 0;
xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
size_t rounds;
/* 1: X = B_{2r - 1} */
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
x0 = xmmp[0];
x1 = xmmp[1];
x2 = xmmp[2];
x3 = xmmp[3];
if (Bxor) {
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
x0 = _mm_xor_si128(x0, xmmp[0]);
x1 = _mm_xor_si128(x1, xmmp[1]);
x2 = _mm_xor_si128(x2, xmmp[2]);
x3 = _mm_xor_si128(x3, xmmp[3]);
}
/* 2: for i = 0 to 2r - 1 do */
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
/* 3: X = H(X ^ B_i) */
xmmp = (xmmi *)scrypt_block(Bin, i);
x0 = _mm_xor_si128(x0, xmmp[0]);
x1 = _mm_xor_si128(x1, xmmp[1]);
x2 = _mm_xor_si128(x2, xmmp[2]);
x3 = _mm_xor_si128(x3, xmmp[3]);
if (Bxor) {
xmmp = (xmmi *)scrypt_block(Bxor, i);
x0 = _mm_xor_si128(x0, xmmp[0]);
x1 = _mm_xor_si128(x1, xmmp[1]);
x2 = _mm_xor_si128(x2, xmmp[2]);
x3 = _mm_xor_si128(x3, xmmp[3]);
}
t0 = x0;
t1 = x1;
t2 = x2;
t3 = x3;
for (rounds = 8; rounds; rounds -= 2) {
x4 = _mm_add_epi32(x1, x0);
x4 = _mm_roti_epi32(x4, 7);
x3 = _mm_xor_si128(x3, x4);
x4 = _mm_add_epi32(x0, x3);
x4 = _mm_roti_epi32(x4, 9);
x2 = _mm_xor_si128(x2, x4);
x4 = _mm_add_epi32(x3, x2);
x4 = _mm_roti_epi32(x4, 13);
x1 = _mm_xor_si128(x1, x4);
x4 = _mm_add_epi32(x2, x1);
x4 = _mm_roti_epi32(x4, 18);
x0 = _mm_xor_si128(x0, x4);
x3 = _mm_shuffle_epi32(x3, 0x93);
x2 = _mm_shuffle_epi32(x2, 0x4e);
x1 = _mm_shuffle_epi32(x1, 0x39);
x4 = _mm_add_epi32(x3, x0);
x4 = _mm_roti_epi32(x4, 7);
x1 = _mm_xor_si128(x1, x4);
x4 = _mm_add_epi32(x0, x1);
x4 = _mm_roti_epi32(x4, 9);
x2 = _mm_xor_si128(x2, x4);
x4 = _mm_add_epi32(x1, x2);
x4 = _mm_roti_epi32(x4, 13);
x3 = _mm_xor_si128(x3, x4);
x4 = _mm_add_epi32(x2, x3);
x4 = _mm_roti_epi32(x4, 18);
x0 = _mm_xor_si128(x0, x4);
x1 = _mm_shuffle_epi32(x1, 0x93);
x2 = _mm_shuffle_epi32(x2, 0x4e);
x3 = _mm_shuffle_epi32(x3, 0x39);
}
x0 = _mm_add_epi32(x0, t0);
x1 = _mm_add_epi32(x1, t1);
x2 = _mm_add_epi32(x2, t2);
x3 = _mm_add_epi32(x3, t3);
/* 4: Y_i = X */
/* 6: B'[0..r-1] = Y_even */
/* 6: B'[r..2r-1] = Y_odd */
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
xmmp[0] = x0;
xmmp[1] = x1;
xmmp[2] = x2;
xmmp[3] = x3;
}
}
#endif
#if defined(SCRYPT_SALSA_XOP)
/* uses salsa_core_tangle_sse2 */
#undef SCRYPT_MIX
#define SCRYPT_MIX "Salsa/8-XOP"
#undef SCRYPT_SALSA_INCLUDED
#define SCRYPT_SALSA_INCLUDED
#endif

View file

@ -1,70 +0,0 @@
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)
#undef SCRYPT_MIX
#define SCRYPT_MIX "Salsa20/8 Ref"
#undef SCRYPT_SALSA_INCLUDED
#define SCRYPT_SALSA_INCLUDED
#define SCRYPT_SALSA_BASIC
static void
salsa_core_basic(uint32_t state[16]) {
size_t rounds = 8;
uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t;
x0 = state[0];
x1 = state[1];
x2 = state[2];
x3 = state[3];
x4 = state[4];
x5 = state[5];
x6 = state[6];
x7 = state[7];
x8 = state[8];
x9 = state[9];
x10 = state[10];
x11 = state[11];
x12 = state[12];
x13 = state[13];
x14 = state[14];
x15 = state[15];
#define quarter(a,b,c,d) \
t = a+d; t = ROTL32(t, 7); b ^= t; \
t = b+a; t = ROTL32(t, 9); c ^= t; \
t = c+b; t = ROTL32(t, 13); d ^= t; \
t = d+c; t = ROTL32(t, 18); a ^= t; \
for (; rounds; rounds -= 2) {
quarter( x0, x4, x8,x12)
quarter( x5, x9,x13, x1)
quarter(x10,x14, x2, x6)
quarter(x15, x3, x7,x11)
quarter( x0, x1, x2, x3)
quarter( x5, x6, x7, x4)
quarter(x10,x11, x8, x9)
quarter(x15,x12,x13,x14)
}
state[0] += x0;
state[1] += x1;
state[2] += x2;
state[3] += x3;
state[4] += x4;
state[5] += x5;
state[6] += x6;
state[7] += x7;
state[8] += x8;
state[9] += x9;
state[10] += x10;
state[11] += x11;
state[12] += x12;
state[13] += x13;
state[14] += x14;
state[15] += x15;
#undef quarter
}
#endif

View file

@ -1,6 +1,4 @@
#if defined(SCRYPT_SALSA) #ifdef SCRYPT_SALSA64
#include "scrypt-jane-salsa.h"
#elif defined(SCRYPT_SALSA64)
#include "scrypt-jane-salsa64.h" #include "scrypt-jane-salsa64.h"
#else #else
#define SCRYPT_MIX_BASE "ERROR" #define SCRYPT_MIX_BASE "ERROR"

View file

@ -1,134 +0,0 @@
#define SCRYPT_MIX_BASE "Salsa20/8"
typedef uint32_t scrypt_mix_word_t;
#define SCRYPT_WORDTO8_LE U32TO8_LE
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
#define SCRYPT_BLOCK_BYTES 64
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
/* must have these here in case block bytes is ever != 64 */
#include "scrypt-jane-romix-basic.h"
#include "scrypt-jane-mix_salsa-xop.h"
#include "scrypt-jane-mix_salsa-avx.h"
#include "scrypt-jane-mix_salsa-sse2.h"
#include "scrypt-jane-mix_salsa.h"
#if defined(SCRYPT_SALSA_XOP)
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop
#define SCRYPT_ROMIX_FN scrypt_ROMix_xop
#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
#include "scrypt-jane-romix-template.h"
#endif
#if defined(SCRYPT_SALSA_AVX)
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
#include "scrypt-jane-romix-template.h"
#endif
#if defined(SCRYPT_SALSA_SSE2)
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
#define SCRYPT_MIX_FN salsa_core_sse2
#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
#include "scrypt-jane-romix-template.h"
#endif
/* cpu agnostic */
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
#define SCRYPT_MIX_FN salsa_core_basic
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
#include "scrypt-jane-romix-template.h"
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
static scrypt_ROMixfn
scrypt_getROMix(void) {
size_t cpuflags = detect_cpu();
#if defined(SCRYPT_SALSA_XOP)
if (cpuflags & cpu_xop)
return scrypt_ROMix_xop;
else
#endif
#if defined(SCRYPT_SALSA_AVX)
if (cpuflags & cpu_avx)
return scrypt_ROMix_avx;
else
#endif
#if defined(SCRYPT_SALSA_SSE2)
if (cpuflags & cpu_sse2)
return scrypt_ROMix_sse2;
else
#endif
return scrypt_ROMix_basic;
}
#endif
#if defined(SCRYPT_TEST_SPEED)
static size_t
available_implementations(void) {
size_t cpuflags = detect_cpu();
size_t flags = 0;
#if defined(SCRYPT_SALSA_XOP)
if (cpuflags & cpu_xop)
flags |= cpu_xop;
#endif
#if defined(SCRYPT_SALSA_AVX)
if (cpuflags & cpu_avx)
flags |= cpu_avx;
#endif
#if defined(SCRYPT_SALSA_SSE2)
if (cpuflags & cpu_sse2)
flags |= cpu_sse2;
#endif
return flags;
}
#endif
static int
scrypt_test_mix(void) {
static const uint8_t expected[16] = {
0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66,
};
int ret = 1;
size_t cpuflags = detect_cpu();
#if defined(SCRYPT_SALSA_XOP)
if (cpuflags & cpu_xop)
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
#endif
#if defined(SCRYPT_SALSA_AVX)
if (cpuflags & cpu_avx)
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
#endif
#if defined(SCRYPT_SALSA_SSE2)
if (cpuflags & cpu_sse2)
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
#endif
#if defined(SCRYPT_SALSA_BASIC)
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
#endif
return ret;
}

View file

@ -10,18 +10,7 @@ static const scrypt_test_setting post_settings[] = {
}; };
#if defined(SCRYPT_SKEIN512) #if defined(SCRYPT_SKEIN512)
#if defined(SCRYPT_SALSA) #ifdef SCRYPT_SALSA64
static const uint8_t post_vectors[][64] = {
{0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69,
0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87,
0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f,
0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e},
{0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e,
0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b,
0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb,
0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00}
};
#elif defined(SCRYPT_SALSA64)
static const uint8_t post_vectors[][64] = { static const uint8_t post_vectors[][64] = {
{0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60, {0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60,
0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59, 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59,

View file

@ -1,36 +0,0 @@
#include "thread.h"
#if defined(_WIN32)
#include <Windows.h>
#endif
int argon2_thread_create(argon2_thread_handle_t *handle,
argon2_thread_func_t func, void *args) {
if (NULL == handle || func == NULL) {
return -1;
}
#if defined(_WIN32)
*handle = _beginthreadex(NULL, 0, func, args, 0, NULL);
return *handle != 0 ? 0 : -1;
#else
return pthread_create(handle, NULL, func, args);
#endif
}
int argon2_thread_join(argon2_thread_handle_t handle) {
#if defined(_WIN32)
if (WaitForSingleObject((HANDLE)handle, INFINITE) == WAIT_OBJECT_0) {
return CloseHandle((HANDLE)handle) != 0 ? 0 : -1;
}
return -1;
#else
return pthread_join(handle, NULL);
#endif
}
void argon2_thread_exit(void) {
#if defined(_WIN32)
_endthreadex(0);
#else
pthread_exit(NULL);
#endif
}

View file

@ -1,46 +0,0 @@
#ifndef ARGON2_THREAD_H
#define ARGON2_THREAD_H
/*
Here we implement an abstraction layer for the simpĺe requirements
of the Argon2 code. We only require 3 primitives---thread creation,
joining, and termination---so full emulation of the pthreads API
is unwarranted. Currently we wrap pthreads and Win32 threads.
The API defines 2 types: the function pointer type,
argon2_thread_func_t,
and the type of the thread handle---argon2_thread_handle_t.
*/
#if defined(_WIN32)
#include <process.h>
typedef unsigned(__stdcall *argon2_thread_func_t)(void *);
typedef uintptr_t argon2_thread_handle_t;
#else
#include <pthread.h>
typedef void *(*argon2_thread_func_t)(void *);
typedef pthread_t argon2_thread_handle_t;
#endif
/* Creates a thread
* @param handle pointer to a thread handle, which is the output of this
* function. Must not be NULL.
* @param func A function pointer for the thread's entry point. Must not be
* NULL.
* @param args Pointer that is passed as an argument to @func. May be NULL.
* @return 0 if @handle and @func are valid pointers and a thread is successfuly
* created.
*/
int argon2_thread_create(argon2_thread_handle_t *handle,
argon2_thread_func_t func, void *args);
/* Waits for a thread to terminate
* @param handle Handle to a thread created with argon2_thread_create.
* @return 0 if @handle is a valid handle, and joining completed successfully.
*/
int argon2_thread_join(argon2_thread_handle_t handle);
/* Terminate the current thread. Must be run inside a thread created by
* argon2_thread_create.
*/
void argon2_thread_exit(void);
#endif

View file

@ -7,7 +7,7 @@
#include "ar2/argon2.h" #include "ar2/argon2.h"
#include "ar2/cores.h" #include "ar2/cores.h"
#include "ar2/scrypt-jane.h" #include "ar2/ar2-scrypt-jane.h"
#define _ALIGN(x) __attribute__ ((aligned(x))) #define _ALIGN(x) __attribute__ ((aligned(x)))
@ -16,7 +16,7 @@
#define MASK 8 #define MASK 8
#define ZERO 0 #define ZERO 0
static void argon_call(void *out, void *in, void *salt, int type) inline void argon_call(void *out, void *in, void *salt, int type)
{ {
argon2_context context = { 0 }; argon2_context context = { 0 };
@ -27,50 +27,18 @@ static void argon_call(void *out, void *in, void *salt, int type)
argon2_core(&context, type); argon2_core(&context, type);
} }
static void bin2hex(char *s, const unsigned char *p, size_t len)
{
for (size_t i = 0; i < len; i++)
sprintf(s + (i * 2), "%02x", (unsigned int) p[i]);
}
static char *abin2hex(const unsigned char *p, size_t len)
{
char *s = (char*) malloc((len * 2) + 1);
if (!s)
return NULL;
bin2hex(s, p, len);
return s;
}
static void applog_data(void *pdata)
{
char* hex = abin2hex((unsigned char*)pdata, 80);
fprintf(stderr, "%s\n", hex);
free(hex);
}
void argon2_hash(const char* input, char* output, uint32_t len) void argon2_hash(const char* input, char* output, uint32_t len)
{ {
// these uint512 in the c++ source of the client are backed by an array of uint32 uint32_t _ALIGN(32) hashA[8], hashB[8];
uint32_t _ALIGN(32) hashA[8], hashB[8], hashC[8];
uint32_t _ALIGN(32) endian[20], *in;
in = (uint32_t*) input; my_scrypt((unsigned char *)input, len,
for (int i=0; i<len/4; i++) (unsigned char *)input, len,
endian[i] = in[i];
// be32enc(&endian[i], in[i]);
//applog_data((void*) endian);
my_scrypt((unsigned char *)endian, len,
(unsigned char *)endian, len,
(unsigned char *)hashA); (unsigned char *)hashA);
argon_call(hashB, hashA, hashA, (hashA[0] & MASK) == ZERO); argon_call(hashB, hashA, hashA, (hashA[0] & MASK) == ZERO);
my_scrypt((const unsigned char *)hashB, 32, my_scrypt((const unsigned char *)hashB, 32,
(const unsigned char *)hashB, 32, (const unsigned char *)hashB, 32,
(unsigned char *)hashC); (unsigned char *)output);
memcpy(output, hashC, 32);
} }

View file

@ -12,7 +12,7 @@ SOURCES=lyra2re.c lyra2v2.c Lyra2.c Sponge.c blake.c scrypt.c c11.c x11.c x13.c
skein2.c zr5.c bmw.c luffa.c pentablake.c whirlpool.c whirlpoolx.c blakecoin.c \ skein2.c zr5.c bmw.c luffa.c pentablake.c whirlpool.c whirlpoolx.c blakecoin.c \
yescrypt.c yescrypt-opt.c sha256_Y.c \ yescrypt.c yescrypt-opt.c sha256_Y.c \
m7m.c magimath.cpp velvet.c \ m7m.c magimath.cpp velvet.c \
argon2a.c ar2/blake2b.c ar2/argon2.c ar2/ref.c ar2/cores.c ar2/thread.c ar2/scrypt-jane.c \ argon2a.c ar2/blake2b.c ar2/argon2.c ar2/ref.c ar2/cores.c ar2/ar2-scrypt-jane.c \
hive.c pomelo.c \ hive.c pomelo.c \
sib.c gost.c sib.c gost.c