argon2: clean stratum code

remove dead code detected after cpuminer implementation
2025-09-29 23:00:33 +00:00 · 2016-01-25 02:25:22 +01:00 · 2016-01-25 02:25:22 +01:00 · 9d1c34f894
commit 9d1c34f894
parent 52580a5636
13 changed files with 10 additions and 1482 deletions
--- a/stratum/algos/ar2/ar2-scrypt-jane.c
+++ b/stratum/algos/ar2/ar2-scrypt-jane.c
@ -12,7 +12,7 @@ extern "C" {
 #endif
 #endif

-#include "scrypt-jane.h"
+#include "ar2-scrypt-jane.h"

 #include "sj/scrypt-jane-portable.h"
 #include "sj/scrypt-jane-hash.h"
--- a/stratum/algos/ar2/ar2-scrypt-jane.h
+++ b/stratum/algos/ar2/ar2-scrypt-jane.h
--- a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-avx.h
+++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-avx.h
@ -1,381 +0,0 @@
-/* x86 */
-#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
-
-#define SCRYPT_SALSA_AVX
-
-asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
-asm_naked_fn(scrypt_ChunkMix_avx)
-	a1(push ebx)
-	a1(push edi)
-	a1(push esi)
-	a1(push ebp)
-	a2(mov ebp,esp)
-	a2(mov edi,[ebp+20])
-	a2(mov esi,[ebp+24])
-	a2(mov eax,[ebp+28])
-	a2(mov ebx,[ebp+32])
-	a2(sub esp,32)
-	a2(and esp,~63)
-	a2(lea edx,[ebx*2])
-	a2(shl edx,6)
-	a2(lea ecx,[edx-64])
-	a2(and eax, eax)
-	a2(movdqa xmm0,[ecx+esi+0])
-	a2(movdqa xmm1,[ecx+esi+16])
-	a2(movdqa xmm2,[ecx+esi+32])
-	a2(movdqa xmm3,[ecx+esi+48])
-	aj(jz scrypt_ChunkMix_avx_no_xor1)
-	a3(vpxor xmm0,xmm0,[ecx+eax+0])
-	a3(vpxor xmm1,xmm1,[ecx+eax+16])
-	a3(vpxor xmm2,xmm2,[ecx+eax+32])
-	a3(vpxor xmm3,xmm3,[ecx+eax+48])
-	a1(scrypt_ChunkMix_avx_no_xor1:)
-	a2(xor ecx,ecx)
-	a2(xor ebx,ebx)
-	a1(scrypt_ChunkMix_avx_loop:)
-		a2(and eax, eax)
-		a3(vpxor xmm0,xmm0,[esi+ecx+0])
-		a3(vpxor xmm1,xmm1,[esi+ecx+16])
-		a3(vpxor xmm2,xmm2,[esi+ecx+32])
-		a3(vpxor xmm3,xmm3,[esi+ecx+48])
-		aj(jz scrypt_ChunkMix_avx_no_xor2)
-		a3(vpxor xmm0,xmm0,[eax+ecx+0])
-		a3(vpxor xmm1,xmm1,[eax+ecx+16])
-		a3(vpxor xmm2,xmm2,[eax+ecx+32])
-		a3(vpxor xmm3,xmm3,[eax+ecx+48])
-		a1(scrypt_ChunkMix_avx_no_xor2:)
-		a2(vmovdqa [esp+0],xmm0)
-		a2(vmovdqa [esp+16],xmm1)
-		a2(vmovdqa xmm6,xmm2)
-		a2(vmovdqa xmm7,xmm3)
-		a2(mov eax,8)
-		a1(scrypt_salsa_avx_loop: )
-			a3(vpaddd xmm4, xmm1, xmm0)
-			a3(vpsrld xmm5, xmm4, 25)
-			a3(vpslld xmm4, xmm4, 7)
-			a3(vpxor xmm3, xmm3, xmm5)
-			a3(vpxor xmm3, xmm3, xmm4)
-			a3(vpaddd xmm4, xmm0, xmm3)
-			a3(vpsrld xmm5, xmm4, 23)
-			a3(vpslld xmm4, xmm4, 9)
-			a3(vpxor xmm2, xmm2, xmm5)
-			a3(vpxor xmm2, xmm2, xmm4)
-			a3(vpaddd xmm4, xmm3, xmm2)
-			a3(vpsrld xmm5, xmm4, 19)
-			a3(vpslld xmm4, xmm4, 13)
-			a3(vpxor xmm1, xmm1, xmm5)
-			a3(vpshufd xmm3, xmm3, 0x93)
-			a3(vpxor xmm1, xmm1, xmm4)
-			a3(vpaddd xmm4, xmm2, xmm1)
-			a3(vpsrld xmm5, xmm4, 14)
-			a3(vpslld xmm4, xmm4, 18)
-			a3(vpxor xmm0, xmm0, xmm5)
-			a3(vpshufd xmm2, xmm2, 0x4e)
-			a3(vpxor xmm0, xmm0, xmm4)
-			a3(vpaddd xmm4, xmm3, xmm0)
-			a3(vpshufd xmm1, xmm1, 0x39)
-			a3(vpsrld xmm5, xmm4, 25)
-			a3(vpslld xmm4, xmm4, 7)
-			a3(vpxor xmm1, xmm1, xmm5)
-			a3(vpxor xmm1, xmm1, xmm4)
-			a3(vpaddd xmm4, xmm0, xmm1)
-			a3(vpsrld xmm5, xmm4, 23)
-			a3(vpslld xmm4, xmm4, 9)
-			a3(vpxor xmm2, xmm2, xmm5)
-			a3(vpxor xmm2, xmm2, xmm4)
-			a3(vpaddd xmm4, xmm1, xmm2)
-			a3(vpsrld xmm5, xmm4, 19)
-			a3(vpslld xmm4, xmm4, 13)
-			a3(vpxor xmm3, xmm3, xmm5)
-			a3(vpshufd xmm1, xmm1, 0x93)
-			a3(vpxor xmm3, xmm3, xmm4)
-			a3(vpaddd xmm4, xmm2, xmm3)
-			a3(vpsrld xmm5, xmm4, 14)
-			a3(vpslld xmm4, xmm4, 18)
-			a3(vpxor xmm0, xmm0, xmm5)
-			a3(vpshufd xmm2, xmm2, 0x4e)
-			a3(vpxor xmm0, xmm0, xmm4)
-			a3(vpshufd xmm3, xmm3, 0x39)
-			a2(sub eax, 2)
-			aj(ja scrypt_salsa_avx_loop)
-		a3(vpaddd xmm0,xmm0,[esp+0])
-		a3(vpaddd xmm1,xmm1,[esp+16])
-		a3(vpaddd xmm2,xmm2,xmm6)
-		a3(vpaddd xmm3,xmm3,xmm7)
-		a2(lea eax,[ebx+ecx])
-		a2(xor ebx,edx)
-		a2(and eax,~0x7f)
-		a2(add ecx,64)
-		a2(shr eax,1)
-		a2(add eax, edi)
-		a2(cmp ecx,edx)
-		a2(vmovdqa [eax+0],xmm0)
-		a2(vmovdqa [eax+16],xmm1)
-		a2(vmovdqa [eax+32],xmm2)
-		a2(vmovdqa [eax+48],xmm3)
-		a2(mov eax,[ebp+28])
-		aj(jne scrypt_ChunkMix_avx_loop)
-	a2(mov esp,ebp)
-	a1(pop ebp)
-	a1(pop esi)
-	a1(pop edi)
-	a1(pop ebx)
-	aret(16)
-asm_naked_fn_end(scrypt_ChunkMix_avx)
-
-#endif
-
-
-
-/* x64 */
-#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
-
-#define SCRYPT_SALSA_AVX
-
-asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
-asm_naked_fn(scrypt_ChunkMix_avx)
-	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
-	a2(shl rcx,6)
-	a2(lea r9,[rcx-64])
-	a2(lea rax,[rsi+r9])
-	a2(lea r9,[rdx+r9])
-	a2(and rdx, rdx)
-	a2(vmovdqa xmm0,[rax+0])
-	a2(vmovdqa xmm1,[rax+16])
-	a2(vmovdqa xmm2,[rax+32])
-	a2(vmovdqa xmm3,[rax+48])
-	aj(jz scrypt_ChunkMix_avx_no_xor1)
-	a3(vpxor xmm0,xmm0,[r9+0])
-	a3(vpxor xmm1,xmm1,[r9+16])
-	a3(vpxor xmm2,xmm2,[r9+32])
-	a3(vpxor xmm3,xmm3,[r9+48])
-	a1(scrypt_ChunkMix_avx_no_xor1:)
-	a2(xor r9,r9)
-	a2(xor r8,r8)
-	a1(scrypt_ChunkMix_avx_loop:)
-		a2(and rdx, rdx)
-		a3(vpxor xmm0,xmm0,[rsi+r9+0])
-		a3(vpxor xmm1,xmm1,[rsi+r9+16])
-		a3(vpxor xmm2,xmm2,[rsi+r9+32])
-		a3(vpxor xmm3,xmm3,[rsi+r9+48])
-		aj(jz scrypt_ChunkMix_avx_no_xor2)
-		a3(vpxor xmm0,xmm0,[rdx+r9+0])
-		a3(vpxor xmm1,xmm1,[rdx+r9+16])
-		a3(vpxor xmm2,xmm2,[rdx+r9+32])
-		a3(vpxor xmm3,xmm3,[rdx+r9+48])
-		a1(scrypt_ChunkMix_avx_no_xor2:)
-		a2(vmovdqa xmm8,xmm0)
-		a2(vmovdqa xmm9,xmm1)
-		a2(vmovdqa xmm10,xmm2)
-		a2(vmovdqa xmm11,xmm3)
-		a2(mov rax,8)
-		a1(scrypt_salsa_avx_loop: )
-			a3(vpaddd xmm4, xmm1, xmm0)
-			a3(vpsrld xmm5, xmm4, 25)
-			a3(vpslld xmm4, xmm4, 7)
-			a3(vpxor xmm3, xmm3, xmm5)
-			a3(vpxor xmm3, xmm3, xmm4)
-			a3(vpaddd xmm4, xmm0, xmm3)
-			a3(vpsrld xmm5, xmm4, 23)
-			a3(vpslld xmm4, xmm4, 9)
-			a3(vpxor xmm2, xmm2, xmm5)
-			a3(vpxor xmm2, xmm2, xmm4)
-			a3(vpaddd xmm4, xmm3, xmm2)
-			a3(vpsrld xmm5, xmm4, 19)
-			a3(vpslld xmm4, xmm4, 13)
-			a3(vpxor xmm1, xmm1, xmm5)
-			a3(vpshufd xmm3, xmm3, 0x93)
-			a3(vpxor xmm1, xmm1, xmm4)
-			a3(vpaddd xmm4, xmm2, xmm1)
-			a3(vpsrld xmm5, xmm4, 14)
-			a3(vpslld xmm4, xmm4, 18)
-			a3(vpxor xmm0, xmm0, xmm5)
-			a3(vpshufd xmm2, xmm2, 0x4e)
-			a3(vpxor xmm0, xmm0, xmm4)
-			a3(vpaddd xmm4, xmm3, xmm0)
-			a3(vpshufd xmm1, xmm1, 0x39)
-			a3(vpsrld xmm5, xmm4, 25)
-			a3(vpslld xmm4, xmm4, 7)
-			a3(vpxor xmm1, xmm1, xmm5)
-			a3(vpxor xmm1, xmm1, xmm4)
-			a3(vpaddd xmm4, xmm0, xmm1)
-			a3(vpsrld xmm5, xmm4, 23)
-			a3(vpslld xmm4, xmm4, 9)
-			a3(vpxor xmm2, xmm2, xmm5)
-			a3(vpxor xmm2, xmm2, xmm4)
-			a3(vpaddd xmm4, xmm1, xmm2)
-			a3(vpsrld xmm5, xmm4, 19)
-			a3(vpslld xmm4, xmm4, 13)
-			a3(vpxor xmm3, xmm3, xmm5)
-			a3(vpshufd xmm1, xmm1, 0x93)
-			a3(vpxor xmm3, xmm3, xmm4)
-			a3(vpaddd xmm4, xmm2, xmm3)
-			a3(vpsrld xmm5, xmm4, 14)
-			a3(vpslld xmm4, xmm4, 18)
-			a3(vpxor xmm0, xmm0, xmm5)
-			a3(vpshufd xmm2, xmm2, 0x4e)
-			a3(vpxor xmm0, xmm0, xmm4)
-			a3(vpshufd xmm3, xmm3, 0x39)
-			a2(sub rax, 2)
-			aj(ja scrypt_salsa_avx_loop)
-		a3(vpaddd xmm0,xmm0,xmm8)
-		a3(vpaddd xmm1,xmm1,xmm9)
-		a3(vpaddd xmm2,xmm2,xmm10)
-		a3(vpaddd xmm3,xmm3,xmm11)
-		a2(lea rax,[r8+r9])
-		a2(xor r8,rcx)
-		a2(and rax,~0x7f)
-		a2(add r9,64)
-		a2(shr rax,1)
-		a2(add rax, rdi)
-		a2(cmp r9,rcx)
-		a2(vmovdqa [rax+0],xmm0)
-		a2(vmovdqa [rax+16],xmm1)
-		a2(vmovdqa [rax+32],xmm2)
-		a2(vmovdqa [rax+48],xmm3)
-		aj(jne scrypt_ChunkMix_avx_loop)
-	a1(ret)
-asm_naked_fn_end(scrypt_ChunkMix_avx)
-
-#endif
-
-
-/* intrinsic */
-#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
-
-#define SCRYPT_SALSA_AVX
-
-static void asm_calling_convention NOINLINE
-scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
-	uint32_t i, blocksPerChunk = r * 2, half = 0;
-	xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
-	size_t rounds;
-
-	/* 1: X = B_{2r - 1} */
-	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
-	x0 = xmmp[0];
-	x1 = xmmp[1];
-	x2 = xmmp[2];
-	x3 = xmmp[3];
-
-	if (Bxor) {
-		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-	}
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
-		/* 3: X = H(X ^ B_i) */
-		xmmp = (xmmi *)scrypt_block(Bin, i);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-
-		if (Bxor) {
-			xmmp = (xmmi *)scrypt_block(Bxor, i);
-			x0 = _mm_xor_si128(x0, xmmp[0]);
-			x1 = _mm_xor_si128(x1, xmmp[1]);
-			x2 = _mm_xor_si128(x2, xmmp[2]);
-			x3 = _mm_xor_si128(x3, xmmp[3]);
-		}
-
-		t0 = x0;
-		t1 = x1;
-		t2 = x2;
-		t3 = x3;
-
-		for (rounds = 8; rounds; rounds -= 2) {
-			x4 = x1;
-			x4 = _mm_add_epi32(x4, x0);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 7);
-			x5 = _mm_srli_epi32(x5, 25);
-			x3 = _mm_xor_si128(x3, x4);
-			x4 = x0;
-			x3 = _mm_xor_si128(x3, x5);
-			x4 = _mm_add_epi32(x4, x3);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 9);
-			x5 = _mm_srli_epi32(x5, 23);
-			x2 = _mm_xor_si128(x2, x4);
-			x4 = x3;
-			x2 = _mm_xor_si128(x2, x5);
-			x3 = _mm_shuffle_epi32(x3, 0x93);
-			x4 = _mm_add_epi32(x4, x2);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 13);
-			x5 = _mm_srli_epi32(x5, 19);
-			x1 = _mm_xor_si128(x1, x4);
-			x4 = x2;
-			x1 = _mm_xor_si128(x1, x5);
-			x2 = _mm_shuffle_epi32(x2, 0x4e);
-			x4 = _mm_add_epi32(x4, x1);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 18);
-			x5 = _mm_srli_epi32(x5, 14);
-			x0 = _mm_xor_si128(x0, x4);
-			x4 = x3;
-			x0 = _mm_xor_si128(x0, x5);
-			x1 = _mm_shuffle_epi32(x1, 0x39);
-			x4 = _mm_add_epi32(x4, x0);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 7);
-			x5 = _mm_srli_epi32(x5, 25);
-			x1 = _mm_xor_si128(x1, x4);
-			x4 = x0;
-			x1 = _mm_xor_si128(x1, x5);
-			x4 = _mm_add_epi32(x4, x1);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 9);
-			x5 = _mm_srli_epi32(x5, 23);
-			x2 = _mm_xor_si128(x2, x4);
-			x4 = x1;
-			x2 = _mm_xor_si128(x2, x5);
-			x1 = _mm_shuffle_epi32(x1, 0x93);
-			x4 = _mm_add_epi32(x4, x2);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 13);
-			x5 = _mm_srli_epi32(x5, 19);
-			x3 = _mm_xor_si128(x3, x4);
-			x4 = x2;
-			x3 = _mm_xor_si128(x3, x5);
-			x2 = _mm_shuffle_epi32(x2, 0x4e);
-			x4 = _mm_add_epi32(x4, x3);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 18);
-			x5 = _mm_srli_epi32(x5, 14);
-			x0 = _mm_xor_si128(x0, x4);
-			x3 = _mm_shuffle_epi32(x3, 0x39);
-			x0 = _mm_xor_si128(x0, x5);
-		}
-
-		x0 = _mm_add_epi32(x0, t0);
-		x1 = _mm_add_epi32(x1, t1);
-		x2 = _mm_add_epi32(x2, t2);
-		x3 = _mm_add_epi32(x3, t3);
-
-		/* 4: Y_i = X */
-		/* 6: B'[0..r-1] = Y_even */
-		/* 6: B'[r..2r-1] = Y_odd */
-		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
-		xmmp[0] = x0;
-		xmmp[1] = x1;
-		xmmp[2] = x2;
-		xmmp[3] = x3;
-	}
-}
-
-#endif
-
-#if defined(SCRYPT_SALSA_AVX)
-	/* uses salsa_core_tangle_sse2 */
-
-	#undef SCRYPT_MIX
-	#define SCRYPT_MIX "Salsa/8-AVX"
-	#undef SCRYPT_SALSA_INCLUDED
-	#define SCRYPT_SALSA_INCLUDED
-#endif
--- a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h
+++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-sse2.h
@ -1,443 +0,0 @@
-/* x86 */
-#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
-
-#define SCRYPT_SALSA_SSE2
-
-asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
-asm_naked_fn(scrypt_ChunkMix_sse2)
-	a1(push ebx)
-	a1(push edi)
-	a1(push esi)
-	a1(push ebp)
-	a2(mov ebp,esp)
-	a2(mov edi,[ebp+20])
-	a2(mov esi,[ebp+24])
-	a2(mov eax,[ebp+28])
-	a2(mov ebx,[ebp+32])
-	a2(sub esp,32)
-	a2(and esp,~63)
-	a2(lea edx,[ebx*2])
-	a2(shl edx,6)
-	a2(lea ecx,[edx-64])
-	a2(and eax, eax)
-	a2(movdqa xmm0,[ecx+esi+0])
-	a2(movdqa xmm1,[ecx+esi+16])
-	a2(movdqa xmm2,[ecx+esi+32])
-	a2(movdqa xmm3,[ecx+esi+48])
-	aj(jz scrypt_ChunkMix_sse2_no_xor1)
-	a2(pxor xmm0,[ecx+eax+0])
-	a2(pxor xmm1,[ecx+eax+16])
-	a2(pxor xmm2,[ecx+eax+32])
-	a2(pxor xmm3,[ecx+eax+48])
-	a1(scrypt_ChunkMix_sse2_no_xor1:)
-	a2(xor ecx,ecx)
-	a2(xor ebx,ebx)
-	a1(scrypt_ChunkMix_sse2_loop:)
-		a2(and eax, eax)
-		a2(pxor xmm0,[esi+ecx+0])
-		a2(pxor xmm1,[esi+ecx+16])
-		a2(pxor xmm2,[esi+ecx+32])
-		a2(pxor xmm3,[esi+ecx+48])
-		aj(jz scrypt_ChunkMix_sse2_no_xor2)
-		a2(pxor xmm0,[eax+ecx+0])
-		a2(pxor xmm1,[eax+ecx+16])
-		a2(pxor xmm2,[eax+ecx+32])
-		a2(pxor xmm3,[eax+ecx+48])
-		a1(scrypt_ChunkMix_sse2_no_xor2:)
-		a2(movdqa [esp+0],xmm0)
-		a2(movdqa [esp+16],xmm1)
-		a2(movdqa xmm6,xmm2)
-		a2(movdqa xmm7,xmm3)
-		a2(mov eax,8)
-		a1(scrypt_salsa_sse2_loop: )
-			a2(movdqa xmm4, xmm1)
-			a2(paddd xmm4, xmm0)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 7)
-			a2(psrld xmm5, 25)
-			a2(pxor xmm3, xmm4)
-			a2(movdqa xmm4, xmm0)
-			a2(pxor xmm3, xmm5)
-			a2(paddd xmm4, xmm3)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 9)
-			a2(psrld xmm5, 23)
-			a2(pxor xmm2, xmm4)
-			a2(movdqa xmm4, xmm3)
-			a2(pxor xmm2, xmm5)
-			a3(pshufd xmm3, xmm3, 0x93)
-			a2(paddd xmm4, xmm2)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 13)
-			a2(psrld xmm5, 19)
-			a2(pxor xmm1, xmm4)
-			a2(movdqa xmm4, xmm2)
-			a2(pxor xmm1, xmm5)
-			a3(pshufd xmm2, xmm2, 0x4e)
-			a2(paddd xmm4, xmm1)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 18)
-			a2(psrld xmm5, 14)
-			a2(pxor xmm0, xmm4)
-			a2(movdqa xmm4, xmm3)
-			a2(pxor xmm0, xmm5)
-			a3(pshufd xmm1, xmm1, 0x39)
-			a2(paddd xmm4, xmm0)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 7)
-			a2(psrld xmm5, 25)
-			a2(pxor xmm1, xmm4)
-			a2(movdqa xmm4, xmm0)
-			a2(pxor xmm1, xmm5)
-			a2(paddd xmm4, xmm1)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 9)
-			a2(psrld xmm5, 23)
-			a2(pxor xmm2, xmm4)
-			a2(movdqa xmm4, xmm1)
-			a2(pxor xmm2, xmm5)
-			a3(pshufd xmm1, xmm1, 0x93)
-			a2(paddd xmm4, xmm2)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 13)
-			a2(psrld xmm5, 19)
-			a2(pxor xmm3, xmm4)
-			a2(movdqa xmm4, xmm2)
-			a2(pxor xmm3, xmm5)
-			a3(pshufd xmm2, xmm2, 0x4e)
-			a2(paddd xmm4, xmm3)
-			a2(sub eax, 2)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 18)
-			a2(psrld xmm5, 14)
-			a2(pxor xmm0, xmm4)
-			a3(pshufd xmm3, xmm3, 0x39)
-			a2(pxor xmm0, xmm5)
-			aj(ja scrypt_salsa_sse2_loop)
-		a2(paddd xmm0,[esp+0])
-		a2(paddd xmm1,[esp+16])
-		a2(paddd xmm2,xmm6)
-		a2(paddd xmm3,xmm7)
-		a2(lea eax,[ebx+ecx])
-		a2(xor ebx,edx)
-		a2(and eax,~0x7f)
-		a2(add ecx,64)
-		a2(shr eax,1)
-		a2(add eax, edi)
-		a2(cmp ecx,edx)
-		a2(movdqa [eax+0],xmm0)
-		a2(movdqa [eax+16],xmm1)
-		a2(movdqa [eax+32],xmm2)
-		a2(movdqa [eax+48],xmm3)
-		a2(mov eax,[ebp+28])
-		aj(jne scrypt_ChunkMix_sse2_loop)
-	a2(mov esp,ebp)
-	a1(pop ebp)
-	a1(pop esi)
-	a1(pop edi)
-	a1(pop ebx)
-	aret(16)
-asm_naked_fn_end(scrypt_ChunkMix_sse2)
-
-#endif
-
-
-
-/* x64 */
-#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
-
-#define SCRYPT_SALSA_SSE2
-
-asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
-asm_naked_fn(scrypt_ChunkMix_sse2)
-	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
-	a2(shl rcx,6)
-	a2(lea r9,[rcx-64])
-	a2(lea rax,[rsi+r9])
-	a2(lea r9,[rdx+r9])
-	a2(and rdx, rdx)
-	a2(movdqa xmm0,[rax+0])
-	a2(movdqa xmm1,[rax+16])
-	a2(movdqa xmm2,[rax+32])
-	a2(movdqa xmm3,[rax+48])
-	aj(jz scrypt_ChunkMix_sse2_no_xor1)
-	a2(pxor xmm0,[r9+0])
-	a2(pxor xmm1,[r9+16])
-	a2(pxor xmm2,[r9+32])
-	a2(pxor xmm3,[r9+48])
-	a1(scrypt_ChunkMix_sse2_no_xor1:)
-	a2(xor r9,r9)
-	a2(xor r8,r8)
-	a1(scrypt_ChunkMix_sse2_loop:)
-		a2(and rdx, rdx)
-		a2(pxor xmm0,[rsi+r9+0])
-		a2(pxor xmm1,[rsi+r9+16])
-		a2(pxor xmm2,[rsi+r9+32])
-		a2(pxor xmm3,[rsi+r9+48])
-		aj(jz scrypt_ChunkMix_sse2_no_xor2)
-		a2(pxor xmm0,[rdx+r9+0])
-		a2(pxor xmm1,[rdx+r9+16])
-		a2(pxor xmm2,[rdx+r9+32])
-		a2(pxor xmm3,[rdx+r9+48])
-		a1(scrypt_ChunkMix_sse2_no_xor2:)
-		a2(movdqa xmm8,xmm0)
-		a2(movdqa xmm9,xmm1)
-		a2(movdqa xmm10,xmm2)
-		a2(movdqa xmm11,xmm3)
-		a2(mov rax,8)
-		a1(scrypt_salsa_sse2_loop: )
-			a2(movdqa xmm4, xmm1)
-			a2(paddd xmm4, xmm0)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 7)
-			a2(psrld xmm5, 25)
-			a2(pxor xmm3, xmm4)
-			a2(movdqa xmm4, xmm0)
-			a2(pxor xmm3, xmm5)
-			a2(paddd xmm4, xmm3)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 9)
-			a2(psrld xmm5, 23)
-			a2(pxor xmm2, xmm4)
-			a2(movdqa xmm4, xmm3)
-			a2(pxor xmm2, xmm5)
-			a3(pshufd xmm3, xmm3, 0x93)
-			a2(paddd xmm4, xmm2)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 13)
-			a2(psrld xmm5, 19)
-			a2(pxor xmm1, xmm4)
-			a2(movdqa xmm4, xmm2)
-			a2(pxor xmm1, xmm5)
-			a3(pshufd xmm2, xmm2, 0x4e)
-			a2(paddd xmm4, xmm1)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 18)
-			a2(psrld xmm5, 14)
-			a2(pxor xmm0, xmm4)
-			a2(movdqa xmm4, xmm3)
-			a2(pxor xmm0, xmm5)
-			a3(pshufd xmm1, xmm1, 0x39)
-			a2(paddd xmm4, xmm0)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 7)
-			a2(psrld xmm5, 25)
-			a2(pxor xmm1, xmm4)
-			a2(movdqa xmm4, xmm0)
-			a2(pxor xmm1, xmm5)
-			a2(paddd xmm4, xmm1)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 9)
-			a2(psrld xmm5, 23)
-			a2(pxor xmm2, xmm4)
-			a2(movdqa xmm4, xmm1)
-			a2(pxor xmm2, xmm5)
-			a3(pshufd xmm1, xmm1, 0x93)
-			a2(paddd xmm4, xmm2)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 13)
-			a2(psrld xmm5, 19)
-			a2(pxor xmm3, xmm4)
-			a2(movdqa xmm4, xmm2)
-			a2(pxor xmm3, xmm5)
-			a3(pshufd xmm2, xmm2, 0x4e)
-			a2(paddd xmm4, xmm3)
-			a2(sub rax, 2)
-			a2(movdqa xmm5, xmm4)
-			a2(pslld xmm4, 18)
-			a2(psrld xmm5, 14)
-			a2(pxor xmm0, xmm4)
-			a3(pshufd xmm3, xmm3, 0x39)
-			a2(pxor xmm0, xmm5)
-			aj(ja scrypt_salsa_sse2_loop)
-		a2(paddd xmm0,xmm8)
-		a2(paddd xmm1,xmm9)
-		a2(paddd xmm2,xmm10)
-		a2(paddd xmm3,xmm11)
-		a2(lea rax,[r8+r9])
-		a2(xor r8,rcx)
-		a2(and rax,~0x7f)
-		a2(add r9,64)
-		a2(shr rax,1)
-		a2(add rax, rdi)
-		a2(cmp r9,rcx)
-		a2(movdqa [rax+0],xmm0)
-		a2(movdqa [rax+16],xmm1)
-		a2(movdqa [rax+32],xmm2)
-		a2(movdqa [rax+48],xmm3)
-		aj(jne scrypt_ChunkMix_sse2_loop)
-	a1(ret)
-asm_naked_fn_end(scrypt_ChunkMix_sse2)
-
-#endif
-
-
-/* intrinsic */
-#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
-
-#define SCRYPT_SALSA_SSE2
-
-static void NOINLINE asm_calling_convention
-scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
-	uint32_t i, blocksPerChunk = r * 2, half = 0;
-	xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
-	size_t rounds;
-
-	/* 1: X = B_{2r - 1} */
-	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
-	x0 = xmmp[0];
-	x1 = xmmp[1];
-	x2 = xmmp[2];
-	x3 = xmmp[3];
-
-	if (Bxor) {
-		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-	}
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
-		/* 3: X = H(X ^ B_i) */
-		xmmp = (xmmi *)scrypt_block(Bin, i);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-
-		if (Bxor) {
-			xmmp = (xmmi *)scrypt_block(Bxor, i);
-			x0 = _mm_xor_si128(x0, xmmp[0]);
-			x1 = _mm_xor_si128(x1, xmmp[1]);
-			x2 = _mm_xor_si128(x2, xmmp[2]);
-			x3 = _mm_xor_si128(x3, xmmp[3]);
-		}
-
-		t0 = x0;
-		t1 = x1;
-		t2 = x2;
-		t3 = x3;
-
-		for (rounds = 8; rounds; rounds -= 2) {
-			x4 = x1;
-			x4 = _mm_add_epi32(x4, x0);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 7);
-			x5 = _mm_srli_epi32(x5, 25);
-			x3 = _mm_xor_si128(x3, x4);
-			x4 = x0;
-			x3 = _mm_xor_si128(x3, x5);
-			x4 = _mm_add_epi32(x4, x3);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 9);
-			x5 = _mm_srli_epi32(x5, 23);
-			x2 = _mm_xor_si128(x2, x4);
-			x4 = x3;
-			x2 = _mm_xor_si128(x2, x5);
-			x3 = _mm_shuffle_epi32(x3, 0x93);
-			x4 = _mm_add_epi32(x4, x2);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 13);
-			x5 = _mm_srli_epi32(x5, 19);
-			x1 = _mm_xor_si128(x1, x4);
-			x4 = x2;
-			x1 = _mm_xor_si128(x1, x5);
-			x2 = _mm_shuffle_epi32(x2, 0x4e);
-			x4 = _mm_add_epi32(x4, x1);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 18);
-			x5 = _mm_srli_epi32(x5, 14);
-			x0 = _mm_xor_si128(x0, x4);
-			x4 = x3;
-			x0 = _mm_xor_si128(x0, x5);
-			x1 = _mm_shuffle_epi32(x1, 0x39);
-			x4 = _mm_add_epi32(x4, x0);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 7);
-			x5 = _mm_srli_epi32(x5, 25);
-			x1 = _mm_xor_si128(x1, x4);
-			x4 = x0;
-			x1 = _mm_xor_si128(x1, x5);
-			x4 = _mm_add_epi32(x4, x1);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 9);
-			x5 = _mm_srli_epi32(x5, 23);
-			x2 = _mm_xor_si128(x2, x4);
-			x4 = x1;
-			x2 = _mm_xor_si128(x2, x5);
-			x1 = _mm_shuffle_epi32(x1, 0x93);
-			x4 = _mm_add_epi32(x4, x2);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 13);
-			x5 = _mm_srli_epi32(x5, 19);
-			x3 = _mm_xor_si128(x3, x4);
-			x4 = x2;
-			x3 = _mm_xor_si128(x3, x5);
-			x2 = _mm_shuffle_epi32(x2, 0x4e);
-			x4 = _mm_add_epi32(x4, x3);
-			x5 = x4;
-			x4 = _mm_slli_epi32(x4, 18);
-			x5 = _mm_srli_epi32(x5, 14);
-			x0 = _mm_xor_si128(x0, x4);
-			x3 = _mm_shuffle_epi32(x3, 0x39);
-			x0 = _mm_xor_si128(x0, x5);
-		}
-
-		x0 = _mm_add_epi32(x0, t0);
-		x1 = _mm_add_epi32(x1, t1);
-		x2 = _mm_add_epi32(x2, t2);
-		x3 = _mm_add_epi32(x3, t3);
-
-		/* 4: Y_i = X */
-		/* 6: B'[0..r-1] = Y_even */
-		/* 6: B'[r..2r-1] = Y_odd */
-		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
-		xmmp[0] = x0;
-		xmmp[1] = x1;
-		xmmp[2] = x2;
-		xmmp[3] = x3;
-	}
-}
-
-#endif
-
-#if defined(SCRYPT_SALSA_SSE2)
-	#undef SCRYPT_MIX
-	#define SCRYPT_MIX "Salsa/8-SSE2"
-	#undef SCRYPT_SALSA_INCLUDED
-	#define SCRYPT_SALSA_INCLUDED
-#endif
-
-/* used by avx,etc as well */
-#if defined(SCRYPT_SALSA_INCLUDED)
-	/*
-		Default layout:
-		 0  1  2  3
-		 4  5  6  7
-		 8  9 10 11
-		12 13 14 15
-
-		SSE2 layout:
-		 0  5 10 15
-		12  1  6 11
-		 8 13  2  7
-		 4  9 14  3
-	*/
-
-	static void asm_calling_convention
-	salsa_core_tangle_sse2(uint32_t *blocks, size_t count) {
-		uint32_t t;
-		while (count--) {
-			t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
-			t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
-			t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
-			t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
-			t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
-			t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
-			blocks += 16;
-		}
-	}
-#endif
-
--- a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-xop.h
+++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa-xop.h
@ -1,317 +0,0 @@
-/* x86 */
-#if defined(X86ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
-
-#define SCRYPT_SALSA_XOP
-
-asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
-asm_naked_fn(scrypt_ChunkMix_xop)
-	a1(push ebx)
-	a1(push edi)
-	a1(push esi)
-	a1(push ebp)
-	a2(mov ebp,esp)
-	a2(mov edi,[ebp+20])
-	a2(mov esi,[ebp+24])
-	a2(mov eax,[ebp+28])
-	a2(mov ebx,[ebp+32])
-	a2(sub esp,32)
-	a2(and esp,~63)
-	a2(lea edx,[ebx*2])
-	a2(shl edx,6)
-	a2(lea ecx,[edx-64])
-	a2(and eax, eax)
-	a2(movdqa xmm0,[ecx+esi+0])
-	a2(movdqa xmm1,[ecx+esi+16])
-	a2(movdqa xmm2,[ecx+esi+32])
-	a2(movdqa xmm3,[ecx+esi+48])
-	aj(jz scrypt_ChunkMix_xop_no_xor1)
-	a3(vpxor xmm0,xmm0,[ecx+eax+0])
-	a3(vpxor xmm1,xmm1,[ecx+eax+16])
-	a3(vpxor xmm2,xmm2,[ecx+eax+32])
-	a3(vpxor xmm3,xmm3,[ecx+eax+48])
-	a1(scrypt_ChunkMix_xop_no_xor1:)
-	a2(xor ecx,ecx)
-	a2(xor ebx,ebx)
-	a1(scrypt_ChunkMix_xop_loop:)
-		a2(and eax, eax)
-		a3(vpxor xmm0,xmm0,[esi+ecx+0])
-		a3(vpxor xmm1,xmm1,[esi+ecx+16])
-		a3(vpxor xmm2,xmm2,[esi+ecx+32])
-		a3(vpxor xmm3,xmm3,[esi+ecx+48])
-		aj(jz scrypt_ChunkMix_xop_no_xor2)
-		a3(vpxor xmm0,xmm0,[eax+ecx+0])
-		a3(vpxor xmm1,xmm1,[eax+ecx+16])
-		a3(vpxor xmm2,xmm2,[eax+ecx+32])
-		a3(vpxor xmm3,xmm3,[eax+ecx+48])
-		a1(scrypt_ChunkMix_xop_no_xor2:)
-		a2(vmovdqa [esp+0],xmm0)
-		a2(vmovdqa [esp+16],xmm1)
-		a2(vmovdqa xmm6,xmm2)
-		a2(vmovdqa xmm7,xmm3)
-		a2(mov eax,8)
-		a1(scrypt_salsa_xop_loop: )
-			a3(vpaddd xmm4, xmm1, xmm0)
-			a3(vprotd xmm4, xmm4, 7)
-			a3(vpxor xmm3, xmm3, xmm4)
-			a3(vpaddd xmm4, xmm0, xmm3)
-			a3(vprotd xmm4, xmm4, 9)
-			a3(vpxor xmm2, xmm2, xmm4)
-			a3(vpaddd xmm4, xmm3, xmm2)
-			a3(vprotd xmm4, xmm4, 13)
-			a3(vpxor xmm1, xmm1, xmm4)
-			a3(vpaddd xmm4, xmm2, xmm1)
-			a3(pshufd xmm3, xmm3, 0x93)
-			a3(vprotd xmm4, xmm4, 18)
-			a3(pshufd xmm2, xmm2, 0x4e)
-			a3(vpxor xmm0, xmm0, xmm4)
-			a3(pshufd xmm1, xmm1, 0x39)
-			a3(vpaddd xmm4, xmm3, xmm0)
-			a3(vprotd xmm4, xmm4, 7)
-			a3(vpxor xmm1, xmm1, xmm4)
-			a3(vpaddd xmm4, xmm0, xmm1)
-			a3(vprotd xmm4, xmm4, 9)
-			a3(vpxor xmm2, xmm2, xmm4)
-			a3(vpaddd xmm4, xmm1, xmm2)
-			a3(vprotd xmm4, xmm4, 13)
-			a3(vpxor xmm3, xmm3, xmm4)
-			a3(pshufd xmm1, xmm1, 0x93)
-			a3(vpaddd xmm4, xmm2, xmm3)
-			a3(pshufd xmm2, xmm2, 0x4e)
-			a3(vprotd xmm4, xmm4, 18)
-			a3(pshufd xmm3, xmm3, 0x39)
-			a3(vpxor xmm0, xmm0, xmm4)
-			a2(sub eax, 2)
-			aj(ja scrypt_salsa_xop_loop)
-		a3(vpaddd xmm0,xmm0,[esp+0])
-		a3(vpaddd xmm1,xmm1,[esp+16])
-		a3(vpaddd xmm2,xmm2,xmm6)
-		a3(vpaddd xmm3,xmm3,xmm7)
-		a2(lea eax,[ebx+ecx])
-		a2(xor ebx,edx)
-		a2(and eax,~0x7f)
-		a2(add ecx,64)
-		a2(shr eax,1)
-		a2(add eax, edi)
-		a2(cmp ecx,edx)
-		a2(vmovdqa [eax+0],xmm0)
-		a2(vmovdqa [eax+16],xmm1)
-		a2(vmovdqa [eax+32],xmm2)
-		a2(vmovdqa [eax+48],xmm3)
-		a2(mov eax,[ebp+28])
-		aj(jne scrypt_ChunkMix_xop_loop)
-	a2(mov esp,ebp)
-	a1(pop ebp)
-	a1(pop esi)
-	a1(pop edi)
-	a1(pop ebx)
-	aret(16)
-asm_naked_fn_end(scrypt_ChunkMix_xop)
-
-#endif
-
-
-
-/* x64 */
-#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
-
-#define SCRYPT_SALSA_XOP
-
-asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
-asm_naked_fn(scrypt_ChunkMix_xop)
-	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
-	a2(shl rcx,6)
-	a2(lea r9,[rcx-64])
-	a2(lea rax,[rsi+r9])
-	a2(lea r9,[rdx+r9])
-	a2(and rdx, rdx)
-	a2(vmovdqa xmm0,[rax+0])
-	a2(vmovdqa xmm1,[rax+16])
-	a2(vmovdqa xmm2,[rax+32])
-	a2(vmovdqa xmm3,[rax+48])
-	aj(jz scrypt_ChunkMix_xop_no_xor1)
-	a3(vpxor xmm0,xmm0,[r9+0])
-	a3(vpxor xmm1,xmm1,[r9+16])
-	a3(vpxor xmm2,xmm2,[r9+32])
-	a3(vpxor xmm3,xmm3,[r9+48])
-	a1(scrypt_ChunkMix_xop_no_xor1:)
-	a2(xor r9,r9)
-	a2(xor r8,r8)
-	a1(scrypt_ChunkMix_xop_loop:)
-		a2(and rdx, rdx)
-		a3(vpxor xmm0,xmm0,[rsi+r9+0])
-		a3(vpxor xmm1,xmm1,[rsi+r9+16])
-		a3(vpxor xmm2,xmm2,[rsi+r9+32])
-		a3(vpxor xmm3,xmm3,[rsi+r9+48])
-		aj(jz scrypt_ChunkMix_xop_no_xor2)
-		a3(vpxor xmm0,xmm0,[rdx+r9+0])
-		a3(vpxor xmm1,xmm1,[rdx+r9+16])
-		a3(vpxor xmm2,xmm2,[rdx+r9+32])
-		a3(vpxor xmm3,xmm3,[rdx+r9+48])
-		a1(scrypt_ChunkMix_xop_no_xor2:)
-		a2(vmovdqa xmm8,xmm0)
-		a2(vmovdqa xmm9,xmm1)
-		a2(vmovdqa xmm10,xmm2)
-		a2(vmovdqa xmm11,xmm3)
-		a2(mov rax,8)
-		a1(scrypt_salsa_xop_loop: )
-			a3(vpaddd xmm4, xmm1, xmm0)
-			a3(vprotd xmm4, xmm4, 7)
-			a3(vpxor xmm3, xmm3, xmm4)
-			a3(vpaddd xmm4, xmm0, xmm3)
-			a3(vprotd xmm4, xmm4, 9)
-			a3(vpxor xmm2, xmm2, xmm4)
-			a3(vpaddd xmm4, xmm3, xmm2)
-			a3(vprotd xmm4, xmm4, 13)
-			a3(vpxor xmm1, xmm1, xmm4)
-			a3(vpaddd xmm4, xmm2, xmm1)
-			a3(pshufd xmm3, xmm3, 0x93)
-			a3(vprotd xmm4, xmm4, 18)
-			a3(pshufd xmm2, xmm2, 0x4e)
-			a3(vpxor xmm0, xmm0, xmm4)
-			a3(pshufd xmm1, xmm1, 0x39)
-			a3(vpaddd xmm4, xmm3, xmm0)
-			a3(vprotd xmm4, xmm4, 7)
-			a3(vpxor xmm1, xmm1, xmm4)
-			a3(vpaddd xmm4, xmm0, xmm1)
-			a3(vprotd xmm4, xmm4, 9)
-			a3(vpxor xmm2, xmm2, xmm4)
-			a3(vpaddd xmm4, xmm1, xmm2)
-			a3(vprotd xmm4, xmm4, 13)
-			a3(vpxor xmm3, xmm3, xmm4)
-			a3(pshufd xmm1, xmm1, 0x93)
-			a3(vpaddd xmm4, xmm2, xmm3)
-			a3(pshufd xmm2, xmm2, 0x4e)
-			a3(vprotd xmm4, xmm4, 18)
-			a3(pshufd xmm3, xmm3, 0x39)
-			a3(vpxor xmm0, xmm0, xmm4)
-			a2(sub rax, 2)
-			aj(ja scrypt_salsa_xop_loop)
-		a3(vpaddd xmm0,xmm0,xmm8)
-		a3(vpaddd xmm1,xmm1,xmm9)
-		a3(vpaddd xmm2,xmm2,xmm10)
-		a3(vpaddd xmm3,xmm3,xmm11)
-		a2(lea rax,[r8+r9])
-		a2(xor r8,rcx)
-		a2(and rax,~0x7f)
-		a2(add r9,64)
-		a2(shr rax,1)
-		a2(add rax, rdi)
-		a2(cmp r9,rcx)
-		a2(vmovdqa [rax+0],xmm0)
-		a2(vmovdqa [rax+16],xmm1)
-		a2(vmovdqa [rax+32],xmm2)
-		a2(vmovdqa [rax+48],xmm3)
-		aj(jne scrypt_ChunkMix_xop_loop)
-	a1(ret)
-asm_naked_fn_end(scrypt_ChunkMix_xop)
-
-#endif
-
-
-/* intrinsic */
-#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
-
-#define SCRYPT_SALSA_XOP
-
-static void asm_calling_convention NOINLINE
-scrypt_ChunkMix_xop(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
-	uint32_t i, blocksPerChunk = r * 2, half = 0;
-	xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
-	size_t rounds;
-
-	/* 1: X = B_{2r - 1} */
-	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
-	x0 = xmmp[0];
-	x1 = xmmp[1];
-	x2 = xmmp[2];
-	x3 = xmmp[3];
-
-	if (Bxor) {
-		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-	}
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
-		/* 3: X = H(X ^ B_i) */
-		xmmp = (xmmi *)scrypt_block(Bin, i);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-
-		if (Bxor) {
-			xmmp = (xmmi *)scrypt_block(Bxor, i);
-			x0 = _mm_xor_si128(x0, xmmp[0]);
-			x1 = _mm_xor_si128(x1, xmmp[1]);
-			x2 = _mm_xor_si128(x2, xmmp[2]);
-			x3 = _mm_xor_si128(x3, xmmp[3]);
-		}
-
-		t0 = x0;
-		t1 = x1;
-		t2 = x2;
-		t3 = x3;
-
-		for (rounds = 8; rounds; rounds -= 2) {
-			x4 = _mm_add_epi32(x1, x0);
-			x4 = _mm_roti_epi32(x4, 7);
-			x3 = _mm_xor_si128(x3, x4);
-			x4 = _mm_add_epi32(x0, x3);
-			x4 = _mm_roti_epi32(x4, 9);
-			x2 = _mm_xor_si128(x2, x4);
-			x4 = _mm_add_epi32(x3, x2);
-			x4 = _mm_roti_epi32(x4, 13);
-			x1 = _mm_xor_si128(x1, x4);
-			x4 = _mm_add_epi32(x2, x1);
-			x4 = _mm_roti_epi32(x4, 18);
-			x0 = _mm_xor_si128(x0, x4);
-			x3 = _mm_shuffle_epi32(x3, 0x93);
-			x2 = _mm_shuffle_epi32(x2, 0x4e);
-			x1 = _mm_shuffle_epi32(x1, 0x39);
-			x4 = _mm_add_epi32(x3, x0);
-			x4 = _mm_roti_epi32(x4, 7);
-			x1 = _mm_xor_si128(x1, x4);
-			x4 = _mm_add_epi32(x0, x1);
-			x4 = _mm_roti_epi32(x4, 9);
-			x2 = _mm_xor_si128(x2, x4);
-			x4 = _mm_add_epi32(x1, x2);
-			x4 = _mm_roti_epi32(x4, 13);
-			x3 = _mm_xor_si128(x3, x4);
-			x4 = _mm_add_epi32(x2, x3);
-			x4 = _mm_roti_epi32(x4, 18);
-			x0 = _mm_xor_si128(x0, x4);
-			x1 = _mm_shuffle_epi32(x1, 0x93);
-			x2 = _mm_shuffle_epi32(x2, 0x4e);
-			x3 = _mm_shuffle_epi32(x3, 0x39);
-		}
-
-		x0 = _mm_add_epi32(x0, t0);
-		x1 = _mm_add_epi32(x1, t1);
-		x2 = _mm_add_epi32(x2, t2);
-		x3 = _mm_add_epi32(x3, t3);
-
-		/* 4: Y_i = X */
-		/* 6: B'[0..r-1] = Y_even */
-		/* 6: B'[r..2r-1] = Y_odd */
-		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
-		xmmp[0] = x0;
-		xmmp[1] = x1;
-		xmmp[2] = x2;
-		xmmp[3] = x3;
-	}
-}
-
-#endif
-
-#if defined(SCRYPT_SALSA_XOP)
-	/* uses salsa_core_tangle_sse2 */
-
-	#undef SCRYPT_MIX
-	#define SCRYPT_MIX "Salsa/8-XOP"
-	#undef SCRYPT_SALSA_INCLUDED
-	#define SCRYPT_SALSA_INCLUDED
-#endif
--- a/stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h
+++ b/stratum/algos/ar2/sj/scrypt-jane-mix_salsa.h
@ -1,70 +0,0 @@
-#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)
-
-#undef SCRYPT_MIX
-#define SCRYPT_MIX "Salsa20/8 Ref"
-
-#undef SCRYPT_SALSA_INCLUDED
-#define SCRYPT_SALSA_INCLUDED
-#define SCRYPT_SALSA_BASIC
-
-static void
-salsa_core_basic(uint32_t state[16]) {
-	size_t rounds = 8;
-	uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t;
-
-	x0 = state[0];
-	x1 = state[1];
-	x2 = state[2];
-	x3 = state[3];
-	x4 = state[4];
-	x5 = state[5];
-	x6 = state[6];
-	x7 = state[7];
-	x8 = state[8];
-	x9 = state[9];
-	x10 = state[10];
-	x11 = state[11];
-	x12 = state[12];
-	x13 = state[13];
-	x14 = state[14];
-	x15 = state[15];
-
-	#define quarter(a,b,c,d) \
-		t = a+d; t = ROTL32(t,  7); b ^= t; \
-		t = b+a; t = ROTL32(t,  9); c ^= t; \
-		t = c+b; t = ROTL32(t, 13); d ^= t; \
-		t = d+c; t = ROTL32(t, 18); a ^= t; \
-
-	for (; rounds; rounds -= 2) {
-		quarter( x0, x4, x8,x12)
-		quarter( x5, x9,x13, x1)
-		quarter(x10,x14, x2, x6)
-		quarter(x15, x3, x7,x11)
-		quarter( x0, x1, x2, x3)
-		quarter( x5, x6, x7, x4)
-		quarter(x10,x11, x8, x9)
-		quarter(x15,x12,x13,x14)
-	}
-
-	state[0] += x0;
-	state[1] += x1;
-	state[2] += x2;
-	state[3] += x3;
-	state[4] += x4;
-	state[5] += x5;
-	state[6] += x6;
-	state[7] += x7;
-	state[8] += x8;
-	state[9] += x9;
-	state[10] += x10;
-	state[11] += x11;
-	state[12] += x12;
-	state[13] += x13;
-	state[14] += x14;
-	state[15] += x15;
-
-	#undef quarter
-}
-
-#endif
-
--- a/stratum/algos/ar2/sj/scrypt-jane-romix.h
+++ b/stratum/algos/ar2/sj/scrypt-jane-romix.h
@ -1,6 +1,4 @@
-#if defined(SCRYPT_SALSA)
-#include "scrypt-jane-salsa.h"
-#elif defined(SCRYPT_SALSA64)
+#ifdef SCRYPT_SALSA64
 #include "scrypt-jane-salsa64.h"
 #else
 	#define SCRYPT_MIX_BASE "ERROR"
--- a/stratum/algos/ar2/sj/scrypt-jane-salsa.h
+++ b/stratum/algos/ar2/sj/scrypt-jane-salsa.h
@ -1,134 +0,0 @@
-#define SCRYPT_MIX_BASE "Salsa20/8"
-
-typedef uint32_t scrypt_mix_word_t;
-
-#define SCRYPT_WORDTO8_LE U32TO8_LE
-#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
-
-#define SCRYPT_BLOCK_BYTES 64
-#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
-
-/* must have these here in case block bytes is ever != 64 */
-#include "scrypt-jane-romix-basic.h"
-
-#include "scrypt-jane-mix_salsa-xop.h"
-#include "scrypt-jane-mix_salsa-avx.h"
-#include "scrypt-jane-mix_salsa-sse2.h"
-#include "scrypt-jane-mix_salsa.h"
-
-#if defined(SCRYPT_SALSA_XOP)
-	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop
-	#define SCRYPT_ROMIX_FN scrypt_ROMix_xop
-	#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
-	#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
-	#include "scrypt-jane-romix-template.h"
-#endif
-
-#if defined(SCRYPT_SALSA_AVX)
-	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
-	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
-	#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
-	#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
-	#include "scrypt-jane-romix-template.h"
-#endif
-
-#if defined(SCRYPT_SALSA_SSE2)
-	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
-	#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
-	#define SCRYPT_MIX_FN salsa_core_sse2
-	#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
-	#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
-	#include "scrypt-jane-romix-template.h"
-#endif
-
-/* cpu agnostic */
-#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
-#define SCRYPT_MIX_FN salsa_core_basic
-#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
-#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
-#include "scrypt-jane-romix-template.h"
-
-#if !defined(SCRYPT_CHOOSE_COMPILETIME)
-static scrypt_ROMixfn
-scrypt_getROMix(void) {
-	size_t cpuflags = detect_cpu();
-
-#if defined(SCRYPT_SALSA_XOP)
-	if (cpuflags & cpu_xop)
-		return scrypt_ROMix_xop;
-	else
-#endif
-
-#if defined(SCRYPT_SALSA_AVX)
-	if (cpuflags & cpu_avx)
-		return scrypt_ROMix_avx;
-	else
-#endif
-
-#if defined(SCRYPT_SALSA_SSE2)
-	if (cpuflags & cpu_sse2)
-		return scrypt_ROMix_sse2;
-	else
-#endif
-
-	return scrypt_ROMix_basic;
-}
-#endif
-
-
-#if defined(SCRYPT_TEST_SPEED)
-static size_t
-available_implementations(void) {
-	size_t cpuflags = detect_cpu();
-	size_t flags = 0;
-
-#if defined(SCRYPT_SALSA_XOP)
-	if (cpuflags & cpu_xop)
-		flags |= cpu_xop;
-#endif
-
-#if defined(SCRYPT_SALSA_AVX)
-	if (cpuflags & cpu_avx)
-		flags |= cpu_avx;
-#endif
-
-#if defined(SCRYPT_SALSA_SSE2)
-	if (cpuflags & cpu_sse2)
-		flags |= cpu_sse2;
-#endif
-
-	return flags;
-}
-#endif
-
-
-static int
-scrypt_test_mix(void) {
-	static const uint8_t expected[16] = {
-		0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66,
-	};
-
-	int ret = 1;
-	size_t cpuflags = detect_cpu();
-
-#if defined(SCRYPT_SALSA_XOP)
-	if (cpuflags & cpu_xop)
-		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
-#endif
-
-#if defined(SCRYPT_SALSA_AVX)
-	if (cpuflags & cpu_avx)
-		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
-#endif
-
-#if defined(SCRYPT_SALSA_SSE2)
-	if (cpuflags & cpu_sse2)
-		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
-#endif
-
-#if defined(SCRYPT_SALSA_BASIC)
-	ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
-#endif
-
-	return ret;
-}
--- a/stratum/algos/ar2/sj/scrypt-jane-test-vectors.h
+++ b/stratum/algos/ar2/sj/scrypt-jane-test-vectors.h
@ -10,18 +10,7 @@ static const scrypt_test_setting post_settings[] = {
 };

 #if defined(SCRYPT_SKEIN512)
-	#if defined(SCRYPT_SALSA)
-		static const uint8_t post_vectors[][64] = {
-			{0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69,
-			 0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87,
-			 0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f,
-			 0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e},
-			{0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e,
-			 0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b,
-			 0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb,
-			 0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00}
-		};
-	#elif defined(SCRYPT_SALSA64)
+	#ifdef SCRYPT_SALSA64
 		static const uint8_t post_vectors[][64] = {
 			{0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60,
 			 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59,
--- a/stratum/algos/ar2/thread.c
+++ b/stratum/algos/ar2/thread.c
@ -1,36 +0,0 @@
-#include "thread.h"
-#if defined(_WIN32)
-#include <Windows.h>
-#endif
-
-int argon2_thread_create(argon2_thread_handle_t *handle,
-                         argon2_thread_func_t func, void *args) {
-    if (NULL == handle || func == NULL) {
-        return -1;
-    }
-#if defined(_WIN32)
-    *handle = _beginthreadex(NULL, 0, func, args, 0, NULL);
-    return *handle != 0 ? 0 : -1;
-#else
-    return pthread_create(handle, NULL, func, args);
-#endif
-}
-
-int argon2_thread_join(argon2_thread_handle_t handle) {
-#if defined(_WIN32)
-    if (WaitForSingleObject((HANDLE)handle, INFINITE) == WAIT_OBJECT_0) {
-        return CloseHandle((HANDLE)handle) != 0 ? 0 : -1;
-    }
-    return -1;
-#else
-    return pthread_join(handle, NULL);
-#endif
-}
-
-void argon2_thread_exit(void) {
-#if defined(_WIN32)
-    _endthreadex(0);
-#else
-    pthread_exit(NULL);
-#endif
-}
--- a/stratum/algos/ar2/thread.h
+++ b/stratum/algos/ar2/thread.h
@ -1,46 +0,0 @@
-#ifndef ARGON2_THREAD_H
-#define ARGON2_THREAD_H
-/*
-        Here we implement an abstraction layer for the simpĺe requirements
-        of the Argon2 code. We only require 3 primitives---thread creation,
-        joining, and termination---so full emulation of the pthreads API
-        is unwarranted. Currently we wrap pthreads and Win32 threads.
-
-        The API defines 2 types: the function pointer type,
-   argon2_thread_func_t,
-        and the type of the thread handle---argon2_thread_handle_t.
-*/
-#if defined(_WIN32)
-#include <process.h>
-typedef unsigned(__stdcall *argon2_thread_func_t)(void *);
-typedef uintptr_t argon2_thread_handle_t;
-#else
-#include <pthread.h>
-typedef void *(*argon2_thread_func_t)(void *);
-typedef pthread_t argon2_thread_handle_t;
-#endif
-
-/* Creates a thread
- * @param handle pointer to a thread handle, which is the output of this
- * function. Must not be NULL.
- * @param func A function pointer for the thread's entry point. Must not be
- * NULL.
- * @param args Pointer that is passed as an argument to @func. May be NULL.
- * @return 0 if @handle and @func are valid pointers and a thread is successfuly
- * created.
- */
-int argon2_thread_create(argon2_thread_handle_t *handle,
-                         argon2_thread_func_t func, void *args);
-
-/* Waits for a thread to terminate
- * @param handle Handle to a thread created with argon2_thread_create.
- * @return 0 if @handle is a valid handle, and joining completed successfully.
-*/
-int argon2_thread_join(argon2_thread_handle_t handle);
-
-/* Terminate the current thread. Must be run inside a thread created by
- * argon2_thread_create.
-*/
-void argon2_thread_exit(void);
-
-#endif
--- a/stratum/algos/argon2a.c
+++ b/stratum/algos/argon2a.c
@ -7,7 +7,7 @@

 #include "ar2/argon2.h"
 #include "ar2/cores.h"
-#include "ar2/scrypt-jane.h"
+#include "ar2/ar2-scrypt-jane.h"

 #define _ALIGN(x) __attribute__ ((aligned(x)))

@ -16,7 +16,7 @@
 #define MASK 8
 #define ZERO 0

-static void argon_call(void *out, void *in, void *salt, int type)
+inline void argon_call(void *out, void *in, void *salt, int type)
 {
 	argon2_context context = { 0 };

@ -27,50 +27,18 @@ static void argon_call(void *out, void *in, void *salt, int type)
 	argon2_core(&context, type);
 }

-static void bin2hex(char *s, const unsigned char *p, size_t len)
-{
-        for (size_t i = 0; i < len; i++)
-                sprintf(s + (i * 2), "%02x", (unsigned int) p[i]);
-}
-
-static char *abin2hex(const unsigned char *p, size_t len)
-{
-        char *s = (char*) malloc((len * 2) + 1);
-        if (!s)
-                return NULL;
-        bin2hex(s, p, len);
-        return s;
-}
-
-static void applog_data(void *pdata)
-{
-        char* hex = abin2hex((unsigned char*)pdata, 80);
-        fprintf(stderr, "%s\n", hex);
-        free(hex);
-}
-
 void argon2_hash(const char* input, char* output, uint32_t len)
 {
-	// these uint512 in the c++ source of the client are backed by an array of uint32
-	uint32_t _ALIGN(32) hashA[8], hashB[8], hashC[8];
-	uint32_t _ALIGN(32) endian[20], *in;
+	uint32_t _ALIGN(32) hashA[8], hashB[8];

-	in = (uint32_t*) input;
-	for (int i=0; i<len/4; i++)
-		endian[i] = in[i];
-	//	be32enc(&endian[i], in[i]);
-	//applog_data((void*) endian);
-
-	my_scrypt((unsigned char *)endian, len,
-		(unsigned char *)endian, len,
+	my_scrypt((unsigned char *)input, len,
+		(unsigned char *)input, len,
 		(unsigned char *)hashA);

 	argon_call(hashB, hashA, hashA, (hashA[0] & MASK) == ZERO);

 	my_scrypt((const unsigned char *)hashB, 32,
 		(const unsigned char *)hashB, 32,
-		(unsigned char *)hashC);
-
-	memcpy(output, hashC, 32);
+		(unsigned char *)output);
 }

--- a/stratum/algos/makefile
+++ b/stratum/algos/makefile
@ -12,7 +12,7 @@ SOURCES=lyra2re.c lyra2v2.c Lyra2.c Sponge.c blake.c scrypt.c c11.c x11.c x13.c
 	skein2.c zr5.c bmw.c luffa.c pentablake.c whirlpool.c whirlpoolx.c blakecoin.c \
 	yescrypt.c yescrypt-opt.c sha256_Y.c \
 	m7m.c magimath.cpp velvet.c \
-	argon2a.c ar2/blake2b.c ar2/argon2.c ar2/ref.c ar2/cores.c ar2/thread.c ar2/scrypt-jane.c \
+	argon2a.c ar2/blake2b.c ar2/argon2.c ar2/ref.c ar2/cores.c ar2/ar2-scrypt-jane.c \
 	hive.c pomelo.c \
 	sib.c gost.c