Browse Source

Add an AVX2 code path.

It's far from optimal because of the way the abstraction is done as
the YMM state is saved/restored on entry/exit to every "low-level
operation".

That said, it's a significant improvement over the reference code, and
is probably maintainable despite being a gigantic wall of assembly
language.
Yawning Angel 1 year ago
parent
commit
147f7126bc
6 changed files with 604 additions and 34 deletions
  1. 1 4
      hwaccel.go
  2. 160 0
      hwaccel_amd64.go
  3. 427 0
      hwaccel_amd64.s
  4. 14 0
      hwaccel_ref.go
  5. 1 4
      norx_ref.go
  6. 1 26
      norx_test.go

+ 1 - 4
hwaccel.go

@@ -18,7 +18,6 @@ var (
 		encryptDataFn: encryptDataRef,
 		decryptDataFn: decryptDataRef,
 		finalizeFn:    finalizeRef,
-		permuteFn:     permuteRef,
 	}
 )
 
@@ -29,7 +28,6 @@ type hwaccelImpl struct {
 	encryptDataFn func(*state, []byte, []byte)
 	decryptDataFn func(*state, []byte, []byte)
 	finalizeFn    func(*state, []byte, []byte)
-	permuteFn     func(*state, int)
 }
 
 func forceDisableHardwareAcceleration() {
@@ -44,6 +42,5 @@ func IsHardwareAccelerated() bool {
 }
 
 func init() {
-	// NOTYET
-	// initHardwareAcceleration()
+	initHardwareAcceleration()
 }

+ 160 - 0
hwaccel_amd64.go

@@ -0,0 +1,160 @@
+// hwaccel_amd64.go - AMD64 optimized routines
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to the software, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+// +build amd64,!gccgo,!noasm,go1.10
+
+package norx
+
+//go:noescape
+func cpuidAmd64(cpuidParams *uint32)
+
+//go:noescape
+func xgetbv0Amd64(xcrVec *uint32)
+
+//go:noescape
+func initAVX2(s *uint64, key, nonce *byte, initConsts, instConsts *uint64)
+
+//go:noescape
+func absorbBlocksAVX2(s *uint64, in *byte, rounds, blocks uint64, tag *uint64)
+
+//go:noescape
+func encryptBlocksAVX2(s *uint64, out, in *byte, rounds, blocks uint64)
+
+//go:noescape
+func decryptBlocksAVX2(s *uint64, out, in *byte, rounds, blocks uint64)
+
+//go:noescape
+func decryptLastBlockAVX2(s *uint64, out, in *byte, rounds, inLen uint64)
+
+//go:noescape
+func finalizeAVX2(s *uint64, out, key *byte, rounds uint64)
+
+func supportsAVX2() bool {
+	// https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+	const (
+		osXsaveBit = 1 << 27
+		avx2Bit    = 1 << 5
+	)
+
+	// Check to see if CPUID actually supports the leaf that indicates AVX2.
+	// CPUID.(EAX=0H, ECX=0H) >= 7
+	regs := [4]uint32{0x00}
+	cpuidAmd64(&regs[0])
+	if regs[0] < 7 {
+		return false
+	}
+
+	// Check to see if the OS knows how to save/restore XMM/YMM state.
+	// CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1
+	regs = [4]uint32{0x01}
+	cpuidAmd64(&regs[0])
+	if regs[2]&osXsaveBit == 0 {
+		return false
+	}
+	xcrRegs := [2]uint32{}
+	xgetbv0Amd64(&xcrRegs[0])
+	if xcrRegs[0]&6 != 6 {
+		return false
+	}
+
+	// Check for AVX2 support.
+	// CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1
+	regs = [4]uint32{0x07}
+	cpuidAmd64(&regs[0])
+	return regs[1]&avx2Bit != 0
+}
+
+var implAVX2 = &hwaccelImpl{
+	name:          "AVX2",
+	initFn:        initYMM,
+	absorbDataFn:  absorbDataYMM,
+	encryptDataFn: encryptDataYMM,
+	decryptDataFn: decryptDataYMM,
+	finalizeFn:    finalizeYMM,
+}
+
+func initYMM(s *state, key, nonce []byte) {
+	var instConsts = [4]uint64{paramW, uint64(s.rounds), paramP, paramT}
+	initAVX2(&s.s[0], &key[0], &nonce[0], &initializationConstants[8], &instConsts[0])
+}
+
+func absorbDataYMM(s *state, in []byte, tag uint64) {
+	inLen := len(in)
+	if inLen == 0 {
+		return
+	}
+
+	var tagVec = [4]uint64{0, 0, 0, tag}
+	var off int
+	if inBlocks := inLen / bytesR; inBlocks > 0 {
+		absorbBlocksAVX2(&s.s[0], &in[0], uint64(s.rounds), uint64(inBlocks), &tagVec[0])
+		off += inBlocks * bytesR
+	}
+	in = in[off:]
+
+	var lastBlock [bytesR]byte
+	padRef(&lastBlock, in)
+	absorbBlocksAVX2(&s.s[0], &lastBlock[0], uint64(s.rounds), 1, &tagVec[0])
+}
+
+func encryptDataYMM(s *state, out, in []byte) {
+	inLen := len(in)
+	if inLen == 0 {
+		return
+	}
+
+	var off int
+	if inBlocks := inLen / bytesR; inBlocks > 0 {
+		encryptBlocksAVX2(&s.s[0], &out[0], &in[0], uint64(s.rounds), uint64(inBlocks))
+		off += inBlocks * bytesR
+	}
+	out, in = out[off:], in[off:]
+
+	var lastBlock [bytesR]byte
+	padRef(&lastBlock, in)
+	encryptBlocksAVX2(&s.s[0], &lastBlock[0], &lastBlock[0], uint64(s.rounds), 1)
+	copy(out, lastBlock[:len(in)])
+}
+
+func decryptDataYMM(s *state, out, in []byte) {
+	inLen := len(in)
+	if inLen == 0 {
+		return
+	}
+
+	var off int
+	if inBlocks := inLen / bytesR; inBlocks > 0 {
+		decryptBlocksAVX2(&s.s[0], &out[0], &in[0], uint64(s.rounds), uint64(inBlocks))
+		off += inBlocks * bytesR
+	}
+	out, in = out[off:], in[off:]
+
+	var lastBlock [bytesR]byte
+	var inPtr *byte
+	if len(in) != 0 {
+		inPtr = &in[0]
+	}
+	decryptLastBlockAVX2(&s.s[0], &lastBlock[0], inPtr, uint64(s.rounds), uint64(len(in)))
+	copy(out, lastBlock[:len(in)])
+	burnBytes(lastBlock[:])
+}
+
+func finalizeYMM(s *state, tag, key []byte) {
+	var lastBlock [bytesC]byte
+
+	finalizeAVX2(&s.s[0], &lastBlock[0], &key[0], uint64(s.rounds))
+	copy(tag, lastBlock[:bytesT])
+	burnBytes(lastBlock[:]) // burn buffer
+	burnUint64s(s.s[:])     // at this point we can also burn the state
+}
+
+func initHardwareAcceleration() {
+	if supportsAVX2() {
+		isHardwareAccelerated = true
+		hardwareAccelImpl = implAVX2
+	}
+}

+ 427 - 0
hwaccel_amd64.s

@@ -0,0 +1,427 @@
+// +build !noasm,go1.10
+// hwaccel_amd64.s - AMD64 optimized routines
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to the software, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+#include "textflag.h"
+
+// func cpuidAmd64(cpuidParams *uint32)
+TEXT ·cpuidAmd64(SB), NOSPLIT, $0-8
+	MOVQ cpuidParams+0(FP), R15
+	MOVL 0(R15), AX
+	MOVL 8(R15), CX
+	CPUID
+	MOVL AX, 0(R15)
+	MOVL BX, 4(R15)
+	MOVL CX, 8(R15)
+	MOVL DX, 12(R15)
+	RET
+
+// func xgetbv0Amd64(xcrVec *uint32)
+TEXT ·xgetbv0Amd64(SB), NOSPLIT, $0-8
+	MOVQ xcrVec+0(FP), BX
+	XORL CX, CX
+	XGETBV
+	MOVL AX, 0(BX)
+	MOVL DX, 4(BX)
+	RET
+
+// Based heavily on the `ymm` referece implementation, but using assembly
+// language instead of using intrinsics like in a sane language.
+//
+// The TWEAK_LOW_LATENCY variant is used for the permutation.
+
+DATA ·vpshufb_idx_r0<>+0x00(SB)/8, $0x0007060504030201
+DATA ·vpshufb_idx_r0<>+0x08(SB)/8, $0x080f0e0d0c0b0a09
+DATA ·vpshufb_idx_r0<>+0x10(SB)/8, $0x0007060504030201
+DATA ·vpshufb_idx_r0<>+0x18(SB)/8, $0x080f0e0d0c0b0a09
+GLOBL ·vpshufb_idx_r0<>(SB), (NOPTR+RODATA), $32
+
+DATA ·vpshufb_idx_r2<>+0x00(SB)/8, $0x0403020100070605
+DATA ·vpshufb_idx_r2<>+0x08(SB)/8, $0x0c0b0a09080f0e0d
+DATA ·vpshufb_idx_r2<>+0x10(SB)/8, $0x0403020100070605
+DATA ·vpshufb_idx_r2<>+0x18(SB)/8, $0x0c0b0a09080f0e0d
+GLOBL ·vpshufb_idx_r2<>(SB), (NOPTR+RODATA), $32
+
+DATA ·tag_payload<>+0x00(SB)/8, $0x0000000000000000
+DATA ·tag_payload<>+0x08(SB)/8, $0x0000000000000000
+DATA ·tag_payload<>+0x10(SB)/8, $0x0000000000000000
+DATA ·tag_payload<>+0x18(SB)/8, $0x0000000000000002
+GLOBL ·tag_payload<>(SB), (NOPTR+RODATA), $32
+
+DATA ·tag_final<>+0x00(SB)/8, $0x0000000000000000
+DATA ·tag_final<>+0x08(SB)/8, $0x0000000000000000
+DATA ·tag_final<>+0x10(SB)/8, $0x0000000000000000
+DATA ·tag_final<>+0x18(SB)/8, $0x0000000000000008
+GLOBL ·tag_final<>(SB), (NOPTR+RODATA), $32
+
+#define G(A, B, C, D, T0, T1, R0, R2) \
+	VPXOR   A, B, T0   \
+	VPAND   A, B, T1   \
+	VPADDQ  T1, T1, T1 \
+	VPXOR   T0, T1, A  \
+	VPXOR   D, T0, D   \
+	VPXOR   D, T1, D   \
+	VPSHUFB R0, D, D   \
+	                   \
+	VPXOR   C, D, T0   \
+	VPAND   C, D, T1   \
+	VPADDQ  T1, T1, T1 \
+	VPXOR   T0, T1, C  \
+	VPXOR   B, T0, B   \
+	VPXOR   B, T1, B   \
+	VPSRLQ  $19, B, T0 \
+	VPSLLQ  $45, B, T1 \
+	VPOR    T0, T1, B  \
+	                   \
+	VPXOR   A, B, T0   \
+	VPAND   A, B, T1   \
+	VPADDQ  T1, T1, T1 \
+	VPXOR   T0, T1, A  \
+	VPXOR   D, T0, D   \
+	VPXOR   D, T1, D   \
+	VPSHUFB R2, D, D   \
+	                   \
+	VPXOR   C, D, T0   \
+	VPAND   C, D, T1   \
+	VPADDQ  T1, T1, T1 \
+	VPXOR   T0, T1, C  \
+	VPXOR   B, T0, B   \
+	VPXOR   B, T1, B   \
+	VPADDQ  B, B, T0   \
+	VPSRLQ  $63, B, T1 \
+	VPOR    T0, T1, B
+
+// -109 -> 147 (See: https://github.com/golang/go/issues/24378)
+#define DIAGONALIZE(A, B, C, D) \
+	VPERMQ $-109, D, D \
+	VPERMQ $78, C, C   \
+	VPERMQ $57, B, B
+
+#define UNDIAGONALIZE(A, B, C, D) \
+	VPERMQ $57, D, D   \
+	VPERMQ $78, C, C   \
+	VPERMQ $-109, B, B
+
+// func initAVX2(s *uint64, key, nonce *byte, initConsts, instConsts *uint64)
+TEXT ·initAVX2(SB), NOSPLIT, $0-40
+	MOVQ s+0(FP), R8
+	MOVQ key+8(FP), R9
+	MOVQ nonce+16(FP), R10
+	MOVQ initConsts+24(FP), R11
+	MOVQ instConsts+32(FP), R12
+	MOVQ 8(R12), AX
+
+	VMOVDQU (R10), Y0
+	VMOVDQU (R9), Y1
+	VMOVDQU (R11), Y2
+	VMOVDQU 32(R11), Y3
+
+	VMOVDQU (R12), Y4
+	VMOVDQA Y1, Y5
+
+	VPXOR Y3, Y4, Y3
+
+	VMOVDQU ·vpshufb_idx_r0<>(SB), Y13
+	VMOVDQU ·vpshufb_idx_r2<>(SB), Y12
+
+looprounds:
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	DIAGONALIZE(Y0, Y1, Y2, Y3)
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	UNDIAGONALIZE(Y0, Y1, Y2, Y3)
+	SUBQ $1, AX
+	JNZ  looprounds
+
+	VPXOR Y3, Y5, Y3
+
+	VMOVDQU Y0, (R8)
+	VMOVDQU Y1, 32(R8)
+	VMOVDQU Y2, 64(R8)
+	VMOVDQU Y3, 96(R8)
+
+	VZEROUPPER
+	RET
+
+// func absorbBlocksAVX2(s *uint64, in *byte, rounds, blocks uint64, tag *uint64)
+TEXT ·absorbBlocksAVX2(SB), NOSPLIT, $0-40
+	MOVQ s+0(FP), R8
+	MOVQ in+8(FP), R10
+	MOVQ rounds+16(FP), R11
+	MOVQ blocks+24(FP), R12
+	MOVQ tag+32(FP), R13
+
+	VMOVDQU (R8), Y0
+	VMOVDQU 32(R8), Y1
+	VMOVDQU 64(R8), Y2
+	VMOVDQU 96(R8), Y3
+
+	VMOVDQU ·vpshufb_idx_r0<>(SB), Y13
+	VMOVDQU ·vpshufb_idx_r2<>(SB), Y12
+	VMOVDQU (R13), Y11
+
+loopblocks:
+	VPXOR Y3, Y11, Y3
+
+	MOVQ R11, AX
+
+looprounds:
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	DIAGONALIZE(Y0, Y1, Y2, Y3)
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	UNDIAGONALIZE(Y0, Y1, Y2, Y3)
+	SUBQ $1, AX
+	JNZ  looprounds
+
+	VMOVDQU (R10), Y4
+	VMOVDQU 32(R10), Y5
+	VMOVDQU 64(R10), Y6
+
+	VPXOR Y0, Y4, Y0
+	VPXOR Y1, Y5, Y1
+	VPXOR Y2, Y6, Y2
+
+	VMOVDQU Y0, (R8)
+	VMOVDQU Y1, 32(R8)
+	VMOVDQU Y2, 64(R8)
+
+	ADDQ $96, R10
+
+	SUBQ $1, R12
+	JNZ  loopblocks
+
+	VMOVDQU Y3, 96(R8)
+
+	VZEROUPPER
+	RET
+
+// func encryptBlocksAVX2(s *uint64, out, in *byte, rounds, blocks uint64)
+TEXT ·encryptBlocksAVX2(SB), NOSPLIT, $0-40
+	MOVQ s+0(FP), R8
+	MOVQ out+8(FP), R9
+	MOVQ in+16(FP), R10
+	MOVQ rounds+24(FP), R11
+	MOVQ blocks+32(FP), R12
+
+	VMOVDQU (R8), Y0
+	VMOVDQU 32(R8), Y1
+	VMOVDQU 64(R8), Y2
+	VMOVDQU 96(R8), Y3
+
+	VMOVDQU ·vpshufb_idx_r0<>(SB), Y13
+	VMOVDQU ·vpshufb_idx_r2<>(SB), Y12
+	VMOVDQU ·tag_payload<>(SB), Y11
+
+loopblocks:
+	VPXOR Y3, Y11, Y3
+
+	MOVQ R11, AX
+
+looprounds:
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	DIAGONALIZE(Y0, Y1, Y2, Y3)
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	UNDIAGONALIZE(Y0, Y1, Y2, Y3)
+	SUBQ $1, AX
+	JNZ  looprounds
+
+	VMOVDQU (R10), Y4
+	VMOVDQU 32(R10), Y5
+	VMOVDQU 64(R10), Y6
+
+	VPXOR Y0, Y4, Y0
+	VPXOR Y1, Y5, Y1
+	VPXOR Y2, Y6, Y2
+
+	VMOVDQU Y0, (R9)
+	VMOVDQU Y1, 32(R9)
+	VMOVDQU Y2, 64(R9)
+
+	ADDQ $96, R9
+	ADDQ $96, R10
+
+	SUBQ $1, R12
+	JNZ  loopblocks
+
+	VMOVDQU Y0, (R8)
+	VMOVDQU Y1, 32(R8)
+	VMOVDQU Y2, 64(R8)
+	VMOVDQU Y3, 96(R8)
+
+	VZEROUPPER
+	RET
+
+// func decryptBlocksAVX2(s *uint64, out, in *byte, rounds, blocks uint64)
+TEXT ·decryptBlocksAVX2(SB), NOSPLIT, $0-40
+	MOVQ s+0(FP), R8
+	MOVQ out+8(FP), R9
+	MOVQ in+16(FP), R10
+	MOVQ rounds+24(FP), R11
+	MOVQ blocks+32(FP), R12
+
+	VMOVDQU (R8), Y0
+	VMOVDQU 32(R8), Y1
+	VMOVDQU 64(R8), Y2
+	VMOVDQU 96(R8), Y3
+
+	VMOVDQU ·vpshufb_idx_r0<>(SB), Y13
+	VMOVDQU ·vpshufb_idx_r2<>(SB), Y12
+	VMOVDQU ·tag_payload<>(SB), Y11
+
+loopblocks:
+	VPXOR Y3, Y11, Y3
+
+	MOVQ R11, AX
+
+looprounds:
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	DIAGONALIZE(Y0, Y1, Y2, Y3)
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	UNDIAGONALIZE(Y0, Y1, Y2, Y3)
+	SUBQ $1, AX
+	JNZ  looprounds
+
+	VMOVDQU (R10), Y4
+	VMOVDQU 32(R10), Y5
+	VMOVDQU 64(R10), Y6
+
+	VPXOR Y0, Y4, Y0
+	VPXOR Y1, Y5, Y1
+	VPXOR Y2, Y6, Y2
+
+	VMOVDQU Y0, (R9)
+	VMOVDQU Y1, 32(R9)
+	VMOVDQU Y2, 64(R9)
+
+	VMOVDQA Y4, Y0
+	VMOVDQA Y5, Y1
+	VMOVDQA Y6, Y2
+
+	ADDQ $96, R9
+	ADDQ $96, R10
+
+	SUBQ $1, R12
+	JNZ  loopblocks
+
+	VMOVDQU Y0, (R8)
+	VMOVDQU Y1, 32(R8)
+	VMOVDQU Y2, 64(R8)
+	VMOVDQU Y3, 96(R8)
+
+	VZEROUPPER
+	RET
+
+// func decryptLastBlockAVX2(s *uint64, out, in *byte, rounds, inLen uint64)
+TEXT ·decryptLastBlockAVX2(SB), NOSPLIT, $0-40
+	MOVQ s+0(FP), R8
+	MOVQ out+8(FP), R9
+	MOVQ in+16(FP), R10
+	MOVQ rounds+24(FP), AX
+	MOVQ inLen+32(FP), R12
+
+	VMOVDQU (R8), Y0
+	VMOVDQU 32(R8), Y1
+	VMOVDQU 64(R8), Y2
+	VMOVDQU 96(R8), Y3
+
+	VMOVDQU ·vpshufb_idx_r0<>(SB), Y13
+	VMOVDQU ·vpshufb_idx_r2<>(SB), Y12
+	VMOVDQU ·tag_payload<>(SB), Y11
+
+	VPXOR Y3, Y11, Y3
+
+looprounds:
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	DIAGONALIZE(Y0, Y1, Y2, Y3)
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	UNDIAGONALIZE(Y0, Y1, Y2, Y3)
+	SUBQ $1, AX
+	JNZ  looprounds
+
+	VMOVDQU Y0, (R9)
+	VMOVDQU Y1, 32(R9)
+	VMOVDQU Y2, 64(R9)
+
+	CMPQ R12, $0
+	JEQ  skipcopy
+	XORQ AX, AX
+
+loopcopy:
+	MOVB (R10)(AX*1), BX
+	MOVB BX, (R9)(AX*1)
+	ADDQ $1, AX
+	CMPQ AX, R12
+	JNE  loopcopy
+
+skipcopy:
+
+	XORB $0x01, (R9)(R12*1)
+	XORB $0x80, 95(R9)
+
+	VMOVDQU (R9), Y4
+	VMOVDQU 32(R9), Y5
+	VMOVDQU 64(R9), Y6
+
+	VPXOR Y0, Y4, Y0
+	VPXOR Y1, Y5, Y1
+	VPXOR Y2, Y6, Y2
+
+	VMOVDQU Y0, (R9)
+	VMOVDQU Y1, 32(R9)
+	VMOVDQU Y2, 64(R9)
+
+	VMOVDQU Y4, (R8)
+	VMOVDQU Y5, 32(R8)
+	VMOVDQU Y6, 64(R8)
+	VMOVDQU Y3, 96(R8)
+
+	VZEROUPPER
+	RET
+
+// func finalizeAVX2(s *uint64, out, key *byte, rounds uint64)
+TEXT ·finalizeAVX2(SB), NOSPLIT, $0-32
+	MOVQ s+0(FP), R8
+	MOVQ out+8(FP), R9
+	MOVQ key+16(FP), R10
+	MOVQ rounds+24(FP), R11
+
+	VMOVDQU (R8), Y0
+	VMOVDQU 32(R8), Y1
+	VMOVDQU 64(R8), Y2
+	VMOVDQU 96(R8), Y3
+
+	VMOVDQU ·vpshufb_idx_r0<>(SB), Y13
+	VMOVDQU ·vpshufb_idx_r2<>(SB), Y12
+	VMOVDQU ·tag_final<>(SB), Y11
+
+	VPXOR Y3, Y11, Y3
+
+	MOVQ R11, AX
+
+looprounds:
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	DIAGONALIZE(Y0, Y1, Y2, Y3)
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	UNDIAGONALIZE(Y0, Y1, Y2, Y3)
+	SUBQ $1, AX
+	JNZ  looprounds
+
+	VMOVDQU (R10), Y11
+	VPXOR   Y3, Y11, Y3
+
+looprounds2:
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	DIAGONALIZE(Y0, Y1, Y2, Y3)
+	G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
+	UNDIAGONALIZE(Y0, Y1, Y2, Y3)
+	SUBQ $1, R11
+	JNZ  looprounds2
+
+	VPXOR   Y3, Y11, Y3
+	VMOVDQU Y3, (R9)
+
+	VZEROUPPER
+	RET

+ 14 - 0
hwaccel_ref.go

@@ -0,0 +1,14 @@
+// hwaccel_ref.go - Unaccelerated stubs
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to the software, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+// +build !amd64 gccgo noasm !go1.10
+
+package norx
+
+func initHardwareAcceleration() {
+	forceDisableHardwareAcceleration()
+}

+ 1 - 4
norx_ref.go

@@ -219,9 +219,6 @@ func decryptLastBlockRef(s *state, out, in []byte) {
 }
 
 func initRef(s *state, key, nonce []byte) {
-	// Note: Ensuring a correctly sized key/nonce is the caller's
-	// responsibility.
-
 	for i := 0; i < 4; i++ {
 		s.s[i] = binary.LittleEndian.Uint64(nonce[i*bytesW:])
 		s.s[i+4] = binary.LittleEndian.Uint64(key[i*bytesW:])
@@ -279,7 +276,7 @@ func decryptDataRef(s *state, out, in []byte) {
 	decryptLastBlockRef(s, out[off:], in[off:])
 }
 
-func finalizeRef(s *state, tag []byte, key []byte) {
+func finalizeRef(s *state, tag, key []byte) {
 	var lastBlock [bytesC]byte
 
 	s.s[15] ^= tagFinal

+ 1 - 26
norx_test.go

@@ -29,37 +29,12 @@ var (
 )
 
 func mustInitHardwareAcceleration() {
-	// initHardwareAcceleration()
+	initHardwareAcceleration()
 	if !IsHardwareAccelerated() {
 		panic("initHardwareAcceleration() failed")
 	}
 }
 
-func TestF(t *testing.T) {
-	forceDisableHardwareAcceleration()
-	doTestF(t)
-
-	if !canAccelerate {
-		t.Log("Hardware acceleration not supported on this host.")
-		return
-	}
-	mustInitHardwareAcceleration()
-	doTestF(t)
-}
-
-func doTestF(t *testing.T) {
-	impl := "_" + hardwareAccelImpl.name
-	t.Run("F"+impl, func(t *testing.T) {
-		require := require.New(t)
-		s := &state{}
-		for i := range s.s {
-			s.s[i] = uint64(i)
-		}
-		hardwareAccelImpl.permuteFn(s, 2)
-		require.Equal(initializationConstants, s.s, "pre-generated vs calculated")
-	})
-}
-
 func TestKAT(t *testing.T) {
 	forceDisableHardwareAcceleration()
 	doTestKAT(t)