Browse Source

Initial AVX2 support.

SSE2:
BenchmarkChaCha20_16-4 	20000000	        61.7 ns/op	 259.32 MB/s
BenchmarkChaCha20_64-4 	10000000	       141 ns/op	 451.88 MB/s
BenchmarkChaCha20_128-4	10000000	       170 ns/op	 751.35 MB/s
enchmarkChaCha20_192-4	 5000000	       299 ns/op	 641.24 MB/s
BenchmarkChaCha20_256-4	 5000000	       266 ns/op	 961.58 MB/s
BenchmarkChaCha20_512-4	 3000000	       517 ns/op	 988.80 MB/s
BenchmarkChaCha20_1k-4 	 1000000	      1018 ns/op	1004.91 MB/s
BenchmarkChaCha20_64k-4	   20000	     64380 ns/op	1017.94 MB/s

AVX2:
BenchmarkChaCha20_16-4 	20000000	        62.1 ns/op	 257.64 MB/s
BenchmarkChaCha20_64-4 	10000000	       141 ns/op	 451.69 MB/s
BenchmarkChaCha20_128-4	10000000	       153 ns/op	 836.54 MB/s
BenchmarkChaCha20_192-4	 5000000	       281 ns/op	 681.77 MB/s
BenchmarkChaCha20_256-4	10000000	       177 ns/op	1441.17 MB/s
BenchmarkChaCha20_512-4	 5000000	       376 ns/op	1361.41 MB/s
BenchmarkChaCha20_1k-4 	 2000000	       610 ns/op	1676.70 MB/s
BenchmarkChaCha20_64k-4	   50000	     37274 ns/op	1758.21 MB/s

The AVX2 path can likely be improved by going from 6 blocks at a time
to 8 at it's widest, but 6 is signifiantly easier to write for decent
gains on supported hardware.
Yawning Angel 3 years ago
parent
commit
8be2ad5dc9
5 changed files with 1523 additions and 192 deletions
  1. 1 1
      README.md
  2. 48 1
      chacha20_amd64.go
  3. 744 77
      chacha20_amd64.py
  4. 660 106
      chacha20_amd64.s
  5. 70 7
      chacha20_test.go

+ 1 - 1
README.md

@@ -9,6 +9,6 @@ Features:
  * 20 round, 256 bit key only.  Everything else is pointless and stupid.
  * IETF 96 bit nonce variant.
  * XChaCha 24 byte nonce variant.
- * SSE2 support on amd64 targets.
+ * SSE2 and AVX2 support on amd64 targets.
  * Incremental encrypt/decrypt support, unlike golang.org/x/crypto/salsa20.
 

+ 48 - 1
chacha20_amd64.go

@@ -13,9 +13,22 @@ import (
 	"math"
 )
 
+var usingAVX2 = false
+
 func blocksAmd64SSE2(x *uint32, in, out *byte, nrBlocks uint)
 
+func blocksAmd64AVX2(x *uint32, in, out *byte, nrBlocks uint)
+
+func cpuidAmd64(cpuidParams *uint32)
+
+func xgetbvAmd64(xcrIndex uint32, xcrVec *uint32)
+
 func blocksAmd64(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) {
+	// Probably unneeded, but stating this explicitly simplifies the assembly.
+	if nrBlocks == 0 {
+		return
+	}
+
 	if isIetf {
 		var totalBlocks uint64
 		totalBlocks = uint64(x[8]) + uint64(nrBlocks)
@@ -31,10 +44,44 @@ func blocksAmd64(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIe
 		in = out
 	}
 
-	blocksAmd64SSE2(&x[0], &in[0], &out[0], uint(nrBlocks))
+	// Pointless to call the AVX2 code for just a single block, since half of
+	// the output gets discarded...
+	if usingAVX2 && nrBlocks > 1 {
+		blocksAmd64AVX2(&x[0], &in[0], &out[0], uint(nrBlocks))
+	} else {
+		blocksAmd64SSE2(&x[0], &in[0], &out[0], uint(nrBlocks))
+	}
+}
+
+func supportsAVX2() bool {
+	// https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+	const (
+		osXsaveBit = 1 << 27
+		avx2Bit    = 1 << 5
+	)
+
+	// Check to see if the OS knows how to save/restore XMM/YMM state.
+	// CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1
+	regs := [4]uint32{0x01}
+	cpuidAmd64(&regs[0])
+	if regs[2]&osXsaveBit == 0 {
+		return false
+	}
+	xcrRegs := [2]uint32{}
+	xgetbvAmd64(0, &xcrRegs[0])
+	if xcrRegs[0]&6 != 6 {
+		return false
+	}
+
+	// Check for AVX2 support.
+	// CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1
+	regs = [4]uint32{0x07}
+	cpuidAmd64(&regs[0])
+	return regs[1]&avx2Bit != 0
 }
 
 func init() {
 	blocksFn = blocksAmd64
 	usingVectors = true
+	usingAVX2 = supportsAVX2()
 }

+ 744 - 77
chacha20_amd64.py

@@ -83,31 +83,30 @@ with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
     reg_inp = GeneralPurposeRegister64()
     reg_outp = GeneralPurposeRegister64()
     reg_blocks = GeneralPurposeRegister64()
+    reg_sp_save = GeneralPurposeRegister64()
 
     LOAD.ARGUMENT(reg_x, x)
     LOAD.ARGUMENT(reg_inp, inp)
     LOAD.ARGUMENT(reg_outp, outp)
     LOAD.ARGUMENT(reg_blocks, nrBlocks)
 
-    # Align the stack to a 16 byte boundary.
-    reg_align_tmp = GeneralPurposeRegister64()
-    MOV(reg_align_tmp, registers.rsp)
-    AND(reg_align_tmp, 0x0f)
+    # Align the stack to a 32 byte boundary.
     reg_align = GeneralPurposeRegister64()
-    MOV(reg_align, 0x10)
-    SUB(reg_align, reg_align_tmp)
-    SUB(registers.rsp, reg_align)
+    MOV(reg_sp_save, registers.rsp)
+    MOV(reg_align, 0x1f)
+    NOT(reg_align)
+    AND(registers.rsp, reg_align)
+    SUB(registers.rsp, 0x20)
 
     # Build the counter increment vector on the stack, and allocate the scratch
     # space
+    xmm_v0 = XMMRegister()
+    PXOR(xmm_v0, xmm_v0)
     SUB(registers.rsp, 16+16)
+    MOVDQA([registers.rsp], xmm_v0)
     reg_tmp = GeneralPurposeRegister32()
     MOV(reg_tmp, 0x00000001)
     MOV([registers.rsp], reg_tmp)
-    MOV(reg_tmp, 0x00000000)
-    MOV([registers.rsp+4], reg_tmp)
-    MOV([registers.rsp+8], reg_tmp)
-    MOV([registers.rsp+12], reg_tmp)
     mem_one = [registers.rsp]     # (Stack) Counter increment vector
     mem_tmp0 = [registers.rsp+16] # (Stack) Scratch space.
 
@@ -116,7 +115,7 @@ with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
     mem_s2 = [reg_x+32]        # (Memory) Cipher state [8..11]
     mem_s3 = [reg_x+48]        # (Memory) Cipher state [12..15]
 
-    xmm_v0 = XMMRegister()
+    # xmm_v0 allocated above...
     xmm_v1 = XMMRegister()
     xmm_v2 = XMMRegister()
     xmm_v3 = XMMRegister()
@@ -138,6 +137,10 @@ with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
 
     xmm_tmp = xmm_v12
 
+    #
+    # 4 blocks at a time.
+    #
+
     vector_loop4 = Loop()
     SUB(reg_blocks, 4)
     JB(vector_loop4.end)
@@ -167,8 +170,8 @@ with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
 
         reg_rounds = GeneralPurposeRegister64()
         MOV(reg_rounds, 20)
-        rounds_loop = Loop()
-        with rounds_loop:
+        rounds_loop4 = Loop()
+        with rounds_loop4:
             # a += b; d ^= a; d = ROTW16(d);
             PADDD(xmm_v0, xmm_v1)
             PADDD(xmm_v4, xmm_v5)
@@ -330,7 +333,7 @@ with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
             MOVDQA(xmm_tmp, mem_tmp0) # Restore
 
             SUB(reg_rounds, 2)
-            JNZ(rounds_loop.begin)
+            JNZ(rounds_loop4.begin)
 
         MOVDQA(mem_tmp0, xmm_tmp)
 
@@ -391,6 +394,10 @@ with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
     MOVDQA(xmm_s3, mem_s3)
     MOVDQA(xmm_one, mem_one)
 
+    #
+    # 2 blocks at a time.
+    #
+
     SUB(reg_blocks, 2)
     vector_loop2 = Loop()
     JB(vector_loop2.end)
@@ -408,8 +415,8 @@ with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
 
         reg_rounds = GeneralPurposeRegister64()
         MOV(reg_rounds, 20)
-        rounds_loop = Loop()
-        with rounds_loop:
+        rounds_loop2 = Loop()
+        with rounds_loop2:
             # a += b; d ^= a; d = ROTW16(d);
             PADDD(xmm_v0, xmm_v1)
             PADDD(xmm_v4, xmm_v5)
@@ -491,7 +498,7 @@ with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
             PSHUFD(xmm_v7, xmm_v7, 0x39)
 
             SUB(reg_rounds, 2)
-            JNZ(rounds_loop.begin)
+            JNZ(rounds_loop2.begin)
 
         PADDD(xmm_v0, xmm_s0)
         PADDD(xmm_v1, xmm_s1)
@@ -514,96 +521,756 @@ with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
         JAE(vector_loop2.begin)
 
     ADD(reg_blocks, 2)
-    serial_loop = Loop()
-    JZ(serial_loop.end)
+    out_serial = Label()
+    JZ(out_serial)
+
+    #
+    # 1 block at a time.  Only executed once, because if there was > 1,
+    # the parallel code would have processed it already.
+    #
+
+    MOVDQA(xmm_v0, xmm_s0)
+    MOVDQA(xmm_v1, xmm_s1)
+    MOVDQA(xmm_v2, xmm_s2)
+    MOVDQA(xmm_v3, xmm_s3)
+
+    reg_rounds = GeneralPurposeRegister64()
+    MOV(reg_rounds, 20)
+    rounds_loop1 = Loop()
+    with rounds_loop1:
+        # a += b; d ^= a; d = ROTW16(d);
+        PADDD(xmm_v0, xmm_v1)
+        PXOR(xmm_v3, xmm_v0)
+        ROTW16_sse2(xmm_tmp, xmm_v3)
+
+        # c += d; b ^= c; b = ROTW12(b);
+        PADDD(xmm_v2, xmm_v3)
+        PXOR(xmm_v1, xmm_v2)
+        ROTW12_sse2(xmm_tmp, xmm_v1)
+
+        # a += b; d ^= a; d = ROTW8(d);
+        PADDD(xmm_v0, xmm_v1)
+        PXOR(xmm_v3, xmm_v0)
+        ROTW8_sse2(xmm_tmp, xmm_v3)
+
+        # c += d; b ^= c; b = ROTW7(b)
+        PADDD(xmm_v2, xmm_v3)
+        PXOR(xmm_v1, xmm_v2)
+        ROTW7_sse2(xmm_tmp, xmm_v1)
+
+        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+        PSHUFD(xmm_v1, xmm_v1, 0x39)
+        PSHUFD(xmm_v2, xmm_v2, 0x4e)
+        PSHUFD(xmm_v3, xmm_v3, 0x93)
+
+        # a += b; d ^= a; d = ROTW16(d);
+        PADDD(xmm_v0, xmm_v1)
+        PXOR(xmm_v3, xmm_v0)
+        ROTW16_sse2(xmm_tmp, xmm_v3)
+
+        # c += d; b ^= c; b = ROTW12(b);
+        PADDD(xmm_v2, xmm_v3)
+        PXOR(xmm_v1, xmm_v2)
+        ROTW12_sse2(xmm_tmp, xmm_v1)
+
+        # a += b; d ^= a; d = ROTW8(d);
+        PADDD(xmm_v0, xmm_v1)
+        PXOR(xmm_v3, xmm_v0)
+        ROTW8_sse2(xmm_tmp, xmm_v3)
+
+        # c += d; b ^= c; b = ROTW7(b)
+        PADDD(xmm_v2, xmm_v3)
+        PXOR(xmm_v1, xmm_v2)
+        ROTW7_sse2(xmm_tmp, xmm_v1)
+
+        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+        PSHUFD(xmm_v1, xmm_v1, 0x93)
+        PSHUFD(xmm_v2, xmm_v2, 0x4e)
+        PSHUFD(xmm_v3, xmm_v3, 0x39)
+
+        SUB(reg_rounds, 2)
+        JNZ(rounds_loop1.begin)
+
+    PADDD(xmm_v0, xmm_s0)
+    PADDD(xmm_v1, xmm_s1)
+    PADDD(xmm_v2, xmm_s2)
+    PADDD(xmm_v3, xmm_s3)
+    WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
+    PADDQ(xmm_s3, xmm_one)
+
+    LABEL(out_serial)
 
-    with serial_loop:
-        MOVDQA(xmm_v0, xmm_s0)
-        MOVDQA(xmm_v1, xmm_s1)
-        MOVDQA(xmm_v2, xmm_s2)
-        MOVDQA(xmm_v3, xmm_s3)
+    # Write back the updated counter.  Stoping at 2^70 bytes is the user's
+    # problem, not mine.  (Skipped if there's exactly a multiple of 4 blocks
+    # because the counter is incremented in memory while looping.)
+    MOVDQA(mem_s3, xmm_s3)
+
+    LABEL(out)
+
+    # Paranoia, cleanse the scratch space.
+    PXOR(xmm_v0, xmm_v0)
+    MOVDQA(mem_tmp0, xmm_v0)
+
+    # Remove our stack allocation.
+    MOV(registers.rsp, reg_sp_save)
+
+    RETURN()
+
+#
+# AVX2 helpers.  Like the SSE2 equivalents, the scratch register is explicit,
+# and more helpers are used to increase readability for destructive operations.
+#
+# XXX/Performance: ROTW16_avx2/ROTW8_avx2 both can use VPSHUFFB.
+#
+
+def ADD_avx2(dst, src):
+    VPADDD(dst, dst, src)
+
+def XOR_avx2(dst, src):
+    VPXOR(dst, dst, src)
+
+def ROTW16_avx2(tmp, d):
+    VPSLLD(tmp, d, 16)
+    VPSRLD(d, d, 16)
+    XOR_avx2(d, tmp)
+
+def ROTW12_avx2(tmp, b):
+    VPSLLD(tmp, b, 12)
+    VPSRLD(b, b, 20)
+    XOR_avx2(b, tmp)
+
+def ROTW8_avx2(tmp, d):
+    VPSLLD(tmp, d, 8)
+    VPSRLD(d, d, 24)
+    XOR_avx2(d, tmp)
+
+def ROTW7_avx2(tmp, b):
+    VPSLLD(tmp, b, 7)
+    VPSRLD(b, b, 25)
+    XOR_avx2(b, tmp)
+
+def WriteXor_avx2(tmp, inp, outp, d, v0, v1, v2, v3):
+    # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
+    VPERM2I128(tmp, v0, v1, 0x20)
+    VPXOR(tmp, tmp, [inp+d])
+    VMOVDQU([outp+d], tmp)
+
+    # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
+    VPERM2I128(tmp, v2, v3, 0x20)
+    VPXOR(tmp, tmp, [inp+d+32])
+    VMOVDQU([outp+d+32], tmp)
+
+    # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
+    VPERM2I128(tmp, v0, v1, 0x31)
+    VPXOR(tmp, tmp, [inp+d+64])
+    VMOVDQU([outp+d+64], tmp)
+
+    # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
+    VPERM2I128(tmp, v2, v3, 0x31)
+    VPXOR(tmp, tmp, [inp+d+96])
+    VMOVDQU([outp+d+96], tmp)
+
+# AVX2 ChaCha20 (aka avx2).  Does not handle partial blocks, will process
+# 6/4/2/1 blocks at a time.  Alignment blah blah blah fuck you.
+with Function("blocksAmd64AVX2", (x, inp, outp, nrBlocks), target=uarch.broadwell):
+    reg_x = GeneralPurposeRegister64()
+    reg_inp = GeneralPurposeRegister64()
+    reg_outp = GeneralPurposeRegister64()
+    reg_blocks = GeneralPurposeRegister64()
+    reg_sp_save = GeneralPurposeRegister64()
+
+    LOAD.ARGUMENT(reg_x, x)
+    LOAD.ARGUMENT(reg_inp, inp)
+    LOAD.ARGUMENT(reg_outp, outp)
+    LOAD.ARGUMENT(reg_blocks, nrBlocks)
+
+    # Align the stack to a 32 byte boundary.
+    reg_align = GeneralPurposeRegister64()
+    MOV(reg_sp_save, registers.rsp)
+    MOV(reg_align, 0x1f)
+    NOT(reg_align)
+    AND(registers.rsp, reg_align)
+    SUB(registers.rsp, 0x20)
+
+    x_s0 = [reg_x]           # (Memory) Cipher state [0..3]
+    x_s1 = [reg_x+16]        # (Memory) Cipher state [4..7]
+    x_s2 = [reg_x+32]        # (Memory) Cipher state [8..11]
+    x_s3 = [reg_x+48]        # (Memory) Cipher state [12..15]
+
+    ymm_tmp0 = YMMRegister()
+    ymm_s1 = YMMRegister()
+    ymm_s2 = YMMRegister()
+    ymm_s3 = YMMRegister()
+
+    ymm_v0 = YMMRegister()
+    ymm_v1 = YMMRegister()
+    ymm_v2 = YMMRegister()
+    ymm_v3 = YMMRegister()
+
+    ymm_v4 = YMMRegister()
+    ymm_v5 = YMMRegister()
+    ymm_v6 = YMMRegister()
+    ymm_v7 = YMMRegister()
+
+    ymm_v8 = YMMRegister()
+    ymm_v9 = YMMRegister()
+    ymm_v10 = YMMRegister()
+    ymm_v11 = YMMRegister()
+
+#   VBROADCASTI128(ymm_s0, x_s0)  Don't have the register for this.
+    VBROADCASTI128(ymm_s1, x_s1)
+    VBROADCASTI128(ymm_s2, x_s2)
+    VBROADCASTI128(ymm_s3, x_s3)
+
+    # Allocate the neccecary stack space for the counter vector and two ymm
+    # registers that we will spill.
+    SUB(registers.rsp, 32)
+    mem_inc = [registers.rsp]      # (Stack) Counter increment vector.
+
+    # Increment the counter for one side of the state vector (ymm3).
+    VPXOR(ymm_tmp0, ymm_tmp0, ymm_tmp0)
+    VMOVDQU(mem_inc, ymm_tmp0)
+    reg_tmp = GeneralPurposeRegister32()
+    MOV(reg_tmp, 0x00000001)
+    MOV([registers.rsp+16], reg_tmp)
+    VPADDQ(ymm_s3, ymm_s3, [registers.rsp])
+
+    # As we process 2xN blocks at a time, so the counter increment for both
+    # sides of the state vector is 2.
+    MOV(reg_tmp, 0x00000002)
+    MOV([registers.rsp], reg_tmp)
+    MOV([registers.rsp+16], reg_tmp)
+
+    #
+    # 6 blocks at a time, like the avx2 code.
+    #
+    # XXX/Performance: It's highly likely that it's worth going to 8x.
+    #
+
+    vector_loop6 = Loop()
+    SUB(reg_blocks, 6)
+    JB(vector_loop6.end)
+    with vector_loop6:
+        VBROADCASTI128(ymm_v0, x_s0)
+        VMOVDQA(ymm_v1, ymm_s1)
+        VMOVDQA(ymm_v2, ymm_s2)
+        VMOVDQA(ymm_v3, ymm_s3)
+
+        VMOVDQA(ymm_v4, ymm_v0)
+        VMOVDQA(ymm_v5, ymm_v1)
+        VMOVDQA(ymm_v6, ymm_v2)
+        VPADDQ(ymm_v7, ymm_v3, mem_inc)
+
+        VMOVDQA(ymm_v8, ymm_v0)
+        VMOVDQA(ymm_v9, ymm_v1)
+        VMOVDQA(ymm_v10, ymm_v2)
+        VPADDQ(ymm_v11, ymm_v7, mem_inc)
 
         reg_rounds = GeneralPurposeRegister64()
         MOV(reg_rounds, 20)
-        rounds_loop = Loop()
-        with rounds_loop:
+        rounds_loop6 = Loop()
+        with rounds_loop6:
             # a += b; d ^= a; d = ROTW16(d);
-            PADDD(xmm_v0, xmm_v1)
-            PXOR(xmm_v3, xmm_v0)
-            ROTW16_sse2(xmm_tmp, xmm_v3)
+            ADD_avx2(ymm_v0, ymm_v1)
+            ADD_avx2(ymm_v4, ymm_v5)
+            ADD_avx2(ymm_v8, ymm_v9)
+            XOR_avx2(ymm_v3, ymm_v0)
+            XOR_avx2(ymm_v7, ymm_v4)
+            XOR_avx2(ymm_v11, ymm_v8)
+            ROTW16_avx2(ymm_tmp0, ymm_v3)
+            ROTW16_avx2(ymm_tmp0, ymm_v7)
+            ROTW16_avx2(ymm_tmp0, ymm_v11)
 
             # c += d; b ^= c; b = ROTW12(b);
-            PADDD(xmm_v2, xmm_v3)
-            PXOR(xmm_v1, xmm_v2)
-            ROTW12_sse2(xmm_tmp, xmm_v1)
+            ADD_avx2(ymm_v2, ymm_v3)
+            ADD_avx2(ymm_v6, ymm_v7)
+            ADD_avx2(ymm_v10, ymm_v11)
+            XOR_avx2(ymm_v1, ymm_v2)
+            XOR_avx2(ymm_v5, ymm_v6)
+            XOR_avx2(ymm_v9, ymm_v10)
+            ROTW12_avx2(ymm_tmp0, ymm_v1)
+            ROTW12_avx2(ymm_tmp0, ymm_v5)
+            ROTW12_avx2(ymm_tmp0, ymm_v9)
 
             # a += b; d ^= a; d = ROTW8(d);
-            PADDD(xmm_v0, xmm_v1)
-            PXOR(xmm_v3, xmm_v0)
-            ROTW8_sse2(xmm_tmp, xmm_v3)
+            ADD_avx2(ymm_v0, ymm_v1)
+            ADD_avx2(ymm_v4, ymm_v5)
+            ADD_avx2(ymm_v8, ymm_v9)
+            XOR_avx2(ymm_v3, ymm_v0)
+            XOR_avx2(ymm_v7, ymm_v4)
+            XOR_avx2(ymm_v11, ymm_v8)
+            ROTW8_avx2(ymm_tmp0, ymm_v3)
+            ROTW8_avx2(ymm_tmp0, ymm_v7)
+            ROTW8_avx2(ymm_tmp0, ymm_v11)
 
             # c += d; b ^= c; b = ROTW7(b)
-            PADDD(xmm_v2, xmm_v3)
-            PXOR(xmm_v1, xmm_v2)
-            ROTW7_sse2(xmm_tmp, xmm_v1)
+            ADD_avx2(ymm_v2, ymm_v3)
+            ADD_avx2(ymm_v6, ymm_v7)
+            ADD_avx2(ymm_v10, ymm_v11)
+            XOR_avx2(ymm_v1, ymm_v2)
+            XOR_avx2(ymm_v5, ymm_v6)
+            XOR_avx2(ymm_v9, ymm_v10)
+            ROTW7_avx2(ymm_tmp0, ymm_v1)
+            ROTW7_avx2(ymm_tmp0, ymm_v5)
+            ROTW7_avx2(ymm_tmp0, ymm_v9)
 
             # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-            PSHUFD(xmm_v1, xmm_v1, 0x39)
-            PSHUFD(xmm_v2, xmm_v2, 0x4e)
-            PSHUFD(xmm_v3, xmm_v3, 0x93)
+            VPSHUFD(ymm_v1, ymm_v1, 0x39)
+            VPSHUFD(ymm_v5, ymm_v5, 0x39)
+            VPSHUFD(ymm_v9, ymm_v9, 0x39)
+            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+            VPSHUFD(ymm_v6, ymm_v6, 0x4e)
+            VPSHUFD(ymm_v10, ymm_v10, 0x4e)
+            VPSHUFD(ymm_v3, ymm_v3, 0x93)
+            VPSHUFD(ymm_v7, ymm_v7, 0x93)
+            VPSHUFD(ymm_v11, ymm_v11, 0x93)
 
             # a += b; d ^= a; d = ROTW16(d);
-            PADDD(xmm_v0, xmm_v1)
-            PXOR(xmm_v3, xmm_v0)
-            ROTW16_sse2(xmm_tmp, xmm_v3)
+            ADD_avx2(ymm_v0, ymm_v1)
+            ADD_avx2(ymm_v4, ymm_v5)
+            ADD_avx2(ymm_v8, ymm_v9)
+            XOR_avx2(ymm_v3, ymm_v0)
+            XOR_avx2(ymm_v7, ymm_v4)
+            XOR_avx2(ymm_v11, ymm_v8)
+            ROTW16_avx2(ymm_tmp0, ymm_v3)
+            ROTW16_avx2(ymm_tmp0, ymm_v7)
+            ROTW16_avx2(ymm_tmp0, ymm_v11)
 
             # c += d; b ^= c; b = ROTW12(b);
-            PADDD(xmm_v2, xmm_v3)
-            PXOR(xmm_v1, xmm_v2)
-            ROTW12_sse2(xmm_tmp, xmm_v1)
+            ADD_avx2(ymm_v2, ymm_v3)
+            ADD_avx2(ymm_v6, ymm_v7)
+            ADD_avx2(ymm_v10, ymm_v11)
+            XOR_avx2(ymm_v1, ymm_v2)
+            XOR_avx2(ymm_v5, ymm_v6)
+            XOR_avx2(ymm_v9, ymm_v10)
+            ROTW12_avx2(ymm_tmp0, ymm_v1)
+            ROTW12_avx2(ymm_tmp0, ymm_v5)
+            ROTW12_avx2(ymm_tmp0, ymm_v9)
 
             # a += b; d ^= a; d = ROTW8(d);
-            PADDD(xmm_v0, xmm_v1)
-            PXOR(xmm_v3, xmm_v0)
-            ROTW8_sse2(xmm_tmp, xmm_v3)
+            ADD_avx2(ymm_v0, ymm_v1)
+            ADD_avx2(ymm_v4, ymm_v5)
+            ADD_avx2(ymm_v8, ymm_v9)
+            XOR_avx2(ymm_v3, ymm_v0)
+            XOR_avx2(ymm_v7, ymm_v4)
+            XOR_avx2(ymm_v11, ymm_v8)
+            ROTW8_avx2(ymm_tmp0, ymm_v3)
+            ROTW8_avx2(ymm_tmp0, ymm_v7)
+            ROTW8_avx2(ymm_tmp0, ymm_v11)
 
             # c += d; b ^= c; b = ROTW7(b)
-            PADDD(xmm_v2, xmm_v3)
-            PXOR(xmm_v1, xmm_v2)
-            ROTW7_sse2(xmm_tmp, xmm_v1)
+            ADD_avx2(ymm_v2, ymm_v3)
+            ADD_avx2(ymm_v6, ymm_v7)
+            ADD_avx2(ymm_v10, ymm_v11)
+            XOR_avx2(ymm_v1, ymm_v2)
+            XOR_avx2(ymm_v5, ymm_v6)
+            XOR_avx2(ymm_v9, ymm_v10)
+            ROTW7_avx2(ymm_tmp0, ymm_v1)
+            ROTW7_avx2(ymm_tmp0, ymm_v5)
+            ROTW7_avx2(ymm_tmp0, ymm_v9)
 
             # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-            PSHUFD(xmm_v1, xmm_v1, 0x93)
-            PSHUFD(xmm_v2, xmm_v2, 0x4e)
-            PSHUFD(xmm_v3, xmm_v3, 0x39)
+            VPSHUFD(ymm_v1, ymm_v1, 0x93)
+            VPSHUFD(ymm_v5, ymm_v5, 0x93)
+            VPSHUFD(ymm_v9, ymm_v9, 0x93)
+            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+            VPSHUFD(ymm_v6, ymm_v6, 0x4e)
+            VPSHUFD(ymm_v10, ymm_v10, 0x4e)
+            VPSHUFD(ymm_v3, ymm_v3, 0x39)
+            VPSHUFD(ymm_v7, ymm_v7, 0x39)
+            VPSHUFD(ymm_v11, ymm_v11, 0x39)
 
             SUB(reg_rounds, 2)
-            JNZ(rounds_loop.begin)
+            JNZ(rounds_loop6.begin)
+
+        VBROADCASTI128(ymm_tmp0, x_s0)
+        ADD_avx2(ymm_v0, ymm_tmp0)
+        ADD_avx2(ymm_v4, ymm_tmp0)
+        ADD_avx2(ymm_v8, ymm_tmp0)
+
+        # ADD_avx2(ymm_v0, ymm_s0)
+        ADD_avx2(ymm_v1, ymm_s1)
+        ADD_avx2(ymm_v2, ymm_s2)
+        ADD_avx2(ymm_v3, ymm_s3)
+        WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
+        ADD_avx2(ymm_s3, mem_inc)
+
+        # ADD_avx2(ymm_v4, ymm_s0)
+        ADD_avx2(ymm_v5, ymm_s1)
+        ADD_avx2(ymm_v6, ymm_s2)
+        ADD_avx2(ymm_v7, ymm_s3)
+        WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
+        ADD_avx2(ymm_s3, mem_inc)
+
+        # ADD_avx2(ymm_v8, ymm_s0)
+        ADD_avx2(ymm_v9, ymm_s1)
+        ADD_avx2(ymm_v10, ymm_s2)
+        ADD_avx2(ymm_v11, ymm_s3)
+        WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 256, ymm_v8, ymm_v9, ymm_v10, ymm_v11)
+        ADD_avx2(ymm_s3, mem_inc)
+
+        ADD(reg_inp, 6 * 64)
+        ADD(reg_outp, 6 * 64)
+
+        SUB(reg_blocks, 6)
+        JAE(vector_loop6.begin)
+
+    ADD(reg_blocks, 6)
+
+    # We now actually can do everything in registers.
+    ymm_s0 = ymm_v8
+    VBROADCASTI128(ymm_s0, x_s0)
+    ymm_inc = ymm_v9
+    VMOVDQA(ymm_inc, mem_inc)
+    ymm_tmp0 = ymm_v10
+
+    #
+    # 4 blocks at a time.
+    #
 
-        PADDD(xmm_v0, xmm_s0)
-        PADDD(xmm_v1, xmm_s1)
-        PADDD(xmm_v2, xmm_s2)
-        PADDD(xmm_v3, xmm_s3)
-        WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
-        PADDQ(xmm_s3, xmm_one)
+    SUB(reg_blocks, 4)
+    vector_loop4 = Loop()
+    JB(vector_loop4.end)
+    with vector_loop4:
+        VMOVDQA(ymm_v0, ymm_s0)
+        VMOVDQA(ymm_v1, ymm_s1)
+        VMOVDQA(ymm_v2, ymm_s2)
+        VMOVDQA(ymm_v3, ymm_s3)
 
-        ADD(reg_inp, 64)
-        ADD(reg_outp, 64)
+        VMOVDQA(ymm_v4, ymm_v0)
+        VMOVDQA(ymm_v5, ymm_v1)
+        VMOVDQA(ymm_v6, ymm_v2)
+        VPADDQ(ymm_v7, ymm_v3, ymm_inc)
 
-        SUB(reg_blocks, 1)
-        JNZ(serial_loop.begin)
+        reg_rounds = GeneralPurposeRegister64()
+        MOV(reg_rounds, 20)
+        rounds_loop4 = Loop()
+        with rounds_loop4:
+            # a += b; d ^= a; d = ROTW16(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            ADD_avx2(ymm_v4, ymm_v5)
+            XOR_avx2(ymm_v3, ymm_v0)
+            XOR_avx2(ymm_v7, ymm_v4)
+            ROTW16_avx2(ymm_tmp0, ymm_v3)
+            ROTW16_avx2(ymm_tmp0, ymm_v7)
 
-    # Write back the updated counter.  Stoping at 2^70 bytes is the user's
-    # problem, not mine.
-    MOVDQA(mem_s3, xmm_s3)
+            # c += d; b ^= c; b = ROTW12(b);
+            ADD_avx2(ymm_v2, ymm_v3)
+            ADD_avx2(ymm_v6, ymm_v7)
+            XOR_avx2(ymm_v1, ymm_v2)
+            XOR_avx2(ymm_v5, ymm_v6)
+            ROTW12_avx2(ymm_tmp0, ymm_v1)
+            ROTW12_avx2(ymm_tmp0, ymm_v5)
 
-    LABEL(out)
+            # a += b; d ^= a; d = ROTW8(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            ADD_avx2(ymm_v4, ymm_v5)
+            XOR_avx2(ymm_v3, ymm_v0)
+            XOR_avx2(ymm_v7, ymm_v4)
+            ROTW8_avx2(ymm_tmp0, ymm_v3)
+            ROTW8_avx2(ymm_tmp0, ymm_v7)
 
-    # Paranoia, cleanse the scratch space.
-    PXOR(xmm_v0, xmm_v0)
-    MOVDQA(mem_tmp0, xmm_v0)
+            # c += d; b ^= c; b = ROTW7(b)
+            ADD_avx2(ymm_v2, ymm_v3)
+            ADD_avx2(ymm_v6, ymm_v7)
+            XOR_avx2(ymm_v1, ymm_v2)
+            XOR_avx2(ymm_v5, ymm_v6)
+            ROTW7_avx2(ymm_tmp0, ymm_v1)
+            ROTW7_avx2(ymm_tmp0, ymm_v5)
+
+            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+            VPSHUFD(ymm_v1, ymm_v1, 0x39)
+            VPSHUFD(ymm_v5, ymm_v5, 0x39)
+            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+            VPSHUFD(ymm_v6, ymm_v6, 0x4e)
+            VPSHUFD(ymm_v3, ymm_v3, 0x93)
+            VPSHUFD(ymm_v7, ymm_v7, 0x93)
+
+            # a += b; d ^= a; d = ROTW16(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            ADD_avx2(ymm_v4, ymm_v5)
+            XOR_avx2(ymm_v3, ymm_v0)
+            XOR_avx2(ymm_v7, ymm_v4)
+            ROTW16_avx2(ymm_tmp0, ymm_v3)
+            ROTW16_avx2(ymm_tmp0, ymm_v7)
+
+            # c += d; b ^= c; b = ROTW12(b);
+            ADD_avx2(ymm_v2, ymm_v3)
+            ADD_avx2(ymm_v6, ymm_v7)
+            XOR_avx2(ymm_v1, ymm_v2)
+            XOR_avx2(ymm_v5, ymm_v6)
+            ROTW12_avx2(ymm_tmp0, ymm_v1)
+            ROTW12_avx2(ymm_tmp0, ymm_v5)
+
+            # a += b; d ^= a; d = ROTW8(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            ADD_avx2(ymm_v4, ymm_v5)
+            XOR_avx2(ymm_v3, ymm_v0)
+            XOR_avx2(ymm_v7, ymm_v4)
+            ROTW8_avx2(ymm_tmp0, ymm_v3)
+            ROTW8_avx2(ymm_tmp0, ymm_v7)
+
+            # c += d; b ^= c; b = ROTW7(b)
+            ADD_avx2(ymm_v2, ymm_v3)
+            ADD_avx2(ymm_v6, ymm_v7)
+            XOR_avx2(ymm_v1, ymm_v2)
+            XOR_avx2(ymm_v5, ymm_v6)
+            ROTW7_avx2(ymm_tmp0, ymm_v1)
+            ROTW7_avx2(ymm_tmp0, ymm_v5)
+
+            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+            VPSHUFD(ymm_v1, ymm_v1, 0x93)
+            VPSHUFD(ymm_v5, ymm_v5, 0x93)
+            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+            VPSHUFD(ymm_v6, ymm_v6, 0x4e)
+            VPSHUFD(ymm_v3, ymm_v3, 0x39)
+            VPSHUFD(ymm_v7, ymm_v7, 0x39)
+
+            SUB(reg_rounds, 2)
+            JNZ(rounds_loop4.begin)
+
+        ADD_avx2(ymm_v0, ymm_s0)
+        ADD_avx2(ymm_v1, ymm_s1)
+        ADD_avx2(ymm_v2, ymm_s2)
+        ADD_avx2(ymm_v3, ymm_s3)
+        WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
+        ADD_avx2(ymm_s3, ymm_inc)
+
+        ADD_avx2(ymm_v4, ymm_s0)
+        ADD_avx2(ymm_v5, ymm_s1)
+        ADD_avx2(ymm_v6, ymm_s2)
+        ADD_avx2(ymm_v7, ymm_s3)
+        WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
+        ADD_avx2(ymm_s3, ymm_inc)
+
+        ADD(reg_inp, 4 * 64)
+        ADD(reg_outp, 4 * 64)
+
+        SUB(reg_blocks, 4)
+        JAE(vector_loop4.begin)
+
+    ADD(reg_blocks, 4)
+
+    #
+    # 2 blocks at a time.
+    #
+
+    SUB(reg_blocks, 2)
+    vector_loop2 = Loop()
+    JB(vector_loop2.end)
+    with vector_loop2:
+        VMOVDQA(ymm_v0, ymm_s0)
+        VMOVDQA(ymm_v1, ymm_s1)
+        VMOVDQA(ymm_v2, ymm_s2)
+        VMOVDQA(ymm_v3, ymm_s3)
+
+        reg_rounds = GeneralPurposeRegister64()
+        MOV(reg_rounds, 20)
+        rounds_loop2 = Loop()
+        with rounds_loop2:
+            # a += b; d ^= a; d = ROTW16(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            XOR_avx2(ymm_v3, ymm_v0)
+            ROTW16_avx2(ymm_tmp0, ymm_v3)
+
+            # c += d; b ^= c; b = ROTW12(b);
+            ADD_avx2(ymm_v2, ymm_v3)
+            XOR_avx2(ymm_v1, ymm_v2)
+            ROTW12_avx2(ymm_tmp0, ymm_v1)
+
+            # a += b; d ^= a; d = ROTW8(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            XOR_avx2(ymm_v3, ymm_v0)
+            ROTW8_avx2(ymm_tmp0, ymm_v3)
+
+            # c += d; b ^= c; b = ROTW7(b)
+            ADD_avx2(ymm_v2, ymm_v3)
+            XOR_avx2(ymm_v1, ymm_v2)
+            ROTW7_avx2(ymm_tmp0, ymm_v1)
+
+            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+            VPSHUFD(ymm_v1, ymm_v1, 0x39)
+            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+            VPSHUFD(ymm_v3, ymm_v3, 0x93)
+
+            # a += b; d ^= a; d = ROTW16(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            XOR_avx2(ymm_v3, ymm_v0)
+            ROTW16_avx2(ymm_tmp0, ymm_v3)
+
+            # c += d; b ^= c; b = ROTW12(b);
+            ADD_avx2(ymm_v2, ymm_v3)
+            XOR_avx2(ymm_v1, ymm_v2)
+            ROTW12_avx2(ymm_tmp0, ymm_v1)
+
+            # a += b; d ^= a; d = ROTW8(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            XOR_avx2(ymm_v3, ymm_v0)
+            ROTW8_avx2(ymm_tmp0, ymm_v3)
+
+            # c += d; b ^= c; b = ROTW7(b)
+            ADD_avx2(ymm_v2, ymm_v3)
+            XOR_avx2(ymm_v1, ymm_v2)
+            ROTW7_avx2(ymm_tmp0, ymm_v1)
+
+            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+            VPSHUFD(ymm_v1, ymm_v1, 0x93)
+            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+            VPSHUFD(ymm_v3, ymm_v3, 0x39)
+
+            SUB(reg_rounds, 2)
+            JNZ(rounds_loop2.begin)
+
+        ADD_avx2(ymm_v0, ymm_s0)
+        ADD_avx2(ymm_v1, ymm_s1)
+        ADD_avx2(ymm_v2, ymm_s2)
+        ADD_avx2(ymm_v3, ymm_s3)
+        WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
+        ADD_avx2(ymm_s3, ymm_inc)
+
+        ADD(reg_inp, 2 * 64)
+        ADD(reg_outp, 2 * 64)
+
+        SUB(reg_blocks, 2)
+        JAE(vector_loop2.begin)
+
+    ADD(reg_blocks, 2)
+    VMOVDQA(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3
+    SUB(reg_blocks, 1)
+    out_serial = Label()
+    JB(out_serial)
+
+    #
+    # 1 block at a time.  Only executed once, because if there was > 1,
+    # the parallel code would have processed it already.
+    #
+
+    VMOVDQA(ymm_v0, ymm_s0)
+    VMOVDQA(ymm_v1, ymm_s1)
+    VMOVDQA(ymm_v2, ymm_s2)
+    VMOVDQA(ymm_v3, ymm_s3)
+
+    reg_rounds = GeneralPurposeRegister64()
+    MOV(reg_rounds, 20)
+    rounds_loop1 = Loop()
+    with rounds_loop1:
+        # a += b; d ^= a; d = ROTW16(d);
+        ADD_avx2(ymm_v0, ymm_v1)
+        XOR_avx2(ymm_v3, ymm_v0)
+        ROTW16_avx2(ymm_tmp0, ymm_v3)
+
+        # c += d; b ^= c; b = ROTW12(b);
+        ADD_avx2(ymm_v2, ymm_v3)
+        XOR_avx2(ymm_v1, ymm_v2)
+        ROTW12_avx2(ymm_tmp0, ymm_v1)
+
+        # a += b; d ^= a; d = ROTW8(d);
+        ADD_avx2(ymm_v0, ymm_v1)
+        XOR_avx2(ymm_v3, ymm_v0)
+        ROTW8_avx2(ymm_tmp0, ymm_v3)
+
+        # c += d; b ^= c; b = ROTW7(b)
+        ADD_avx2(ymm_v2, ymm_v3)
+        XOR_avx2(ymm_v1, ymm_v2)
+        ROTW7_avx2(ymm_tmp0, ymm_v1)
+
+        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+        VPSHUFD(ymm_v1, ymm_v1, 0x39)
+        VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+        VPSHUFD(ymm_v3, ymm_v3, 0x93)
+
+        # a += b; d ^= a; d = ROTW16(d);
+        ADD_avx2(ymm_v0, ymm_v1)
+        XOR_avx2(ymm_v3, ymm_v0)
+        ROTW16_avx2(ymm_tmp0, ymm_v3)
+
+        # c += d; b ^= c; b = ROTW12(b);
+        ADD_avx2(ymm_v2, ymm_v3)
+        XOR_avx2(ymm_v1, ymm_v2)
+        ROTW12_avx2(ymm_tmp0, ymm_v1)
+
+        # a += b; d ^= a; d = ROTW8(d);
+        ADD_avx2(ymm_v0, ymm_v1)
+        XOR_avx2(ymm_v3, ymm_v0)
+        ROTW8_avx2(ymm_tmp0, ymm_v3)
+
+        # c += d; b ^= c; b = ROTW7(b)
+        ADD_avx2(ymm_v2, ymm_v3)
+        XOR_avx2(ymm_v1, ymm_v2)
+        ROTW7_avx2(ymm_tmp0, ymm_v1)
+
+        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+        VPSHUFD(ymm_v1, ymm_v1, 0x93)
+        VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+        VPSHUFD(ymm_v3, ymm_v3, 0x39)
+
+        SUB(reg_rounds, 2)
+        JNZ(rounds_loop1.begin)
+
+    ADD_avx2(ymm_v0, ymm_s0)
+    ADD_avx2(ymm_v1, ymm_s1)
+    ADD_avx2(ymm_v2, ymm_s2)
+    ADD_avx2(ymm_v3, ymm_s3)
+
+    # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
+    VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x20)
+    VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp])
+    VMOVDQU([reg_outp], ymm_tmp0)
+
+    # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
+    VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x20)
+    VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+32])
+    VMOVDQU([reg_outp+32], ymm_tmp0)
+
+    VPERM2I128(ymm_s3, ymm_s3, ymm_s3, 0x01)
+    VMOVDQA(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3
+
+    LABEL(out_serial)
+
+    # Remove our stack allocation.
+    MOV(registers.rsp, reg_sp_save)
+
+    RETURN()
+
+#
+# CPUID
+#
+
+cpuidParams = Argument(ptr(uint32_t))
+
+with Function("cpuidAmd64", (cpuidParams,)):
+    reg_params = registers.r15
+    LOAD.ARGUMENT(reg_params, cpuidParams)
+
+    MOV(registers.eax, [reg_params])
+    MOV(registers.ecx, [reg_params+4])
+
+    CPUID()
+
+    MOV([reg_params], registers.eax)
+    MOV([reg_params+4], registers.ebx)
+    MOV([reg_params+8], registers.ecx)
+    MOV([reg_params+12], registers.edx)
+
+    RETURN()
+
+#
+# XGETBV
+#
+
+xcrIndex = Argument(uint32_t)
+xcrVec = Argument(ptr(uint32_t))
+
+with Function("xgetbvAmd64", (xcrIndex, xcrVec)):
+    reg_vec = GeneralPurposeRegister64()
+    reg_idx = registers.ecx
+
+    LOAD.ARGUMENT(reg_idx, xcrIndex)
+    LOAD.ARGUMENT(reg_vec, xcrVec)
+
+    XGETBV()
 
-    ADD(registers.rsp, 16+16)
-    ADD(registers.rsp, reg_align)
+    MOV([reg_vec], registers.eax)
+    MOV([reg_vec+4], registers.edx)
 
     RETURN()

+ 660 - 106
chacha20_amd64.s

@@ -8,17 +8,15 @@ TEXT ·blocksAmd64SSE2(SB),4,$0-32
 	MOVQ outp+16(FP), CX
 	MOVQ nrBlocks+24(FP), DX
 	MOVQ SP, DI
-	ANDQ $15, DI
-	MOVQ $16, SI
-	SUBQ DI, SI
-	SUBQ SI, SP
+	MOVQ $31, SI
+	NOTQ SI
+	ANDQ SI, SP
 	SUBQ $32, SP
-	MOVL $1, DI
-	MOVL DI, 0(SP)
-	MOVL $0, DI
-	MOVL DI, 4(SP)
-	MOVL DI, 8(SP)
-	MOVL DI, 12(SP)
+	PXOR X0, X0
+	SUBQ $32, SP
+	MOVO X0, 0(SP)
+	MOVL $1, SI
+	MOVL SI, 0(SP)
 	SUBQ $4, DX
 	JCS vector_loop4_end
 vector_loop4_begin:
@@ -41,8 +39,8 @@ vector_loop4_begin:
 		MOVO X2, X14
 		MOVO X11, X15
 		PADDQ 0(SP), X15
-		MOVQ $20, DI
-rounds_loop1_begin:
+		MOVQ $20, SI
+rounds_loop4_begin:
 			PADDL X1, X0
 			PADDL X5, X4
 			PADDL X9, X8
@@ -267,8 +265,8 @@ rounds_loop1_begin:
 			PSHUFL $57, X11, X11
 			PSHUFL $57, X15, X15
 			MOVO 16(SP), X12
-			SUBQ $2, DI
-			JNE rounds_loop1_begin
+			SUBQ $2, SI
+			JNE rounds_loop4_begin
 		MOVO X12, 16(SP)
 		PADDL 0(AX), X0
 		PADDL 16(AX), X1
@@ -365,8 +363,8 @@ vector_loop2_begin:
 		MOVO X2, X6
 		MOVO X3, X7
 		PADDQ X13, X7
-		MOVQ $20, DI
-rounds_loop0_begin:
+		MOVQ $20, SI
+rounds_loop2_begin:
 			PADDL X1, X0
 			PADDL X5, X4
 			PXOR X0, X3
@@ -475,8 +473,8 @@ rounds_loop0_begin:
 			PSHUFL $78, X6, X6
 			PSHUFL $57, X3, X3
 			PSHUFL $57, X7, X7
-			SUBQ $2, DI
-			JNE rounds_loop0_begin
+			SUBQ $2, SI
+			JNE rounds_loop2_begin
 		PADDL X8, X0
 		PADDL X9, X1
 		PADDL X10, X2
@@ -517,96 +515,652 @@ rounds_loop0_begin:
 		JCC vector_loop2_begin
 vector_loop2_end:
 	ADDQ $2, DX
-	JEQ serial_loop_end
-serial_loop_begin:
-		MOVO X8, X0
-		MOVO X9, X1
-		MOVO X10, X2
-		MOVO X11, X3
-		MOVQ $20, DI
-rounds_loop2_begin:
-			PADDL X1, X0
-			PXOR X0, X3
-			MOVO X3, X12
-			PSLLL $16, X12
-			PSRLL $16, X3
-			PXOR X12, X3
-			PADDL X3, X2
-			PXOR X2, X1
-			MOVO X1, X12
-			PSLLL $12, X12
-			PSRLL $20, X1
-			PXOR X12, X1
-			PADDL X1, X0
-			PXOR X0, X3
-			MOVO X3, X12
-			PSLLL $8, X12
-			PSRLL $24, X3
-			PXOR X12, X3
-			PADDL X3, X2
-			PXOR X2, X1
-			MOVO X1, X12
-			PSLLL $7, X12
-			PSRLL $25, X1
-			PXOR X12, X1
-			PSHUFL $57, X1, X1
-			PSHUFL $78, X2, X2
-			PSHUFL $147, X3, X3
-			PADDL X1, X0
-			PXOR X0, X3
-			MOVO X3, X12
-			PSLLL $16, X12
-			PSRLL $16, X3
-			PXOR X12, X3
-			PADDL X3, X2
-			PXOR X2, X1
-			MOVO X1, X12
-			PSLLL $12, X12
-			PSRLL $20, X1
-			PXOR X12, X1
-			PADDL X1, X0
-			PXOR X0, X3
-			MOVO X3, X12
-			PSLLL $8, X12
-			PSRLL $24, X3
-			PXOR X12, X3
-			PADDL X3, X2
-			PXOR X2, X1
-			MOVO X1, X12
-			PSLLL $7, X12
-			PSRLL $25, X1
-			PXOR X12, X1
-			PSHUFL $147, X1, X1
-			PSHUFL $78, X2, X2
-			PSHUFL $57, X3, X3
-			SUBQ $2, DI
-			JNE rounds_loop2_begin
-		PADDL X8, X0
-		PADDL X9, X1
-		PADDL X10, X2
-		PADDL X11, X3
-		MOVOU 0(BX), X12
-		PXOR X0, X12
-		MOVOU X12, 0(CX)
-		MOVOU 16(BX), X12
-		PXOR X1, X12
-		MOVOU X12, 16(CX)
-		MOVOU 32(BX), X12
-		PXOR X2, X12
-		MOVOU X12, 32(CX)
-		MOVOU 48(BX), X12
-		PXOR X3, X12
-		MOVOU X12, 48(CX)
-		PADDQ X13, X11
-		ADDQ $64, BX
-		ADDQ $64, CX
-		SUBQ $1, DX
-		JNE serial_loop_begin
-serial_loop_end:
+	JEQ out_serial
+	MOVO X8, X0
+	MOVO X9, X1
+	MOVO X10, X2
+	MOVO X11, X3
+	MOVQ $20, DX
+rounds_loop1_begin:
+		PADDL X1, X0
+		PXOR X0, X3
+		MOVO X3, X12
+		PSLLL $16, X12
+		PSRLL $16, X3
+		PXOR X12, X3
+		PADDL X3, X2
+		PXOR X2, X1
+		MOVO X1, X12
+		PSLLL $12, X12
+		PSRLL $20, X1
+		PXOR X12, X1
+		PADDL X1, X0
+		PXOR X0, X3
+		MOVO X3, X12
+		PSLLL $8, X12
+		PSRLL $24, X3
+		PXOR X12, X3
+		PADDL X3, X2
+		PXOR X2, X1
+		MOVO X1, X12
+		PSLLL $7, X12
+		PSRLL $25, X1
+		PXOR X12, X1
+		PSHUFL $57, X1, X1
+		PSHUFL $78, X2, X2
+		PSHUFL $147, X3, X3
+		PADDL X1, X0
+		PXOR X0, X3
+		MOVO X3, X12
+		PSLLL $16, X12
+		PSRLL $16, X3
+		PXOR X12, X3
+		PADDL X3, X2
+		PXOR X2, X1
+		MOVO X1, X12
+		PSLLL $12, X12
+		PSRLL $20, X1
+		PXOR X12, X1
+		PADDL X1, X0
+		PXOR X0, X3
+		MOVO X3, X12
+		PSLLL $8, X12
+		PSRLL $24, X3
+		PXOR X12, X3
+		PADDL X3, X2
+		PXOR X2, X1
+		MOVO X1, X12
+		PSLLL $7, X12
+		PSRLL $25, X1
+		PXOR X12, X1
+		PSHUFL $147, X1, X1
+		PSHUFL $78, X2, X2
+		PSHUFL $57, X3, X3
+		SUBQ $2, DX
+		JNE rounds_loop1_begin
+	PADDL X8, X0
+	PADDL X9, X1
+	PADDL X10, X2
+	PADDL X11, X3
+	MOVOU 0(BX), X12
+	PXOR X0, X12
+	MOVOU X12, 0(CX)
+	MOVOU 16(BX), X12
+	PXOR X1, X12
+	MOVOU X12, 16(CX)
+	MOVOU 32(BX), X12
+	PXOR X2, X12
+	MOVOU X12, 32(CX)
+	MOVOU 48(BX), X12
+	PXOR X3, X12
+	MOVOU X12, 48(CX)
+	PADDQ X13, X11
+out_serial:
 	MOVO X11, 48(AX)
 out:
 	PXOR X0, X0
 	MOVO X0, 16(SP)
-	ADDQ $32, SP
-	ADDQ SI, SP
+	MOVQ DI, SP
+	RET
+
+// func blocksAmd64AVX2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint)
+TEXT ·blocksAmd64AVX2(SB),4,$0-32
+	MOVQ x+0(FP), AX
+	MOVQ inp+8(FP), BX
+	MOVQ outp+16(FP), CX
+	MOVQ nrBlocks+24(FP), DX
+	MOVQ SP, DI
+	MOVQ $31, SI
+	NOTQ SI
+	ANDQ SI, SP
+	SUBQ $32, SP
+	BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x48; BYTE $0x10 // VBROADCASTI128 ymm1, [rax + 16]
+	BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x50; BYTE $0x20 // VBROADCASTI128 ymm2, [rax + 32]
+	BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x58; BYTE $0x30 // VBROADCASTI128 ymm3, [rax + 48]
+	SUBQ $32, SP
+	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm0, ymm0, ymm0
+	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x04; BYTE $0x24 // VMOVDQU [rsp], ymm0
+	MOVL $1, SI
+	MOVL SI, 16(SP)
+	BYTE $0xC5; BYTE $0xE5; BYTE $0xD4; BYTE $0x1C; BYTE $0x24 // VPADDQ ymm3, ymm3, [rsp]
+	MOVL $2, SI
+	MOVL SI, 0(SP)
+	MOVL SI, 16(SP)
+	SUBQ $6, DX
+	JCS vector_loop6_end
+vector_loop6_begin:
+		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x20 // VBROADCASTI128 ymm4, [rax]
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xE9 // VMOVDQA ymm5, ymm1
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF2 // VMOVDQA ymm6, ymm2
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFB // VMOVDQA ymm7, ymm3
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xC4 // VMOVDQA ymm8, ymm4
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xCD // VMOVDQA ymm9, ymm5
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xD6 // VMOVDQA ymm10, ymm6
+		BYTE $0xC5; BYTE $0x45; BYTE $0xD4; BYTE $0x1C; BYTE $0x24 // VPADDQ ymm11, ymm7, [rsp]
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xE4 // VMOVDQA ymm12, ymm4
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xED // VMOVDQA ymm13, ymm5
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xF6 // VMOVDQA ymm14, ymm6
+		BYTE $0xC5; BYTE $0x25; BYTE $0xD4; BYTE $0x3C; BYTE $0x24 // VPADDQ ymm15, ymm11, [rsp]
+		MOVQ $20, SI
+rounds_loop6_begin:
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+			BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm9
+			BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm12, ymm12, ymm13
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm11, ymm11, ymm8
+			BYTE $0xC4; BYTE $0x41; BYTE $0x05; BYTE $0xEF; BYTE $0xFC // VPXOR ymm15, ymm15, ymm12
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm7, 16
+			BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm7, ymm7, 16
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm7, ymm7, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF3; BYTE $0x10 // VPSLLD ymm0, ymm11, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x25; BYTE $0x72; BYTE $0xD3; BYTE $0x10 // VPSRLD ymm11, ymm11, 16
+			BYTE $0xC5; BYTE $0x25; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm11, ymm11, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm15, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm15, ymm15, 16
+			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm10, ymm10, ymm11
+			BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xEF; BYTE $0xCA // VPXOR ymm9, ymm9, ymm10
+			BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm5, 12
+			BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm5, ymm5, 20
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm5, ymm5, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF1; BYTE $0x0C // VPSLLD ymm0, ymm9, 12
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x35; BYTE $0x72; BYTE $0xD1; BYTE $0x14 // VPSRLD ymm9, ymm9, 20
+			BYTE $0xC5; BYTE $0x35; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm9, ymm9, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm13, 12
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm13, ymm13, 20
+			BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+			BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm9
+			BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm12, ymm12, ymm13
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm11, ymm11, ymm8
+			BYTE $0xC4; BYTE $0x41; BYTE $0x05; BYTE $0xEF; BYTE $0xFC // VPXOR ymm15, ymm15, ymm12
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm7, 8
+			BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm7, ymm7, 24
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm7, ymm7, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF3; BYTE $0x08 // VPSLLD ymm0, ymm11, 8
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x25; BYTE $0x72; BYTE $0xD3; BYTE $0x18 // VPSRLD ymm11, ymm11, 24
+			BYTE $0xC5; BYTE $0x25; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm11, ymm11, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm15, 8
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm15, ymm15, 24
+			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm10, ymm10, ymm11
+			BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xEF; BYTE $0xCA // VPXOR ymm9, ymm9, ymm10
+			BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm5, 7
+			BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm5, ymm5, 25
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm5, ymm5, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF1; BYTE $0x07 // VPSLLD ymm0, ymm9, 7
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x35; BYTE $0x72; BYTE $0xD1; BYTE $0x19 // VPSRLD ymm9, ymm9, 25
+			BYTE $0xC5; BYTE $0x35; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm9, ymm9, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm13, 7
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm13, ymm13, 25
+			BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xED; BYTE $0x39 // VPSHUFD ymm5, ymm5, 57
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm9, ymm9, 57
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xED; BYTE $0x39 // VPSHUFD ymm13, ymm13, 57
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm6, ymm6, 78
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x4E // VPSHUFD ymm10, ymm10, 78
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm14, ymm14, 78
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x93 // VPSHUFD ymm7, ymm7, 147
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm11, ymm11, 147
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xFF; BYTE $0x93 // VPSHUFD ymm15, ymm15, 147
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+			BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm9
+			BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm12, ymm12, ymm13
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm11, ymm11, ymm8
+			BYTE $0xC4; BYTE $0x41; BYTE $0x05; BYTE $0xEF; BYTE $0xFC // VPXOR ymm15, ymm15, ymm12
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm7, 16
+			BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm7, ymm7, 16
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm7, ymm7, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF3; BYTE $0x10 // VPSLLD ymm0, ymm11, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x25; BYTE $0x72; BYTE $0xD3; BYTE $0x10 // VPSRLD ymm11, ymm11, 16
+			BYTE $0xC5; BYTE $0x25; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm11, ymm11, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm15, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm15, ymm15, 16
+			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm10, ymm10, ymm11
+			BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xEF; BYTE $0xCA // VPXOR ymm9, ymm9, ymm10
+			BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm5, 12
+			BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm5, ymm5, 20
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm5, ymm5, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF1; BYTE $0x0C // VPSLLD ymm0, ymm9, 12
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x35; BYTE $0x72; BYTE $0xD1; BYTE $0x14 // VPSRLD ymm9, ymm9, 20
+			BYTE $0xC5; BYTE $0x35; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm9, ymm9, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm13, 12
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm13, ymm13, 20
+			BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+			BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm9
+			BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm12, ymm12, ymm13
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm11, ymm11, ymm8
+			BYTE $0xC4; BYTE $0x41; BYTE $0x05; BYTE $0xEF; BYTE $0xFC // VPXOR ymm15, ymm15, ymm12
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm7, 8
+			BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm7, ymm7, 24
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm7, ymm7, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF3; BYTE $0x08 // VPSLLD ymm0, ymm11, 8
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x25; BYTE $0x72; BYTE $0xD3; BYTE $0x18 // VPSRLD ymm11, ymm11, 24
+			BYTE $0xC5; BYTE $0x25; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm11, ymm11, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm15, 8
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm15, ymm15, 24
+			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm10, ymm10, ymm11
+			BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xEF; BYTE $0xCA // VPXOR ymm9, ymm9, ymm10
+			BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm5, 7
+			BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm5, ymm5, 25
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm5, ymm5, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF1; BYTE $0x07 // VPSLLD ymm0, ymm9, 7
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x35; BYTE $0x72; BYTE $0xD1; BYTE $0x19 // VPSRLD ymm9, ymm9, 25
+			BYTE $0xC5; BYTE $0x35; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm9, ymm9, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm13, 7
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm13, ymm13, 25
+			BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xED; BYTE $0x93 // VPSHUFD ymm5, ymm5, 147
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm9, ymm9, 147
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xED; BYTE $0x93 // VPSHUFD ymm13, ymm13, 147
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm6, ymm6, 78
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x4E // VPSHUFD ymm10, ymm10, 78
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm14, ymm14, 78
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x39 // VPSHUFD ymm7, ymm7, 57
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm11, ymm11, 57
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xFF; BYTE $0x39 // VPSHUFD ymm15, ymm15, 57
+			SUBQ $2, SI
+			JNE rounds_loop6_begin
+		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x00 // VBROADCASTI128 ymm0, [rax]
+		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE0 // VPADDD ymm4, ymm4, ymm0
+		BYTE $0xC5; BYTE $0x3D; BYTE $0xFE; BYTE $0xC0 // VPADDD ymm8, ymm8, ymm0
+		BYTE $0xC5; BYTE $0x1D; BYTE $0xFE; BYTE $0xE0 // VPADDD ymm12, ymm12, ymm0
+		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xE9 // VPADDD ymm5, ymm5, ymm1
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF2 // VPADDD ymm6, ymm6, ymm2
+		BYTE $0xC5; BYTE $0xC5; BYTE $0xFE; BYTE $0xFB // VPADDD ymm7, ymm7, ymm3
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC5; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm5, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x4D; BYTE $0x46; BYTE $0xC7; BYTE $0x20 // VPERM2I128 ymm0, ymm6, ymm7, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC5; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm5, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x4D; BYTE $0x46; BYTE $0xC7; BYTE $0x31 // VPERM2I128 ymm0, ymm6, ymm7, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0
+		BYTE $0xC5; BYTE $0xE5; BYTE $0xFE; BYTE $0x1C; BYTE $0x24 // VPADDD ymm3, ymm3, [rsp]
+		BYTE $0xC5; BYTE $0x35; BYTE $0xFE; BYTE $0xC9 // VPADDD ymm9, ymm9, ymm1
+		BYTE $0xC5; BYTE $0x2D; BYTE $0xFE; BYTE $0xD2 // VPADDD ymm10, ymm10, ymm2
+		BYTE $0xC5; BYTE $0x25; BYTE $0xFE; BYTE $0xDB // VPADDD ymm11, ymm11, ymm3
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x3D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm8, ymm9, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 128]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 128], ymm0
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x2D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm10, ymm11, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 160]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 160], ymm0
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x3D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm8, ymm9, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 192]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 192], ymm0
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x2D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm10, ymm11, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 224]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 224], ymm0
+		BYTE $0xC5; BYTE $0xE5; BYTE $0xFE; BYTE $0x1C; BYTE $0x24 // VPADDD ymm3, ymm3, [rsp]
+		BYTE $0xC5; BYTE $0x15; BYTE $0xFE; BYTE $0xE9 // VPADDD ymm13, ymm13, ymm1
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xFE; BYTE $0xF2 // VPADDD ymm14, ymm14, ymm2
+		BYTE $0xC5; BYTE $0x05; BYTE $0xFE; BYTE $0xFB // VPADDD ymm15, ymm15, ymm3
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x1D; BYTE $0x46; BYTE $0xC5; BYTE $0x20 // VPERM2I128 ymm0, ymm12, ymm13, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x00; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 256]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x00; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 256], ymm0
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x0D; BYTE $0x46; BYTE $0xC7; BYTE $0x20 // VPERM2I128 ymm0, ymm14, ymm15, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x20; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 288]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x20; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 288], ymm0
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x1D; BYTE $0x46; BYTE $0xC5; BYTE $0x31 // VPERM2I128 ymm0, ymm12, ymm13, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x40; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 320]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x40; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 320], ymm0
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x0D; BYTE $0x46; BYTE $0xC7; BYTE $0x31 // VPERM2I128 ymm0, ymm14, ymm15, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x60; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 352]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x60; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 352], ymm0
+		BYTE $0xC5; BYTE $0xE5; BYTE $0xFE; BYTE $0x1C; BYTE $0x24 // VPADDD ymm3, ymm3, [rsp]
+		ADDQ $384, BX
+		ADDQ $384, CX
+		SUBQ $6, DX
+		JCC vector_loop6_begin
+vector_loop6_end:
+	ADDQ $6, DX
+	BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x20 // VBROADCASTI128 ymm12, [rax]
+	BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0x2C; BYTE $0x24 // VMOVDQA ymm13, [rsp]
+	SUBQ $4, DX
+	JCS vector_loop4_end
+vector_loop4_begin:
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE4 // VMOVDQA ymm4, ymm12
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xE9 // VMOVDQA ymm5, ymm1
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF2 // VMOVDQA ymm6, ymm2
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFB // VMOVDQA ymm7, ymm3
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xC4 // VMOVDQA ymm8, ymm4
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xCD // VMOVDQA ymm9, ymm5
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xD6 // VMOVDQA ymm10, ymm6
+		BYTE $0xC4; BYTE $0x41; BYTE $0x45; BYTE $0xD4; BYTE $0xDD // VPADDQ ymm11, ymm7, ymm13
+		MOVQ $20, SI
+rounds_loop4_begin:
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+			BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm9
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm11, ymm11, ymm8
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm14, ymm7, 16
+			BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm7, ymm7, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xEF; BYTE $0xFE // VPXOR ymm7, ymm7, ymm14
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x0D; BYTE $0x72; BYTE $0xF3; BYTE $0x10 // VPSLLD ymm14, ymm11, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x25; BYTE $0x72; BYTE $0xD3; BYTE $0x10 // VPSRLD ymm11, ymm11, 16
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xEF; BYTE $0xDE // VPXOR ymm11, ymm11, ymm14
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm10, ymm10, ymm11
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xEF; BYTE $0xCA // VPXOR ymm9, ymm9, ymm10
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm14, ymm5, 12
+			BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm5, ymm5, 20
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm14
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x0D; BYTE $0x72; BYTE $0xF1; BYTE $0x0C // VPSLLD ymm14, ymm9, 12
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x35; BYTE $0x72; BYTE $0xD1; BYTE $0x14 // VPSRLD ymm9, ymm9, 20
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xEF; BYTE $0xCE // VPXOR ymm9, ymm9, ymm14
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+			BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm9
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm11, ymm11, ymm8
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm14, ymm7, 8
+			BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm7, ymm7, 24
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xEF; BYTE $0xFE // VPXOR ymm7, ymm7, ymm14
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x0D; BYTE $0x72; BYTE $0xF3; BYTE $0x08 // VPSLLD ymm14, ymm11, 8
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x25; BYTE $0x72; BYTE $0xD3; BYTE $0x18 // VPSRLD ymm11, ymm11, 24
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xEF; BYTE $0xDE // VPXOR ymm11, ymm11, ymm14
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm10, ymm10, ymm11
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xEF; BYTE $0xCA // VPXOR ymm9, ymm9, ymm10
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm14, ymm5, 7
+			BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm5, ymm5, 25
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm14
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x0D; BYTE $0x72; BYTE $0xF1; BYTE $0x07 // VPSLLD ymm14, ymm9, 7
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x35; BYTE $0x72; BYTE $0xD1; BYTE $0x19 // VPSRLD ymm9, ymm9, 25
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xEF; BYTE $0xCE // VPXOR ymm9, ymm9, ymm14
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xED; BYTE $0x39 // VPSHUFD ymm5, ymm5, 57
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm9, ymm9, 57
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm6, ymm6, 78
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x4E // VPSHUFD ymm10, ymm10, 78
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x93 // VPSHUFD ymm7, ymm7, 147
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm11, ymm11, 147
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+			BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm9
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm11, ymm11, ymm8
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm14, ymm7, 16
+			BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm7, ymm7, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xEF; BYTE $0xFE // VPXOR ymm7, ymm7, ymm14
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x0D; BYTE $0x72; BYTE $0xF3; BYTE $0x10 // VPSLLD ymm14, ymm11, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x25; BYTE $0x72; BYTE $0xD3; BYTE $0x10 // VPSRLD ymm11, ymm11, 16
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xEF; BYTE $0xDE // VPXOR ymm11, ymm11, ymm14
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm10, ymm10, ymm11
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xEF; BYTE $0xCA // VPXOR ymm9, ymm9, ymm10
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm14, ymm5, 12
+			BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm5, ymm5, 20
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm14
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x0D; BYTE $0x72; BYTE $0xF1; BYTE $0x0C // VPSLLD ymm14, ymm9, 12
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x35; BYTE $0x72; BYTE $0xD1; BYTE $0x14 // VPSRLD ymm9, ymm9, 20
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xEF; BYTE $0xCE // VPXOR ymm9, ymm9, ymm14
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+			BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm9
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm11, ymm11, ymm8
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm14, ymm7, 8
+			BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm7, ymm7, 24
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xEF; BYTE $0xFE // VPXOR ymm7, ymm7, ymm14
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x0D; BYTE $0x72; BYTE $0xF3; BYTE $0x08 // VPSLLD ymm14, ymm11, 8
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x25; BYTE $0x72; BYTE $0xD3; BYTE $0x18 // VPSRLD ymm11, ymm11, 24
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xEF; BYTE $0xDE // VPXOR ymm11, ymm11, ymm14
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm10, ymm10, ymm11
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xEF; BYTE $0xCA // VPXOR ymm9, ymm9, ymm10
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm14, ymm5, 7
+			BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm5, ymm5, 25
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm14
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x0D; BYTE $0x72; BYTE $0xF1; BYTE $0x07 // VPSLLD ymm14, ymm9, 7
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x35; BYTE $0x72; BYTE $0xD1; BYTE $0x19 // VPSRLD ymm9, ymm9, 25
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xEF; BYTE $0xCE // VPXOR ymm9, ymm9, ymm14
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xED; BYTE $0x93 // VPSHUFD ymm5, ymm5, 147
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm9, ymm9, 147
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm6, ymm6, 78
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x4E // VPSHUFD ymm10, ymm10, 78
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x39 // VPSHUFD ymm7, ymm7, 57
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm11, ymm11, 57
+			SUBQ $2, SI
+			JNE rounds_loop4_begin
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE4 // VPADDD ymm4, ymm4, ymm12
+		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xE9 // VPADDD ymm5, ymm5, ymm1
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF2 // VPADDD ymm6, ymm6, ymm2
+		BYTE $0xC5; BYTE $0xC5; BYTE $0xFE; BYTE $0xFB // VPADDD ymm7, ymm7, ymm3
+		BYTE $0xC4; BYTE $0x63; BYTE $0x5D; BYTE $0x46; BYTE $0xF5; BYTE $0x20 // VPERM2I128 ymm14, ymm4, ymm5, 32
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0x33 // VPXOR ymm14, ymm14, [rbx]
+		BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0x31 // VMOVDQU [rcx], ymm14
+		BYTE $0xC4; BYTE $0x63; BYTE $0x4D; BYTE $0x46; BYTE $0xF7; BYTE $0x20 // VPERM2I128 ymm14, ymm6, ymm7, 32
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0x73; BYTE $0x20 // VPXOR ymm14, ymm14, [rbx + 32]
+		BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0x71; BYTE $0x20 // VMOVDQU [rcx + 32], ymm14
+		BYTE $0xC4; BYTE $0x63; BYTE $0x5D; BYTE $0x46; BYTE $0xF5; BYTE $0x31 // VPERM2I128 ymm14, ymm4, ymm5, 49
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0x73; BYTE $0x40 // VPXOR ymm14, ymm14, [rbx + 64]
+		BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0x71; BYTE $0x40 // VMOVDQU [rcx + 64], ymm14
+		BYTE $0xC4; BYTE $0x63; BYTE $0x4D; BYTE $0x46; BYTE $0xF7; BYTE $0x31 // VPERM2I128 ymm14, ymm6, ymm7, 49
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0x73; BYTE $0x60 // VPXOR ymm14, ymm14, [rbx + 96]
+		BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0x71; BYTE $0x60 // VMOVDQU [rcx + 96], ymm14
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDD // VPADDD ymm3, ymm3, ymm13
+		BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC4 // VPADDD ymm8, ymm8, ymm12
+		BYTE $0xC5; BYTE $0x35; BYTE $0xFE; BYTE $0xC9 // VPADDD ymm9, ymm9, ymm1
+		BYTE $0xC5; BYTE $0x2D; BYTE $0xFE; BYTE $0xD2 // VPADDD ymm10, ymm10, ymm2
+		BYTE $0xC5; BYTE $0x25; BYTE $0xFE; BYTE $0xDB // VPADDD ymm11, ymm11, ymm3
+		BYTE $0xC4; BYTE $0x43; BYTE $0x3D; BYTE $0x46; BYTE $0xF1; BYTE $0x20 // VPERM2I128 ymm14, ymm8, ymm9, 32
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0xB3; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm14, ymm14, [rbx + 128]
+		BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0xB1; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 128], ymm14
+		BYTE $0xC4; BYTE $0x43; BYTE $0x2D; BYTE $0x46; BYTE $0xF3; BYTE $0x20 // VPERM2I128 ymm14, ymm10, ymm11, 32
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0xB3; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm14, ymm14, [rbx + 160]
+		BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0xB1; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 160], ymm14
+		BYTE $0xC4; BYTE $0x43; BYTE $0x3D; BYTE $0x46; BYTE $0xF1; BYTE $0x31 // VPERM2I128 ymm14, ymm8, ymm9, 49
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0xB3; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm14, ymm14, [rbx + 192]
+		BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0xB1; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 192], ymm14
+		BYTE $0xC4; BYTE $0x43; BYTE $0x2D; BYTE $0x46; BYTE $0xF3; BYTE $0x31 // VPERM2I128 ymm14, ymm10, ymm11, 49
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0xB3; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm14, ymm14, [rbx + 224]
+		BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0xB1; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 224], ymm14
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDD // VPADDD ymm3, ymm3, ymm13
+		ADDQ $256, BX
+		ADDQ $256, CX
+		SUBQ $4, DX
+		JCC vector_loop4_begin
+vector_loop4_end:
+	ADDQ $4, DX
+	SUBQ $2, DX
+	JCS vector_loop2_end
+vector_loop2_begin:
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE4 // VMOVDQA ymm4, ymm12
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xE9 // VMOVDQA ymm5, ymm1
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF2 // VMOVDQA ymm6, ymm2
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFB // VMOVDQA ymm7, ymm3
+		MOVQ $20, SI
+rounds_loop2_begin:
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm14, ymm7, 16
+			BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm7, ymm7, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xEF; BYTE $0xFE // VPXOR ymm7, ymm7, ymm14
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm14, ymm5, 12
+			BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm5, ymm5, 20
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm14
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm14, ymm7, 8
+			BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm7, ymm7, 24
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xEF; BYTE $0xFE // VPXOR ymm7, ymm7, ymm14
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm14, ymm5, 7
+			BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm5, ymm5, 25
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm14
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xED; BYTE $0x39 // VPSHUFD ymm5, ymm5, 57
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm6, ymm6, 78
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x93 // VPSHUFD ymm7, ymm7, 147
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm14, ymm7, 16
+			BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm7, ymm7, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xEF; BYTE $0xFE // VPXOR ymm7, ymm7, ymm14
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm14, ymm5, 12
+			BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm5, ymm5, 20
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm14
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+			BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm14, ymm7, 8
+			BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm7, ymm7, 24
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xEF; BYTE $0xFE // VPXOR ymm7, ymm7, ymm14
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+			BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm14, ymm5, 7
+			BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm5, ymm5, 25
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm14
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xED; BYTE $0x93 // VPSHUFD ymm5, ymm5, 147
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm6, ymm6, 78
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x39 // VPSHUFD ymm7, ymm7, 57
+			SUBQ $2, SI
+			JNE rounds_loop2_begin
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE4 // VPADDD ymm4, ymm4, ymm12
+		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xE9 // VPADDD ymm5, ymm5, ymm1
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF2 // VPADDD ymm6, ymm6, ymm2
+		BYTE $0xC5; BYTE $0xC5; BYTE $0xFE; BYTE $0xFB // VPADDD ymm7, ymm7, ymm3
+		BYTE $0xC4; BYTE $0x63; BYTE $0x5D; BYTE $0x46; BYTE $0xF5; BYTE $0x20 // VPERM2I128 ymm14, ymm4, ymm5, 32
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0x33 // VPXOR ymm14, ymm14, [rbx]
+		BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0x31 // VMOVDQU [rcx], ymm14
+		BYTE $0xC4; BYTE $0x63; BYTE $0x4D; BYTE $0x46; BYTE $0xF7; BYTE $0x20 // VPERM2I128 ymm14, ymm6, ymm7, 32
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0x73; BYTE $0x20 // VPXOR ymm14, ymm14, [rbx + 32]
+		BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0x71; BYTE $0x20 // VMOVDQU [rcx + 32], ymm14
+		BYTE $0xC4; BYTE $0x63; BYTE $0x5D; BYTE $0x46; BYTE $0xF5; BYTE $0x31 // VPERM2I128 ymm14, ymm4, ymm5, 49
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0x73; BYTE $0x40 // VPXOR ymm14, ymm14, [rbx + 64]
+		BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0x71; BYTE $0x40 // VMOVDQU [rcx + 64], ymm14
+		BYTE $0xC4; BYTE $0x63; BYTE $0x4D; BYTE $0x46; BYTE $0xF7; BYTE $0x31 // VPERM2I128 ymm14, ymm6, ymm7, 49
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0x73; BYTE $0x60 // VPXOR ymm14, ymm14, [rbx + 96]
+		BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0x71; BYTE $0x60 // VMOVDQU [rcx + 96], ymm14
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDD // VPADDD ymm3, ymm3, ymm13
+		ADDQ $128, BX
+		ADDQ $128, CX
+		SUBQ $2, DX
+		JCC vector_loop2_begin
+vector_loop2_end:
+	ADDQ $2, DX
+	BYTE $0xC5; BYTE $0xF9; BYTE $0x7F; BYTE $0x58; BYTE $0x30 // VMOVDQA [rax + 48], xmm3
+	SUBQ $1, DX
+	JCS out_serial
+	BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE4 // VMOVDQA ymm4, ymm12
+	BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xE9 // VMOVDQA ymm5, ymm1
+	BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF2 // VMOVDQA ymm6, ymm2
+	BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFB // VMOVDQA ymm7, ymm3
+	MOVQ $20, DX
+rounds_loop1_begin:
+		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+		BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+		BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm14, ymm7, 16
+		BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm7, ymm7, 16
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xEF; BYTE $0xFE // VPXOR ymm7, ymm7, ymm14
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+		BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+		BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm14, ymm5, 12
+		BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm5, ymm5, 20
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm14
+		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+		BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+		BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm14, ymm7, 8
+		BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm7, ymm7, 24
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xEF; BYTE $0xFE // VPXOR ymm7, ymm7, ymm14
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+		BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+		BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm14, ymm5, 7
+		BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm5, ymm5, 25
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm14
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xED; BYTE $0x39 // VPSHUFD ymm5, ymm5, 57
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm6, ymm6, 78
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x93 // VPSHUFD ymm7, ymm7, 147
+		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+		BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+		BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm14, ymm7, 16
+		BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm7, ymm7, 16
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xEF; BYTE $0xFE // VPXOR ymm7, ymm7, ymm14
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+		BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+		BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm14, ymm5, 12
+		BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm5, ymm5, 20
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm14
+		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE5 // VPADDD ymm4, ymm4, ymm5
+		BYTE $0xC5; BYTE $0xC5; BYTE $0xEF; BYTE $0xFC // VPXOR ymm7, ymm7, ymm4
+		BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm14, ymm7, 8
+		BYTE $0xC5; BYTE $0xC5; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm7, ymm7, 24
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xEF; BYTE $0xFE // VPXOR ymm7, ymm7, ymm14
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm6, ymm6, ymm7
+		BYTE $0xC5; BYTE $0xD5; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm6
+		BYTE $0xC5; BYTE $0x8D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm14, ymm5, 7
+		BYTE $0xC5; BYTE $0xD5; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm5, ymm5, 25
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xEF; BYTE $0xEE // VPXOR ymm5, ymm5, ymm14
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xED; BYTE $0x93 // VPSHUFD ymm5, ymm5, 147
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm6, ymm6, 78
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x39 // VPSHUFD ymm7, ymm7, 57
+		SUBQ $2, DX
+		JNE rounds_loop1_begin
+	BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE4 // VPADDD ymm4, ymm4, ymm12
+	BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xE9 // VPADDD ymm5, ymm5, ymm1
+	BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF2 // VPADDD ymm6, ymm6, ymm2
+	BYTE $0xC5; BYTE $0xC5; BYTE $0xFE; BYTE $0xFB // VPADDD ymm7, ymm7, ymm3
+	BYTE $0xC4; BYTE $0x63; BYTE $0x5D; BYTE $0x46; BYTE $0xF5; BYTE $0x20 // VPERM2I128 ymm14, ymm4, ymm5, 32
+	BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0x33 // VPXOR ymm14, ymm14, [rbx]
+	BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0x31 // VMOVDQU [rcx], ymm14
+	BYTE $0xC4; BYTE $0x63; BYTE $0x4D; BYTE $0x46; BYTE $0xF7; BYTE $0x20 // VPERM2I128 ymm14, ymm6, ymm7, 32
+	BYTE $0xC5; BYTE $0x0D; BYTE $0xEF; BYTE $0x73; BYTE $0x20 // VPXOR ymm14, ymm14, [rbx + 32]
+	BYTE $0xC5; BYTE $0x7E; BYTE $0x7F; BYTE $0x71; BYTE $0x20 // VMOVDQU [rcx + 32], ymm14
+	BYTE $0xC4; BYTE $0xE3; BYTE $0x65; BYTE $0x46; BYTE $0xDB; BYTE $0x01 // VPERM2I128 ymm3, ymm3, ymm3, 1
+	BYTE $0xC5; BYTE $0xF9; BYTE $0x7F; BYTE $0x58; BYTE $0x30 // VMOVDQA [rax + 48], xmm3
+out_serial:
+	MOVQ DI, SP
+	BYTE $0xC5; BYTE $0xF8; BYTE $0x77 // VZEROUPPER
+	RET
+
+// func cpuidAmd64(cpuidParams *uint32)
+TEXT ·cpuidAmd64(SB),4,$0-8
+	MOVQ cpuidParams+0(FP), R15
+	MOVL 0(R15), AX
+	MOVL 4(R15), CX
+	CPUID
+	MOVL AX, 0(R15)
+	MOVL BX, 4(R15)
+	MOVL CX, 8(R15)
+	MOVL DX, 12(R15)
+	RET
+
+// func xgetbvAmd64(xcrIndex uint32, xcrVec *uint32)
+TEXT ·xgetbvAmd64(SB),4,$0-12
+	MOVL xcrIndex+0(FP), CX
+	MOVQ xcrVec+8(FP), BX
+	BYTE $0x0F; BYTE $0x01; BYTE $0xD0 // XGETBV
+	MOVL AX, 0(BX)
+	MOVL DX, 4(BX)
 	RET

+ 70 - 7
chacha20_test.go

@@ -337,7 +337,7 @@ func TestChaCha20(t *testing.T) {
 
 func TestChaCha20Vectorized(t *testing.T) {
 	if !usingVectors {
-		t.Skip("vectorized ChaCha20 support not compiled in")
+		t.Skip("vectorized ChaCha20 support not enabled")
 	}
 
 	// Save the batch blocks processing routine so we can mess with it, and
@@ -373,15 +373,15 @@ func TestChaCha20Vectorized(t *testing.T) {
 		}
 		blocksFn = blocksRef
 		c.XORKeyStream(refOut[:], input[:i])
-		if !bytes.Equal(refOut[:], vecOut[:]) {
-			for i, v := range refOut {
-				if vecOut[i] != v {
-					t.Errorf("mismatch at offset: %d %x != %x", i, vecOut[i], v)
+		if !bytes.Equal(refOut[:i], vecOut[:i]) {
+			for j, v := range refOut {
+				if vecOut[j] != v {
+					t.Errorf("[%d] mismatch at offset: %d %x != %x", i, j, vecOut[j], v)
 					break
 				}
 			}
-			t.Errorf("ref: %s", hex.Dump(refOut[:]))
-			t.Errorf("vec: %s", hex.Dump(vecOut[:]))
+			t.Errorf("ref: %s", hex.Dump(refOut[:i]))
+			t.Errorf("vec: %s", hex.Dump(vecOut[:i]))
 			t.Errorf("refOut != vecOut")
 			break
 		}
@@ -389,6 +389,69 @@ func TestChaCha20Vectorized(t *testing.T) {
 	}
 }
 
+func TestChaCha20VectorizedIncremental(t *testing.T) {
+	if !usingVectors {
+		t.Skip("vectorized ChaCha20 support not enabled")
+	}
+
+	// Save the batch blocks processing routine so we can mess with it, and
+	// restore it when we're done.
+	oldBlocksFn := blocksFn
+	defer func() {
+		blocksFn = oldBlocksFn
+	}()
+
+	const (
+		maxBlocks = 128
+		testSz    = (maxBlocks * (maxBlocks + 1) / 2) * BlockSize
+	)
+
+	// Generate a random key, nonce and input.
+	var key [KeySize]byte
+	var nonce [NonceSize]byte
+	var input [testSz]byte
+	var vecOut [testSz]byte
+	var refOut [testSz]byte
+	rand.Read(key[:])
+	rand.Read(nonce[:])
+	rand.Read(input[:])
+
+	// Using the vectorized version, encrypt an ever increasing number of
+	// blocks at a time.
+	c, err := NewCipher(key[:], nonce[:])
+	if err != nil {
+		t.Fatal(err)
+	}
+	off := 0
+	for nrBlocks := 0; nrBlocks <= maxBlocks; nrBlocks++ {
+		cnt := nrBlocks * BlockSize
+		c.XORKeyStream(vecOut[off:off+cnt], input[off:off+cnt])
+		off += cnt
+	}
+
+	// Encrypt an equivalent amount of data with a one shot call to the
+	// reference implementation.
+	c, err = NewCipher(key[:], nonce[:])
+	if err != nil {
+		t.Fatal(err)
+	}
+	blocksFn = blocksRef
+	c.XORKeyStream(refOut[:], input[:])
+
+	// And compare the output.
+	if !bytes.Equal(refOut[:], vecOut[:]) {
+		for j, v := range refOut {
+			if vecOut[j] != v {
+				t.Errorf("incremental mismatch at offset: %d %x != %x", j, vecOut[j], v)
+				break
+			}
+		}
+		// t.Errorf("ref: %s", hex.Dump(refOut[:]))
+		// t.Errorf("vec: %s", hex.Dump(vecOut[:]))
+		t.Errorf("refOut != vecOut")
+	}
+}
+
 func doBenchN(b *testing.B, n int) {
 	var key [KeySize]byte
 	var nonce [NonceSize]byte