Browse Source

Get rid of the stupid sigma parameter to the AMD64 SSE2 code.

Yawning Angel 3 years ago
parent
commit
a6e78418d9
5 changed files with 155 additions and 152 deletions
  1. 24 20
      chacha20.go
  2. 2 7
      chacha20_amd64.go
  3. 11 13
      chacha20_amd64.py
  4. 62 63
      chacha20_amd64.s
  5. 56 49
      chacha20_ref.go

+ 24 - 20
chacha20.go

@@ -35,7 +35,7 @@ const (
 	// BlockSize is the ChaCha20 block size in bytes.
 	BlockSize = 64
 
-	stateSize    = 16 - 4
+	stateSize    = 16
 	chachaRounds = 20
 
 	// The constant "expand 32-byte k" as little endian uint32s.
@@ -190,24 +190,28 @@ func (c *Cipher) ReKey(key, nonce []byte) error {
 	}
 
 	c.Reset()
-	c.state[0] = binary.LittleEndian.Uint32(key[0:4])
-	c.state[1] = binary.LittleEndian.Uint32(key[4:8])
-	c.state[2] = binary.LittleEndian.Uint32(key[8:12])
-	c.state[3] = binary.LittleEndian.Uint32(key[12:16])
-	c.state[4] = binary.LittleEndian.Uint32(key[16:20])
-	c.state[5] = binary.LittleEndian.Uint32(key[20:24])
-	c.state[6] = binary.LittleEndian.Uint32(key[24:28])
-	c.state[7] = binary.LittleEndian.Uint32(key[28:32])
-	c.state[8] = 0
+	c.state[0] = sigma0
+	c.state[1] = sigma1
+	c.state[2] = sigma2
+	c.state[3] = sigma3
+	c.state[4] = binary.LittleEndian.Uint32(key[0:4])
+	c.state[5] = binary.LittleEndian.Uint32(key[4:8])
+	c.state[6] = binary.LittleEndian.Uint32(key[8:12])
+	c.state[7] = binary.LittleEndian.Uint32(key[12:16])
+	c.state[8] = binary.LittleEndian.Uint32(key[16:20])
+	c.state[9] = binary.LittleEndian.Uint32(key[20:24])
+	c.state[10] = binary.LittleEndian.Uint32(key[24:28])
+	c.state[11] = binary.LittleEndian.Uint32(key[28:32])
+	c.state[12] = 0
 	if len(nonce) == INonceSize {
-		c.state[9] = binary.LittleEndian.Uint32(nonce[0:4])
-		c.state[10] = binary.LittleEndian.Uint32(nonce[4:8])
-		c.state[11] = binary.LittleEndian.Uint32(nonce[8:12])
+		c.state[13] = binary.LittleEndian.Uint32(nonce[0:4])
+		c.state[14] = binary.LittleEndian.Uint32(nonce[4:8])
+		c.state[15] = binary.LittleEndian.Uint32(nonce[8:12])
 		c.ietf = true
 	} else {
-		c.state[9] = 0
-		c.state[10] = binary.LittleEndian.Uint32(nonce[0:4])
-		c.state[11] = binary.LittleEndian.Uint32(nonce[4:8])
+		c.state[13] = 0
+		c.state[14] = binary.LittleEndian.Uint32(nonce[0:4])
+		c.state[15] = binary.LittleEndian.Uint32(nonce[4:8])
 		c.ietf = false
 	}
 	c.off = BlockSize
@@ -221,10 +225,10 @@ func (c *Cipher) Seek(blockCounter uint64) error {
 		if blockCounter > math.MaxUint32 {
 			return ErrInvalidCounter
 		}
-		c.state[8] = uint32(blockCounter)
+		c.state[12] = uint32(blockCounter)
 	} else {
-		c.state[8] = uint32(blockCounter)
-		c.state[9] = uint32(blockCounter >> 32)
+		c.state[12] = uint32(blockCounter)
+		c.state[13] = uint32(blockCounter >> 32)
 	}
 	c.off = BlockSize
 	return nil
@@ -241,7 +245,7 @@ func NewCipher(key, nonce []byte) (*Cipher, error) {
 
 // HChaCha is the HChaCha20 hash function used to make XChaCha.
 func HChaCha(key []byte, nonce *[HNonceSize]byte, out *[32]byte) {
-	var x [stateSize]uint32
+	var x [stateSize]uint32 // Last 4 slots unused, sigma hardcoded.
 	x[0] = binary.LittleEndian.Uint32(key[0:4])
 	x[1] = binary.LittleEndian.Uint32(key[4:8])
 	x[2] = binary.LittleEndian.Uint32(key[8:12])

+ 2 - 7
chacha20_amd64.go

@@ -13,12 +13,7 @@ import (
 	"math"
 )
 
-func blocksAmd64SSE2(sigma, x *uint32, in, out *byte, nrBlocks uint)
-
-// One day this won't be parameters when PeachPy fixes issue #11.  Since the
-// SSE2 code uses all the registers anyway, there isn't a huge gain from
-// chaging this anyway.
-var sigma = [4]uint32{sigma0, sigma1, sigma2, sigma3}
+func blocksAmd64SSE2(x *uint32, in, out *byte, nrBlocks uint)
 
 func blocksAmd64(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) {
 	if isIetf {
@@ -36,7 +31,7 @@ func blocksAmd64(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIe
 		in = out
 	}
 
-	blocksAmd64SSE2(&sigma[0], &x[0], &in[0], &out[0], uint(nrBlocks))
+	blocksAmd64SSE2(&x[0], &in[0], &out[0], uint(nrBlocks))
 }
 
 func init() {

+ 11 - 13
chacha20_amd64.py

@@ -26,7 +26,6 @@
 from peachpy import *
 from peachpy.x86_64 import *
 
-sigma = Argument(ptr(const_uint32_t))
 x = Argument(ptr(uint32_t))
 inp = Argument(ptr(const_uint8_t))
 outp = Argument(ptr(uint8_t))
@@ -130,19 +129,18 @@ def WriteXor(tmp, inp, outp, d, v0, v1, v2, v3):
     PXOR(tmp, v3)
     MOVDQU([outp+d+48], tmp)
 
-with Function("blocksAmd64SSE2", (sigma, x, inp, outp, nrBlocks)):
-    reg_sigma = GeneralPurposeRegister64()
+with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
     reg_x = GeneralPurposeRegister64()
     reg_inp = GeneralPurposeRegister64()
     reg_outp = GeneralPurposeRegister64()
     reg_blocks = GeneralPurposeRegister64()
 
-    LOAD.ARGUMENT(reg_sigma, sigma)
     LOAD.ARGUMENT(reg_x, x)
     LOAD.ARGUMENT(reg_inp, inp)
     LOAD.ARGUMENT(reg_outp, outp)
     LOAD.ARGUMENT(reg_blocks, nrBlocks)
 
+    # Build the counter increment vector on the stack.
     SUB(registers.rsp, 16)
     reg_tmp = GeneralPurposeRegister32()
     MOV(reg_tmp, 0x00000001)
@@ -154,11 +152,11 @@ with Function("blocksAmd64SSE2", (sigma, x, inp, outp, nrBlocks)):
 
     xmm_tmp = XMMRegister()
     xmm_s1 = XMMRegister()
-    MOVDQU(xmm_s1, [reg_x])
+    MOVDQU(xmm_s1, [reg_x+16])
     xmm_s2 = XMMRegister()
-    MOVDQU(xmm_s2, [reg_x+16])
+    MOVDQU(xmm_s2, [reg_x+32])
     xmm_s3 = XMMRegister()
-    MOVDQU(xmm_s3, [reg_x+32])
+    MOVDQU(xmm_s3, [reg_x+48])
 
     vector_loop = Loop()
     serial_loop = Loop()
@@ -181,7 +179,7 @@ with Function("blocksAmd64SSE2", (sigma, x, inp, outp, nrBlocks)):
     SUB(reg_blocks, 3)
     JB(vector_loop.end)
     with vector_loop:
-        MOVDQU(xmm_v0, [reg_sigma])
+        MOVDQU(xmm_v0, [reg_x]) # <- sigma
         MOVDQA(xmm_v1, xmm_s1)
         MOVDQA(xmm_v2, xmm_s2)
         MOVDQA(xmm_v3, xmm_s3)
@@ -209,7 +207,7 @@ with Function("blocksAmd64SSE2", (sigma, x, inp, outp, nrBlocks)):
             SUB(reg_rounds, 2)
             JNZ(rounds_loop.begin)
 
-        PADDD(xmm_v0, [reg_sigma])
+        PADDD(xmm_v0, [reg_x])
         PADDD(xmm_v1, xmm_s1)
         PADDD(xmm_v2, xmm_s2)
         PADDD(xmm_v3, xmm_s3)
@@ -217,14 +215,14 @@ with Function("blocksAmd64SSE2", (sigma, x, inp, outp, nrBlocks)):
         MOVDQU(xmm_v0, [registers.rsp])
         PADDQ(xmm_s3, xmm_v0) # + counter
 
-        PADDD(xmm_v4, [reg_sigma])
+        PADDD(xmm_v4, [reg_x])
         PADDD(xmm_v5, xmm_s1)
         PADDD(xmm_v6, xmm_s2)
         PADDD(xmm_v7, xmm_s3)
         WriteXor(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
         PADDQ(xmm_s3, xmm_v0) # +counter
 
-        PADDD(xmm_v8, [reg_sigma])
+        PADDD(xmm_v8, [reg_x])
         PADDD(xmm_v9, xmm_s1)
         PADDD(xmm_v10, xmm_s2)
         PADDD(xmm_v11, xmm_s3)
@@ -244,7 +242,7 @@ with Function("blocksAmd64SSE2", (sigma, x, inp, outp, nrBlocks)):
     #    xmm_v4 = 1, 0, 0, 0
     #    xmm_v5 = sigma
     MOVDQU(xmm_v4, [registers.rsp])
-    MOVDQU(xmm_v5, [reg_sigma])
+    MOVDQU(xmm_v5, [reg_x])
     with serial_loop:
         MOVDQA(xmm_v0, xmm_v5)
         MOVDQA(xmm_v1, xmm_s1)
@@ -274,7 +272,7 @@ with Function("blocksAmd64SSE2", (sigma, x, inp, outp, nrBlocks)):
 
     # Write back the updated counter.  Stoping at 2^70 bytes is the user's
     # problem, not mine.
-    MOVDQU([reg_x+32], xmm_s3)
+    MOVDQU([reg_x+48], xmm_s3)
 
     ADD(registers.rsp, 16)
 

+ 62 - 63
chacha20_amd64.s

@@ -1,24 +1,23 @@
 // Generated by PeachPy 0.2.0 from chacha20_amd64.py
 
 
-// func blocksAmd64SSE2(sigma *uint32, x *uint32, inp *uint8, outp *uint8, nrBlocks *uint)
-TEXT ·blocksAmd64SSE2(SB),4,$0-40
-	MOVQ sigma+0(FP), AX
-	MOVQ x+8(FP), BX
-	MOVQ inp+16(FP), CX
-	MOVQ outp+24(FP), DX
-	MOVQ nrBlocks+32(FP), DI
+// func blocksAmd64SSE2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint)
+TEXT ·blocksAmd64SSE2(SB),4,$0-32
+	MOVQ x+0(FP), AX
+	MOVQ inp+8(FP), BX
+	MOVQ outp+16(FP), CX
+	MOVQ nrBlocks+24(FP), DX
 	SUBQ $16, SP
-	MOVL $1, SI
-	MOVL SI, 0(SP)
-	MOVL $0, SI
-	MOVL SI, 4(SP)
-	MOVL SI, 8(SP)
-	MOVL SI, 12(SP)
-	MOVOU 0(BX), X1
-	MOVOU 16(BX), X2
-	MOVOU 32(BX), X3
-	SUBQ $3, DI
+	MOVL $1, DI
+	MOVL DI, 0(SP)
+	MOVL $0, DI
+	MOVL DI, 4(SP)
+	MOVL DI, 8(SP)
+	MOVL DI, 12(SP)
+	MOVOU 16(AX), X1
+	MOVOU 32(AX), X2
+	MOVOU 48(AX), X3
+	SUBQ $3, DX
 	JCS vector_loop_end
 vector_loop_begin:
 		MOVOU 0(AX), X4
@@ -36,8 +35,8 @@ vector_loop_begin:
 		MOVO X2, X14
 		MOVO X11, X15
 		PADDQ X0, X15
-		MOVQ $20, SI
-rounds_loop1_begin:
+		MOVQ $20, DI
+rounds_loop0_begin:
 			PADDL X5, X4
 			PXOR X4, X7
 			MOVO X7, X0
@@ -200,66 +199,66 @@ rounds_loop1_begin:
 			PSHUFL $147, X13, X13
 			PSHUFL $78, X14, X14
 			PSHUFL $57, X15, X15
-			SUBQ $2, SI
-			JNE rounds_loop1_begin
+			SUBQ $2, DI
+			JNE rounds_loop0_begin
 		PADDL 0(AX), X4
 		PADDL X1, X5
 		PADDL X2, X6
 		PADDL X3, X7
-		MOVOU 0(CX), X0
+		MOVOU 0(BX), X0
 		PXOR X4, X0
-		MOVOU X0, 0(DX)
-		MOVOU 16(CX), X0
+		MOVOU X0, 0(CX)
+		MOVOU 16(BX), X0
 		PXOR X5, X0
-		MOVOU X0, 16(DX)
-		MOVOU 32(CX), X0
+		MOVOU X0, 16(CX)
+		MOVOU 32(BX), X0
 		PXOR X6, X0
-		MOVOU X0, 32(DX)
-		MOVOU 48(CX), X0
+		MOVOU X0, 32(CX)
+		MOVOU 48(BX), X0
 		PXOR X7, X0
-		MOVOU X0, 48(DX)
+		MOVOU X0, 48(CX)
 		MOVOU 0(SP), X4
 		PADDQ X4, X3
 		PADDL 0(AX), X8
 		PADDL X1, X9
 		PADDL X2, X10
 		PADDL X3, X11
-		MOVOU 64(CX), X0
+		MOVOU 64(BX), X0
 		PXOR X8, X0
-		MOVOU X0, 64(DX)
-		MOVOU 80(CX), X0
+		MOVOU X0, 64(CX)
+		MOVOU 80(BX), X0
 		PXOR X9, X0
-		MOVOU X0, 80(DX)
-		MOVOU 96(CX), X0
+		MOVOU X0, 80(CX)
+		MOVOU 96(BX), X0
 		PXOR X10, X0
-		MOVOU X0, 96(DX)
-		MOVOU 112(CX), X0
+		MOVOU X0, 96(CX)
+		MOVOU 112(BX), X0
 		PXOR X11, X0
-		MOVOU X0, 112(DX)
+		MOVOU X0, 112(CX)
 		PADDQ X4, X3
 		PADDL 0(AX), X12
 		PADDL X1, X13
 		PADDL X2, X14
 		PADDL X3, X15
-		MOVOU 128(CX), X0
+		MOVOU 128(BX), X0
 		PXOR X12, X0
-		MOVOU X0, 128(DX)
-		MOVOU 144(CX), X0
+		MOVOU X0, 128(CX)
+		MOVOU 144(BX), X0
 		PXOR X13, X0
-		MOVOU X0, 144(DX)
-		MOVOU 160(CX), X0
+		MOVOU X0, 144(CX)
+		MOVOU 160(BX), X0
 		PXOR X14, X0
-		MOVOU X0, 160(DX)
-		MOVOU 176(CX), X0
+		MOVOU X0, 160(CX)
+		MOVOU 176(BX), X0
 		PXOR X15, X0
-		MOVOU X0, 176(DX)
+		MOVOU X0, 176(CX)
 		PADDQ X4, X3
+		ADDQ $192, BX
 		ADDQ $192, CX
-		ADDQ $192, DX
-		SUBQ $3, DI
+		SUBQ $3, DX
 		JCC vector_loop_begin
 vector_loop_end:
-	ADDQ $3, DI
+	ADDQ $3, DX
 	JEQ serial_loop_end
 	MOVOU 0(SP), X8
 	MOVOU 0(AX), X9
@@ -268,8 +267,8 @@ serial_loop_begin:
 		MOVO X1, X5
 		MOVO X2, X6
 		MOVO X3, X7
-		MOVQ $20, AX
-rounds_loop0_begin:
+		MOVQ $20, DI
+rounds_loop1_begin:
 			PADDL X5, X4
 			PXOR X4, X7
 			MOVO X7, X0
@@ -324,30 +323,30 @@ rounds_loop0_begin:
 			PSHUFL $147, X5, X5
 			PSHUFL $78, X6, X6
 			PSHUFL $57, X7, X7
-			SUBQ $2, AX
-			JNE rounds_loop0_begin
+			SUBQ $2, DI
+			JNE rounds_loop1_begin
 		PADDL X9, X4
 		PADDL X1, X5
 		PADDL X2, X6
 		PADDL X3, X7
-		MOVOU 0(CX), X0
+		MOVOU 0(BX), X0
 		PXOR X4, X0
-		MOVOU X0, 0(DX)
-		MOVOU 16(CX), X0
+		MOVOU X0, 0(CX)
+		MOVOU 16(BX), X0
 		PXOR X5, X0
-		MOVOU X0, 16(DX)
-		MOVOU 32(CX), X0
+		MOVOU X0, 16(CX)
+		MOVOU 32(BX), X0
 		PXOR X6, X0
-		MOVOU X0, 32(DX)
-		MOVOU 48(CX), X0
+		MOVOU X0, 32(CX)
+		MOVOU 48(BX), X0
 		PXOR X7, X0
-		MOVOU X0, 48(DX)
+		MOVOU X0, 48(CX)
 		PADDQ X8, X3
+		ADDQ $64, BX
 		ADDQ $64, CX
-		ADDQ $64, DX
-		SUBQ $1, DI
+		SUBQ $1, DX
 		JNE serial_loop_begin
 serial_loop_end:
-	MOVOU X3, 32(BX)
+	MOVOU X3, 48(AX)
 	ADDQ $16, SP
 	RET

+ 56 - 49
chacha20_ref.go

@@ -22,9 +22,12 @@ func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf
 		}
 	}
 
+	// This routine ignores x[0]...x[4] in favor the const values since it's
+	// ever so slightly faster.
+
 	for n := 0; n < nrBlocks; n++ {
 		x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
-		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11]
+		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
 
 		for i := chachaRounds; i > 0; i -= 2 {
 			// quarterround(x, 0, 4, 8, 12)
@@ -149,56 +152,60 @@ func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf
 				outArr[1] = inArr[1] ^ (x1 + sigma1)
 				outArr[2] = inArr[2] ^ (x2 + sigma2)
 				outArr[3] = inArr[3] ^ (x3 + sigma3)
-				outArr[4] = inArr[4] ^ (x4 + x[0])
-				outArr[5] = inArr[5] ^ (x5 + x[1])
-				outArr[6] = inArr[6] ^ (x6 + x[2])
-				outArr[7] = inArr[7] ^ (x7 + x[3])
-				outArr[8] = inArr[8] ^ (x8 + x[4])
-				outArr[9] = inArr[9] ^ (x9 + x[5])
-				outArr[10] = inArr[10] ^ (x10 + x[6])
-				outArr[11] = inArr[11] ^ (x11 + x[7])
-				outArr[12] = inArr[12] ^ (x12 + x[8])
-				outArr[13] = inArr[13] ^ (x13 + x[9])
-				outArr[14] = inArr[14] ^ (x14 + x[10])
-				outArr[15] = inArr[15] ^ (x15 + x[11])
+				outArr[4] = inArr[4] ^ (x4 + x[4])
+				outArr[5] = inArr[5] ^ (x5 + x[5])
+				outArr[6] = inArr[6] ^ (x6 + x[6])
+				outArr[7] = inArr[7] ^ (x7 + x[7])
+				outArr[8] = inArr[8] ^ (x8 + x[8])
+				outArr[9] = inArr[9] ^ (x9 + x[9])
+				outArr[10] = inArr[10] ^ (x10 + x[10])
+				outArr[11] = inArr[11] ^ (x11 + x[11])
+				outArr[12] = inArr[12] ^ (x12 + x[12])
+				outArr[13] = inArr[13] ^ (x13 + x[13])
+				outArr[14] = inArr[14] ^ (x14 + x[14])
+				outArr[15] = inArr[15] ^ (x15 + x[15])
 			} else {
 				outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
 				outArr[0] = x0 + sigma0
 				outArr[1] = x1 + sigma1
 				outArr[2] = x2 + sigma2
 				outArr[3] = x3 + sigma3
-				outArr[4] = x4 + x[0]
-				outArr[5] = x5 + x[1]
-				outArr[6] = x6 + x[2]
-				outArr[7] = x7 + x[3]
-				outArr[8] = x8 + x[4]
-				outArr[9] = x9 + x[5]
-				outArr[10] = x10 + x[6]
-				outArr[11] = x11 + x[7]
-				outArr[12] = x12 + x[8]
-				outArr[13] = x13 + x[9]
-				outArr[14] = x14 + x[10]
-				outArr[15] = x15 + x[11]
+				outArr[4] = x4 + x[4]
+				outArr[5] = x5 + x[5]
+				outArr[6] = x6 + x[6]
+				outArr[7] = x7 + x[7]
+				outArr[8] = x8 + x[8]
+				outArr[9] = x9 + x[9]
+				outArr[10] = x10 + x[10]
+				outArr[11] = x11 + x[11]
+				outArr[12] = x12 + x[12]
+				outArr[13] = x13 + x[13]
+				outArr[14] = x14 + x[14]
+				outArr[15] = x15 + x[15]
 			}
 		} else {
-			// Slow path, either the architecture cares about alignment, or is not litte endian.
-			x4 += x[0]
-			x5 += x[1]
-			x6 += x[2]
-			x7 += x[3]
-			x8 += x[4]
-			x9 += x[5]
-			x10 += x[6]
-			x11 += x[7]
-			x12 += x[8]
-			x13 += x[9]
-			x14 += x[10]
-			x15 += x[11]
+			// Slow path, either the architecture cares about alignment, or is not litt;e endian.
+			x0 += sigma0
+			x1 += sigma1
+			x2 += sigma2
+			x3 += sigma3
+			x4 += x[4]
+			x5 += x[5]
+			x6 += x[6]
+			x7 += x[7]
+			x8 += x[8]
+			x9 += x[9]
+			x10 += x[10]
+			x11 += x[11]
+			x12 += x[12]
+			x13 += x[13]
+			x14 += x[14]
+			x15 += x[15]
 			if in != nil {
-				binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^(x0+sigma0))
-				binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^(x1+sigma1))
-				binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^(x2+sigma2))
-				binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^(x3+sigma3))
+				binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^x0)
+				binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^x1)
+				binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^x2)
+				binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^x3)
 				binary.LittleEndian.PutUint32(out[16:20], binary.LittleEndian.Uint32(in[16:20])^x4)
 				binary.LittleEndian.PutUint32(out[20:24], binary.LittleEndian.Uint32(in[20:24])^x5)
 				binary.LittleEndian.PutUint32(out[24:28], binary.LittleEndian.Uint32(in[24:28])^x6)
@@ -213,10 +220,10 @@ func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf
 				binary.LittleEndian.PutUint32(out[60:64], binary.LittleEndian.Uint32(in[60:64])^x15)
 				in = in[BlockSize:]
 			} else {
-				binary.LittleEndian.PutUint32(out[0:4], x0+sigma0)
-				binary.LittleEndian.PutUint32(out[4:8], x1+sigma1)
-				binary.LittleEndian.PutUint32(out[8:12], x2+sigma2)
-				binary.LittleEndian.PutUint32(out[12:16], x3+sigma3)
+				binary.LittleEndian.PutUint32(out[0:4], x0)
+				binary.LittleEndian.PutUint32(out[4:8], x1)
+				binary.LittleEndian.PutUint32(out[8:12], x2)
+				binary.LittleEndian.PutUint32(out[12:16], x3)
 				binary.LittleEndian.PutUint32(out[16:20], x4)
 				binary.LittleEndian.PutUint32(out[20:24], x5)
 				binary.LittleEndian.PutUint32(out[24:28], x6)
@@ -234,9 +241,9 @@ func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf
 		}
 
 		// Stoping at 2^70 bytes per nonce is the user's responsibility.
-		ctr := uint64(x[9])<<32 | uint64(x[8])
+		ctr := uint64(x[13])<<32 | uint64(x[12])
 		ctr++
-		x[8] = uint32(ctr)
-		x[9] = uint32(ctr >> 32)
+		x[12] = uint32(ctr)
+		x[13] = uint32(ctr >> 32)
 	}
 }