Browse Source

Make the code ugly, because Go's escape analysis sucks.

This improves performance significantly depending on message sizes,
because it avoids heap allocations caused by indirect function calls.
Yawning Angel 3 years ago
parent
commit
052616c46f
8 changed files with 46 additions and 35 deletions
  1. 4 1
      chacha20.go
  2. 3 3
      chacha20_test.go
  3. 4 4
      hs1siv.go
  4. 3 3
      hs1siv_test.go
  5. 2 12
      hwaccel.go
  6. 15 5
      hwaccel_amd64.go
  7. 7 7
      hwaccel_amd64.s
  8. 8 0
      hwaccel_ref.go

+ 4 - 1
chacha20.go

@@ -34,6 +34,9 @@ func chacha20(key, nonce, in, out []byte, initialCounter uint32) {
 	if len(nonce) != chachaNonceSize {
 		panic("hs1siv: invalid chacha nonce size")
 	}
+	if len(in) == 0 {
+		return
+	}
 
 	if len(out) < len(in) {
 		in = in[:len(out)]
@@ -56,7 +59,7 @@ func chacha20(key, nonce, in, out []byte, initialCounter uint32) {
 		binary.LittleEndian.Uint32(nonce[8:12]),
 	}
 
-	hardwareAccelImpl.chachaXORKeyStreamFn(s, in, out)
+	chachaXORKeyStream(s, in, out)
 
 	// Purge the state off the stack.
 	burnUint32s(s[:])

+ 3 - 3
chacha20_test.go

@@ -16,7 +16,7 @@ import (
 
 func TestChaCha20(t *testing.T) {
 	forceDisableHardwareAcceleration()
-	impl := "_" + hardwareAccelImpl.name
+	impl := "_" + hardwareAccelImpl
 	t.Run("ChaCha20"+impl, func(t *testing.T) { doTestChaCha20(t) })
 
 	if !canAccelerate {
@@ -24,7 +24,7 @@ func TestChaCha20(t *testing.T) {
 		return
 	}
 	mustInitHardwareAcceleration()
-	impl = "_" + hardwareAccelImpl.name
+	impl = "_" + hardwareAccelImpl
 	t.Run("ChaCha20"+impl, func(t *testing.T) { doTestChaCha20(t) })
 }
 
@@ -254,7 +254,7 @@ func BenchmarkChaCha20(b *testing.B) {
 
 func doBenchmarkChaCha20(b *testing.B) {
 	benchSizes := []int{8, 32, 64, 576, 1536, 4096, 1024768}
-	impl := "_" + hardwareAccelImpl.name
+	impl := "_" + hardwareAccelImpl
 
 	for _, sz := range benchSizes {
 		bn := "ChaCha20" + impl + "_"

+ 4 - 4
hs1siv.go

@@ -204,11 +204,11 @@ func (ctx *aeadCtx) sivHashAD(a []byte) {
 
 	// Hash associated data.
 	nhMultiple := aBytes & ^(hs1NHLen - 1)
-	hardwareAccelImpl.hashStepFn(&ctx.hashCtx, a[:nhMultiple], &ctx.sivAccum)
+	hashStep(&ctx.hashCtx, a[:nhMultiple], &ctx.sivAccum)
 	if nhMultiple < aBytes {
 		var buf [hs1NHLen]byte
 		copy(buf[:], a[nhMultiple:])
-		hardwareAccelImpl.hashStepFn(&ctx.hashCtx, buf[:], &ctx.sivAccum)
+		hashStep(&ctx.hashCtx, buf[:], &ctx.sivAccum)
 	}
 }
 
@@ -218,13 +218,13 @@ func (ctx *aeadCtx) sivGenerate(m, n, siv []byte) {
 	// Hash message data.
 	var chachaKey [chachaKeySize]byte
 	nhMultiple := mBytes & ^(hs1NHLen - 1)
-	hardwareAccelImpl.hashStepFn(&ctx.hashCtx, m[:nhMultiple], &ctx.sivAccum)
+	hashStep(&ctx.hashCtx, m[:nhMultiple], &ctx.sivAccum)
 	mBytes = mBytes - nhMultiple
 	mBytesWithPadding := (mBytes + 15) & ^15
 	if mBytesWithPadding == hs1NHLen {
 		var buf [hs1NHLen]byte
 		copy(buf[:], m[nhMultiple:])
-		hardwareAccelImpl.hashStepFn(&ctx.hashCtx, buf[:], &ctx.sivAccum)
+		hashStep(&ctx.hashCtx, buf[:], &ctx.sivAccum)
 		hashFinalizeRef(&ctx.hashCtx, ctx.sivLenBuf[:], &ctx.sivAccum, chachaKey[:])
 	} else {
 		var buf [hs1NHLen]byte

+ 3 - 3
hs1siv_test.go

@@ -27,7 +27,7 @@ func mustInitHardwareAcceleration() {
 
 func TestKAT(t *testing.T) {
 	forceDisableHardwareAcceleration()
-	impl := "_" + hardwareAccelImpl.name
+	impl := "_" + hardwareAccelImpl
 	t.Run("HS1-SIV_KAT"+impl, func(t *testing.T) { doTestKAT(t) })
 
 	if !canAccelerate {
@@ -35,7 +35,7 @@ func TestKAT(t *testing.T) {
 		return
 	}
 	mustInitHardwareAcceleration()
-	impl = "_" + hardwareAccelImpl.name
+	impl = "_" + hardwareAccelImpl
 	t.Run("HS1-SIV_KAT"+impl, func(t *testing.T) { doTestKAT(t) })
 }
 
@@ -117,7 +117,7 @@ func BenchmarkHS1SIV(b *testing.B) {
 
 func doBenchmarkHS1SIV(b *testing.B) {
 	benchSizes := []int{8, 32, 64, 576, 1536, 4096, 1024768}
-	impl := "_" + hardwareAccelImpl.name
+	impl := "_" + hardwareAccelImpl
 
 	for _, sz := range benchSizes {
 		bn := "HS1-SIV" + impl + "_"

+ 2 - 12
hwaccel.go

@@ -7,23 +7,13 @@
 
 package hs1siv
 
+const implReference = "Reference"
+
 var (
 	isHardwareAccelerated = false
 	hardwareAccelImpl     = implReference
-
-	implReference = &hwaccelImpl{
-		name:                 "Reference",
-		chachaXORKeyStreamFn: chachaXORKeyStreamRef,
-		hashStepFn:           hashStepRef,
-	}
 )
 
-type hwaccelImpl struct {
-	name                 string
-	chachaXORKeyStreamFn func(*chachaState, []byte, []byte)
-	hashStepFn           func(*hs1Ctx, []byte, *[hs1HashRounds]uint64)
-}
-
 func forceDisableHardwareAcceleration() {
 	isHardwareAccelerated = false
 	hardwareAccelImpl = implReference

+ 15 - 5
hwaccel_amd64.go

@@ -63,15 +63,25 @@ func supportsAVX2BMI2() bool {
 	return regs[1]&avx2Bit != 0 && regs[1]&bmi2Bit != 0
 }
 
-var implAVX2BMI2 = &hwaccelImpl{
-	name:                 "AVX2",
-	chachaXORKeyStreamFn: chachaXORKeyStreamAVX2,
-	hashStepFn:           hashStepAVX2,
+func hashStep(ctx *hs1Ctx, in []byte, accum *[hs1HashRounds]uint64) {
+	if isHardwareAccelerated {
+		hashStepAVX2(ctx, in, accum)
+		return
+	}
+	hashStepRef(ctx, in, accum)
+}
+
+func chachaXORKeyStream(s *chachaState, in, out []byte) {
+	if isHardwareAccelerated {
+		chachaXORKeyStreamAVX2(s, in, out)
+		return
+	}
+	chachaXORKeyStreamRef(s, in, out)
 }
 
 func initHardwareAcceleration() {
 	if supportsAVX2BMI2() {
-		hardwareAccelImpl = implAVX2BMI2
+		hardwareAccelImpl = "AVX2"
 		isHardwareAccelerated = true
 	}
 }

+ 7 - 7
hwaccel_amd64.s

@@ -63,12 +63,12 @@ TEXT ·chachaXORKeyStreamAVX2(SB), NOSPLIT, $544-56
 	//  * The number of rounds is always 20.
 	//  * %rbp is used instead of %rsp.
 	LEAQ    ·chacha_constants<>(SB), AX
-	VMOVDQA 0(AX), X8
-	VMOVDQA 16(AX), X6
-	VMOVDQA 32(AX), X7
-	VMOVDQA 0(DI), X9
-	VMOVDQA 16(DI), X10
-	VMOVDQA 32(DI), X11
+	VMOVDQU 0(AX), X8
+	VMOVDQU 16(AX), X6
+	VMOVDQU 32(AX), X7
+	VMOVDQU 0(DI), X9
+	VMOVDQU 16(DI), X10
+	VMOVDQU 32(DI), X11
 
 	// MOVQ 48(DI), AX
 	MOVQ    $1, R9
@@ -964,7 +964,7 @@ chacha_blocks_avx2_copyoutput:
 	JNZ  chacha_blocks_avx2_copyoutput
 
 chacha_blocks_avx2_done:
-	VMOVDQA X11, 32(DI)
+	VMOVDQU X11, 32(DI)
 
 	VZEROUPPER
 	RET

+ 8 - 0
hwaccel_ref.go

@@ -12,3 +12,11 @@ package hs1siv
 func initHardwareAcceleration() {
 	forceDisableHardwareAcceleration()
 }
+
+func hashStep(ctx *hs1Ctx, in []byte, accum *[hs1HashRounds]uint64) {
+	hashStepRef(ctx, in, accum)
+}
+
+func chachaXORKeyStream(s *chachaState, in, out []byte) {
+	chachaXORKeyStreamRef(s, in, out)
+}