Browse Source

Even more vector optimizations.

I said I was going to hold off, but the eta=4 case for the centered
binomial distribution is nicely optimized in the reference code, and
trivially short.

Part of #1.
Yawning Angel 1 year ago
parent
commit
58e5961523
4 changed files with 74 additions and 0 deletions
  1. 4 0
      cbd.go
  2. 2 0
      hwaccel.go
  3. 13 0
      hwaccel_amd64.go
  4. 55 0
      hwaccel_amd64.s

+ 4 - 0
cbd.go

@@ -20,6 +20,10 @@ func loadLittleEndian(x []byte, bytes int) uint64 {
 // coefficients distributed according to a centered binomial distribution
 // with parameter eta.
 func (p *poly) cbd(buf []byte, eta int) {
+	hardwareAccelImpl.cbdFn(p, buf, eta)
+}
+
+func cbdRef(p *poly, buf []byte, eta int) {
 	switch eta {
 	case 3:
 		var a, b [4]uint32

+ 2 - 0
hwaccel.go

@@ -16,6 +16,7 @@ var (
 		nttFn:          nttRef,
 		invnttFn:       invnttRef,
 		pointwiseAccFn: pointwiseAccRef,
+		cbdFn:          cbdRef,
 	}
 )
 
@@ -24,6 +25,7 @@ type hwaccelImpl struct {
 	nttFn          func(*[kyberN]uint16)
 	invnttFn       func(*[kyberN]uint16)
 	pointwiseAccFn func(*poly, *polyVec, *polyVec)
+	cbdFn          func(*poly, []byte, int)
 }
 
 func forceDisableHardwareAcceleration() {

+ 13 - 0
hwaccel_amd64.go

@@ -162,6 +162,9 @@ func pointwiseAccK3AVX2(dst *uint16, a, b **uint16)
 //go:noescape
 func pointwiseAccK4AVX2(dst *uint16, a, b **uint16)
 
+//go:noescape
+func cbdEta4AVX2(dst *uint16, buf *byte)
+
 func supportsAVX2() bool {
 	// https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
 	const (
@@ -202,6 +205,7 @@ var implAVX2 = &hwaccelImpl{
 	nttFn:          nttOpt,
 	invnttFn:       invnttOpt,
 	pointwiseAccFn: pointwiseAccOpt,
+	cbdFn:          cbdOpt,
 }
 
 func nttOpt(p *[kyberN]uint16) {
@@ -235,6 +239,15 @@ func pointwiseAccOpt(p *poly, a, b *polyVec) {
 	}
 }
 
+func cbdOpt(p *poly, buf []byte, eta int) {
+	switch eta {
+	case 4:
+		cbdEta4AVX2(&p.coeffs[0], &buf[0])
+	default:
+		cbdRef(p, buf, eta)
+	}
+}
+
 func initHardwareAcceleration() {
 	if supportsAVX2() {
 		isHardwareAccelerated = true

+ 55 - 0
hwaccel_amd64.s

@@ -99,6 +99,18 @@ DATA ·montsq_x16<>+0x10(SB)/8, $0x15c115c115c115c1
 DATA ·montsq_x16<>+0x18(SB)/8, $0x15c115c115c115c1
 GLOBL ·montsq_x16<>(SB), (NOPTR+RODATA), $32
 
+DATA ·mask11<>+0x00(SB)/8, $0x1111111111111111
+DATA ·mask11<>+0x08(SB)/8, $0x1111111111111111
+DATA ·mask11<>+0x10(SB)/8, $0x1111111111111111
+DATA ·mask11<>+0x18(SB)/8, $0x1111111111111111
+GLOBL ·mask11<>(SB), (NOPTR+RODATA), $32
+
+DATA ·mask0f<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
+DATA ·mask0f<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
+DATA ·mask0f<>+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f
+DATA ·mask0f<>+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f
+GLOBL ·mask0f<>(SB), (NOPTR+RODATA), $32
+
 // func nttAVX2(inout, zetas *uint16)
 TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 	MOVQ inout+0(FP), DI
@@ -2692,3 +2704,46 @@ looptop4:
 
 	VZEROUPPER
 	RET
+
+// func cbdEta4AVX2(dst *uint16, b *byte)
+TEXT ·cbdEta4AVX2(SB), NOSPLIT, $0-16
+	MOVQ dst+0(FP), DI
+	MOVQ b+8(FP), SI
+
+	VMOVDQU ·mask11<>(SB), Y0
+	VMOVDQU ·mask0f<>(SB), Y1
+	VMOVDQU ·q_x16<>(SB), Y2
+
+	MOVQ $256, DX
+
+looptop:
+	VMOVUPD    0(SI), Y3
+	VPAND      Y3, Y0, Y4
+	VPSRLW     $1, Y3, Y3
+	VPAND      Y3, Y0, Y5
+	VPADDB     Y5, Y4, Y4
+	VPSRLW     $1, Y3, Y3
+	VPAND      Y3, Y0, Y5
+	VPADDB     Y5, Y4, Y4
+	VPSRLW     $1, Y3, Y3
+	VPAND      Y3, Y0, Y3
+	VPADDB     Y3, Y4, Y3
+	VPSRLW     $4, Y3, Y4
+	VPAND      Y3, Y1, Y3
+	VPAND      Y4, Y1, Y4
+	VPSUBB     Y4, Y3, Y3
+	VPMOVSXBW  X3, Y4
+	VPADDW     Y2, Y4, Y4
+	VMOVUPD    Y4, 0(DI)
+	VPERM2F128 $0x21, Y3, Y3, Y3
+	VPMOVSXBW  X3, Y4
+	VPADDW     Y2, Y4, Y4
+	VMOVUPD    Y4, 32(DI)
+
+	ADDQ $64, DI
+	ADDQ $32, SI
+	SUBQ $32, DX
+	JA   looptop
+
+	VZEROUPPER
+	RET