Browse Source

Low hanging fruit vector optimizations.

This adds support for using AVX2 on AMD64 platforms for the NTT and
inverse-NTT.

Part of #1.
Yawning Angel 1 year ago
parent
commit
9872c9ab03
10 changed files with 2661 additions and 19 deletions
  1. 37 0
      hwaccel.go
  2. 208 0
      hwaccel_amd64.go
  3. 2307 0
      hwaccel_amd64.s
  4. 14 0
      hwaccel_ref.go
  5. 51 10
      kem_test.go
  6. 15 2
      kem_vectors_test.go
  7. 15 2
      kex_test.go
  8. 2 2
      ntt.go
  9. 2 2
      poly.go
  10. 10 1
      polyvec.go

+ 37 - 0
hwaccel.go

@@ -0,0 +1,37 @@
+// hwaccel.go - Hardware acceleration hooks.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to the software, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+package kyber
+
+const implReference = "Reference"
+
+var (
+	isHardwareAccelerated = false
+	hardwareAccelImpl     = implReference
+
+	nttFn    = nttRef
+	invnttFn = invnttRef
+)
+
+func forceDisableHardwareAcceleration() {
+	// This is for the benefit of testing, so that it's possible to test
+	// all versions that are supported by the host.
+	isHardwareAccelerated = false
+	hardwareAccelImpl = implReference
+	nttFn = nttRef
+	invnttFn = invnttRef
+}
+
+// IsHardwareAccelerated returns true iff the Kyber implementation will use
+// hardware acceleration (eg: AVX2).
+func IsHardwareAccelerated() bool {
+	return isHardwareAccelerated
+}
+
+func init() {
+	initHardwareAcceleration()
+}

+ 208 - 0
hwaccel_amd64.go

@@ -0,0 +1,208 @@
+// hwaccel_amd64.go - AMD64 optimized routines.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to the software, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+// +build amd64,!gccgo,!noasm,go1.10
+
+package kyber
+
+const implAVX2 = "AVX2"
+
+var zetasExp = [752]uint16{
+	3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777,
+	3777, 3777, 3777, 3777, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
+	4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 3625, 3625, 3625, 3625,
+	3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625,
+	3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985,
+	3985, 3985, 3985, 3985, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581,
+	6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 2456, 2456, 2456, 2456,
+	2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456,
+	2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194,
+	2194, 2194, 2194, 2194, 121, 121, 121, 121, 121, 121, 121, 121, 121,
+	121, 121, 121, 121, 121, 121, 121, 5431, 5431, 5431, 5431, 5431, 5431,
+	5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 834, 834,
+	834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834,
+	5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186,
+	5186, 5186, 5186, 5186, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362,
+	5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 2876, 2876, 2876, 2876,
+	2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876,
+	5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980,
+	5980, 5980, 5980, 5980, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414,
+	1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 2816, 2816, 2816, 2816,
+	2816, 2816, 2816, 2816, 5593, 5593, 5593, 5593, 5593, 5593, 5593, 5593,
+	5444, 5444, 5444, 5444, 5444, 5444, 5444, 5444, 1986, 1986, 1986, 1986,
+	1986, 1986, 1986, 1986, 6082, 6082, 6082, 6082, 6082, 6082, 6082, 6082,
+	1993, 1993, 1993, 1993, 1993, 1993, 1993, 1993, 3706, 3706, 3706, 3706,
+	3706, 3706, 3706, 3706, 5675, 5675, 5675, 5675, 5675, 5675, 5675, 5675,
+	6156, 6156, 6156, 6156, 6156, 6156, 6156, 6156, 5124, 5124, 5124, 5124,
+	5124, 5124, 5124, 5124, 1296, 1296, 1296, 1296, 1296, 1296, 1296, 1296,
+	1483, 1483, 1483, 1483, 1483, 1483, 1483, 1483, 4851, 4851, 4851, 4851,
+	4851, 4851, 4851, 4851, 3364, 3364, 3364, 3364, 3364, 3364, 3364, 3364,
+	617, 617, 617, 617, 617, 617, 617, 617, 1921, 1921, 1921, 1921, 1921,
+	1921, 1921, 1921, 3992, 3992, 3992, 3992, 5943, 5943, 5943, 5943, 3266,
+	3266, 3266, 3266, 4081, 4081, 4081, 4081, 810, 810, 810, 810, 1887,
+	1887, 1887, 1887, 7043, 7043, 7043, 7043, 7674, 7674, 7674, 7674, 7243,
+	7243, 7243, 7243, 7002, 7002, 7002, 7002, 6376, 6376, 6376, 6376, 5921,
+	5921, 5921, 5921, 396, 396, 396, 396, 4507, 4507, 4507, 4507, 4126,
+	4126, 4126, 4126, 5800, 5800, 5800, 5800, 3772, 3772, 3772, 3772, 5146,
+	5146, 5146, 5146, 5241, 5241, 5241, 5241, 5126, 5126, 5126, 5126, 1535,
+	1535, 1535, 1535, 7132, 7132, 7132, 7132, 3153, 3153, 3153, 3153, 2310,
+	2310, 2310, 2310, 6282, 6282, 6282, 6282, 1321, 1321, 1321, 1321, 514,
+	514, 514, 514, 4725, 4725, 4725, 4725, 7578, 7578, 7578, 7578, 2804,
+	2804, 2804, 2804, 5638, 5638, 5638, 5638, 6250, 6250, 6250, 6250, 6627,
+	6627, 1698, 1698, 4225, 4225, 1166, 1166, 2426, 2426, 3831, 3831, 915,
+	915, 7679, 7679, 4264, 4264, 7487, 7487, 2919, 2919, 2789, 2789, 3405,
+	3405, 2385, 2385, 5568, 5568, 4949, 4949, 2175, 2175, 373, 373, 3692,
+	3692, 6951, 6951, 5925, 5925, 3135, 3135, 5290, 5290, 660, 660, 6184,
+	6184, 2572, 2572, 4536, 4536, 1350, 1350, 5457, 5457, 4093, 4093, 6000,
+	6000, 2883, 2883, 6291, 6291, 1598, 1598, 3750, 3750, 2762, 2762, 2835,
+	2835, 2764, 2764, 5448, 5448, 3816, 3816, 6148, 6148, 1464, 1464, 6954,
+	6954, 1521, 1521, 1386, 1386, 4253, 4253, 6760, 6760, 4938, 4938, 5521,
+	5521, 2649, 2649, 6822, 6822, 2579, 2579, 1532, 1532, 1919, 1919, 7195,
+	7195, 404, 404, 6625, 6625, 783, 783, 1799, 1799, 5016, 5016, 3480,
+	3480, 2133, 2133, 4371, 4371, 6513, 6513, 7664, 3744, 2422, 2001, 1278,
+	929, 6333, 5451, 7502, 6439, 5622, 6611, 2161, 1649, 2072, 3177, 5610,
+	1121, 7245, 236, 715, 670, 7023, 6205, 5303, 2767, 3542, 7455, 1203,
+	1181, 7530, 3887, 1712, 7459, 2786, 7230, 4134, 1779, 6530, 7247, 3568,
+	3988, 3581, 6095, 1509, 2918, 2339, 6274, 3434, 4131, 2340, 2891, 2998,
+	4367, 3461, 4962, 5434, 5092, 1144, 1072, 1295, 4866, 3911, 3450, 3781,
+	5423, 796, 3163, 4473, 7092, 2963, 7557, 3214, 3334, 4315, 3936, 3723,
+	1931, 7252, 7279, 4273, 83, 6155, 826, 6343, 2345, 5378, 2515, 7039,
+	5844, 4716, 6890, 370, 293, 3312, 2083, 5992, 6904, 2070, 2262, 6788,
+	2386, 7493, 6162, 4807, 6277, 1012, 2130, 1441, 2532, 4346, 6597, 4338,
+	2937, 509, 6278, 2812, 3763, 592, 2005, 3657, 2460, 4004, 3752, 692,
+	1669, 2167, 4394,
+}
+
+var zetasInvExp = [752]uint16{
+	3287, 5514, 6012, 6989, 3929, 3677, 5221, 4024, 5676, 7089, 3918, 4869,
+	1403, 7172, 4744, 3343, 1084, 3335, 5149, 6240, 5551, 6669, 1404, 2874,
+	1519, 188, 5295, 893, 5419, 5611, 777, 1689, 5598, 4369, 7388, 7311,
+	791, 2965, 1837, 642, 5166, 2303, 5336, 1338, 6855, 1526, 7598, 3408,
+	402, 429, 5750, 3958, 3745, 3366, 4347, 4467, 124, 4718, 589, 3208,
+	4518, 6885, 2258, 3900, 4231, 3770, 2815, 6386, 6609, 6537, 2589, 2247,
+	2719, 4220, 3314, 4683, 4790, 5341, 3550, 4247, 1407, 5342, 4763, 6172,
+	1586, 4100, 3693, 4113, 434, 1151, 5902, 3547, 451, 4895, 222, 5969,
+	3794, 151, 6500, 6478, 226, 4139, 4914, 2378, 1476, 658, 7011, 6966,
+	7445, 436, 6560, 2071, 4504, 5609, 6032, 5520, 1070, 2059, 1242, 179,
+	2230, 1348, 6752, 6403, 5680, 5259, 3937, 17, 1168, 1168, 3310, 3310,
+	5548, 5548, 4201, 4201, 2665, 2665, 5882, 5882, 6898, 6898, 1056, 1056,
+	7277, 7277, 486, 486, 5762, 5762, 6149, 6149, 5102, 5102, 859, 859,
+	5032, 5032, 2160, 2160, 2743, 2743, 921, 921, 3428, 3428, 6295, 6295,
+	6160, 6160, 727, 727, 6217, 6217, 1533, 1533, 3865, 3865, 2233, 2233,
+	4917, 4917, 4846, 4846, 4919, 4919, 3931, 3931, 6083, 6083, 1390, 1390,
+	4798, 4798, 1681, 1681, 3588, 3588, 2224, 2224, 6331, 6331, 3145, 3145,
+	5109, 5109, 1497, 1497, 7021, 7021, 2391, 2391, 4546, 4546, 1756, 1756,
+	730, 730, 3989, 3989, 7308, 7308, 5506, 5506, 2732, 2732, 2113, 2113,
+	5296, 5296, 4276, 4276, 4892, 4892, 4762, 4762, 194, 194, 3417, 3417, 2,
+	2, 6766, 6766, 3850, 3850, 5255, 5255, 6515, 6515, 3456, 3456, 5983,
+	5983, 1054, 1054, 1431, 1431, 1431, 1431, 2043, 2043, 2043, 2043, 4877,
+	4877, 4877, 4877, 103, 103, 103, 103, 2956, 2956, 2956, 2956, 7167,
+	7167, 7167, 7167, 6360, 6360, 6360, 6360, 1399, 1399, 1399, 1399, 5371,
+	5371, 5371, 5371, 4528, 4528, 4528, 4528, 549, 549, 549, 549, 6146,
+	6146, 6146, 6146, 2555, 2555, 2555, 2555, 2440, 2440, 2440, 2440, 2535,
+	2535, 2535, 2535, 3909, 3909, 3909, 3909, 1881, 1881, 1881, 1881, 3555,
+	3555, 3555, 3555, 3174, 3174, 3174, 3174, 7285, 7285, 7285, 7285, 1760,
+	1760, 1760, 1760, 1305, 1305, 1305, 1305, 679, 679, 679, 679, 438, 438,
+	438, 438, 7, 7, 7, 7, 638, 638, 638, 638, 5794, 5794, 5794, 5794, 6871,
+	6871, 6871, 6871, 3600, 3600, 3600, 3600, 4415, 4415, 4415, 4415, 1738,
+	1738, 1738, 1738, 3689, 3689, 3689, 3689, 5760, 5760, 5760, 5760, 5760,
+	5760, 5760, 5760, 7064, 7064, 7064, 7064, 7064, 7064, 7064, 7064, 4317,
+	4317, 4317, 4317, 4317, 4317, 4317, 4317, 2830, 2830, 2830, 2830, 2830,
+	2830, 2830, 2830, 6198, 6198, 6198, 6198, 6198, 6198, 6198, 6198, 6385,
+	6385, 6385, 6385, 6385, 6385, 6385, 6385, 2557, 2557, 2557, 2557, 2557,
+	2557, 2557, 2557, 1525, 1525, 1525, 1525, 1525, 1525, 1525, 1525, 2006,
+	2006, 2006, 2006, 2006, 2006, 2006, 2006, 3975, 3975, 3975, 3975, 3975,
+	3975, 3975, 3975, 5688, 5688, 5688, 5688, 5688, 5688, 5688, 5688, 1599,
+	1599, 1599, 1599, 1599, 1599, 1599, 1599, 5695, 5695, 5695, 5695, 5695,
+	5695, 5695, 5695, 2237, 2237, 2237, 2237, 2237, 2237, 2237, 2237, 2088,
+	2088, 2088, 2088, 2088, 2088, 2088, 2088, 4865, 4865, 4865, 4865, 4865,
+	4865, 4865, 4865, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267,
+	6267, 6267, 6267, 6267, 6267, 6267, 6267, 1701, 1701, 1701, 1701, 1701,
+	1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 4805,
+	4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805,
+	4805, 4805, 4805, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319,
+	2319, 2319, 2319, 2319, 2319, 2319, 2319, 2495, 2495, 2495, 2495, 2495,
+	2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 6847,
+	6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847,
+	6847, 6847, 6847, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250,
+	2250, 2250, 2250, 2250, 2250, 2250, 2250, 7560, 7560, 7560, 7560, 7560,
+	7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 5487,
+	5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487,
+	5487, 5487, 5487, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225,
+	5225, 5225, 5225, 5225, 5225, 5225, 5225, 1100, 1100, 1100, 1100, 1100,
+	1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 3696,
+	3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696,
+	3696, 3696, 3696, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056,
+	4056, 4056, 4056, 4056, 4056, 4056, 4056, 3182, 3182, 3182, 3182, 3182,
+	3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 5776,
+	5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776,
+	5776, 5776, 5776,
+}
+
+//go:noescape
+func cpuidAmd64(cpuidParams *uint32)
+
+//go:noescape
+func xgetbv0Amd64(xcrVec *uint32)
+
+//go:noescape
+func nttAVX2(inout, zetas *uint16)
+
+//go:noescape
+func invnttAVX2(inout, omegas *uint16)
+
+func supportsAVX2() bool {
+	// https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+	const (
+		osXsaveBit = 1 << 27
+		avx2Bit    = 1 << 5
+	)
+
+	// Check to see if CPUID actually supports the leaf that indicates AVX2.
+	// CPUID.(EAX=0H, ECX=0H) >= 7
+	regs := [4]uint32{0x00}
+	cpuidAmd64(&regs[0])
+	if regs[0] < 7 {
+		return false
+	}
+
+	// Check to see if the OS knows how to save/restore XMM/YMM state.
+	// CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1
+	regs = [4]uint32{0x01}
+	cpuidAmd64(&regs[0])
+	if regs[2]&osXsaveBit == 0 {
+		return false
+	}
+	xcrRegs := [2]uint32{}
+	xgetbv0Amd64(&xcrRegs[0])
+	if xcrRegs[0]&6 != 6 {
+		return false
+	}
+
+	// Check for AVX2 support.
+	// CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1
+	regs = [4]uint32{0x07}
+	cpuidAmd64(&regs[0])
+	return regs[1]&avx2Bit != 0
+}
+
+func nttOpt(p *[kyberN]uint16) {
+	nttAVX2(&p[0], &zetasExp[0])
+}
+
+func invnttOpt(p *[kyberN]uint16) {
+	invnttAVX2(&p[0], &zetasInvExp[0])
+}
+
+func initHardwareAcceleration() {
+	if supportsAVX2() {
+		isHardwareAccelerated = true
+		hardwareAccelImpl = implAVX2
+		nttFn = nttOpt
+		invnttFn = invnttOpt
+	}
+}

+ 2307 - 0
hwaccel_amd64.s

@@ -0,0 +1,2307 @@
+// +build !noasm,go1.10
+// hwaccel_amd64.s - AMD64 optimized routines.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to the software, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+#include "textflag.h"
+
+// func cpuidAmd64(cpuidParams *uint32)
+TEXT ·cpuidAmd64(SB), NOSPLIT, $0-8
+	MOVQ cpuidParams+0(FP), R15
+	MOVL 0(R15), AX
+	MOVL 8(R15), CX
+	CPUID
+	MOVL AX, 0(R15)
+	MOVL BX, 4(R15)
+	MOVL CX, 8(R15)
+	MOVL DX, 12(R15)
+	RET
+
+// func xgetbv0Amd64(xcrVec *uint32)
+TEXT ·xgetbv0Amd64(SB), NOSPLIT, $0-8
+	MOVQ xcrVec+0(FP), BX
+	XORL CX, CX
+	XGETBV
+	MOVL AX, 0(BX)
+	MOVL DX, 4(BX)
+	RET
+
+// NTT and inverse-NTT take from the `avx2` implementation, converted to Go's
+// assembly dialect.  I do this in lieu of cutting myself to see if I still
+// can feel pain.
+//
+// The conversion is mostly direct except:
+//  * Instead of aligned loads, unaligned loads are used, as there is no
+//    meaningful difference on modern Intel systems, and it's not immediately
+//    obvious to me how Go will align global data.
+//  * The constants are renamed slightly.
+//
+// Note:
+//  * These must be kept in sync with the values in params.go.
+//    Currently assumes Q = 7681, Q_INV = 57857.
+//  * Caution, Little endian so things will look different from avx2/consts.c.
+DATA ·vpshufb_idx<>+0x00(SB)/8, $0x0504070601000302
+DATA ·vpshufb_idx<>+0x08(SB)/8, $0x0d0c0f0e09080b0a
+DATA ·vpshufb_idx<>+0x10(SB)/8, $0x0504070601000302
+DATA ·vpshufb_idx<>+0x18(SB)/8, $0x0d0c0f0e09080b0a
+GLOBL ·vpshufb_idx<>(SB), (NOPTR+RODATA), $32
+
+DATA ·low_mask<>+0x00(SB)/8, $0x1fff1fff1fff1fff
+DATA ·low_mask<>+0x08(SB)/8, $0x1fff1fff1fff1fff
+DATA ·low_mask<>+0x10(SB)/8, $0x1fff1fff1fff1fff
+DATA ·low_mask<>+0x18(SB)/8, $0x1fff1fff1fff1fff
+GLOBL ·low_mask<>(SB), (NOPTR+RODATA), $32
+
+DATA ·lowdword<>+0x00(SB)/8, $0x0000ffff0000ffff
+DATA ·lowdword<>+0x08(SB)/8, $0x0000ffff0000ffff
+DATA ·lowdword<>+0x10(SB)/8, $0x0000ffff0000ffff
+DATA ·lowdword<>+0x18(SB)/8, $0x0000ffff0000ffff
+GLOBL ·lowdword<>(SB), (NOPTR+RODATA), $32
+
+// DATA ·lowdword<>+0x00(SB)/8, $0x0000ffff0000ffff
+
+DATA ·q_x16<>+0x00(SB)/8, $0x1e011e011e011e01
+DATA ·q_x16<>+0x08(SB)/8, $0x1e011e011e011e01
+DATA ·q_x16<>+0x10(SB)/8, $0x1e011e011e011e01
+DATA ·q_x16<>+0x18(SB)/8, $0x1e011e011e011e01
+GLOBL ·q_x16<>(SB), (NOPTR+RODATA), $32
+
+DATA ·q2_x16<>+0x00(SB)/8, $0x3c023c023c023c02
+DATA ·q2_x16<>+0x08(SB)/8, $0x3c023c023c023c02
+DATA ·q2_x16<>+0x10(SB)/8, $0x3c023c023c023c02
+DATA ·q2_x16<>+0x18(SB)/8, $0x3c023c023c023c02
+GLOBL ·q2_x16<>(SB), (NOPTR+RODATA), $32
+
+DATA ·qinv_x16<>+0x00(SB)/8, $0xe201e201e201e201
+DATA ·qinv_x16<>+0x08(SB)/8, $0xe201e201e201e201
+DATA ·qinv_x16<>+0x10(SB)/8, $0xe201e201e201e201
+DATA ·qinv_x16<>+0x18(SB)/8, $0xe201e201e201e201
+GLOBL ·qinv_x16<>(SB), (NOPTR+RODATA), $32
+
+DATA ·f_x16<>+0x00(SB)/8, $0x0100010001000100
+DATA ·f_x16<>+0x08(SB)/8, $0x0100010001000100
+DATA ·f_x16<>+0x10(SB)/8, $0x0100010001000100
+DATA ·f_x16<>+0x18(SB)/8, $0x0100010001000100
+GLOBL ·f_x16<>(SB), (NOPTR+RODATA), $32
+
+DATA ·v_x16<>+0x00(SB)/8, $0x4442444244424442
+DATA ·v_x16<>+0x08(SB)/8, $0x4442444244424442
+DATA ·v_x16<>+0x10(SB)/8, $0x4442444244424442
+DATA ·v_x16<>+0x18(SB)/8, $0x4442444244424442
+GLOBL ·v_x16<>(SB), (NOPTR+RODATA), $32
+
+// func nttAVX2(inout, zetas *uint16)
+TEXT ·nttAVX2(SB), NOSPLIT, $0-16
+	MOVQ inout+0(FP), DI
+	MOVQ zetas+8(FP), SI
+
+	VMOVDQU ·qinv_x16<>(SB), Y0
+	VMOVDQU ·q_x16<>(SB), Y1
+	VMOVDQU ·low_mask<>(SB), Y2
+
+	// zetas
+	VMOVDQU (SI), Y3
+
+	// first round
+	// load
+	VMOVDQU (DI), Y4
+	VMOVDQU (32)(DI), Y5
+	VMOVDQU (64)(DI), Y6
+	VMOVDQU (96)(DI), Y7
+	VMOVDQU (256)(DI), Y8
+	VMOVDQU (288)(DI), Y9
+	VMOVDQU (320)(DI), Y10
+	VMOVDQU (352)(DI), Y11
+
+	// level 0
+	// mul
+	VPMULLW Y3, Y8, Y12
+	VPMULHW Y3, Y8, Y8
+	VPMULLW Y3, Y9, Y13
+	VPMULHW Y3, Y9, Y9
+	VPMULLW Y3, Y10, Y14
+	VPMULHW Y3, Y10, Y10
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y8, Y12
+	VPSUBW  Y13, Y9, Y13
+	VPSUBW  Y14, Y10, Y14
+	VPSUBW  Y15, Y11, Y15
+
+	// update
+	VPSUBW Y12, Y4, Y8
+	VPSUBW Y13, Y5, Y9
+	VPSUBW Y14, Y6, Y10
+	VPSUBW Y15, Y7, Y11
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y5, Y5
+	VPADDW Y14, Y6, Y6
+	VPADDW Y15, Y7, Y7
+
+	// store
+	VMOVDQU Y4, (DI)
+	VMOVDQU Y5, (32)(DI)
+	VMOVDQU Y6, (64)(DI)
+	VMOVDQU Y7, (96)(DI)
+	VMOVDQU Y8, (256)(DI)
+	VMOVDQU Y9, (288)(DI)
+	VMOVDQU Y10, (320)(DI)
+	VMOVDQU Y11, (352)(DI)
+
+	ADDQ $128, DI
+
+	// second round
+	// load
+	VMOVDQU (DI), Y4
+	VMOVDQU (32)(DI), Y5
+	VMOVDQU (64)(DI), Y6
+	VMOVDQU (96)(DI), Y7
+	VMOVDQU (256)(DI), Y8
+	VMOVDQU (288)(DI), Y9
+	VMOVDQU (320)(DI), Y10
+	VMOVDQU (352)(DI), Y11
+
+	// level 0
+	// mul
+	VPMULLW Y3, Y8, Y12
+	VPMULHW Y3, Y8, Y8
+	VPMULLW Y3, Y9, Y13
+	VPMULHW Y3, Y9, Y9
+	VPMULLW Y3, Y10, Y14
+	VPMULHW Y3, Y10, Y10
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y8, Y12
+	VPSUBW  Y13, Y9, Y13
+	VPSUBW  Y14, Y10, Y14
+	VPSUBW  Y15, Y11, Y15
+
+	// update
+	VPSUBW Y12, Y4, Y8
+	VPSUBW Y13, Y5, Y9
+	VPSUBW Y14, Y6, Y10
+	VPSUBW Y15, Y7, Y11
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y5, Y5
+	VPADDW Y14, Y6, Y6
+	VPADDW Y15, Y7, Y7
+
+	// store
+	VMOVDQU Y4, (DI)
+	VMOVDQU Y5, (32)(DI)
+	VMOVDQU Y6, (64)(DI)
+	VMOVDQU Y7, (96)(DI)
+	VMOVDQU Y8, (256)(DI)
+	VMOVDQU Y9, (288)(DI)
+	VMOVDQU Y10, (320)(DI)
+	VMOVDQU Y11, (352)(DI)
+
+	SUBQ $128, DI
+
+	// first round
+	// zetas
+	VMOVDQU (32)(SI), Y3
+
+	// load
+	VMOVDQU (DI), Y4
+	VMOVDQU (32)(DI), Y5
+	VMOVDQU (64)(DI), Y6
+	VMOVDQU (96)(DI), Y7
+	VMOVDQU (128)(DI), Y8
+	VMOVDQU (160)(DI), Y9
+	VMOVDQU (192)(DI), Y10
+	VMOVDQU (224)(DI), Y11
+
+	// level 1
+	// mul
+	VPMULLW Y3, Y8, Y12
+	VPMULHW Y3, Y8, Y8
+	VPMULLW Y3, Y9, Y13
+	VPMULHW Y3, Y9, Y9
+	VPMULLW Y3, Y10, Y14
+	VPMULHW Y3, Y10, Y10
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y8, Y12
+	VPSUBW  Y13, Y9, Y13
+	VPSUBW  Y14, Y10, Y14
+	VPSUBW  Y15, Y11, Y15
+
+	// update
+	VPSUBW Y12, Y4, Y8
+	VPSUBW Y13, Y5, Y9
+	VPSUBW Y14, Y6, Y10
+	VPSUBW Y15, Y7, Y11
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y5, Y5
+	VPADDW Y14, Y6, Y6
+	VPADDW Y15, Y7, Y7
+
+	// level 2
+	// zetas
+	VMOVDQU (96)(SI), Y15
+	VMOVDQU (128)(SI), Y3
+
+	// mul
+	VPMULLW Y15, Y6, Y12
+	VPMULHW Y15, Y6, Y6
+	VPMULLW Y15, Y7, Y13
+	VPMULHW Y15, Y7, Y7
+	VPMULLW Y3, Y10, Y14
+	VPMULHW Y3, Y10, Y10
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y6, Y12
+	VPSUBW  Y13, Y7, Y13
+	VPSUBW  Y14, Y10, Y14
+	VPSUBW  Y15, Y11, Y15
+
+	// update
+	VPSUBW Y12, Y4, Y6
+	VPSUBW Y13, Y5, Y7
+	VPSUBW Y14, Y8, Y10
+	VPSUBW Y15, Y9, Y11
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y5, Y5
+	VPADDW Y14, Y8, Y8
+	VPADDW Y15, Y9, Y9
+
+	// level 3
+	// zetas
+	VMOVDQU (224)(SI), Y13
+	VMOVDQU (256)(SI), Y14
+	VMOVDQU (288)(SI), Y15
+	VMOVDQU (320)(SI), Y3
+
+	// mul
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y5, Y12
+	VPSUBW  Y13, Y7, Y13
+	VPSUBW  Y14, Y9, Y14
+	VPSUBW  Y15, Y11, Y15
+
+	// reduce 2
+	VPSRAW $13, Y4, Y5
+	VPSRAW $13, Y6, Y7
+	VPSRAW $13, Y8, Y9
+	VPSRAW $13, Y10, Y11
+	VPAND  Y2, Y4, Y4
+	VPAND  Y2, Y6, Y6
+	VPAND  Y2, Y8, Y8
+	VPAND  Y2, Y10, Y10
+	VPSUBW Y5, Y4, Y4
+	VPSUBW Y7, Y6, Y6
+	VPSUBW Y9, Y8, Y8
+	VPSUBW Y11, Y10, Y10
+	VPSLLW $9, Y5, Y5
+	VPSLLW $9, Y7, Y7
+	VPSLLW $9, Y9, Y9
+	VPSLLW $9, Y11, Y11
+	VPADDW Y5, Y4, Y4
+	VPADDW Y7, Y6, Y6
+	VPADDW Y9, Y8, Y8
+	VPADDW Y11, Y10, Y10
+
+	// update
+	VPSUBW Y12, Y4, Y5
+	VPSUBW Y13, Y6, Y7
+	VPSUBW Y14, Y8, Y9
+	VPSUBW Y15, Y10, Y11
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y6, Y6
+	VPADDW Y14, Y8, Y8
+	VPADDW Y15, Y10, Y10
+
+	// level 4
+	// zetas
+	VMOVDQU (480)(SI), Y12
+	VMOVDQU (512)(SI), Y13
+	VMOVDQU (544)(SI), Y14
+	VMOVDQU (576)(SI), Y15
+
+	// shuffle
+	VPERM2I128 $0x02, Y4, Y5, Y3
+	VPERM2I128 $0x13, Y4, Y5, Y4
+	VPERM2I128 $0x02, Y6, Y7, Y5
+	VPERM2I128 $0x13, Y6, Y7, Y6
+	VPERM2I128 $0x02, Y8, Y9, Y7
+	VPERM2I128 $0x13, Y8, Y9, Y8
+	VPERM2I128 $0x02, Y10, Y11, Y9
+	VPERM2I128 $0x13, Y10, Y11, Y10
+
+	// mul
+	VPMULLW Y12, Y4, Y11
+	VPMULHW Y12, Y4, Y4
+	VPMULLW Y13, Y6, Y12
+	VPMULHW Y13, Y6, Y6
+	VPMULLW Y14, Y8, Y13
+	VPMULHW Y14, Y8, Y8
+	VPMULLW Y15, Y10, Y14
+	VPMULHW Y15, Y10, Y10
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y4, Y11
+	VPSUBW  Y12, Y6, Y12
+	VPSUBW  Y13, Y8, Y13
+	VPSUBW  Y14, Y10, Y14
+
+	// update
+	VPSUBW Y11, Y3, Y4
+	VPSUBW Y12, Y5, Y6
+	VPSUBW Y13, Y7, Y8
+	VPSUBW Y14, Y9, Y10
+	VPADDW Y11, Y3, Y3
+	VPADDW Y12, Y5, Y5
+	VPADDW Y13, Y7, Y7
+	VPADDW Y14, Y9, Y9
+
+	// level 5
+	// zetas
+	VMOVDQU (736)(SI), Y12
+	VMOVDQU (768)(SI), Y13
+	VMOVDQU (800)(SI), Y14
+	VMOVDQU (832)(SI), Y15
+
+	// shuffle
+	VSHUFPD $0x00, Y4, Y3, Y11
+	VSHUFPD $0x0F, Y4, Y3, Y3
+	VSHUFPD $0x00, Y6, Y5, Y4
+	VSHUFPD $0x0F, Y6, Y5, Y5
+	VSHUFPD $0x00, Y8, Y7, Y6
+	VSHUFPD $0x0F, Y8, Y7, Y7
+	VSHUFPD $0x00, Y10, Y9, Y8
+	VSHUFPD $0x0F, Y10, Y9, Y9
+
+	// mul
+	VPMULLW Y12, Y3, Y10
+	VPMULHW Y12, Y3, Y3
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+
+	// reduce
+	VPMULLW Y0, Y10, Y10
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y10, Y10
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y10, Y3, Y10
+	VPSUBW  Y12, Y5, Y12
+	VPSUBW  Y13, Y7, Y13
+	VPSUBW  Y14, Y9, Y14
+
+	// update
+	VPSUBW Y10, Y11, Y3
+	VPSUBW Y12, Y4, Y5
+	VPSUBW Y13, Y6, Y7
+	VPSUBW Y14, Y8, Y9
+	VPADDW Y10, Y11, Y10
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y6, Y6
+	VPADDW Y14, Y8, Y8
+
+	// level 6
+	// shuffle
+	VPSHUFD  $0xB1, Y10, Y12
+	VPSHUFD  $0xB1, Y3, Y13
+	VPSHUFD  $0xB1, Y4, Y14
+	VPSHUFD  $0xB1, Y5, Y15
+	VPBLENDD $0x55, Y10, Y13, Y10
+	VPBLENDD $0xAA, Y3, Y12, Y3
+	VPBLENDD $0x55, Y4, Y15, Y4
+	VPBLENDD $0xAA, Y5, Y14, Y5
+	VPSHUFD  $0xB1, Y6, Y12
+	VPSHUFD  $0xB1, Y7, Y13
+	VPSHUFD  $0xB1, Y8, Y14
+	VPSHUFD  $0xB1, Y9, Y15
+	VPBLENDD $0x55, Y6, Y13, Y6
+	VPBLENDD $0xAA, Y7, Y12, Y7
+	VPBLENDD $0x55, Y8, Y15, Y8
+	VPBLENDD $0xAA, Y9, Y14, Y9
+
+	// zetas
+	VMOVDQU (992)(SI), Y12
+	VMOVDQU (1024)(SI), Y13
+	VMOVDQU (1056)(SI), Y14
+	VMOVDQU (1088)(SI), Y15
+
+	// mul
+	VPMULLW Y12, Y3, Y11
+	VPMULHW Y12, Y3, Y3
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y3, Y11
+	VPSUBW  Y12, Y5, Y12
+	VPSUBW  Y13, Y7, Y13
+	VPSUBW  Y14, Y9, Y14
+
+	// reduce 2
+	VPSRAW $13, Y10, Y3
+	VPSRAW $13, Y4, Y5
+	VPSRAW $13, Y6, Y7
+	VPSRAW $13, Y8, Y9
+	VPAND  Y2, Y10, Y10
+	VPAND  Y2, Y4, Y4
+	VPAND  Y2, Y6, Y6
+	VPAND  Y2, Y8, Y8
+	VPSUBW Y3, Y10, Y10
+	VPSUBW Y5, Y4, Y4
+	VPSUBW Y7, Y6, Y6
+	VPSUBW Y9, Y8, Y8
+	VPSLLW $9, Y3, Y3
+	VPSLLW $9, Y5, Y5
+	VPSLLW $9, Y7, Y7
+	VPSLLW $9, Y9, Y9
+	VPADDW Y3, Y10, Y10
+	VPADDW Y5, Y4, Y4
+	VPADDW Y7, Y6, Y6
+	VPADDW Y9, Y8, Y8
+
+	// update
+	VPSUBW Y11, Y10, Y3
+	VPSUBW Y12, Y4, Y5
+	VPSUBW Y13, Y6, Y7
+	VPSUBW Y14, Y8, Y9
+	VPADDW Y11, Y10, Y10
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y6, Y6
+	VPADDW Y14, Y8, Y8
+
+	// level 7
+	// shuffle
+	VMOVDQU  ·vpshufb_idx<>(SB), Y15
+	VPSHUFB  Y15, Y10, Y11
+	VPSHUFB  Y15, Y3, Y12
+	VPSHUFB  Y15, Y4, Y13
+	VPSHUFB  Y15, Y5, Y14
+	VPBLENDW $0x55, Y10, Y12, Y10
+	VPBLENDW $0xAA, Y3, Y11, Y3
+	VPBLENDW $0x55, Y4, Y14, Y4
+	VPBLENDW $0xAA, Y5, Y13, Y5
+	VPSHUFB  Y15, Y6, Y11
+	VPSHUFB  Y15, Y7, Y12
+	VPSHUFB  Y15, Y8, Y13
+	VPSHUFB  Y15, Y9, Y14
+	VPBLENDW $0x55, Y6, Y12, Y6
+	VPBLENDW $0xAA, Y7, Y11, Y7
+	VPBLENDW $0x55, Y8, Y14, Y8
+	VPBLENDW $0xAA, Y9, Y13, Y9
+
+	// zetas
+	VMOVDQU (1248)(SI), Y12
+	VMOVDQU (1280)(SI), Y13
+	VMOVDQU (1312)(SI), Y14
+	VMOVDQU (1344)(SI), Y15
+
+	// mul
+	VPMULLW Y12, Y3, Y11
+	VPMULHW Y12, Y3, Y3
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y3, Y11
+	VPSUBW  Y12, Y5, Y12
+	VPSUBW  Y13, Y7, Y13
+	VPSUBW  Y14, Y9, Y14
+
+	// reduce 3
+	VMOVDQU ·q2_x16<>(SB), Y15
+	VPSRAW  $15, Y10, Y3
+	VPSRAW  $15, Y4, Y5
+	VPSRAW  $15, Y6, Y7
+	VPSRAW  $15, Y8, Y9
+	VPAND   Y15, Y3, Y3
+	VPAND   Y15, Y5, Y5
+	VPAND   Y15, Y7, Y7
+	VPAND   Y15, Y9, Y9
+	VPADDW  Y1, Y10, Y10
+	VPADDW  Y1, Y4, Y4
+	VPADDW  Y1, Y6, Y6
+	VPADDW  Y1, Y8, Y8
+	VPADDW  Y3, Y10, Y10
+	VPADDW  Y5, Y4, Y4
+	VPADDW  Y7, Y6, Y6
+	VPADDW  Y9, Y8, Y8
+
+	// update
+	VPSUBW Y11, Y10, Y3
+	VPSUBW Y12, Y4, Y5
+	VPSUBW Y13, Y6, Y7
+	VPSUBW Y14, Y8, Y9
+	VPADDW Y11, Y10, Y10
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y6, Y6
+	VPADDW Y14, Y8, Y8
+
+	// reorder
+	VPUNPCKLWD Y3, Y10, Y12
+	VPUNPCKHWD Y3, Y10, Y13
+	VPUNPCKLWD Y5, Y4, Y14
+	VPUNPCKHWD Y5, Y4, Y15
+	VPUNPCKLWD Y7, Y6, Y3
+	VPUNPCKHWD Y7, Y6, Y4
+	VPUNPCKLWD Y9, Y8, Y5
+	VPUNPCKHWD Y9, Y8, Y6
+	VPERM2I128 $0x20, Y13, Y12, Y11
+	VPERM2I128 $0x31, Y13, Y12, Y12
+	VPERM2I128 $0x20, Y15, Y14, Y13
+	VPERM2I128 $0x31, Y15, Y14, Y14
+	VPERM2I128 $0x20, Y4, Y3, Y15
+	VPERM2I128 $0x31, Y4, Y3, Y3
+	VPERM2I128 $0x20, Y6, Y5, Y4
+	VPERM2I128 $0x31, Y6, Y5, Y5
+
+	// store
+	VMOVDQU Y11, (DI)
+	VMOVDQU Y12, (32)(DI)
+	VMOVDQU Y13, (64)(DI)
+	VMOVDQU Y14, (96)(DI)
+	VMOVDQU Y15, (128)(DI)
+	VMOVDQU Y3, (160)(DI)
+	VMOVDQU Y4, (192)(DI)
+	VMOVDQU Y5, (224)(DI)
+
+	ADDQ $256, DI
+
+	// second round
+	// zetas
+	VMOVDQU (64)(SI), Y3
+
+	// load
+	VMOVDQU (DI), Y4
+	VMOVDQU (32)(DI), Y5
+	VMOVDQU (64)(DI), Y6
+	VMOVDQU (96)(DI), Y7
+	VMOVDQU (128)(DI), Y8
+	VMOVDQU (160)(DI), Y9
+	VMOVDQU (192)(DI), Y10
+	VMOVDQU (224)(DI), Y11
+
+	// level 1
+	// mul
+	VPMULLW Y3, Y8, Y12
+	VPMULHW Y3, Y8, Y8
+	VPMULLW Y3, Y9, Y13
+	VPMULHW Y3, Y9, Y9
+	VPMULLW Y3, Y10, Y14
+	VPMULHW Y3, Y10, Y10
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y8, Y12
+	VPSUBW  Y13, Y9, Y13
+	VPSUBW  Y14, Y10, Y14
+	VPSUBW  Y15, Y11, Y15
+
+	// update
+	VPSUBW Y12, Y4, Y8
+	VPSUBW Y13, Y5, Y9
+	VPSUBW Y14, Y6, Y10
+	VPSUBW Y15, Y7, Y11
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y5, Y5
+	VPADDW Y14, Y6, Y6
+	VPADDW Y15, Y7, Y7
+
+	// level 2
+	// zetas
+	VMOVDQU (160)(SI), Y15
+	VMOVDQU (192)(SI), Y3
+
+	// mul
+	VPMULLW Y15, Y6, Y12
+	VPMULHW Y15, Y6, Y6
+	VPMULLW Y15, Y7, Y13
+	VPMULHW Y15, Y7, Y7
+	VPMULLW Y3, Y10, Y14
+	VPMULHW Y3, Y10, Y10
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y6, Y12
+	VPSUBW  Y13, Y7, Y13
+	VPSUBW  Y14, Y10, Y14
+	VPSUBW  Y15, Y11, Y15
+
+	// update
+	VPSUBW Y12, Y4, Y6
+	VPSUBW Y13, Y5, Y7
+	VPSUBW Y14, Y8, Y10
+	VPSUBW Y15, Y9, Y11
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y5, Y5
+	VPADDW Y14, Y8, Y8
+	VPADDW Y15, Y9, Y9
+
+	// level 3
+	// zetas
+	VMOVDQU (352)(SI), Y13
+	VMOVDQU (384)(SI), Y14
+	VMOVDQU (416)(SI), Y15
+	VMOVDQU (448)(SI), Y3
+
+	// mul
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y5, Y12
+	VPSUBW  Y13, Y7, Y13
+	VPSUBW  Y14, Y9, Y14
+	VPSUBW  Y15, Y11, Y15
+
+	// reduce 2
+	VPSRAW $13, Y4, Y5
+	VPSRAW $13, Y6, Y7
+	VPSRAW $13, Y8, Y9
+	VPSRAW $13, Y10, Y11
+	VPAND  Y2, Y4, Y4
+	VPAND  Y2, Y6, Y6
+	VPAND  Y2, Y8, Y8
+	VPAND  Y2, Y10, Y10
+	VPSUBW Y5, Y4, Y4
+	VPSUBW Y7, Y6, Y6
+	VPSUBW Y9, Y8, Y8
+	VPSUBW Y11, Y10, Y10
+	VPSLLW $9, Y5, Y5
+	VPSLLW $9, Y7, Y7
+	VPSLLW $9, Y9, Y9
+	VPSLLW $9, Y11, Y11
+	VPADDW Y5, Y4, Y4
+	VPADDW Y7, Y6, Y6
+	VPADDW Y9, Y8, Y8
+	VPADDW Y11, Y10, Y10
+
+	// update
+	VPSUBW Y12, Y4, Y5
+	VPSUBW Y13, Y6, Y7
+	VPSUBW Y14, Y8, Y9
+	VPSUBW Y15, Y10, Y11
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y6, Y6
+	VPADDW Y14, Y8, Y8
+	VPADDW Y15, Y10, Y10
+
+	// level 4
+	// zetas
+	VMOVDQU (608)(SI), Y12
+	VMOVDQU (640)(SI), Y13
+	VMOVDQU (672)(SI), Y14
+	VMOVDQU (704)(SI), Y15
+
+	// shuffle
+	VPERM2I128 $0x02, Y4, Y5, Y3
+	VPERM2I128 $0x13, Y4, Y5, Y4
+	VPERM2I128 $0x02, Y6, Y7, Y5
+	VPERM2I128 $0x13, Y6, Y7, Y6
+	VPERM2I128 $0x02, Y8, Y9, Y7
+	VPERM2I128 $0x13, Y8, Y9, Y8
+	VPERM2I128 $0x02, Y10, Y11, Y9
+	VPERM2I128 $0x13, Y10, Y11, Y10
+
+	// mul
+	VPMULLW Y12, Y4, Y11
+	VPMULHW Y12, Y4, Y4
+	VPMULLW Y13, Y6, Y12
+	VPMULHW Y13, Y6, Y6
+	VPMULLW Y14, Y8, Y13
+	VPMULHW Y14, Y8, Y8
+	VPMULLW Y15, Y10, Y14
+	VPMULHW Y15, Y10, Y10
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y4, Y11
+	VPSUBW  Y12, Y6, Y12
+	VPSUBW  Y13, Y8, Y13
+	VPSUBW  Y14, Y10, Y14
+
+	// update
+	VPSUBW Y11, Y3, Y4
+	VPSUBW Y12, Y5, Y6
+	VPSUBW Y13, Y7, Y8
+	VPSUBW Y14, Y9, Y10
+	VPADDW Y11, Y3, Y3
+	VPADDW Y12, Y5, Y5
+	VPADDW Y13, Y7, Y7
+	VPADDW Y14, Y9, Y9
+
+	// level 5
+	// zetas
+	VMOVDQU (864)(SI), Y12
+	VMOVDQU (896)(SI), Y13
+	VMOVDQU (928)(SI), Y14
+	VMOVDQU (960)(SI), Y15
+
+	// shuffle
+	VSHUFPD $0x00, Y4, Y3, Y11
+	VSHUFPD $0x0F, Y4, Y3, Y3
+	VSHUFPD $0x00, Y6, Y5, Y4
+	VSHUFPD $0x0F, Y6, Y5, Y5
+	VSHUFPD $0x00, Y8, Y7, Y6
+	VSHUFPD $0x0F, Y8, Y7, Y7
+	VSHUFPD $0x00, Y10, Y9, Y8
+	VSHUFPD $0x0F, Y10, Y9, Y9
+
+	// mul
+	VPMULLW Y12, Y3, Y10
+	VPMULHW Y12, Y3, Y3
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+
+	// reduce
+	VPMULLW Y0, Y10, Y10
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y10, Y10
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y10, Y3, Y10
+	VPSUBW  Y12, Y5, Y12
+	VPSUBW  Y13, Y7, Y13
+	VPSUBW  Y14, Y9, Y14
+
+	// update
+	VPSUBW Y10, Y11, Y3
+	VPSUBW Y12, Y4, Y5
+	VPSUBW Y13, Y6, Y7
+	VPSUBW Y14, Y8, Y9
+	VPADDW Y10, Y11, Y10
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y6, Y6
+	VPADDW Y14, Y8, Y8
+
+	// level 6
+	// shuffle
+	VPSHUFD  $0xB1, Y10, Y12
+	VPSHUFD  $0xB1, Y3, Y13
+	VPSHUFD  $0xB1, Y4, Y14
+	VPSHUFD  $0xB1, Y5, Y15
+	VPBLENDD $0x55, Y10, Y13, Y10
+	VPBLENDD $0xAA, Y3, Y12, Y3
+	VPBLENDD $0x55, Y4, Y15, Y4
+	VPBLENDD $0xAA, Y5, Y14, Y5
+	VPSHUFD  $0xB1, Y6, Y12
+	VPSHUFD  $0xB1, Y7, Y13
+	VPSHUFD  $0xB1, Y8, Y14
+	VPSHUFD  $0xB1, Y9, Y15
+	VPBLENDD $0x55, Y6, Y13, Y6
+	VPBLENDD $0xAA, Y7, Y12, Y7
+	VPBLENDD $0x55, Y8, Y15, Y8
+	VPBLENDD $0xAA, Y9, Y14, Y9
+
+	// zetas
+	VMOVDQU (1120)(SI), Y12
+	VMOVDQU (1152)(SI), Y13
+	VMOVDQU (1184)(SI), Y14
+	VMOVDQU (1216)(SI), Y15
+
+	// mul
+	VPMULLW Y12, Y3, Y11
+	VPMULHW Y12, Y3, Y3
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y3, Y11
+	VPSUBW  Y12, Y5, Y12
+	VPSUBW  Y13, Y7, Y13
+	VPSUBW  Y14, Y9, Y14
+
+	// reduce 2
+	VPSRAW $13, Y10, Y3
+	VPSRAW $13, Y4, Y5
+	VPSRAW $13, Y6, Y7
+	VPSRAW $13, Y8, Y9
+	VPAND  Y2, Y10, Y10
+	VPAND  Y2, Y4, Y4
+	VPAND  Y2, Y6, Y6
+	VPAND  Y2, Y8, Y8
+	VPSUBW Y3, Y10, Y10
+	VPSUBW Y5, Y4, Y4
+	VPSUBW Y7, Y6, Y6
+	VPSUBW Y9, Y8, Y8
+	VPSLLW $9, Y3, Y3
+	VPSLLW $9, Y5, Y5
+	VPSLLW $9, Y7, Y7
+	VPSLLW $9, Y9, Y9
+	VPADDW Y3, Y10, Y10
+	VPADDW Y5, Y4, Y4
+	VPADDW Y7, Y6, Y6
+	VPADDW Y9, Y8, Y8
+
+	// update
+	VPSUBW Y11, Y10, Y3
+	VPSUBW Y12, Y4, Y5
+	VPSUBW Y13, Y6, Y7
+	VPSUBW Y14, Y8, Y9
+	VPADDW Y11, Y10, Y10
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y6, Y6
+	VPADDW Y14, Y8, Y8
+
+	// level 7
+	// shuffle
+	VMOVDQU  ·vpshufb_idx<>(SB), Y15
+	VPSHUFB  Y15, Y10, Y11
+	VPSHUFB  Y15, Y3, Y12
+	VPSHUFB  Y15, Y4, Y13
+	VPSHUFB  Y15, Y5, Y14
+	VPBLENDW $0x55, Y10, Y12, Y10
+	VPBLENDW $0xAA, Y3, Y11, Y3
+	VPBLENDW $0x55, Y4, Y14, Y4
+	VPBLENDW $0xAA, Y5, Y13, Y5
+	VPSHUFB  Y15, Y6, Y11
+	VPSHUFB  Y15, Y7, Y12
+	VPSHUFB  Y15, Y8, Y13
+	VPSHUFB  Y15, Y9, Y14
+	VPBLENDW $0x55, Y6, Y12, Y6
+	VPBLENDW $0xAA, Y7, Y11, Y7
+	VPBLENDW $0x55, Y8, Y14, Y8
+	VPBLENDW $0xAA, Y9, Y13, Y9
+
+	// zetas
+	VMOVDQU (1376)(SI), Y12
+	VMOVDQU (1408)(SI), Y13
+	VMOVDQU (1440)(SI), Y14
+	VMOVDQU (1472)(SI), Y15
+
+	// mul
+	VPMULLW Y12, Y3, Y11
+	VPMULHW Y12, Y3, Y3
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y3, Y11
+	VPSUBW  Y12, Y5, Y12
+	VPSUBW  Y13, Y7, Y13
+	VPSUBW  Y14, Y9, Y14
+
+	// reduce 3
+	VMOVDQU ·q2_x16<>(SB), Y15
+	VPSRAW  $15, Y10, Y3
+	VPSRAW  $15, Y4, Y5
+	VPSRAW  $15, Y6, Y7
+	VPSRAW  $15, Y8, Y9
+	VPAND   Y15, Y3, Y3
+	VPAND   Y15, Y5, Y5
+	VPAND   Y15, Y7, Y7
+	VPAND   Y15, Y9, Y9
+	VPADDW  Y1, Y10, Y10
+	VPADDW  Y1, Y4, Y4
+	VPADDW  Y1, Y6, Y6
+	VPADDW  Y1, Y8, Y8
+	VPADDW  Y3, Y10, Y10
+	VPADDW  Y5, Y4, Y4
+	VPADDW  Y7, Y6, Y6
+	VPADDW  Y9, Y8, Y8
+
+	// update
+	VPSUBW Y11, Y10, Y3
+	VPSUBW Y12, Y4, Y5
+	VPSUBW Y13, Y6, Y7
+	VPSUBW Y14, Y8, Y9
+	VPADDW Y11, Y10, Y10
+	VPADDW Y12, Y4, Y4
+	VPADDW Y13, Y6, Y6
+	VPADDW Y14, Y8, Y8
+
+	// reorder
+	VPUNPCKLWD Y3, Y10, Y12
+	VPUNPCKHWD Y3, Y10, Y13
+	VPUNPCKLWD Y5, Y4, Y14
+	VPUNPCKHWD Y5, Y4, Y15
+	VPUNPCKLWD Y7, Y6, Y3
+	VPUNPCKHWD Y7, Y6, Y4
+	VPUNPCKLWD Y9, Y8, Y5
+	VPUNPCKHWD Y9, Y8, Y6
+	VPERM2I128 $0x20, Y13, Y12, Y11
+	VPERM2I128 $0x31, Y13, Y12, Y12
+	VPERM2I128 $0x20, Y15, Y14, Y13
+	VPERM2I128 $0x31, Y15, Y14, Y14
+	VPERM2I128 $0x20, Y4, Y3, Y15
+	VPERM2I128 $0x31, Y4, Y3, Y3
+	VPERM2I128 $0x20, Y6, Y5, Y4
+	VPERM2I128 $0x31, Y6, Y5, Y5
+
+	// store
+	VMOVDQU Y11, (DI)
+	VMOVDQU Y12, (32)(DI)
+	VMOVDQU Y13, (64)(DI)
+	VMOVDQU Y14, (96)(DI)
+	VMOVDQU Y15, (128)(DI)
+	VMOVDQU Y3, (160)(DI)
+	VMOVDQU Y4, (192)(DI)
+	VMOVDQU Y5, (224)(DI)
+
+	VZEROUPPER
+	RET
+
+// For some inexplicable reason, Go's assembler pukes on VPERMQ, so do things
+// the hard way.  Maybe I'm doing something wrong, fuck if I know.
+#define VPERMQ_0xd8_Y4_Y4() \
+	BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xe4; BYTE $0xd8
+
+#define VPERMQ_0xd8_Y5_Y5() \
+	BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xed; BYTE $0xd8
+
+#define VPERMQ_0xd8_Y6_Y6() \
+	BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xf6; BYTE $0xd8
+
+#define VPERMQ_0xd8_Y7_Y7() \
+	BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xff; BYTE $0xd8
+
+#define VPERMQ_0xd8_top_half() \
+	VPERMQ_0xd8_Y4_Y4(); \
+	VPERMQ_0xd8_Y5_Y5(); \
+	VPERMQ_0xd8_Y6_Y6(); \
+	VPERMQ_0xd8_Y7_Y7()
+
+#define VPERMQ_0xd8_Y8_Y8() \
+	BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0; BYTE $0xd8
+
+#define VPERMQ_0xd8_Y9_Y9() \
+	BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0xd8
+
+#define VPERMQ_0xd8_Y10_Y10() \
+	BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0xd8
+
+#define VPERMQ_0xd8_Y11_Y11() \
+	BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0xd8
+
+#define VPERMQ_0xd8_bottom_half() \
+	VPERMQ_0xd8_Y8_Y8();   \
+	VPERMQ_0xd8_Y9_Y9();   \
+	VPERMQ_0xd8_Y10_Y10(); \
+	VPERMQ_0xd8_Y11_Y11()
+
+// func invnttAVX2(inout, omegas *uint16)
+TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
+	MOVQ inout+0(FP), DI
+	MOVQ omegas+8(FP), SI
+
+	VMOVDQU ·qinv_x16<>(SB), Y0
+	VMOVDQU ·q_x16<>(SB), Y1
+	VMOVDQU ·v_x16<>(SB), Y2
+
+	MOVQ SI, R8
+
+	// first round
+	// load
+	VMOVDQU (DI), Y4
+	VMOVDQU (32)(DI), Y5
+	VMOVDQU (64)(DI), Y6
+	VMOVDQU (96)(DI), Y7
+	VMOVDQU (128)(DI), Y8
+	VMOVDQU (160)(DI), Y9
+	VMOVDQU (192)(DI), Y10
+	VMOVDQU (224)(DI), Y11
+
+	// reorder
+	VMOVDQU   ·lowdword<>(SB), Y3
+	VPAND     Y3, Y4, Y12
+	VPAND     Y3, Y5, Y13
+	VPAND     Y3, Y6, Y14
+	VPAND     Y3, Y7, Y15
+	VPSRLD    $16, Y4, Y4
+	VPSRLD    $16, Y5, Y5
+	VPSRLD    $16, Y6, Y6
+	VPSRLD    $16, Y7, Y7
+	VPACKUSDW Y5, Y4, Y5
+	VPACKUSDW Y13, Y12, Y4
+	VPACKUSDW Y7, Y6, Y7
+	VPACKUSDW Y15, Y14, Y6
+	/*
+	 VPERMQ$0xd8,Y4,Y4
+	 VPERMQ$0xd8,Y5,Y5
+	 VPERMQ$0xd8,Y6,Y6
+	 VPERMQ$0xd8,Y7,Y7
+	*/
+	VPERMQ_0xd8_top_half()
+	VPAND     Y3, Y8, Y12
+	VPAND     Y3, Y9, Y13
+	VPAND     Y3, Y10, Y14
+	VPAND     Y3, Y11, Y15
+	VPSRLD    $16, Y8, Y8
+	VPSRLD    $16, Y9, Y9
+	VPSRLD    $16, Y10, Y10
+	VPSRLD    $16, Y11, Y11
+	VPACKUSDW Y9, Y8, Y9
+	VPACKUSDW Y13, Y12, Y8
+	VPACKUSDW Y11, Y10, Y11
+	VPACKUSDW Y15, Y14, Y10
+	/*
+	 VPERMQ $0xd8,Y8,Y8
+	 VPERMQ $0xd8,Y9,Y9
+	 VPERMQ $0xd8,Y10,Y10
+	 VPERMQ $0xd8,Y11,Y11
+	*/
+	VPERMQ_0xd8_bottom_half()
+
+	// level 0
+	// update
+	VPSUBW Y5, Y4, Y12
+	VPSUBW Y7, Y6, Y13
+	VPSUBW Y9, Y8, Y14
+	VPSUBW Y11, Y10, Y15
+	VPADDW Y4, Y5, Y4
+	VPADDW Y6, Y7, Y6
+	VPADDW Y8, Y9, Y8
+	VPADDW Y10, Y11, Y10
+
+	// zetas
+	VMOVDQU (R8), Y7
+	VMOVDQU (32)(R8), Y9
+	VMOVDQU (64)(R8), Y11
+	VMOVDQU (96)(R8), Y3
+
+	// mul
+	VPMULLW Y7, Y12, Y5
+	VPMULHW Y7, Y12, Y12
+	VPMULLW Y9, Y13, Y7
+	VPMULHW Y9, Y13, Y13
+	VPMULLW Y11, Y14, Y9
+	VPMULHW Y11, Y14, Y14
+	VPMULLW Y3, Y15, Y11
+	VPMULHW Y3, Y15, Y15
+
+	// reduce
+	VPMULLW Y0, Y5, Y5
+	VPMULLW Y0, Y7, Y7
+	VPMULLW Y0, Y9, Y9
+	VPMULLW Y0, Y11, Y11
+	VPMULHW Y1, Y5, Y5
+	VPMULHW Y1, Y7, Y7
+	VPMULHW Y1, Y9, Y9
+	VPMULHW Y1, Y11, Y11
+	VPSUBW  Y5, Y12, Y5
+	VPSUBW  Y7, Y13, Y7
+	VPSUBW  Y9, Y14, Y9
+	VPSUBW  Y11, Y15, Y11
+
+	// level 1
+	// shuffle
+	VMOVDQU  ·vpshufb_idx<>(SB), Y3
+	VPSHUFB  Y3, Y4, Y12
+	VPSHUFB  Y3, Y5, Y13
+	VPSHUFB  Y3, Y6, Y14
+	VPSHUFB  Y3, Y7, Y15
+	VPBLENDW $0x55, Y4, Y13, Y4
+	VPBLENDW $0xAA, Y5, Y12, Y5
+	VPBLENDW $0x55, Y6, Y15, Y6
+	VPBLENDW $0xAA, Y7, Y14, Y7
+	VPSHUFB  Y3, Y8, Y12
+	VPSHUFB  Y3, Y9, Y13
+	VPSHUFB  Y3, Y10, Y14
+	VPSHUFB  Y3, Y11, Y15
+	VPBLENDW $0x55, Y8, Y13, Y8
+	VPBLENDW $0xAA, Y9, Y12, Y9
+	VPBLENDW $0x55, Y10, Y15, Y10
+	VPBLENDW $0xAA, Y11, Y14, Y11
+
+	// update
+	VPSUBW Y5, Y4, Y12
+	VPSUBW Y7, Y6, Y13
+	VPSUBW Y9, Y8, Y14
+	VPSUBW Y11, Y10, Y15
+	VPADDW Y4, Y5, Y4
+	VPADDW Y6, Y7, Y6
+	VPADDW Y8, Y9, Y8
+	VPADDW Y10, Y11, Y10
+
+	// zetas
+	VMOVDQU (256)(R8), Y7
+	VMOVDQU (288)(R8), Y9
+	VMOVDQU (320)(R8), Y11
+	VMOVDQU (352)(R8), Y3
+
+	// mul
+	VPMULLW Y7, Y12, Y5
+	VPMULHW Y7, Y12, Y12
+	VPMULLW Y9, Y13, Y7
+	VPMULHW Y9, Y13, Y13
+	VPMULLW Y11, Y14, Y9
+	VPMULHW Y11, Y14, Y14
+	VPMULLW Y3, Y15, Y11
+	VPMULHW Y3, Y15, Y15
+
+	// reduce
+	VPMULLW Y0, Y5, Y5
+	VPMULLW Y0, Y7, Y7
+	VPMULLW Y0, Y9, Y9
+	VPMULLW Y0, Y11, Y11
+	VPMULHW Y1, Y5, Y5
+	VPMULHW Y1, Y7, Y7
+	VPMULHW Y1, Y9, Y9
+	VPMULHW Y1, Y11, Y11
+	VPSUBW  Y5, Y12, Y5
+	VPSUBW  Y7, Y13, Y7
+	VPSUBW  Y9, Y14, Y9
+	VPSUBW  Y11, Y15, Y11
+
+	// reduce 2
+	VPMULHW Y2, Y4, Y12
+	VPMULHW Y2, Y6, Y13
+	VPMULHW Y2, Y8, Y14
+	VPMULHW Y2, Y10, Y15
+	VPSRAW  $11, Y12, Y12
+	VPSRAW  $11, Y13, Y13
+	VPSRAW  $11, Y14, Y14
+	VPSRAW  $11, Y15, Y15
+	VPMULLW Y1, Y12, Y12
+	VPMULLW Y1, Y13, Y13
+	VPMULLW Y1, Y14, Y14
+	VPMULLW Y1, Y15, Y15
+	VPSUBW  Y12, Y4, Y4
+	VPSUBW  Y13, Y6, Y6
+	VPSUBW  Y14, Y8, Y8
+	VPSUBW  Y15, Y10, Y10
+
+	// level 2
+	// shuffle
+	VPSHUFD  $0xB1, Y4, Y12
+	VPSHUFD  $0xB1, Y5, Y13
+	VPSHUFD  $0xB1, Y6, Y14
+	VPSHUFD  $0xB1, Y7, Y15
+	VPBLENDD $0x55, Y4, Y13, Y4
+	VPBLENDD $0xAA, Y5, Y12, Y5
+	VPBLENDD $0x55, Y6, Y15, Y6
+	VPBLENDD $0xAA, Y7, Y14, Y7
+	VPSHUFD  $0xB1, Y8, Y12
+	VPSHUFD  $0xB1, Y9, Y13
+	VPSHUFD  $0xB1, Y10, Y14
+	VPSHUFD  $0xB1, Y11, Y15
+	VPBLENDD $0x55, Y8, Y13, Y8
+	VPBLENDD $0xAA, Y9, Y12, Y9
+	VPBLENDD $0x55, Y10, Y15, Y10
+	VPBLENDD $0xAA, Y11, Y14, Y11
+
+	// update
+	VPSUBW Y5, Y4, Y12
+	VPSUBW Y7, Y6, Y13
+	VPSUBW Y9, Y8, Y14
+	VPSUBW Y11, Y10, Y15
+	VPADDW Y4, Y5, Y4
+	VPADDW Y6, Y7, Y6
+	VPADDW Y8, Y9, Y8
+	VPADDW Y10, Y11, Y10
+
+	// zetas
+	VMOVDQU (512)(R8), Y7
+	VMOVDQU (544)(R8), Y9
+	VMOVDQU (576)(R8), Y11
+	VMOVDQU (608)(R8), Y3
+
+	// mul
+	VPMULLW Y7, Y12, Y5
+	VPMULHW Y7, Y12, Y12
+	VPMULLW Y9, Y13, Y7
+	VPMULHW Y9, Y13, Y13
+	VPMULLW Y11, Y14, Y9
+	VPMULHW Y11, Y14, Y14
+	VPMULLW Y3, Y15, Y11
+	VPMULHW Y3, Y15, Y15
+
+	// reduce
+	VPMULLW Y0, Y5, Y5
+	VPMULLW Y0, Y7, Y7
+	VPMULLW Y0, Y9, Y9
+	VPMULLW Y0, Y11, Y11
+	VPMULHW Y1, Y5, Y5
+	VPMULHW Y1, Y7, Y7
+	VPMULHW Y1, Y9, Y9
+	VPMULHW Y1, Y11, Y11
+	VPSUBW  Y5, Y12, Y5
+	VPSUBW  Y7, Y13, Y7
+	VPSUBW  Y9, Y14, Y9
+	VPSUBW  Y11, Y15, Y11
+
+	// level 3
+	// shuffle
+	VSHUFPD $0x00, Y5, Y4, Y3
+	VSHUFPD $0x0F, Y5, Y4, Y4
+	VSHUFPD $0x00, Y7, Y6, Y5
+	VSHUFPD $0x0F, Y7, Y6, Y6
+	VSHUFPD $0x00, Y9, Y8, Y7
+	VSHUFPD $0x0F, Y9, Y8, Y8
+	VSHUFPD $0x00, Y11, Y10, Y9
+	VSHUFPD $0x0F, Y11, Y10, Y10
+
+	// update
+	VPSUBW Y4, Y3, Y12
+	VPSUBW Y6, Y5, Y13
+	VPSUBW Y8, Y7, Y14
+	VPSUBW Y10, Y9, Y15
+	VPADDW Y3, Y4, Y3
+	VPADDW Y5, Y6, Y5
+	VPADDW Y7, Y8, Y7
+	VPADDW Y9, Y10, Y9
+
+	// zetas
+	VMOVDQU (768)(R8), Y6
+	VMOVDQU (800)(R8), Y8
+	VMOVDQU (832)(R8), Y10
+	VMOVDQU (864)(R8), Y11
+
+	// mul
+	VPMULLW Y6, Y12, Y4
+	VPMULHW Y6, Y12, Y12
+	VPMULLW Y8, Y13, Y6
+	VPMULHW Y8, Y13, Y13
+	VPMULLW Y10, Y14, Y8
+	VPMULHW Y10, Y14, Y14
+	VPMULLW Y11, Y15, Y10
+	VPMULHW Y11, Y15, Y15
+
+	// reduce
+	VPMULLW Y0, Y4, Y4
+	VPMULLW Y0, Y6, Y6
+	VPMULLW Y0, Y8, Y8
+	VPMULLW Y0, Y10, Y10
+	VPMULHW Y1, Y4, Y4
+	VPMULHW Y1, Y6, Y6
+	VPMULHW Y1, Y8, Y8
+	VPMULHW Y1, Y10, Y10
+	VPSUBW  Y4, Y12, Y4
+	VPSUBW  Y6, Y13, Y6
+	VPSUBW  Y8, Y14, Y8
+	VPSUBW  Y10, Y15, Y10
+
+	// reduce 2
+	VPMULHW Y2, Y3, Y12
+	VPMULHW Y2, Y5, Y13
+	VPMULHW Y2, Y7, Y14
+	VPMULHW Y2, Y9, Y15
+	VPSRAW  $11, Y12, Y12
+	VPSRAW  $11, Y13, Y13
+	VPSRAW  $11, Y14, Y14
+	VPSRAW  $11, Y15, Y15
+	VPMULLW Y1, Y12, Y12
+	VPMULLW Y1, Y13, Y13
+	VPMULLW Y1, Y14, Y14
+	VPMULLW Y1, Y15, Y15
+	VPSUBW  Y12, Y3, Y3
+	VPSUBW  Y13, Y5, Y5
+	VPSUBW  Y14, Y7, Y7
+	VPSUBW  Y15, Y9, Y9
+
+	// level 4
+	// shuffle
+	VPERM2I128 $0x02, Y3, Y4, Y11
+	VPERM2I128 $0x13, Y3, Y4, Y3
+	VPERM2I128 $0x02, Y5, Y6, Y4
+	VPERM2I128 $0x13, Y5, Y6, Y5
+	VPERM2I128 $0x02, Y7, Y8, Y6
+	VPERM2I128 $0x13, Y7, Y8, Y7
+	VPERM2I128 $0x02, Y9, Y10, Y8
+	VPERM2I128 $0x13, Y9, Y10, Y9
+
+	// update
+	VMOVDQU Y11, Y12
+	VMOVDQU Y4, Y13
+	VMOVDQU Y6, Y14
+	VMOVDQU Y8, Y15
+	VPADDW  Y11, Y3, Y10
+	VPADDW  Y4, Y5, Y4
+	VPADDW  Y6, Y7, Y6
+	VPADDW  Y8, Y9, Y8
+	VPSUBW  Y3, Y12, Y3
+	VPSUBW  Y5, Y13, Y5
+	VPSUBW  Y7, Y14, Y7
+	VPSUBW  Y9, Y15, Y9
+
+	// zetas
+	VMOVDQU (1024)(R8), Y12
+	VMOVDQU (1056)(R8), Y13
+	VMOVDQU (1088)(R8), Y14
+	VMOVDQU (1120)(R8), Y15
+
+	// mul
+	VPMULLW Y12, Y3, Y11
+	VPMULHW Y12, Y3, Y3
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y3, Y3
+	VPSUBW  Y12, Y5, Y5
+	VPSUBW  Y13, Y7, Y7
+	VPSUBW  Y14, Y9, Y9
+
+	// level 5
+	// update
+	VMOVDQU Y10, Y12
+	VMOVDQU Y3, Y13
+	VMOVDQU Y6, Y14
+	VMOVDQU Y7, Y15
+	VPADDW  Y10, Y4, Y10
+	VPADDW  Y3, Y5, Y3
+	VPADDW  Y6, Y8, Y6
+	VPADDW  Y7, Y9, Y7
+	VPSUBW  Y4, Y12, Y4
+	VPSUBW  Y5, Y13, Y5
+	VPSUBW  Y8, Y14, Y8
+	VPSUBW  Y9, Y15, Y9
+
+	// zetas
+	VMOVDQU (1280)(SI), Y14
+	VMOVDQU (1312)(SI), Y15
+
+	// mul
+	VPMULLW Y14, Y4, Y11
+	VPMULLW Y14, Y5, Y12
+	VPMULLW Y15, Y8, Y13
+	VPMULHW Y14, Y4, Y4
+	VPMULHW Y14, Y5, Y5
+	VPMULHW Y15, Y8, Y8
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y4, Y4
+	VPSUBW  Y12, Y5, Y5
+	VPSUBW  Y13, Y8, Y8
+	VPSUBW  Y14, Y9, Y9
+
+	// reduce 2
+	VPMULHW Y2, Y10, Y12
+	VPMULHW Y2, Y6, Y13
+	VPSRAW  $11, Y12, Y12
+	VPSRAW  $11, Y13, Y13
+	VPMULLW Y1, Y12, Y12
+	VPMULLW Y1, Y13, Y13
+	VPSUBW  Y12, Y10, Y10
+	VPSUBW  Y13, Y6, Y6
+
+	// level 6
+	// update
+	VMOVDQU Y10, Y12
+	VMOVDQU Y3, Y13
+	VMOVDQU Y4, Y14
+	VMOVDQU Y5, Y15
+	VPADDW  Y10, Y6, Y10
+	VPADDW  Y3, Y7, Y3
+	VPADDW  Y4, Y8, Y4
+	VPADDW  Y5, Y9, Y5
+	VPSUBW  Y6, Y12, Y6
+	VPSUBW  Y7, Y13, Y7
+	VPSUBW  Y8, Y14, Y8
+	VPSUBW  Y9, Y15, Y9
+
+	// zetas
+	VMOVDQU (1408)(SI), Y15
+
+	// mul
+	VPMULLW Y15, Y6, Y11
+	VPMULLW Y15, Y7, Y12
+	VPMULLW Y15, Y8, Y13
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y6, Y6
+	VPMULHW Y15, Y7, Y7
+	VPMULHW Y15, Y8, Y8
+	VPMULHW Y15, Y9, Y9
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y6, Y6
+	VPSUBW  Y12, Y7, Y7
+	VPSUBW  Y13, Y8, Y8
+	VPSUBW  Y14, Y9, Y9
+
+	// reduce 2
+	VPMULHW Y2, Y3, Y12
+	VPSRAW  $11, Y12, Y12
+	VPMULLW Y1, Y12, Y12
+	VPSUBW  Y12, Y3, Y3
+
+	// store
+	VMOVDQU Y10, (DI)
+	VMOVDQU Y3, (32)(DI)
+	VMOVDQU Y4, (64)(DI)
+	VMOVDQU Y5, (96)(DI)
+	VMOVDQU Y6, (128)(DI)
+	VMOVDQU Y7, (160)(DI)
+	VMOVDQU Y8, (192)(DI)
+	VMOVDQU Y9, (224)(DI)
+
+	ADDQ $256, DI
+	ADDQ $128, R8
+
+	// second round
+	// load
+	VMOVDQU (DI), Y4
+	VMOVDQU (32)(DI), Y5
+	VMOVDQU (64)(DI), Y6
+	VMOVDQU (96)(DI), Y7
+	VMOVDQU (128)(DI), Y8
+	VMOVDQU (160)(DI), Y9
+	VMOVDQU (192)(DI), Y10
+	VMOVDQU (224)(DI), Y11
+
+	// reorder
+	VMOVDQU   ·lowdword<>(SB), Y3
+	VPAND     Y3, Y4, Y12
+	VPAND     Y3, Y5, Y13
+	VPAND     Y3, Y6, Y14
+	VPAND     Y3, Y7, Y15
+	VPSRLD    $16, Y4, Y4
+	VPSRLD    $16, Y5, Y5
+	VPSRLD    $16, Y6, Y6
+	VPSRLD    $16, Y7, Y7
+	VPACKUSDW Y5, Y4, Y5
+	VPACKUSDW Y13, Y12, Y4
+	VPACKUSDW Y7, Y6, Y7
+	VPACKUSDW Y15, Y14, Y6
+	/*
+	 VPERMQ $0xd8,Y4,Y4
+	 VPERMQ $0xd8,Y5,Y5
+	 VPERMQ $0xd8,Y6,Y6
+	 VPERMQ $0xd8,Y7,Y7
+	 */
+	VPERMQ_0xd8_top_half()
+	VPAND     Y3, Y8, Y12
+	VPAND     Y3, Y9, Y13
+	VPAND     Y3, Y10, Y14
+	VPAND     Y3, Y11, Y15
+	VPSRLD    $16, Y8, Y8
+	VPSRLD    $16, Y9, Y9
+	VPSRLD    $16, Y10, Y10
+	VPSRLD    $16, Y11, Y11
+	VPACKUSDW Y9, Y8, Y9
+	VPACKUSDW Y13, Y12, Y8
+	VPACKUSDW Y11, Y10, Y11
+	VPACKUSDW Y15, Y14, Y10
+	/*
+	 VPERMQ $0xd8,Y8,Y8
+	 VPERMQ $0xd8,Y9,Y9
+	 VPERMQ $0xd8,Y10,Y10
+	 VPERMQ $0xd8,Y11,Y11
+	*/
+	VPERMQ_0xd8_bottom_half()
+
+	// level 0
+	// update
+	VMOVDQU Y4, Y12
+	VMOVDQU Y6, Y13
+	VMOVDQU Y8, Y14
+	VMOVDQU Y10, Y15
+	VPADDW  Y4, Y5, Y4
+	VPADDW  Y6, Y7, Y6
+	VPADDW  Y8, Y9, Y8
+	VPADDW  Y10, Y11, Y10
+	VPSUBW  Y5, Y12, Y5
+	VPSUBW  Y7, Y13, Y7
+	VPSUBW  Y9, Y14, Y9
+	VPSUBW  Y11, Y15, Y11
+
+	// zetas
+	VMOVDQU (R8), Y13
+	VMOVDQU (32)(R8), Y14
+	VMOVDQU (64)(R8), Y15
+	VMOVDQU (96)(R8), Y3
+
+	// mul
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y5, Y5
+	VPSUBW  Y13, Y7, Y7
+	VPSUBW  Y14, Y9, Y9
+	VPSUBW  Y15, Y11, Y11
+
+	// level 1
+	// shuffle
+	VMOVDQU  ·vpshufb_idx<>(SB), Y3
+	VPSHUFB  Y3, Y4, Y12
+	VPSHUFB  Y3, Y5, Y13
+	VPSHUFB  Y3, Y6, Y14
+	VPSHUFB  Y3, Y7, Y15
+	VPBLENDW $0x55, Y4, Y13, Y4
+	VPBLENDW $0xAA, Y5, Y12, Y5
+	VPBLENDW $0x55, Y6, Y15, Y6
+	VPBLENDW $0xAA, Y7, Y14, Y7
+	VPSHUFB  Y3, Y8, Y12
+	VPSHUFB  Y3, Y9, Y13
+	VPSHUFB  Y3, Y10, Y14
+	VPSHUFB  Y3, Y11, Y15
+	VPBLENDW $0x55, Y8, Y13, Y8
+	VPBLENDW $0xAA, Y9, Y12, Y9
+	VPBLENDW $0x55, Y10, Y15, Y10
+	VPBLENDW $0xAA, Y11, Y14, Y11
+
+	// update
+	VMOVDQU Y4, Y12
+	VMOVDQU Y6, Y13
+	VMOVDQU Y8, Y14
+	VMOVDQU Y10, Y15
+	VPADDW  Y4, Y5, Y4
+	VPADDW  Y6, Y7, Y6
+	VPADDW  Y8, Y9, Y8
+	VPADDW  Y10, Y11, Y10
+	VPSUBW  Y5, Y12, Y5
+	VPSUBW  Y7, Y13, Y7
+	VPSUBW  Y9, Y14, Y9
+	VPSUBW  Y11, Y15, Y11
+
+	// zetas
+	VMOVDQU (256)(R8), Y13
+	VMOVDQU (288)(R8), Y14
+	VMOVDQU (320)(R8), Y15
+	VMOVDQU (352)(R8), Y3
+
+	// mul
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y5, Y5
+	VPSUBW  Y13, Y7, Y7
+	VPSUBW  Y14, Y9, Y9
+	VPSUBW  Y15, Y11, Y11
+
+	// reduce 2
+	VPMULHW Y2, Y4, Y12
+	VPMULHW Y2, Y6, Y13
+	VPMULHW Y2, Y8, Y14
+	VPMULHW Y2, Y10, Y15
+	VPSRAW  $11, Y12, Y12
+	VPSRAW  $11, Y13, Y13
+	VPSRAW  $11, Y14, Y14
+	VPSRAW  $11, Y15, Y15
+	VPMULLW Y1, Y12, Y12
+	VPMULLW Y1, Y13, Y13
+	VPMULLW Y1, Y14, Y14
+	VPMULLW Y1, Y15, Y15
+	VPSUBW  Y12, Y4, Y4
+	VPSUBW  Y13, Y6, Y6
+	VPSUBW  Y14, Y8, Y8
+	VPSUBW  Y15, Y10, Y10
+
+	// level 2
+	// shuffle
+	VPSHUFD  $0xB1, Y4, Y12
+	VPSHUFD  $0xB1, Y5, Y13
+	VPSHUFD  $0xB1, Y6, Y14
+	VPSHUFD  $0xB1, Y7, Y15
+	VPBLENDD $0x55, Y4, Y13, Y4
+	VPBLENDD $0xAA, Y5, Y12, Y5
+	VPBLENDD $0x55, Y6, Y15, Y6
+	VPBLENDD $0xAA, Y7, Y14, Y7
+	VPSHUFD  $0xB1, Y8, Y12
+	VPSHUFD  $0xB1, Y9, Y13
+	VPSHUFD  $0xB1, Y10, Y14
+	VPSHUFD  $0xB1, Y11, Y15
+	VPBLENDD $0x55, Y8, Y13, Y8
+	VPBLENDD $0xAA, Y9, Y12, Y9
+	VPBLENDD $0x55, Y10, Y15, Y10
+	VPBLENDD $0xAA, Y11, Y14, Y11
+
+	// update
+	VMOVDQU Y4, Y12
+	VMOVDQU Y6, Y13
+	VMOVDQU Y8, Y14
+	VMOVDQU Y10, Y15
+	VPADDW  Y4, Y5, Y4
+	VPADDW  Y6, Y7, Y6
+	VPADDW  Y8, Y9, Y8
+	VPADDW  Y10, Y11, Y10
+	VPSUBW  Y5, Y12, Y5
+	VPSUBW  Y7, Y13, Y7
+	VPSUBW  Y9, Y14, Y9
+	VPSUBW  Y11, Y15, Y11
+
+	// zetas
+	VMOVDQU (512)(R8), Y13
+	VMOVDQU (544)(R8), Y14
+	VMOVDQU (576)(R8), Y15
+	VMOVDQU (608)(R8), Y3
+
+	// mul
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y5, Y5
+	VPSUBW  Y13, Y7, Y7
+	VPSUBW  Y14, Y9, Y9
+	VPSUBW  Y15, Y11, Y11
+
+	// level 3
+	// shuffle
+	VSHUFPD $0x00, Y5, Y4, Y3
+	VSHUFPD $0x0F, Y5, Y4, Y4
+	VSHUFPD $0x00, Y7, Y6, Y5
+	VSHUFPD $0x0F, Y7, Y6, Y6
+	VSHUFPD $0x00, Y9, Y8, Y7
+	VSHUFPD $0x0F, Y9, Y8, Y8
+	VSHUFPD $0x00, Y11, Y10, Y9
+	VSHUFPD $0x0F, Y11, Y10, Y10
+
+	// update
+	VMOVDQU Y3, Y12
+	VMOVDQU Y5, Y13
+	VMOVDQU Y7, Y14
+	VMOVDQU Y9, Y15
+	VPADDW  Y3, Y4, Y3
+	VPADDW  Y5, Y6, Y5
+	VPADDW  Y7, Y8, Y7
+	VPADDW  Y9, Y10, Y9
+	VPSUBW  Y4, Y12, Y4
+	VPSUBW  Y6, Y13, Y6
+	VPSUBW  Y8, Y14, Y8
+	VPSUBW  Y10, Y15, Y10
+
+	// zetas
+	VMOVDQU (768)(R8), Y12
+	VMOVDQU (800)(R8), Y13
+	VMOVDQU (832)(R8), Y14
+	VMOVDQU (864)(R8), Y15
+
+	// mul
+	VPMULLW Y12, Y4, Y11
+	VPMULHW Y12, Y4, Y4
+	VPMULLW Y13, Y6, Y12
+	VPMULHW Y13, Y6, Y6
+	VPMULLW Y14, Y8, Y13
+	VPMULHW Y14, Y8, Y8
+	VPMULLW Y15, Y10, Y14
+	VPMULHW Y15, Y10, Y10
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y4, Y4
+	VPSUBW  Y12, Y6, Y6
+	VPSUBW  Y13, Y8, Y8
+	VPSUBW  Y14, Y10, Y10
+
+	// reduce 2
+	VPMULHW Y2, Y3, Y12
+	VPMULHW Y2, Y5, Y13
+	VPMULHW Y2, Y7, Y14
+	VPMULHW Y2, Y9, Y15
+	VPSRAW  $11, Y12, Y12
+	VPSRAW  $11, Y13, Y13
+	VPSRAW  $11, Y14, Y14
+	VPSRAW  $11, Y15, Y15
+	VPMULLW Y1, Y12, Y12
+	VPMULLW Y1, Y13, Y13
+	VPMULLW Y1, Y14, Y14
+	VPMULLW Y1, Y15, Y15
+	VPSUBW  Y12, Y3, Y3
+	VPSUBW  Y13, Y5, Y5
+	VPSUBW  Y14, Y7, Y7
+	VPSUBW  Y15, Y9, Y9
+
+	// level 4
+	// shuffle
+	VPERM2I128 $0x02, Y3, Y4, Y11
+	VPERM2I128 $0x13, Y3, Y4, Y3
+	VPERM2I128 $0x02, Y5, Y6, Y4
+	VPERM2I128 $0x13, Y5, Y6, Y5
+	VPERM2I128 $0x02, Y7, Y8, Y6
+	VPERM2I128 $0x13, Y7, Y8, Y7
+	VPERM2I128 $0x02, Y9, Y10, Y8
+	VPERM2I128 $0x13, Y9, Y10, Y9
+
+	// update
+	VMOVDQU Y11, Y12
+	VMOVDQU Y4, Y13
+	VMOVDQU Y6, Y14
+	VMOVDQU Y8, Y15
+	VPADDW  Y11, Y3, Y10
+	VPADDW  Y4, Y5, Y4
+	VPADDW  Y6, Y7, Y6
+	VPADDW  Y8, Y9, Y8
+	VPSUBW  Y3, Y12, Y3
+	VPSUBW  Y5, Y13, Y5
+	VPSUBW  Y7, Y14, Y7
+	VPSUBW  Y9, Y15, Y9
+
+	// zetas
+	VMOVDQU (1024)(R8), Y12
+	VMOVDQU (1056)(R8), Y13
+	VMOVDQU (1088)(R8), Y14
+	VMOVDQU (1120)(R8), Y15
+
+	// mul
+	VPMULLW Y12, Y3, Y11
+	VPMULHW Y12, Y3, Y3
+	VPMULLW Y13, Y5, Y12
+	VPMULHW Y13, Y5, Y5
+	VPMULLW Y14, Y7, Y13
+	VPMULHW Y14, Y7, Y7
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y3, Y3
+	VPSUBW  Y12, Y5, Y5
+	VPSUBW  Y13, Y7, Y7
+	VPSUBW  Y14, Y9, Y9
+
+	// level 5
+	// update
+	VMOVDQU Y10, Y12
+	VMOVDQU Y3, Y13
+	VMOVDQU Y6, Y14
+	VMOVDQU Y7, Y15
+	VPADDW  Y10, Y4, Y10
+	VPADDW  Y3, Y5, Y3
+	VPADDW  Y6, Y8, Y6
+	VPADDW  Y7, Y9, Y7
+	VPSUBW  Y4, Y12, Y4
+	VPSUBW  Y5, Y13, Y5
+	VPSUBW  Y8, Y14, Y8
+	VPSUBW  Y9, Y15, Y9
+
+	// zetas
+	VMOVDQU (1344)(SI), Y14
+	VMOVDQU (1376)(SI), Y15
+
+	// mul
+	VPMULLW Y14, Y4, Y11
+	VPMULLW Y14, Y5, Y12
+	VPMULLW Y15, Y8, Y13
+	VPMULHW Y14, Y4, Y4
+	VPMULHW Y14, Y5, Y5
+	VPMULHW Y15, Y8, Y8
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y9, Y9
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y4, Y4
+	VPSUBW  Y12, Y5, Y5
+	VPSUBW  Y13, Y8, Y8
+	VPSUBW  Y14, Y9, Y9
+
+	// reduce 2
+	VPMULHW Y2, Y10, Y12
+	VPMULHW Y2, Y6, Y13
+	VPSRAW  $11, Y12, Y12
+	VPSRAW  $11, Y13, Y13
+	VPMULLW Y1, Y12, Y12
+	VPMULLW Y1, Y13, Y13
+	VPSUBW  Y12, Y10, Y10
+	VPSUBW  Y13, Y6, Y6
+
+	// level 6
+	// update
+	VMOVDQU Y10, Y12
+	VMOVDQU Y3, Y13
+	VMOVDQU Y4, Y14
+	VMOVDQU Y5, Y15
+	VPADDW  Y10, Y6, Y10
+	VPADDW  Y3, Y7, Y3
+	VPADDW  Y4, Y8, Y4
+	VPADDW  Y5, Y9, Y5
+	VPSUBW  Y6, Y12, Y6
+	VPSUBW  Y7, Y13, Y7
+	VPSUBW  Y8, Y14, Y8
+	VPSUBW  Y9, Y15, Y9
+
+	// zetas
+	VMOVDQU (1440)(SI), Y15
+
+	// mul
+	VPMULLW Y15, Y6, Y11
+	VPMULLW Y15, Y7, Y12
+	VPMULLW Y15, Y8, Y13
+	VPMULLW Y15, Y9, Y14
+	VPMULHW Y15, Y6, Y6
+	VPMULHW Y15, Y7, Y7
+	VPMULHW Y15, Y8, Y8
+	VPMULHW Y15, Y9, Y9
+
+	// reduce
+	VPMULLW Y0, Y11, Y11
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULHW Y1, Y11, Y11
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPSUBW  Y11, Y6, Y6
+	VPSUBW  Y12, Y7, Y7
+	VPSUBW  Y13, Y8, Y8
+	VPSUBW  Y14, Y9, Y9
+
+	// reduce 2
+	VPMULHW Y2, Y3, Y12
+	VPSRAW  $11, Y12, Y12
+	VPMULLW Y1, Y12, Y12
+	VPSUBW  Y12, Y3, Y3
+
+	// store
+	VMOVDQU Y10, (DI)
+	VMOVDQU Y3, (32)(DI)
+	VMOVDQU Y4, (64)(DI)
+	VMOVDQU Y5, (96)(DI)
+	VMOVDQU Y6, (128)(DI)
+	VMOVDQU Y7, (160)(DI)
+	VMOVDQU Y8, (192)(DI)
+	VMOVDQU Y9, (224)(DI)
+
+	SUBQ $256, DI
+
+	// f
+	VMOVDQU ·f_x16<>(SB), Y2
+
+	// first round
+	// load
+	VMOVDQU (DI), Y4
+	VMOVDQU (32)(DI), Y5
+	VMOVDQU (64)(DI), Y6
+	VMOVDQU (96)(DI), Y7
+	VMOVDQU (256)(DI), Y8
+	VMOVDQU (288)(DI), Y9
+	VMOVDQU (320)(DI), Y10
+	VMOVDQU (352)(DI), Y11
+
+	// level 7
+	// update
+	VMOVDQU Y4, Y12
+	VMOVDQU Y5, Y13
+	VMOVDQU Y6, Y14
+	VMOVDQU Y7, Y15
+	VPADDW  Y4, Y8, Y4
+	VPADDW  Y5, Y9, Y5
+	VPADDW  Y6, Y10, Y6
+	VPADDW  Y7, Y11, Y7
+	VPSUBW  Y8, Y12, Y8
+	VPSUBW  Y9, Y13, Y9
+	VPSUBW  Y10, Y14, Y10
+	VPSUBW  Y11, Y15, Y11
+
+	// zeta
+	VMOVDQU (1472)(SI), Y3
+
+	// mul
+	VPMULLW Y3, Y8, Y12
+	VPMULLW Y3, Y9, Y13
+	VPMULLW Y3, Y10, Y14
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y8, Y8
+	VPMULHW Y3, Y9, Y9
+	VPMULHW Y3, Y10, Y10
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y8, Y8
+	VPSUBW  Y13, Y9, Y9
+	VPSUBW  Y14, Y10, Y10
+	VPSUBW  Y15, Y11, Y11
+	VPADDW  Y1, Y8, Y8
+	VPADDW  Y1, Y9, Y9
+	VPADDW  Y1, Y10, Y10
+	VPADDW  Y1, Y11, Y11
+
+	// mul
+	VPMULLW Y2, Y4, Y12
+	VPMULLW Y2, Y5, Y13
+	VPMULLW Y2, Y6, Y14
+	VPMULLW Y2, Y7, Y15
+	VPMULHW Y2, Y4, Y4
+	VPMULHW Y2, Y5, Y5
+	VPMULHW Y2, Y6, Y6
+	VPMULHW Y2, Y7, Y7
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y4, Y4
+	VPSUBW  Y13, Y5, Y5
+	VPSUBW  Y14, Y6, Y6
+	VPSUBW  Y15, Y7, Y7
+	VPADDW  Y1, Y4, Y4
+	VPADDW  Y1, Y5, Y5
+	VPADDW  Y1, Y6, Y6
+	VPADDW  Y1, Y7, Y7
+
+	// store
+	VMOVDQU Y4, (DI)
+	VMOVDQU Y5, (32)(DI)
+	VMOVDQU Y6, (64)(DI)
+	VMOVDQU Y7, (96)(DI)
+	VMOVDQU Y8, (256)(DI)
+	VMOVDQU Y9, (288)(DI)
+	VMOVDQU Y10, (320)(DI)
+	VMOVDQU Y11, (352)(DI)
+
+	ADDQ $128, DI
+
+	// second round
+	// load
+	VMOVDQU (DI), Y4
+	VMOVDQU (32)(DI), Y5
+	VMOVDQU (64)(DI), Y6
+	VMOVDQU (96)(DI), Y7
+	VMOVDQU (256)(DI), Y8
+	VMOVDQU (288)(DI), Y9
+	VMOVDQU (320)(DI), Y10
+	VMOVDQU (352)(DI), Y11
+
+	// zeta
+	VMOVDQU (1472)(SI), Y3
+
+	// level 7
+	// update
+	VMOVDQU Y4, Y12
+	VMOVDQU Y5, Y13
+	VMOVDQU Y6, Y14
+	VMOVDQU Y7, Y15
+	VPADDW  Y4, Y8, Y4
+	VPADDW  Y5, Y9, Y5
+	VPADDW  Y6, Y10, Y6
+	VPADDW  Y7, Y11, Y7
+	VPSUBW  Y8, Y12, Y8
+	VPSUBW  Y9, Y13, Y9
+	VPSUBW  Y10, Y14, Y10
+	VPSUBW  Y11, Y15, Y11
+
+	// mul
+	VPMULLW Y3, Y8, Y12
+	VPMULLW Y3, Y9, Y13
+	VPMULLW Y3, Y10, Y14
+	VPMULLW Y3, Y11, Y15
+	VPMULHW Y3, Y8, Y8
+	VPMULHW Y3, Y9, Y9
+	VPMULHW Y3, Y10, Y10
+	VPMULHW Y3, Y11, Y11
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y8, Y8
+	VPSUBW  Y13, Y9, Y9
+	VPSUBW  Y14, Y10, Y10
+	VPSUBW  Y15, Y11, Y11
+	VPADDW  Y1, Y8, Y8
+	VPADDW  Y1, Y9, Y9
+	VPADDW  Y1, Y10, Y10
+	VPADDW  Y1, Y11, Y11
+
+	// mul
+	VPMULLW Y2, Y4, Y12
+	VPMULLW Y2, Y5, Y13
+	VPMULLW Y2, Y6, Y14
+	VPMULLW Y2, Y7, Y15
+	VPMULHW Y2, Y4, Y4
+	VPMULHW Y2, Y5, Y5
+	VPMULHW Y2, Y6, Y6
+	VPMULHW Y2, Y7, Y7
+
+	// reduce
+	VPMULLW Y0, Y12, Y12
+	VPMULLW Y0, Y13, Y13
+	VPMULLW Y0, Y14, Y14
+	VPMULLW Y0, Y15, Y15
+	VPMULHW Y1, Y12, Y12
+	VPMULHW Y1, Y13, Y13
+	VPMULHW Y1, Y14, Y14
+	VPMULHW Y1, Y15, Y15
+	VPSUBW  Y12, Y4, Y4
+	VPSUBW  Y13, Y5, Y5
+	VPSUBW  Y14, Y6, Y6
+	VPSUBW  Y15, Y7, Y7
+	VPADDW  Y1, Y4, Y4
+	VPADDW  Y1, Y5, Y5
+	VPADDW  Y1, Y6, Y6
+	VPADDW  Y1, Y7, Y7
+
+	// store
+	VMOVDQU Y4, (DI)
+	VMOVDQU Y5, (32)(DI)
+	VMOVDQU Y6, (64)(DI)
+	VMOVDQU Y7, (96)(DI)
+	VMOVDQU Y8, (256)(DI)
+	VMOVDQU Y9, (288)(DI)
+	VMOVDQU Y10, (320)(DI)
+	VMOVDQU Y11, (352)(DI)
+
+	VZEROUPPER
+	RET

+ 14 - 0
hwaccel_ref.go

@@ -0,0 +1,14 @@
+// hwaccel_ref.go - Unaccelerated stubs.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to the software, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+// +build !amd64 gccgo noasm !go1.10
+
+package kyber
+
+func initHardwareAcceleration() {
+	forceDisableHardwareAcceleration()
+}

+ 51 - 10
kem_test.go

@@ -17,17 +17,41 @@ import (
 
 const nTests = 100
 
-var allParams = []*ParameterSet{
-	Kyber512,
-	Kyber768,
-	Kyber1024,
+var (
+	allParams = []*ParameterSet{
+		Kyber512,
+		Kyber768,
+		Kyber1024,
+	}
+
+	canAccelerate bool
+)
+
+func mustInitHardwareAcceleration() {
+	initHardwareAcceleration()
+	if !IsHardwareAccelerated() {
+		panic("initHardwareAcceleration() failed")
+	}
 }
 
 func TestKEM(t *testing.T) {
+	forceDisableHardwareAcceleration()
+	doTestKEM(t)
+
+	if !canAccelerate {
+		t.Log("Hardware acceleration not supported on this host.")
+		return
+	}
+	mustInitHardwareAcceleration()
+	doTestKEM(t)
+}
+
+func doTestKEM(t *testing.T) {
+	impl := "_" + hardwareAccelImpl
 	for _, p := range allParams {
-		t.Run(p.Name()+"_Keys", func(t *testing.T) { doTestKEMKeys(t, p) })
-		t.Run(p.Name()+"_Invalid_SecretKey_A", func(t *testing.T) { doTestKEMInvalidSkA(t, p) })
-		t.Run(p.Name()+"_Invalid_CipherText", func(t *testing.T) { doTestKEMInvalidCipherText(t, p) })
+		t.Run(p.Name()+"_Keys"+impl, func(t *testing.T) { doTestKEMKeys(t, p) })
+		t.Run(p.Name()+"_Invalid_SecretKey_A"+impl, func(t *testing.T) { doTestKEMInvalidSkA(t, p) })
+		t.Run(p.Name()+"_Invalid_CipherText"+impl, func(t *testing.T) { doTestKEMInvalidCipherText(t, p) })
 	}
 }
 
@@ -132,10 +156,23 @@ func requirePublicKeyEqual(require *require.Assertions, a, b *PublicKey) {
 }
 
 func BenchmarkKEM(b *testing.B) {
+	forceDisableHardwareAcceleration()
+	doBenchmarkKEM(b)
+
+	if !canAccelerate {
+		b.Log("Hardware acceleration not supported on this host.")
+		return
+	}
+	mustInitHardwareAcceleration()
+	doBenchmarkKEM(b)
+}
+
+func doBenchmarkKEM(b *testing.B) {
+	impl := "_" + hardwareAccelImpl
 	for _, p := range allParams {
-		b.Run(p.Name()+"_GenerateKeyPair", func(b *testing.B) { doBenchKEMGenerateKeyPair(b, p) })
-		b.Run(p.Name()+"_KEMEncrypt", func(b *testing.B) { doBenchKEMEncDec(b, p, true) })
-		b.Run(p.Name()+"_KEMDecrypt", func(b *testing.B) { doBenchKEMEncDec(b, p, false) })
+		b.Run(p.Name()+"_GenerateKeyPair"+impl, func(b *testing.B) { doBenchKEMGenerateKeyPair(b, p) })
+		b.Run(p.Name()+"_KEMEncrypt"+impl, func(b *testing.B) { doBenchKEMEncDec(b, p, true) })
+		b.Run(p.Name()+"_KEMDecrypt"+impl, func(b *testing.B) { doBenchKEMEncDec(b, p, false) })
 	}
 }
 
@@ -183,3 +220,7 @@ func doBenchKEMEncDec(b *testing.B, p *ParameterSet, isEnc bool) {
 		}
 	}
 }
+
+func init() {
+	canAccelerate = IsHardwareAccelerated()
+}

+ 15 - 2
kem_vectors_test.go

@@ -30,12 +30,25 @@ func TestKEMVectors(t *testing.T) {
 		t.Fatalf("loadCompactTestVectors(): %v", err)
 	}
 
+	forceDisableHardwareAcceleration()
+	doTestKEMVectors(t)
+
+	if !canAccelerate {
+		t.Log("Hardware acceleration not supported on this host.")
+		return
+	}
+	mustInitHardwareAcceleration()
+	doTestKEMVectors(t)
+}
+
+func doTestKEMVectors(t *testing.T) {
+	impl := "_" + hardwareAccelImpl
 	for _, p := range allParams {
-		t.Run(p.Name(), func(t *testing.T) { doTestKEMVectors(t, p) })
+		t.Run(p.Name()+impl, func(t *testing.T) { doTestKEMVectorsPick(t, p) })
 	}
 }
 
-func doTestKEMVectors(t *testing.T, p *ParameterSet) {
+func doTestKEMVectorsPick(t *testing.T, p *ParameterSet) {
 	require := require.New(t)
 
 	// The full test vectors are gigantic, and aren't checked into the

+ 15 - 2
kex_test.go

@@ -15,9 +15,22 @@ import (
 )
 
 func TestAKE(t *testing.T) {
+	forceDisableHardwareAcceleration()
+	doTestKEX(t)
+
+	if !canAccelerate {
+		t.Log("Hardware acceleration not supported on this host.")
+		return
+	}
+	mustInitHardwareAcceleration()
+	doTestKEX(t)
+}
+
+func doTestKEX(t *testing.T) {
+	impl := "_" + hardwareAccelImpl
 	for _, p := range allParams {
-		t.Run(p.Name()+"_UAKE", func(t *testing.T) { doTestUAKE(t, p) })
-		t.Run(p.Name()+"_AKE", func(t *testing.T) { doTestAKE(t, p) })
+		t.Run(p.Name()+"_UAKE"+impl, func(t *testing.T) { doTestUAKE(t, p) })
+		t.Run(p.Name()+"_AKE"+impl, func(t *testing.T) { doTestAKE(t, p) })
 	}
 }
 

+ 2 - 2
ntt.go

@@ -10,7 +10,7 @@ package kyber
 // Computes negacyclic number-theoretic transform (NTT) of a polynomial (vector
 // of 256 coefficients) in place; inputs assumed to be in normal order, output
 // in bitreversed order.
-func ntt(p *[kyberN]uint16) {
+func nttRef(p *[kyberN]uint16) {
 	var j int
 	k := 1
 	for level := 7; level >= 0; level-- {
@@ -35,7 +35,7 @@ func ntt(p *[kyberN]uint16) {
 // Computes inverse of negacyclic number-theoretic transform (NTT) of a
 // polynomial (vector of 256 coefficients) in place; inputs assumed to be in
 // bitreversed order, output in normal order.
-func invntt(a *[kyberN]uint16) {
+func invnttRef(a *[kyberN]uint16) {
 	for level := 0; level < 8; level++ {
 		distance := 1 << uint(level)
 		for start := 0; start < distance; start++ {

+ 2 - 2
poly.go

@@ -121,14 +121,14 @@ func (p *poly) getNoise(seed []byte, nonce byte, eta int) {
 // Computes negacyclic number-theoretic transform (NTT) of a polynomial in
 // place; inputs assumed to be in normal order, output in bitreversed order.
 func (p *poly) ntt() {
-	ntt(&p.coeffs)
+	nttFn(&p.coeffs)
 }
 
 // Computes inverse of negacyclic number-theoretic transform (NTT) of a
 // polynomial in place; inputs assumed to be in bitreversed order, output in
 // normal order.
 func (p *poly) invntt() {
-	invntt(&p.coeffs)
+	invnttFn(&p.coeffs)
 }
 
 // Add two polynomials.

+ 10 - 1
polyvec.go

@@ -93,7 +93,16 @@ func (p *poly) pointwiseAcc(a, b *polyVec) {
 			t = montgomeryReduce(4613 * uint32(b.vec[i].coeffs[j]))
 			p.coeffs[j] += montgomeryReduce(uint32(a.vec[i].coeffs[j]) * uint32(t))
 		}
-		p.coeffs[j] = barrettReduce(p.coeffs[j])
+
+		// HACK HACK HACK:
+		//
+		// The AVX2 code assumes fully reduced coefficients.  Since it's
+		// the only acceleration target right now, just do this here.
+		if isHardwareAccelerated {
+			p.coeffs[j] = freeze(p.coeffs[j])
+		} else {
+			p.coeffs[j] = barrettReduce(p.coeffs[j])
+		}
 	}
 }