123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256 |
- // hwaccel_amd64.go - AMD64 optimized routines.
- //
- // To the extent possible under law, Yawning Angel has waived all copyright
- // and related or neighboring rights to the software, using the Creative
- // Commons "CC0" public domain dedication. See LICENSE or
- // <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
- // +build amd64,!gccgo,!noasm,go1.10
- package kyber
- var zetasExp = [752]uint16{
- 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777,
- 3777, 3777, 3777, 3777, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
- 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 3625, 3625, 3625, 3625,
- 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625,
- 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985,
- 3985, 3985, 3985, 3985, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581,
- 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 2456, 2456, 2456, 2456,
- 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456,
- 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194,
- 2194, 2194, 2194, 2194, 121, 121, 121, 121, 121, 121, 121, 121, 121,
- 121, 121, 121, 121, 121, 121, 121, 5431, 5431, 5431, 5431, 5431, 5431,
- 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 834, 834,
- 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834,
- 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186,
- 5186, 5186, 5186, 5186, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362,
- 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 2876, 2876, 2876, 2876,
- 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876,
- 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980,
- 5980, 5980, 5980, 5980, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414,
- 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 2816, 2816, 2816, 2816,
- 2816, 2816, 2816, 2816, 5593, 5593, 5593, 5593, 5593, 5593, 5593, 5593,
- 5444, 5444, 5444, 5444, 5444, 5444, 5444, 5444, 1986, 1986, 1986, 1986,
- 1986, 1986, 1986, 1986, 6082, 6082, 6082, 6082, 6082, 6082, 6082, 6082,
- 1993, 1993, 1993, 1993, 1993, 1993, 1993, 1993, 3706, 3706, 3706, 3706,
- 3706, 3706, 3706, 3706, 5675, 5675, 5675, 5675, 5675, 5675, 5675, 5675,
- 6156, 6156, 6156, 6156, 6156, 6156, 6156, 6156, 5124, 5124, 5124, 5124,
- 5124, 5124, 5124, 5124, 1296, 1296, 1296, 1296, 1296, 1296, 1296, 1296,
- 1483, 1483, 1483, 1483, 1483, 1483, 1483, 1483, 4851, 4851, 4851, 4851,
- 4851, 4851, 4851, 4851, 3364, 3364, 3364, 3364, 3364, 3364, 3364, 3364,
- 617, 617, 617, 617, 617, 617, 617, 617, 1921, 1921, 1921, 1921, 1921,
- 1921, 1921, 1921, 3992, 3992, 3992, 3992, 5943, 5943, 5943, 5943, 3266,
- 3266, 3266, 3266, 4081, 4081, 4081, 4081, 810, 810, 810, 810, 1887,
- 1887, 1887, 1887, 7043, 7043, 7043, 7043, 7674, 7674, 7674, 7674, 7243,
- 7243, 7243, 7243, 7002, 7002, 7002, 7002, 6376, 6376, 6376, 6376, 5921,
- 5921, 5921, 5921, 396, 396, 396, 396, 4507, 4507, 4507, 4507, 4126,
- 4126, 4126, 4126, 5800, 5800, 5800, 5800, 3772, 3772, 3772, 3772, 5146,
- 5146, 5146, 5146, 5241, 5241, 5241, 5241, 5126, 5126, 5126, 5126, 1535,
- 1535, 1535, 1535, 7132, 7132, 7132, 7132, 3153, 3153, 3153, 3153, 2310,
- 2310, 2310, 2310, 6282, 6282, 6282, 6282, 1321, 1321, 1321, 1321, 514,
- 514, 514, 514, 4725, 4725, 4725, 4725, 7578, 7578, 7578, 7578, 2804,
- 2804, 2804, 2804, 5638, 5638, 5638, 5638, 6250, 6250, 6250, 6250, 6627,
- 6627, 1698, 1698, 4225, 4225, 1166, 1166, 2426, 2426, 3831, 3831, 915,
- 915, 7679, 7679, 4264, 4264, 7487, 7487, 2919, 2919, 2789, 2789, 3405,
- 3405, 2385, 2385, 5568, 5568, 4949, 4949, 2175, 2175, 373, 373, 3692,
- 3692, 6951, 6951, 5925, 5925, 3135, 3135, 5290, 5290, 660, 660, 6184,
- 6184, 2572, 2572, 4536, 4536, 1350, 1350, 5457, 5457, 4093, 4093, 6000,
- 6000, 2883, 2883, 6291, 6291, 1598, 1598, 3750, 3750, 2762, 2762, 2835,
- 2835, 2764, 2764, 5448, 5448, 3816, 3816, 6148, 6148, 1464, 1464, 6954,
- 6954, 1521, 1521, 1386, 1386, 4253, 4253, 6760, 6760, 4938, 4938, 5521,
- 5521, 2649, 2649, 6822, 6822, 2579, 2579, 1532, 1532, 1919, 1919, 7195,
- 7195, 404, 404, 6625, 6625, 783, 783, 1799, 1799, 5016, 5016, 3480,
- 3480, 2133, 2133, 4371, 4371, 6513, 6513, 7664, 3744, 2422, 2001, 1278,
- 929, 6333, 5451, 7502, 6439, 5622, 6611, 2161, 1649, 2072, 3177, 5610,
- 1121, 7245, 236, 715, 670, 7023, 6205, 5303, 2767, 3542, 7455, 1203,
- 1181, 7530, 3887, 1712, 7459, 2786, 7230, 4134, 1779, 6530, 7247, 3568,
- 3988, 3581, 6095, 1509, 2918, 2339, 6274, 3434, 4131, 2340, 2891, 2998,
- 4367, 3461, 4962, 5434, 5092, 1144, 1072, 1295, 4866, 3911, 3450, 3781,
- 5423, 796, 3163, 4473, 7092, 2963, 7557, 3214, 3334, 4315, 3936, 3723,
- 1931, 7252, 7279, 4273, 83, 6155, 826, 6343, 2345, 5378, 2515, 7039,
- 5844, 4716, 6890, 370, 293, 3312, 2083, 5992, 6904, 2070, 2262, 6788,
- 2386, 7493, 6162, 4807, 6277, 1012, 2130, 1441, 2532, 4346, 6597, 4338,
- 2937, 509, 6278, 2812, 3763, 592, 2005, 3657, 2460, 4004, 3752, 692,
- 1669, 2167, 4394,
- }
- var zetasInvExp = [752]uint16{
- 3287, 5514, 6012, 6989, 3929, 3677, 5221, 4024, 5676, 7089, 3918, 4869,
- 1403, 7172, 4744, 3343, 1084, 3335, 5149, 6240, 5551, 6669, 1404, 2874,
- 1519, 188, 5295, 893, 5419, 5611, 777, 1689, 5598, 4369, 7388, 7311,
- 791, 2965, 1837, 642, 5166, 2303, 5336, 1338, 6855, 1526, 7598, 3408,
- 402, 429, 5750, 3958, 3745, 3366, 4347, 4467, 124, 4718, 589, 3208,
- 4518, 6885, 2258, 3900, 4231, 3770, 2815, 6386, 6609, 6537, 2589, 2247,
- 2719, 4220, 3314, 4683, 4790, 5341, 3550, 4247, 1407, 5342, 4763, 6172,
- 1586, 4100, 3693, 4113, 434, 1151, 5902, 3547, 451, 4895, 222, 5969,
- 3794, 151, 6500, 6478, 226, 4139, 4914, 2378, 1476, 658, 7011, 6966,
- 7445, 436, 6560, 2071, 4504, 5609, 6032, 5520, 1070, 2059, 1242, 179,
- 2230, 1348, 6752, 6403, 5680, 5259, 3937, 17, 1168, 1168, 3310, 3310,
- 5548, 5548, 4201, 4201, 2665, 2665, 5882, 5882, 6898, 6898, 1056, 1056,
- 7277, 7277, 486, 486, 5762, 5762, 6149, 6149, 5102, 5102, 859, 859,
- 5032, 5032, 2160, 2160, 2743, 2743, 921, 921, 3428, 3428, 6295, 6295,
- 6160, 6160, 727, 727, 6217, 6217, 1533, 1533, 3865, 3865, 2233, 2233,
- 4917, 4917, 4846, 4846, 4919, 4919, 3931, 3931, 6083, 6083, 1390, 1390,
- 4798, 4798, 1681, 1681, 3588, 3588, 2224, 2224, 6331, 6331, 3145, 3145,
- 5109, 5109, 1497, 1497, 7021, 7021, 2391, 2391, 4546, 4546, 1756, 1756,
- 730, 730, 3989, 3989, 7308, 7308, 5506, 5506, 2732, 2732, 2113, 2113,
- 5296, 5296, 4276, 4276, 4892, 4892, 4762, 4762, 194, 194, 3417, 3417, 2,
- 2, 6766, 6766, 3850, 3850, 5255, 5255, 6515, 6515, 3456, 3456, 5983,
- 5983, 1054, 1054, 1431, 1431, 1431, 1431, 2043, 2043, 2043, 2043, 4877,
- 4877, 4877, 4877, 103, 103, 103, 103, 2956, 2956, 2956, 2956, 7167,
- 7167, 7167, 7167, 6360, 6360, 6360, 6360, 1399, 1399, 1399, 1399, 5371,
- 5371, 5371, 5371, 4528, 4528, 4528, 4528, 549, 549, 549, 549, 6146,
- 6146, 6146, 6146, 2555, 2555, 2555, 2555, 2440, 2440, 2440, 2440, 2535,
- 2535, 2535, 2535, 3909, 3909, 3909, 3909, 1881, 1881, 1881, 1881, 3555,
- 3555, 3555, 3555, 3174, 3174, 3174, 3174, 7285, 7285, 7285, 7285, 1760,
- 1760, 1760, 1760, 1305, 1305, 1305, 1305, 679, 679, 679, 679, 438, 438,
- 438, 438, 7, 7, 7, 7, 638, 638, 638, 638, 5794, 5794, 5794, 5794, 6871,
- 6871, 6871, 6871, 3600, 3600, 3600, 3600, 4415, 4415, 4415, 4415, 1738,
- 1738, 1738, 1738, 3689, 3689, 3689, 3689, 5760, 5760, 5760, 5760, 5760,
- 5760, 5760, 5760, 7064, 7064, 7064, 7064, 7064, 7064, 7064, 7064, 4317,
- 4317, 4317, 4317, 4317, 4317, 4317, 4317, 2830, 2830, 2830, 2830, 2830,
- 2830, 2830, 2830, 6198, 6198, 6198, 6198, 6198, 6198, 6198, 6198, 6385,
- 6385, 6385, 6385, 6385, 6385, 6385, 6385, 2557, 2557, 2557, 2557, 2557,
- 2557, 2557, 2557, 1525, 1525, 1525, 1525, 1525, 1525, 1525, 1525, 2006,
- 2006, 2006, 2006, 2006, 2006, 2006, 2006, 3975, 3975, 3975, 3975, 3975,
- 3975, 3975, 3975, 5688, 5688, 5688, 5688, 5688, 5688, 5688, 5688, 1599,
- 1599, 1599, 1599, 1599, 1599, 1599, 1599, 5695, 5695, 5695, 5695, 5695,
- 5695, 5695, 5695, 2237, 2237, 2237, 2237, 2237, 2237, 2237, 2237, 2088,
- 2088, 2088, 2088, 2088, 2088, 2088, 2088, 4865, 4865, 4865, 4865, 4865,
- 4865, 4865, 4865, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267,
- 6267, 6267, 6267, 6267, 6267, 6267, 6267, 1701, 1701, 1701, 1701, 1701,
- 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 4805,
- 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805,
- 4805, 4805, 4805, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319,
- 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2495, 2495, 2495, 2495, 2495,
- 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 6847,
- 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847,
- 6847, 6847, 6847, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250,
- 2250, 2250, 2250, 2250, 2250, 2250, 2250, 7560, 7560, 7560, 7560, 7560,
- 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 5487,
- 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487,
- 5487, 5487, 5487, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225,
- 5225, 5225, 5225, 5225, 5225, 5225, 5225, 1100, 1100, 1100, 1100, 1100,
- 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 3696,
- 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696,
- 3696, 3696, 3696, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056,
- 4056, 4056, 4056, 4056, 4056, 4056, 4056, 3182, 3182, 3182, 3182, 3182,
- 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 5776,
- 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776,
- 5776, 5776, 5776,
- }
- //go:noescape
- func cpuidAmd64(cpuidParams *uint32)
- //go:noescape
- func xgetbv0Amd64(xcrVec *uint32)
- //go:noescape
- func nttAVX2(inout, zetas *uint16)
- //go:noescape
- func invnttAVX2(inout, omegas *uint16)
- //go:noescape
- func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
- //go:noescape
- func pointwiseAccK3AVX2(dst *uint16, a, b **uint16)
- //go:noescape
- func pointwiseAccK4AVX2(dst *uint16, a, b **uint16)
- //go:noescape
- func cbdEta4AVX2(dst *uint16, buf *byte)
- func supportsAVX2() bool {
- // https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
- const (
- osXsaveBit = 1 << 27
- avx2Bit = 1 << 5
- )
- // Check to see if CPUID actually supports the leaf that indicates AVX2.
- // CPUID.(EAX=0H, ECX=0H) >= 7
- regs := [4]uint32{0x00}
- cpuidAmd64(®s[0])
- if regs[0] < 7 {
- return false
- }
- // Check to see if the OS knows how to save/restore XMM/YMM state.
- // CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1
- regs = [4]uint32{0x01}
- cpuidAmd64(®s[0])
- if regs[2]&osXsaveBit == 0 {
- return false
- }
- xcrRegs := [2]uint32{}
- xgetbv0Amd64(&xcrRegs[0])
- if xcrRegs[0]&6 != 6 {
- return false
- }
- // Check for AVX2 support.
- // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1
- regs = [4]uint32{0x07}
- cpuidAmd64(®s[0])
- return regs[1]&avx2Bit != 0
- }
- var implAVX2 = &hwaccelImpl{
- name: "AVX2",
- nttFn: nttYMM,
- invnttFn: invnttYMM,
- pointwiseAccFn: pointwiseAccYMM,
- cbdFn: cbdYMM,
- }
- func nttYMM(p *[kyberN]uint16) {
- nttAVX2(&p[0], &zetasExp[0])
- }
- func invnttYMM(a *[kyberN]uint16) {
- invnttAVX2(&a[0], &zetasInvExp[0])
- }
- func pointwiseAccYMM(p *poly, a, b *polyVec) {
- // Unlike the C code, a polyVec won't have the polys in contigious
- // memory. So each assembly function takes vectors of pointers to
- // each polyvec's polys.
- //
- // Kind of ugly, but it's the price to pay for flexibility...
- var aVec, bVec [4]*uint16 // k is in {2,3,4}.
- for i := range a.vec {
- aVec[i] = &a.vec[i].coeffs[0]
- bVec[i] = &b.vec[i].coeffs[0]
- }
- switch len(a.vec) {
- case 2:
- pointwiseAccK2AVX2(&p.coeffs[0], &aVec[0], &bVec[0])
- case 3:
- pointwiseAccK3AVX2(&p.coeffs[0], &aVec[0], &bVec[0])
- case 4:
- pointwiseAccK4AVX2(&p.coeffs[0], &aVec[0], &bVec[0])
- }
- }
- func cbdYMM(p *poly, buf []byte, eta int) {
- switch eta {
- case 4:
- cbdEta4AVX2(&p.coeffs[0], &buf[0])
- default:
- cbdRef(p, buf, eta)
- }
- }
- func initHardwareAcceleration() {
- if supportsAVX2() {
- isHardwareAccelerated = true
- hardwareAccelImpl = implAVX2
- }
- }
|