hwaccel_amd64.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. // hwaccel_amd64.go - AMD64 optimized routines.
  2. //
  3. // To the extent possible under law, Yawning Angel has waived all copyright
  4. // and related or neighboring rights to the software, using the Creative
  5. // Commons "CC0" public domain dedication. See LICENSE or
  6. // <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
  7. // +build amd64,!gccgo,!noasm,go1.10
  8. package kyber
  9. var zetasExp = [752]uint16{
  10. 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777,
  11. 3777, 3777, 3777, 3777, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
  12. 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 3625, 3625, 3625, 3625,
  13. 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625,
  14. 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985,
  15. 3985, 3985, 3985, 3985, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581,
  16. 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 2456, 2456, 2456, 2456,
  17. 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456,
  18. 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194,
  19. 2194, 2194, 2194, 2194, 121, 121, 121, 121, 121, 121, 121, 121, 121,
  20. 121, 121, 121, 121, 121, 121, 121, 5431, 5431, 5431, 5431, 5431, 5431,
  21. 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 834, 834,
  22. 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834,
  23. 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186,
  24. 5186, 5186, 5186, 5186, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362,
  25. 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 2876, 2876, 2876, 2876,
  26. 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876,
  27. 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980,
  28. 5980, 5980, 5980, 5980, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414,
  29. 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 2816, 2816, 2816, 2816,
  30. 2816, 2816, 2816, 2816, 5593, 5593, 5593, 5593, 5593, 5593, 5593, 5593,
  31. 5444, 5444, 5444, 5444, 5444, 5444, 5444, 5444, 1986, 1986, 1986, 1986,
  32. 1986, 1986, 1986, 1986, 6082, 6082, 6082, 6082, 6082, 6082, 6082, 6082,
  33. 1993, 1993, 1993, 1993, 1993, 1993, 1993, 1993, 3706, 3706, 3706, 3706,
  34. 3706, 3706, 3706, 3706, 5675, 5675, 5675, 5675, 5675, 5675, 5675, 5675,
  35. 6156, 6156, 6156, 6156, 6156, 6156, 6156, 6156, 5124, 5124, 5124, 5124,
  36. 5124, 5124, 5124, 5124, 1296, 1296, 1296, 1296, 1296, 1296, 1296, 1296,
  37. 1483, 1483, 1483, 1483, 1483, 1483, 1483, 1483, 4851, 4851, 4851, 4851,
  38. 4851, 4851, 4851, 4851, 3364, 3364, 3364, 3364, 3364, 3364, 3364, 3364,
  39. 617, 617, 617, 617, 617, 617, 617, 617, 1921, 1921, 1921, 1921, 1921,
  40. 1921, 1921, 1921, 3992, 3992, 3992, 3992, 5943, 5943, 5943, 5943, 3266,
  41. 3266, 3266, 3266, 4081, 4081, 4081, 4081, 810, 810, 810, 810, 1887,
  42. 1887, 1887, 1887, 7043, 7043, 7043, 7043, 7674, 7674, 7674, 7674, 7243,
  43. 7243, 7243, 7243, 7002, 7002, 7002, 7002, 6376, 6376, 6376, 6376, 5921,
  44. 5921, 5921, 5921, 396, 396, 396, 396, 4507, 4507, 4507, 4507, 4126,
  45. 4126, 4126, 4126, 5800, 5800, 5800, 5800, 3772, 3772, 3772, 3772, 5146,
  46. 5146, 5146, 5146, 5241, 5241, 5241, 5241, 5126, 5126, 5126, 5126, 1535,
  47. 1535, 1535, 1535, 7132, 7132, 7132, 7132, 3153, 3153, 3153, 3153, 2310,
  48. 2310, 2310, 2310, 6282, 6282, 6282, 6282, 1321, 1321, 1321, 1321, 514,
  49. 514, 514, 514, 4725, 4725, 4725, 4725, 7578, 7578, 7578, 7578, 2804,
  50. 2804, 2804, 2804, 5638, 5638, 5638, 5638, 6250, 6250, 6250, 6250, 6627,
  51. 6627, 1698, 1698, 4225, 4225, 1166, 1166, 2426, 2426, 3831, 3831, 915,
  52. 915, 7679, 7679, 4264, 4264, 7487, 7487, 2919, 2919, 2789, 2789, 3405,
  53. 3405, 2385, 2385, 5568, 5568, 4949, 4949, 2175, 2175, 373, 373, 3692,
  54. 3692, 6951, 6951, 5925, 5925, 3135, 3135, 5290, 5290, 660, 660, 6184,
  55. 6184, 2572, 2572, 4536, 4536, 1350, 1350, 5457, 5457, 4093, 4093, 6000,
  56. 6000, 2883, 2883, 6291, 6291, 1598, 1598, 3750, 3750, 2762, 2762, 2835,
  57. 2835, 2764, 2764, 5448, 5448, 3816, 3816, 6148, 6148, 1464, 1464, 6954,
  58. 6954, 1521, 1521, 1386, 1386, 4253, 4253, 6760, 6760, 4938, 4938, 5521,
  59. 5521, 2649, 2649, 6822, 6822, 2579, 2579, 1532, 1532, 1919, 1919, 7195,
  60. 7195, 404, 404, 6625, 6625, 783, 783, 1799, 1799, 5016, 5016, 3480,
  61. 3480, 2133, 2133, 4371, 4371, 6513, 6513, 7664, 3744, 2422, 2001, 1278,
  62. 929, 6333, 5451, 7502, 6439, 5622, 6611, 2161, 1649, 2072, 3177, 5610,
  63. 1121, 7245, 236, 715, 670, 7023, 6205, 5303, 2767, 3542, 7455, 1203,
  64. 1181, 7530, 3887, 1712, 7459, 2786, 7230, 4134, 1779, 6530, 7247, 3568,
  65. 3988, 3581, 6095, 1509, 2918, 2339, 6274, 3434, 4131, 2340, 2891, 2998,
  66. 4367, 3461, 4962, 5434, 5092, 1144, 1072, 1295, 4866, 3911, 3450, 3781,
  67. 5423, 796, 3163, 4473, 7092, 2963, 7557, 3214, 3334, 4315, 3936, 3723,
  68. 1931, 7252, 7279, 4273, 83, 6155, 826, 6343, 2345, 5378, 2515, 7039,
  69. 5844, 4716, 6890, 370, 293, 3312, 2083, 5992, 6904, 2070, 2262, 6788,
  70. 2386, 7493, 6162, 4807, 6277, 1012, 2130, 1441, 2532, 4346, 6597, 4338,
  71. 2937, 509, 6278, 2812, 3763, 592, 2005, 3657, 2460, 4004, 3752, 692,
  72. 1669, 2167, 4394,
  73. }
  74. var zetasInvExp = [752]uint16{
  75. 3287, 5514, 6012, 6989, 3929, 3677, 5221, 4024, 5676, 7089, 3918, 4869,
  76. 1403, 7172, 4744, 3343, 1084, 3335, 5149, 6240, 5551, 6669, 1404, 2874,
  77. 1519, 188, 5295, 893, 5419, 5611, 777, 1689, 5598, 4369, 7388, 7311,
  78. 791, 2965, 1837, 642, 5166, 2303, 5336, 1338, 6855, 1526, 7598, 3408,
  79. 402, 429, 5750, 3958, 3745, 3366, 4347, 4467, 124, 4718, 589, 3208,
  80. 4518, 6885, 2258, 3900, 4231, 3770, 2815, 6386, 6609, 6537, 2589, 2247,
  81. 2719, 4220, 3314, 4683, 4790, 5341, 3550, 4247, 1407, 5342, 4763, 6172,
  82. 1586, 4100, 3693, 4113, 434, 1151, 5902, 3547, 451, 4895, 222, 5969,
  83. 3794, 151, 6500, 6478, 226, 4139, 4914, 2378, 1476, 658, 7011, 6966,
  84. 7445, 436, 6560, 2071, 4504, 5609, 6032, 5520, 1070, 2059, 1242, 179,
  85. 2230, 1348, 6752, 6403, 5680, 5259, 3937, 17, 1168, 1168, 3310, 3310,
  86. 5548, 5548, 4201, 4201, 2665, 2665, 5882, 5882, 6898, 6898, 1056, 1056,
  87. 7277, 7277, 486, 486, 5762, 5762, 6149, 6149, 5102, 5102, 859, 859,
  88. 5032, 5032, 2160, 2160, 2743, 2743, 921, 921, 3428, 3428, 6295, 6295,
  89. 6160, 6160, 727, 727, 6217, 6217, 1533, 1533, 3865, 3865, 2233, 2233,
  90. 4917, 4917, 4846, 4846, 4919, 4919, 3931, 3931, 6083, 6083, 1390, 1390,
  91. 4798, 4798, 1681, 1681, 3588, 3588, 2224, 2224, 6331, 6331, 3145, 3145,
  92. 5109, 5109, 1497, 1497, 7021, 7021, 2391, 2391, 4546, 4546, 1756, 1756,
  93. 730, 730, 3989, 3989, 7308, 7308, 5506, 5506, 2732, 2732, 2113, 2113,
  94. 5296, 5296, 4276, 4276, 4892, 4892, 4762, 4762, 194, 194, 3417, 3417, 2,
  95. 2, 6766, 6766, 3850, 3850, 5255, 5255, 6515, 6515, 3456, 3456, 5983,
  96. 5983, 1054, 1054, 1431, 1431, 1431, 1431, 2043, 2043, 2043, 2043, 4877,
  97. 4877, 4877, 4877, 103, 103, 103, 103, 2956, 2956, 2956, 2956, 7167,
  98. 7167, 7167, 7167, 6360, 6360, 6360, 6360, 1399, 1399, 1399, 1399, 5371,
  99. 5371, 5371, 5371, 4528, 4528, 4528, 4528, 549, 549, 549, 549, 6146,
  100. 6146, 6146, 6146, 2555, 2555, 2555, 2555, 2440, 2440, 2440, 2440, 2535,
  101. 2535, 2535, 2535, 3909, 3909, 3909, 3909, 1881, 1881, 1881, 1881, 3555,
  102. 3555, 3555, 3555, 3174, 3174, 3174, 3174, 7285, 7285, 7285, 7285, 1760,
  103. 1760, 1760, 1760, 1305, 1305, 1305, 1305, 679, 679, 679, 679, 438, 438,
  104. 438, 438, 7, 7, 7, 7, 638, 638, 638, 638, 5794, 5794, 5794, 5794, 6871,
  105. 6871, 6871, 6871, 3600, 3600, 3600, 3600, 4415, 4415, 4415, 4415, 1738,
  106. 1738, 1738, 1738, 3689, 3689, 3689, 3689, 5760, 5760, 5760, 5760, 5760,
  107. 5760, 5760, 5760, 7064, 7064, 7064, 7064, 7064, 7064, 7064, 7064, 4317,
  108. 4317, 4317, 4317, 4317, 4317, 4317, 4317, 2830, 2830, 2830, 2830, 2830,
  109. 2830, 2830, 2830, 6198, 6198, 6198, 6198, 6198, 6198, 6198, 6198, 6385,
  110. 6385, 6385, 6385, 6385, 6385, 6385, 6385, 2557, 2557, 2557, 2557, 2557,
  111. 2557, 2557, 2557, 1525, 1525, 1525, 1525, 1525, 1525, 1525, 1525, 2006,
  112. 2006, 2006, 2006, 2006, 2006, 2006, 2006, 3975, 3975, 3975, 3975, 3975,
  113. 3975, 3975, 3975, 5688, 5688, 5688, 5688, 5688, 5688, 5688, 5688, 1599,
  114. 1599, 1599, 1599, 1599, 1599, 1599, 1599, 5695, 5695, 5695, 5695, 5695,
  115. 5695, 5695, 5695, 2237, 2237, 2237, 2237, 2237, 2237, 2237, 2237, 2088,
  116. 2088, 2088, 2088, 2088, 2088, 2088, 2088, 4865, 4865, 4865, 4865, 4865,
  117. 4865, 4865, 4865, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267,
  118. 6267, 6267, 6267, 6267, 6267, 6267, 6267, 1701, 1701, 1701, 1701, 1701,
  119. 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 4805,
  120. 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805,
  121. 4805, 4805, 4805, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319,
  122. 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2495, 2495, 2495, 2495, 2495,
  123. 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 6847,
  124. 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847,
  125. 6847, 6847, 6847, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250,
  126. 2250, 2250, 2250, 2250, 2250, 2250, 2250, 7560, 7560, 7560, 7560, 7560,
  127. 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 5487,
  128. 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487,
  129. 5487, 5487, 5487, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225,
  130. 5225, 5225, 5225, 5225, 5225, 5225, 5225, 1100, 1100, 1100, 1100, 1100,
  131. 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 3696,
  132. 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696,
  133. 3696, 3696, 3696, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056,
  134. 4056, 4056, 4056, 4056, 4056, 4056, 4056, 3182, 3182, 3182, 3182, 3182,
  135. 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 5776,
  136. 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776,
  137. 5776, 5776, 5776,
  138. }
  139. //go:noescape
  140. func cpuidAmd64(cpuidParams *uint32)
  141. //go:noescape
  142. func xgetbv0Amd64(xcrVec *uint32)
  143. //go:noescape
  144. func nttAVX2(inout, zetas *uint16)
  145. //go:noescape
  146. func invnttAVX2(inout, omegas *uint16)
  147. //go:noescape
  148. func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
  149. //go:noescape
  150. func pointwiseAccK3AVX2(dst *uint16, a, b **uint16)
  151. //go:noescape
  152. func pointwiseAccK4AVX2(dst *uint16, a, b **uint16)
  153. //go:noescape
  154. func cbdEta4AVX2(dst *uint16, buf *byte)
  155. func supportsAVX2() bool {
  156. // https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
  157. const (
  158. osXsaveBit = 1 << 27
  159. avx2Bit = 1 << 5
  160. )
  161. // Check to see if CPUID actually supports the leaf that indicates AVX2.
  162. // CPUID.(EAX=0H, ECX=0H) >= 7
  163. regs := [4]uint32{0x00}
  164. cpuidAmd64(&regs[0])
  165. if regs[0] < 7 {
  166. return false
  167. }
  168. // Check to see if the OS knows how to save/restore XMM/YMM state.
  169. // CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1
  170. regs = [4]uint32{0x01}
  171. cpuidAmd64(&regs[0])
  172. if regs[2]&osXsaveBit == 0 {
  173. return false
  174. }
  175. xcrRegs := [2]uint32{}
  176. xgetbv0Amd64(&xcrRegs[0])
  177. if xcrRegs[0]&6 != 6 {
  178. return false
  179. }
  180. // Check for AVX2 support.
  181. // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1
  182. regs = [4]uint32{0x07}
  183. cpuidAmd64(&regs[0])
  184. return regs[1]&avx2Bit != 0
  185. }
  186. var implAVX2 = &hwaccelImpl{
  187. name: "AVX2",
  188. nttFn: nttYMM,
  189. invnttFn: invnttYMM,
  190. pointwiseAccFn: pointwiseAccYMM,
  191. cbdFn: cbdYMM,
  192. }
  193. func nttYMM(p *[kyberN]uint16) {
  194. nttAVX2(&p[0], &zetasExp[0])
  195. }
  196. func invnttYMM(a *[kyberN]uint16) {
  197. invnttAVX2(&a[0], &zetasInvExp[0])
  198. }
  199. func pointwiseAccYMM(p *poly, a, b *polyVec) {
  200. // Unlike the C code, a polyVec won't have the polys in contigious
  201. // memory. So each assembly function takes vectors of pointers to
  202. // each polyvec's polys.
  203. //
  204. // Kind of ugly, but it's the price to pay for flexibility...
  205. var aVec, bVec [4]*uint16 // k is in {2,3,4}.
  206. for i := range a.vec {
  207. aVec[i] = &a.vec[i].coeffs[0]
  208. bVec[i] = &b.vec[i].coeffs[0]
  209. }
  210. switch len(a.vec) {
  211. case 2:
  212. pointwiseAccK2AVX2(&p.coeffs[0], &aVec[0], &bVec[0])
  213. case 3:
  214. pointwiseAccK3AVX2(&p.coeffs[0], &aVec[0], &bVec[0])
  215. case 4:
  216. pointwiseAccK4AVX2(&p.coeffs[0], &aVec[0], &bVec[0])
  217. }
  218. }
  219. func cbdYMM(p *poly, buf []byte, eta int) {
  220. switch eta {
  221. case 4:
  222. cbdEta4AVX2(&p.coeffs[0], &buf[0])
  223. default:
  224. cbdRef(p, buf, eta)
  225. }
  226. }
  227. func initHardwareAcceleration() {
  228. if supportsAVX2() {
  229. isHardwareAccelerated = true
  230. hardwareAccelImpl = implAVX2
  231. }
  232. }