hwaccel_amd64.s 52 KB


  1. // +build !noasm,go1.10
  2. // hwaccel_amd64.s - AMD64 optimized routines.
  3. //
  4. // To the extent possible under law, Yawning Angel has waived all copyright
  5. // and related or neighboring rights to the software, using the Creative
  6. // Commons "CC0" public domain dedication. See LICENSE or
  7. // <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
  8. #include "textflag.h"
  9. // func cpuidAmd64(cpuidParams *uint32)
  10. TEXT ·cpuidAmd64(SB), NOSPLIT, $0-8
  11. MOVQ cpuidParams+0(FP), R15
  12. MOVL 0(R15), AX
  13. MOVL 8(R15), CX
  14. CPUID
  15. MOVL AX, 0(R15)
  16. MOVL BX, 4(R15)
  17. MOVL CX, 8(R15)
  18. MOVL DX, 12(R15)
  19. RET
  20. // func xgetbv0Amd64(xcrVec *uint32)
  21. TEXT ·xgetbv0Amd64(SB), NOSPLIT, $0-8
  22. MOVQ xcrVec+0(FP), BX
  23. XORL CX, CX
  24. XGETBV
  25. MOVL AX, 0(BX)
  26. MOVL DX, 4(BX)
  27. RET
  28. // Routines taken from the `avx2` implementation, converted to Go's assembly
  29. // dialect. I do this in lieu of cutting myself to see if I still can feel
  30. // pain.
  31. //
  32. // The conversion is mostly direct except:
  33. // * Instead of aligned loads, unaligned loads are used, as there is no
  34. // meaningful difference on modern Intel systems, and it's not immediately
  35. // obvious to me how Go will align global data.
  36. // * The polyvec_pointwise_acc family of routines take vectors of pointers
  37. // due to the different internal memory layout of a polyvec.
  38. // * The constants are renamed slightly.
  39. // Note:
  40. // * These must be kept in sync with the values in params.go.
  41. // Currently assumes Q = 7681, Q_INV = 57857.
  42. // * Caution, Little endian so things will look different from avx2/consts.c.
  43. DATA ·vpshufb_idx<>+0x00(SB)/8, $0x0504070601000302
  44. DATA ·vpshufb_idx<>+0x08(SB)/8, $0x0d0c0f0e09080b0a
  45. DATA ·vpshufb_idx<>+0x10(SB)/8, $0x0504070601000302
  46. DATA ·vpshufb_idx<>+0x18(SB)/8, $0x0d0c0f0e09080b0a
  47. GLOBL ·vpshufb_idx<>(SB), (NOPTR+RODATA), $32
  48. DATA ·low_mask<>+0x00(SB)/8, $0x1fff1fff1fff1fff
  49. DATA ·low_mask<>+0x08(SB)/8, $0x1fff1fff1fff1fff
  50. DATA ·low_mask<>+0x10(SB)/8, $0x1fff1fff1fff1fff
  51. DATA ·low_mask<>+0x18(SB)/8, $0x1fff1fff1fff1fff
  52. GLOBL ·low_mask<>(SB), (NOPTR+RODATA), $32
  53. DATA ·lowdword<>+0x00(SB)/8, $0x0000ffff0000ffff
  54. DATA ·lowdword<>+0x08(SB)/8, $0x0000ffff0000ffff
  55. DATA ·lowdword<>+0x10(SB)/8, $0x0000ffff0000ffff
  56. DATA ·lowdword<>+0x18(SB)/8, $0x0000ffff0000ffff
  57. GLOBL ·lowdword<>(SB), (NOPTR+RODATA), $32
  58. DATA ·q_x16<>+0x00(SB)/8, $0x1e011e011e011e01
  59. DATA ·q_x16<>+0x08(SB)/8, $0x1e011e011e011e01
  60. DATA ·q_x16<>+0x10(SB)/8, $0x1e011e011e011e01
  61. DATA ·q_x16<>+0x18(SB)/8, $0x1e011e011e011e01
  62. GLOBL ·q_x16<>(SB), (NOPTR+RODATA), $32
  63. DATA ·q2_x16<>+0x00(SB)/8, $0x3c023c023c023c02
  64. DATA ·q2_x16<>+0x08(SB)/8, $0x3c023c023c023c02
  65. DATA ·q2_x16<>+0x10(SB)/8, $0x3c023c023c023c02
  66. DATA ·q2_x16<>+0x18(SB)/8, $0x3c023c023c023c02
  67. GLOBL ·q2_x16<>(SB), (NOPTR+RODATA), $32
  68. DATA ·qinv_x16<>+0x00(SB)/8, $0xe201e201e201e201
  69. DATA ·qinv_x16<>+0x08(SB)/8, $0xe201e201e201e201
  70. DATA ·qinv_x16<>+0x10(SB)/8, $0xe201e201e201e201
  71. DATA ·qinv_x16<>+0x18(SB)/8, $0xe201e201e201e201
  72. GLOBL ·qinv_x16<>(SB), (NOPTR+RODATA), $32
  73. DATA ·f_x16<>+0x00(SB)/8, $0x0100010001000100
  74. DATA ·f_x16<>+0x08(SB)/8, $0x0100010001000100
  75. DATA ·f_x16<>+0x10(SB)/8, $0x0100010001000100
  76. DATA ·f_x16<>+0x18(SB)/8, $0x0100010001000100
  77. GLOBL ·f_x16<>(SB), (NOPTR+RODATA), $32
  78. DATA ·v_x16<>+0x00(SB)/8, $0x4442444244424442
  79. DATA ·v_x16<>+0x08(SB)/8, $0x4442444244424442
  80. DATA ·v_x16<>+0x10(SB)/8, $0x4442444244424442
  81. DATA ·v_x16<>+0x18(SB)/8, $0x4442444244424442
  82. GLOBL ·v_x16<>(SB), (NOPTR+RODATA), $32
  83. DATA ·montsq_x16<>+0x00(SB)/8, $0x15c115c115c115c1
  84. DATA ·montsq_x16<>+0x08(SB)/8, $0x15c115c115c115c1
  85. DATA ·montsq_x16<>+0x10(SB)/8, $0x15c115c115c115c1
  86. DATA ·montsq_x16<>+0x18(SB)/8, $0x15c115c115c115c1
  87. GLOBL ·montsq_x16<>(SB), (NOPTR+RODATA), $32
  88. // func nttAVX2(inout, zetas *uint16)
  89. TEXT ·nttAVX2(SB), NOSPLIT, $0-16
  90. MOVQ inout+0(FP), DI
  91. MOVQ zetas+8(FP), SI
  92. VMOVDQU ·qinv_x16<>(SB), Y0
  93. VMOVDQU ·q_x16<>(SB), Y1
  94. VMOVDQU ·low_mask<>(SB), Y2
  95. // zetas
  96. VMOVDQU (SI), Y3
  97. // first round
  98. // load
  99. VMOVDQU (DI), Y4
  100. VMOVDQU 32(DI), Y5
  101. VMOVDQU 64(DI), Y6
  102. VMOVDQU 96(DI), Y7
  103. VMOVDQU 256(DI), Y8
  104. VMOVDQU 288(DI), Y9
  105. VMOVDQU 320(DI), Y10
  106. VMOVDQU 352(DI), Y11
  107. // level 0
  108. // mul
  109. VPMULLW Y3, Y8, Y12
  110. VPMULHW Y3, Y8, Y8
  111. VPMULLW Y3, Y9, Y13
  112. VPMULHW Y3, Y9, Y9
  113. VPMULLW Y3, Y10, Y14
  114. VPMULHW Y3, Y10, Y10
  115. VPMULLW Y3, Y11, Y15
  116. VPMULHW Y3, Y11, Y11
  117. // reduce
  118. VPMULLW Y0, Y12, Y12
  119. VPMULLW Y0, Y13, Y13
  120. VPMULLW Y0, Y14, Y14
  121. VPMULLW Y0, Y15, Y15
  122. VPMULHW Y1, Y12, Y12
  123. VPMULHW Y1, Y13, Y13
  124. VPMULHW Y1, Y14, Y14
  125. VPMULHW Y1, Y15, Y15
  126. VPSUBW Y12, Y8, Y12
  127. VPSUBW Y13, Y9, Y13
  128. VPSUBW Y14, Y10, Y14
  129. VPSUBW Y15, Y11, Y15
  130. // update
  131. VPSUBW Y12, Y4, Y8
  132. VPSUBW Y13, Y5, Y9
  133. VPSUBW Y14, Y6, Y10
  134. VPSUBW Y15, Y7, Y11
  135. VPADDW Y12, Y4, Y4
  136. VPADDW Y13, Y5, Y5
  137. VPADDW Y14, Y6, Y6
  138. VPADDW Y15, Y7, Y7
  139. // store
  140. VMOVDQU Y4, (DI)
  141. VMOVDQU Y5, 32(DI)
  142. VMOVDQU Y6, 64(DI)
  143. VMOVDQU Y7, 96(DI)
  144. VMOVDQU Y8, 256(DI)
  145. VMOVDQU Y9, 288(DI)
  146. VMOVDQU Y10, 320(DI)
  147. VMOVDQU Y11, 352(DI)
  148. ADDQ $128, DI
  149. // second round
  150. // load
  151. VMOVDQU (DI), Y4
  152. VMOVDQU 32(DI), Y5
  153. VMOVDQU 64(DI), Y6
  154. VMOVDQU 96(DI), Y7
  155. VMOVDQU 256(DI), Y8
  156. VMOVDQU 288(DI), Y9
  157. VMOVDQU 320(DI), Y10
  158. VMOVDQU 352(DI), Y11
  159. // level 0
  160. // mul
  161. VPMULLW Y3, Y8, Y12
  162. VPMULHW Y3, Y8, Y8
  163. VPMULLW Y3, Y9, Y13
  164. VPMULHW Y3, Y9, Y9
  165. VPMULLW Y3, Y10, Y14
  166. VPMULHW Y3, Y10, Y10
  167. VPMULLW Y3, Y11, Y15
  168. VPMULHW Y3, Y11, Y11
  169. // reduce
  170. VPMULLW Y0, Y12, Y12
  171. VPMULLW Y0, Y13, Y13
  172. VPMULLW Y0, Y14, Y14
  173. VPMULLW Y0, Y15, Y15
  174. VPMULHW Y1, Y12, Y12
  175. VPMULHW Y1, Y13, Y13
  176. VPMULHW Y1, Y14, Y14
  177. VPMULHW Y1, Y15, Y15
  178. VPSUBW Y12, Y8, Y12
  179. VPSUBW Y13, Y9, Y13
  180. VPSUBW Y14, Y10, Y14
  181. VPSUBW Y15, Y11, Y15
  182. // update
  183. VPSUBW Y12, Y4, Y8
  184. VPSUBW Y13, Y5, Y9
  185. VPSUBW Y14, Y6, Y10
  186. VPSUBW Y15, Y7, Y11
  187. VPADDW Y12, Y4, Y4
  188. VPADDW Y13, Y5, Y5
  189. VPADDW Y14, Y6, Y6
  190. VPADDW Y15, Y7, Y7
  191. // store
  192. VMOVDQU Y4, (DI)
  193. VMOVDQU Y5, 32(DI)
  194. VMOVDQU Y6, 64(DI)
  195. VMOVDQU Y7, 96(DI)
  196. VMOVDQU Y8, 256(DI)
  197. VMOVDQU Y9, 288(DI)
  198. VMOVDQU Y10, 320(DI)
  199. VMOVDQU Y11, 352(DI)
  200. SUBQ $128, DI
  201. // first round
  202. // zetas
  203. VMOVDQU 32(SI), Y3
  204. // load
  205. VMOVDQU (DI), Y4
  206. VMOVDQU 32(DI), Y5
  207. VMOVDQU 64(DI), Y6
  208. VMOVDQU 96(DI), Y7
  209. VMOVDQU 128(DI), Y8
  210. VMOVDQU 160(DI), Y9
  211. VMOVDQU 192(DI), Y10
  212. VMOVDQU 224(DI), Y11
  213. // level 1
  214. // mul
  215. VPMULLW Y3, Y8, Y12
  216. VPMULHW Y3, Y8, Y8
  217. VPMULLW Y3, Y9, Y13
  218. VPMULHW Y3, Y9, Y9
  219. VPMULLW Y3, Y10, Y14
  220. VPMULHW Y3, Y10, Y10
  221. VPMULLW Y3, Y11, Y15
  222. VPMULHW Y3, Y11, Y11
  223. // reduce
  224. VPMULLW Y0, Y12, Y12
  225. VPMULLW Y0, Y13, Y13
  226. VPMULLW Y0, Y14, Y14
  227. VPMULLW Y0, Y15, Y15
  228. VPMULHW Y1, Y12, Y12
  229. VPMULHW Y1, Y13, Y13
  230. VPMULHW Y1, Y14, Y14
  231. VPMULHW Y1, Y15, Y15
  232. VPSUBW Y12, Y8, Y12
  233. VPSUBW Y13, Y9, Y13
  234. VPSUBW Y14, Y10, Y14
  235. VPSUBW Y15, Y11, Y15
  236. // update
  237. VPSUBW Y12, Y4, Y8
  238. VPSUBW Y13, Y5, Y9
  239. VPSUBW Y14, Y6, Y10
  240. VPSUBW Y15, Y7, Y11
  241. VPADDW Y12, Y4, Y4
  242. VPADDW Y13, Y5, Y5
  243. VPADDW Y14, Y6, Y6
  244. VPADDW Y15, Y7, Y7
  245. // level 2
  246. // zetas
  247. VMOVDQU 96(SI), Y15
  248. VMOVDQU 128(SI), Y3
  249. // mul
  250. VPMULLW Y15, Y6, Y12
  251. VPMULHW Y15, Y6, Y6
  252. VPMULLW Y15, Y7, Y13
  253. VPMULHW Y15, Y7, Y7
  254. VPMULLW Y3, Y10, Y14
  255. VPMULHW Y3, Y10, Y10
  256. VPMULLW Y3, Y11, Y15
  257. VPMULHW Y3, Y11, Y11
  258. // reduce
  259. VPMULLW Y0, Y12, Y12
  260. VPMULLW Y0, Y13, Y13
  261. VPMULLW Y0, Y14, Y14
  262. VPMULLW Y0, Y15, Y15
  263. VPMULHW Y1, Y12, Y12
  264. VPMULHW Y1, Y13, Y13
  265. VPMULHW Y1, Y14, Y14
  266. VPMULHW Y1, Y15, Y15
  267. VPSUBW Y12, Y6, Y12
  268. VPSUBW Y13, Y7, Y13
  269. VPSUBW Y14, Y10, Y14
  270. VPSUBW Y15, Y11, Y15
  271. // update
  272. VPSUBW Y12, Y4, Y6
  273. VPSUBW Y13, Y5, Y7
  274. VPSUBW Y14, Y8, Y10
  275. VPSUBW Y15, Y9, Y11
  276. VPADDW Y12, Y4, Y4
  277. VPADDW Y13, Y5, Y5
  278. VPADDW Y14, Y8, Y8
  279. VPADDW Y15, Y9, Y9
  280. // level 3
  281. // zetas
  282. VMOVDQU 224(SI), Y13
  283. VMOVDQU 256(SI), Y14
  284. VMOVDQU 288(SI), Y15
  285. VMOVDQU 320(SI), Y3
  286. // mul
  287. VPMULLW Y13, Y5, Y12
  288. VPMULHW Y13, Y5, Y5
  289. VPMULLW Y14, Y7, Y13
  290. VPMULHW Y14, Y7, Y7
  291. VPMULLW Y15, Y9, Y14
  292. VPMULHW Y15, Y9, Y9
  293. VPMULLW Y3, Y11, Y15
  294. VPMULHW Y3, Y11, Y11
  295. // reduce
  296. VPMULLW Y0, Y12, Y12
  297. VPMULLW Y0, Y13, Y13
  298. VPMULLW Y0, Y14, Y14
  299. VPMULLW Y0, Y15, Y15
  300. VPMULHW Y1, Y12, Y12
  301. VPMULHW Y1, Y13, Y13
  302. VPMULHW Y1, Y14, Y14
  303. VPMULHW Y1, Y15, Y15
  304. VPSUBW Y12, Y5, Y12
  305. VPSUBW Y13, Y7, Y13
  306. VPSUBW Y14, Y9, Y14
  307. VPSUBW Y15, Y11, Y15
  308. // reduce 2
  309. VPSRAW $13, Y4, Y5
  310. VPSRAW $13, Y6, Y7
  311. VPSRAW $13, Y8, Y9
  312. VPSRAW $13, Y10, Y11
  313. VPAND Y2, Y4, Y4
  314. VPAND Y2, Y6, Y6
  315. VPAND Y2, Y8, Y8
  316. VPAND Y2, Y10, Y10
  317. VPSUBW Y5, Y4, Y4
  318. VPSUBW Y7, Y6, Y6
  319. VPSUBW Y9, Y8, Y8
  320. VPSUBW Y11, Y10, Y10
  321. VPSLLW $9, Y5, Y5
  322. VPSLLW $9, Y7, Y7
  323. VPSLLW $9, Y9, Y9
  324. VPSLLW $9, Y11, Y11
  325. VPADDW Y5, Y4, Y4
  326. VPADDW Y7, Y6, Y6
  327. VPADDW Y9, Y8, Y8
  328. VPADDW Y11, Y10, Y10
  329. // update
  330. VPSUBW Y12, Y4, Y5
  331. VPSUBW Y13, Y6, Y7
  332. VPSUBW Y14, Y8, Y9
  333. VPSUBW Y15, Y10, Y11
  334. VPADDW Y12, Y4, Y4
  335. VPADDW Y13, Y6, Y6
  336. VPADDW Y14, Y8, Y8
  337. VPADDW Y15, Y10, Y10
  338. // level 4
  339. // zetas
  340. VMOVDQU 480(SI), Y12
  341. VMOVDQU 512(SI), Y13
  342. VMOVDQU 544(SI), Y14
  343. VMOVDQU 576(SI), Y15
  344. // shuffle
  345. VPERM2I128 $0x02, Y4, Y5, Y3
  346. VPERM2I128 $0x13, Y4, Y5, Y4
  347. VPERM2I128 $0x02, Y6, Y7, Y5
  348. VPERM2I128 $0x13, Y6, Y7, Y6
  349. VPERM2I128 $0x02, Y8, Y9, Y7
  350. VPERM2I128 $0x13, Y8, Y9, Y8
  351. VPERM2I128 $0x02, Y10, Y11, Y9
  352. VPERM2I128 $0x13, Y10, Y11, Y10
  353. // mul
  354. VPMULLW Y12, Y4, Y11
  355. VPMULHW Y12, Y4, Y4
  356. VPMULLW Y13, Y6, Y12
  357. VPMULHW Y13, Y6, Y6
  358. VPMULLW Y14, Y8, Y13
  359. VPMULHW Y14, Y8, Y8
  360. VPMULLW Y15, Y10, Y14
  361. VPMULHW Y15, Y10, Y10
  362. // reduce
  363. VPMULLW Y0, Y11, Y11
  364. VPMULLW Y0, Y12, Y12
  365. VPMULLW Y0, Y13, Y13
  366. VPMULLW Y0, Y14, Y14
  367. VPMULHW Y1, Y11, Y11
  368. VPMULHW Y1, Y12, Y12
  369. VPMULHW Y1, Y13, Y13
  370. VPMULHW Y1, Y14, Y14
  371. VPSUBW Y11, Y4, Y11
  372. VPSUBW Y12, Y6, Y12
  373. VPSUBW Y13, Y8, Y13
  374. VPSUBW Y14, Y10, Y14
  375. // update
  376. VPSUBW Y11, Y3, Y4
  377. VPSUBW Y12, Y5, Y6
  378. VPSUBW Y13, Y7, Y8
  379. VPSUBW Y14, Y9, Y10
  380. VPADDW Y11, Y3, Y3
  381. VPADDW Y12, Y5, Y5
  382. VPADDW Y13, Y7, Y7
  383. VPADDW Y14, Y9, Y9
  384. // level 5
  385. // zetas
  386. VMOVDQU 736(SI), Y12
  387. VMOVDQU 768(SI), Y13
  388. VMOVDQU 800(SI), Y14
  389. VMOVDQU 832(SI), Y15
  390. // shuffle
  391. VSHUFPD $0x00, Y4, Y3, Y11
  392. VSHUFPD $0x0F, Y4, Y3, Y3
  393. VSHUFPD $0x00, Y6, Y5, Y4
  394. VSHUFPD $0x0F, Y6, Y5, Y5
  395. VSHUFPD $0x00, Y8, Y7, Y6
  396. VSHUFPD $0x0F, Y8, Y7, Y7
  397. VSHUFPD $0x00, Y10, Y9, Y8
  398. VSHUFPD $0x0F, Y10, Y9, Y9
  399. // mul
  400. VPMULLW Y12, Y3, Y10
  401. VPMULHW Y12, Y3, Y3
  402. VPMULLW Y13, Y5, Y12
  403. VPMULHW Y13, Y5, Y5
  404. VPMULLW Y14, Y7, Y13
  405. VPMULHW Y14, Y7, Y7
  406. VPMULLW Y15, Y9, Y14
  407. VPMULHW Y15, Y9, Y9
  408. // reduce
  409. VPMULLW Y0, Y10, Y10
  410. VPMULLW Y0, Y12, Y12
  411. VPMULLW Y0, Y13, Y13
  412. VPMULLW Y0, Y14, Y14
  413. VPMULHW Y1, Y10, Y10
  414. VPMULHW Y1, Y12, Y12
  415. VPMULHW Y1, Y13, Y13
  416. VPMULHW Y1, Y14, Y14
  417. VPSUBW Y10, Y3, Y10
  418. VPSUBW Y12, Y5, Y12
  419. VPSUBW Y13, Y7, Y13
  420. VPSUBW Y14, Y9, Y14
  421. // update
  422. VPSUBW Y10, Y11, Y3
  423. VPSUBW Y12, Y4, Y5
  424. VPSUBW Y13, Y6, Y7
  425. VPSUBW Y14, Y8, Y9
  426. VPADDW Y10, Y11, Y10
  427. VPADDW Y12, Y4, Y4
  428. VPADDW Y13, Y6, Y6
  429. VPADDW Y14, Y8, Y8
  430. // level 6
  431. // shuffle
  432. VPSHUFD $0xB1, Y10, Y12
  433. VPSHUFD $0xB1, Y3, Y13
  434. VPSHUFD $0xB1, Y4, Y14
  435. VPSHUFD $0xB1, Y5, Y15
  436. VPBLENDD $0x55, Y10, Y13, Y10
  437. VPBLENDD $0xAA, Y3, Y12, Y3
  438. VPBLENDD $0x55, Y4, Y15, Y4
  439. VPBLENDD $0xAA, Y5, Y14, Y5
  440. VPSHUFD $0xB1, Y6, Y12
  441. VPSHUFD $0xB1, Y7, Y13
  442. VPSHUFD $0xB1, Y8, Y14
  443. VPSHUFD $0xB1, Y9, Y15
  444. VPBLENDD $0x55, Y6, Y13, Y6
  445. VPBLENDD $0xAA, Y7, Y12, Y7
  446. VPBLENDD $0x55, Y8, Y15, Y8
  447. VPBLENDD $0xAA, Y9, Y14, Y9
  448. // zetas
  449. VMOVDQU 992(SI), Y12
  450. VMOVDQU 1024(SI), Y13
  451. VMOVDQU 1056(SI), Y14
  452. VMOVDQU 1088(SI), Y15
  453. // mul
  454. VPMULLW Y12, Y3, Y11
  455. VPMULHW Y12, Y3, Y3
  456. VPMULLW Y13, Y5, Y12
  457. VPMULHW Y13, Y5, Y5
  458. VPMULLW Y14, Y7, Y13
  459. VPMULHW Y14, Y7, Y7
  460. VPMULLW Y15, Y9, Y14
  461. VPMULHW Y15, Y9, Y9
  462. // reduce
  463. VPMULLW Y0, Y11, Y11
  464. VPMULLW Y0, Y12, Y12
  465. VPMULLW Y0, Y13, Y13
  466. VPMULLW Y0, Y14, Y14
  467. VPMULHW Y1, Y11, Y11
  468. VPMULHW Y1, Y12, Y12
  469. VPMULHW Y1, Y13, Y13
  470. VPMULHW Y1, Y14, Y14
  471. VPSUBW Y11, Y3, Y11
  472. VPSUBW Y12, Y5, Y12
  473. VPSUBW Y13, Y7, Y13
  474. VPSUBW Y14, Y9, Y14
  475. // reduce 2
  476. VPSRAW $13, Y10, Y3
  477. VPSRAW $13, Y4, Y5
  478. VPSRAW $13, Y6, Y7
  479. VPSRAW $13, Y8, Y9
  480. VPAND Y2, Y10, Y10
  481. VPAND Y2, Y4, Y4
  482. VPAND Y2, Y6, Y6
  483. VPAND Y2, Y8, Y8
  484. VPSUBW Y3, Y10, Y10
  485. VPSUBW Y5, Y4, Y4
  486. VPSUBW Y7, Y6, Y6
  487. VPSUBW Y9, Y8, Y8
  488. VPSLLW $9, Y3, Y3
  489. VPSLLW $9, Y5, Y5
  490. VPSLLW $9, Y7, Y7
  491. VPSLLW $9, Y9, Y9
  492. VPADDW Y3, Y10, Y10
  493. VPADDW Y5, Y4, Y4
  494. VPADDW Y7, Y6, Y6
  495. VPADDW Y9, Y8, Y8
  496. // update
  497. VPSUBW Y11, Y10, Y3
  498. VPSUBW Y12, Y4, Y5
  499. VPSUBW Y13, Y6, Y7
  500. VPSUBW Y14, Y8, Y9
  501. VPADDW Y11, Y10, Y10
  502. VPADDW Y12, Y4, Y4
  503. VPADDW Y13, Y6, Y6
  504. VPADDW Y14, Y8, Y8
  505. // level 7
  506. // shuffle
  507. VMOVDQU ·vpshufb_idx<>(SB), Y15
  508. VPSHUFB Y15, Y10, Y11
  509. VPSHUFB Y15, Y3, Y12
  510. VPSHUFB Y15, Y4, Y13
  511. VPSHUFB Y15, Y5, Y14
  512. VPBLENDW $0x55, Y10, Y12, Y10
  513. VPBLENDW $0xAA, Y3, Y11, Y3
  514. VPBLENDW $0x55, Y4, Y14, Y4
  515. VPBLENDW $0xAA, Y5, Y13, Y5
  516. VPSHUFB Y15, Y6, Y11
  517. VPSHUFB Y15, Y7, Y12
  518. VPSHUFB Y15, Y8, Y13
  519. VPSHUFB Y15, Y9, Y14
  520. VPBLENDW $0x55, Y6, Y12, Y6
  521. VPBLENDW $0xAA, Y7, Y11, Y7
  522. VPBLENDW $0x55, Y8, Y14, Y8
  523. VPBLENDW $0xAA, Y9, Y13, Y9
  524. // zetas
  525. VMOVDQU 1248(SI), Y12
  526. VMOVDQU 1280(SI), Y13
  527. VMOVDQU 1312(SI), Y14
  528. VMOVDQU 1344(SI), Y15
  529. // mul
  530. VPMULLW Y12, Y3, Y11
  531. VPMULHW Y12, Y3, Y3
  532. VPMULLW Y13, Y5, Y12
  533. VPMULHW Y13, Y5, Y5
  534. VPMULLW Y14, Y7, Y13
  535. VPMULHW Y14, Y7, Y7
  536. VPMULLW Y15, Y9, Y14
  537. VPMULHW Y15, Y9, Y9
  538. // reduce
  539. VPMULLW Y0, Y11, Y11
  540. VPMULLW Y0, Y12, Y12
  541. VPMULLW Y0, Y13, Y13
  542. VPMULLW Y0, Y14, Y14
  543. VPMULHW Y1, Y11, Y11
  544. VPMULHW Y1, Y12, Y12
  545. VPMULHW Y1, Y13, Y13
  546. VPMULHW Y1, Y14, Y14
  547. VPSUBW Y11, Y3, Y11
  548. VPSUBW Y12, Y5, Y12
  549. VPSUBW Y13, Y7, Y13
  550. VPSUBW Y14, Y9, Y14
  551. // reduce 3
  552. VMOVDQU ·q2_x16<>(SB), Y15
  553. VPSRAW $15, Y10, Y3
  554. VPSRAW $15, Y4, Y5
  555. VPSRAW $15, Y6, Y7
  556. VPSRAW $15, Y8, Y9
  557. VPAND Y15, Y3, Y3
  558. VPAND Y15, Y5, Y5
  559. VPAND Y15, Y7, Y7
  560. VPAND Y15, Y9, Y9
  561. VPADDW Y1, Y10, Y10
  562. VPADDW Y1, Y4, Y4
  563. VPADDW Y1, Y6, Y6
  564. VPADDW Y1, Y8, Y8
  565. VPADDW Y3, Y10, Y10
  566. VPADDW Y5, Y4, Y4
  567. VPADDW Y7, Y6, Y6
  568. VPADDW Y9, Y8, Y8
  569. // update
  570. VPSUBW Y11, Y10, Y3
  571. VPSUBW Y12, Y4, Y5
  572. VPSUBW Y13, Y6, Y7
  573. VPSUBW Y14, Y8, Y9
  574. VPADDW Y11, Y10, Y10
  575. VPADDW Y12, Y4, Y4
  576. VPADDW Y13, Y6, Y6
  577. VPADDW Y14, Y8, Y8
  578. // reorder
  579. VPUNPCKLWD Y3, Y10, Y12
  580. VPUNPCKHWD Y3, Y10, Y13
  581. VPUNPCKLWD Y5, Y4, Y14
  582. VPUNPCKHWD Y5, Y4, Y15
  583. VPUNPCKLWD Y7, Y6, Y3
  584. VPUNPCKHWD Y7, Y6, Y4
  585. VPUNPCKLWD Y9, Y8, Y5
  586. VPUNPCKHWD Y9, Y8, Y6
  587. VPERM2I128 $0x20, Y13, Y12, Y11
  588. VPERM2I128 $0x31, Y13, Y12, Y12
  589. VPERM2I128 $0x20, Y15, Y14, Y13
  590. VPERM2I128 $0x31, Y15, Y14, Y14
  591. VPERM2I128 $0x20, Y4, Y3, Y15
  592. VPERM2I128 $0x31, Y4, Y3, Y3
  593. VPERM2I128 $0x20, Y6, Y5, Y4
  594. VPERM2I128 $0x31, Y6, Y5, Y5
  595. // store
  596. VMOVDQU Y11, (DI)
  597. VMOVDQU Y12, 32(DI)
  598. VMOVDQU Y13, 64(DI)
  599. VMOVDQU Y14, 96(DI)
  600. VMOVDQU Y15, 128(DI)
  601. VMOVDQU Y3, 160(DI)
  602. VMOVDQU Y4, 192(DI)
  603. VMOVDQU Y5, 224(DI)
  604. ADDQ $256, DI
  605. // second round
  606. // zetas
  607. VMOVDQU 64(SI), Y3
  608. // load
  609. VMOVDQU (DI), Y4
  610. VMOVDQU 32(DI), Y5
  611. VMOVDQU 64(DI), Y6
  612. VMOVDQU 96(DI), Y7
  613. VMOVDQU 128(DI), Y8
  614. VMOVDQU 160(DI), Y9
  615. VMOVDQU 192(DI), Y10
  616. VMOVDQU 224(DI), Y11
  617. // level 1
  618. // mul
  619. VPMULLW Y3, Y8, Y12
  620. VPMULHW Y3, Y8, Y8
  621. VPMULLW Y3, Y9, Y13
  622. VPMULHW Y3, Y9, Y9
  623. VPMULLW Y3, Y10, Y14
  624. VPMULHW Y3, Y10, Y10
  625. VPMULLW Y3, Y11, Y15
  626. VPMULHW Y3, Y11, Y11
  627. // reduce
  628. VPMULLW Y0, Y12, Y12
  629. VPMULLW Y0, Y13, Y13
  630. VPMULLW Y0, Y14, Y14
  631. VPMULLW Y0, Y15, Y15
  632. VPMULHW Y1, Y12, Y12
  633. VPMULHW Y1, Y13, Y13
  634. VPMULHW Y1, Y14, Y14
  635. VPMULHW Y1, Y15, Y15
  636. VPSUBW Y12, Y8, Y12
  637. VPSUBW Y13, Y9, Y13
  638. VPSUBW Y14, Y10, Y14
  639. VPSUBW Y15, Y11, Y15
  640. // update
  641. VPSUBW Y12, Y4, Y8
  642. VPSUBW Y13, Y5, Y9
  643. VPSUBW Y14, Y6, Y10
  644. VPSUBW Y15, Y7, Y11
  645. VPADDW Y12, Y4, Y4
  646. VPADDW Y13, Y5, Y5
  647. VPADDW Y14, Y6, Y6
  648. VPADDW Y15, Y7, Y7
  649. // level 2
  650. // zetas
  651. VMOVDQU 160(SI), Y15
  652. VMOVDQU 192(SI), Y3
  653. // mul
  654. VPMULLW Y15, Y6, Y12
  655. VPMULHW Y15, Y6, Y6
  656. VPMULLW Y15, Y7, Y13
  657. VPMULHW Y15, Y7, Y7
  658. VPMULLW Y3, Y10, Y14
  659. VPMULHW Y3, Y10, Y10
  660. VPMULLW Y3, Y11, Y15
  661. VPMULHW Y3, Y11, Y11
  662. // reduce
  663. VPMULLW Y0, Y12, Y12
  664. VPMULLW Y0, Y13, Y13
  665. VPMULLW Y0, Y14, Y14
  666. VPMULLW Y0, Y15, Y15
  667. VPMULHW Y1, Y12, Y12
  668. VPMULHW Y1, Y13, Y13
  669. VPMULHW Y1, Y14, Y14
  670. VPMULHW Y1, Y15, Y15
  671. VPSUBW Y12, Y6, Y12
  672. VPSUBW Y13, Y7, Y13
  673. VPSUBW Y14, Y10, Y14
  674. VPSUBW Y15, Y11, Y15
  675. // update
  676. VPSUBW Y12, Y4, Y6
  677. VPSUBW Y13, Y5, Y7
  678. VPSUBW Y14, Y8, Y10
  679. VPSUBW Y15, Y9, Y11
  680. VPADDW Y12, Y4, Y4
  681. VPADDW Y13, Y5, Y5
  682. VPADDW Y14, Y8, Y8
  683. VPADDW Y15, Y9, Y9
  684. // level 3
  685. // zetas
  686. VMOVDQU 352(SI), Y13
  687. VMOVDQU 384(SI), Y14
  688. VMOVDQU 416(SI), Y15
  689. VMOVDQU 448(SI), Y3
  690. // mul
  691. VPMULLW Y13, Y5, Y12
  692. VPMULHW Y13, Y5, Y5
  693. VPMULLW Y14, Y7, Y13
  694. VPMULHW Y14, Y7, Y7
  695. VPMULLW Y15, Y9, Y14
  696. VPMULHW Y15, Y9, Y9
  697. VPMULLW Y3, Y11, Y15
  698. VPMULHW Y3, Y11, Y11
  699. // reduce
  700. VPMULLW Y0, Y12, Y12
  701. VPMULLW Y0, Y13, Y13
  702. VPMULLW Y0, Y14, Y14
  703. VPMULLW Y0, Y15, Y15
  704. VPMULHW Y1, Y12, Y12
  705. VPMULHW Y1, Y13, Y13
  706. VPMULHW Y1, Y14, Y14
  707. VPMULHW Y1, Y15, Y15
  708. VPSUBW Y12, Y5, Y12
  709. VPSUBW Y13, Y7, Y13
  710. VPSUBW Y14, Y9, Y14
  711. VPSUBW Y15, Y11, Y15
  712. // reduce 2
  713. VPSRAW $13, Y4, Y5
  714. VPSRAW $13, Y6, Y7
  715. VPSRAW $13, Y8, Y9
  716. VPSRAW $13, Y10, Y11
  717. VPAND Y2, Y4, Y4
  718. VPAND Y2, Y6, Y6
  719. VPAND Y2, Y8, Y8
  720. VPAND Y2, Y10, Y10
  721. VPSUBW Y5, Y4, Y4
  722. VPSUBW Y7, Y6, Y6
  723. VPSUBW Y9, Y8, Y8
  724. VPSUBW Y11, Y10, Y10
  725. VPSLLW $9, Y5, Y5
  726. VPSLLW $9, Y7, Y7
  727. VPSLLW $9, Y9, Y9
  728. VPSLLW $9, Y11, Y11
  729. VPADDW Y5, Y4, Y4
  730. VPADDW Y7, Y6, Y6
  731. VPADDW Y9, Y8, Y8
  732. VPADDW Y11, Y10, Y10
  733. // update
  734. VPSUBW Y12, Y4, Y5
  735. VPSUBW Y13, Y6, Y7
  736. VPSUBW Y14, Y8, Y9
  737. VPSUBW Y15, Y10, Y11
  738. VPADDW Y12, Y4, Y4
  739. VPADDW Y13, Y6, Y6
  740. VPADDW Y14, Y8, Y8
  741. VPADDW Y15, Y10, Y10
  742. // level 4
  743. // zetas
  744. VMOVDQU 608(SI), Y12
  745. VMOVDQU 640(SI), Y13
  746. VMOVDQU 672(SI), Y14
  747. VMOVDQU 704(SI), Y15
  748. // shuffle
  749. VPERM2I128 $0x02, Y4, Y5, Y3
  750. VPERM2I128 $0x13, Y4, Y5, Y4
  751. VPERM2I128 $0x02, Y6, Y7, Y5
  752. VPERM2I128 $0x13, Y6, Y7, Y6
  753. VPERM2I128 $0x02, Y8, Y9, Y7
  754. VPERM2I128 $0x13, Y8, Y9, Y8
  755. VPERM2I128 $0x02, Y10, Y11, Y9
  756. VPERM2I128 $0x13, Y10, Y11, Y10
  757. // mul
  758. VPMULLW Y12, Y4, Y11
  759. VPMULHW Y12, Y4, Y4
  760. VPMULLW Y13, Y6, Y12
  761. VPMULHW Y13, Y6, Y6
  762. VPMULLW Y14, Y8, Y13
  763. VPMULHW Y14, Y8, Y8
  764. VPMULLW Y15, Y10, Y14
  765. VPMULHW Y15, Y10, Y10
  766. // reduce
  767. VPMULLW Y0, Y11, Y11
  768. VPMULLW Y0, Y12, Y12
  769. VPMULLW Y0, Y13, Y13
  770. VPMULLW Y0, Y14, Y14
  771. VPMULHW Y1, Y11, Y11
  772. VPMULHW Y1, Y12, Y12
  773. VPMULHW Y1, Y13, Y13
  774. VPMULHW Y1, Y14, Y14
  775. VPSUBW Y11, Y4, Y11
  776. VPSUBW Y12, Y6, Y12
  777. VPSUBW Y13, Y8, Y13
  778. VPSUBW Y14, Y10, Y14
  779. // update
  780. VPSUBW Y11, Y3, Y4
  781. VPSUBW Y12, Y5, Y6
  782. VPSUBW Y13, Y7, Y8
  783. VPSUBW Y14, Y9, Y10
  784. VPADDW Y11, Y3, Y3
  785. VPADDW Y12, Y5, Y5
  786. VPADDW Y13, Y7, Y7
  787. VPADDW Y14, Y9, Y9
  788. // level 5
  789. // zetas
  790. VMOVDQU 864(SI), Y12
  791. VMOVDQU 896(SI), Y13
  792. VMOVDQU 928(SI), Y14
  793. VMOVDQU 960(SI), Y15
  794. // shuffle
  795. VSHUFPD $0x00, Y4, Y3, Y11
  796. VSHUFPD $0x0F, Y4, Y3, Y3
  797. VSHUFPD $0x00, Y6, Y5, Y4
  798. VSHUFPD $0x0F, Y6, Y5, Y5
  799. VSHUFPD $0x00, Y8, Y7, Y6
  800. VSHUFPD $0x0F, Y8, Y7, Y7
  801. VSHUFPD $0x00, Y10, Y9, Y8
  802. VSHUFPD $0x0F, Y10, Y9, Y9
  803. // mul
  804. VPMULLW Y12, Y3, Y10
  805. VPMULHW Y12, Y3, Y3
  806. VPMULLW Y13, Y5, Y12
  807. VPMULHW Y13, Y5, Y5
  808. VPMULLW Y14, Y7, Y13
  809. VPMULHW Y14, Y7, Y7
  810. VPMULLW Y15, Y9, Y14
  811. VPMULHW Y15, Y9, Y9
  812. // reduce
  813. VPMULLW Y0, Y10, Y10
  814. VPMULLW Y0, Y12, Y12
  815. VPMULLW Y0, Y13, Y13
  816. VPMULLW Y0, Y14, Y14
  817. VPMULHW Y1, Y10, Y10
  818. VPMULHW Y1, Y12, Y12
  819. VPMULHW Y1, Y13, Y13
  820. VPMULHW Y1, Y14, Y14
  821. VPSUBW Y10, Y3, Y10
  822. VPSUBW Y12, Y5, Y12
  823. VPSUBW Y13, Y7, Y13
  824. VPSUBW Y14, Y9, Y14
  825. // update
  826. VPSUBW Y10, Y11, Y3
  827. VPSUBW Y12, Y4, Y5
  828. VPSUBW Y13, Y6, Y7
  829. VPSUBW Y14, Y8, Y9
  830. VPADDW Y10, Y11, Y10
  831. VPADDW Y12, Y4, Y4
  832. VPADDW Y13, Y6, Y6
  833. VPADDW Y14, Y8, Y8
  834. // level 6
  835. // shuffle
  836. VPSHUFD $0xB1, Y10, Y12
  837. VPSHUFD $0xB1, Y3, Y13
  838. VPSHUFD $0xB1, Y4, Y14
  839. VPSHUFD $0xB1, Y5, Y15
  840. VPBLENDD $0x55, Y10, Y13, Y10
  841. VPBLENDD $0xAA, Y3, Y12, Y3
  842. VPBLENDD $0x55, Y4, Y15, Y4
  843. VPBLENDD $0xAA, Y5, Y14, Y5
  844. VPSHUFD $0xB1, Y6, Y12
  845. VPSHUFD $0xB1, Y7, Y13
  846. VPSHUFD $0xB1, Y8, Y14
  847. VPSHUFD $0xB1, Y9, Y15
  848. VPBLENDD $0x55, Y6, Y13, Y6
  849. VPBLENDD $0xAA, Y7, Y12, Y7
  850. VPBLENDD $0x55, Y8, Y15, Y8
  851. VPBLENDD $0xAA, Y9, Y14, Y9
  852. // zetas
  853. VMOVDQU 1120(SI), Y12
  854. VMOVDQU 1152(SI), Y13
  855. VMOVDQU 1184(SI), Y14
  856. VMOVDQU 1216(SI), Y15
  857. // mul
  858. VPMULLW Y12, Y3, Y11
  859. VPMULHW Y12, Y3, Y3
  860. VPMULLW Y13, Y5, Y12
  861. VPMULHW Y13, Y5, Y5
  862. VPMULLW Y14, Y7, Y13
  863. VPMULHW Y14, Y7, Y7
  864. VPMULLW Y15, Y9, Y14
  865. VPMULHW Y15, Y9, Y9
  866. // reduce
  867. VPMULLW Y0, Y11, Y11
  868. VPMULLW Y0, Y12, Y12
  869. VPMULLW Y0, Y13, Y13
  870. VPMULLW Y0, Y14, Y14
  871. VPMULHW Y1, Y11, Y11
  872. VPMULHW Y1, Y12, Y12
  873. VPMULHW Y1, Y13, Y13
  874. VPMULHW Y1, Y14, Y14
  875. VPSUBW Y11, Y3, Y11
  876. VPSUBW Y12, Y5, Y12
  877. VPSUBW Y13, Y7, Y13
  878. VPSUBW Y14, Y9, Y14
  879. // reduce 2
  880. VPSRAW $13, Y10, Y3
  881. VPSRAW $13, Y4, Y5
  882. VPSRAW $13, Y6, Y7
  883. VPSRAW $13, Y8, Y9
  884. VPAND Y2, Y10, Y10
  885. VPAND Y2, Y4, Y4
  886. VPAND Y2, Y6, Y6
  887. VPAND Y2, Y8, Y8
  888. VPSUBW Y3, Y10, Y10
  889. VPSUBW Y5, Y4, Y4
  890. VPSUBW Y7, Y6, Y6
  891. VPSUBW Y9, Y8, Y8
  892. VPSLLW $9, Y3, Y3
  893. VPSLLW $9, Y5, Y5
  894. VPSLLW $9, Y7, Y7
  895. VPSLLW $9, Y9, Y9
  896. VPADDW Y3, Y10, Y10
  897. VPADDW Y5, Y4, Y4
  898. VPADDW Y7, Y6, Y6
  899. VPADDW Y9, Y8, Y8
  900. // update
  901. VPSUBW Y11, Y10, Y3
  902. VPSUBW Y12, Y4, Y5
  903. VPSUBW Y13, Y6, Y7
  904. VPSUBW Y14, Y8, Y9
  905. VPADDW Y11, Y10, Y10
  906. VPADDW Y12, Y4, Y4
  907. VPADDW Y13, Y6, Y6
  908. VPADDW Y14, Y8, Y8
  909. // level 7
  910. // shuffle
  911. VMOVDQU ·vpshufb_idx<>(SB), Y15
  912. VPSHUFB Y15, Y10, Y11
  913. VPSHUFB Y15, Y3, Y12
  914. VPSHUFB Y15, Y4, Y13
  915. VPSHUFB Y15, Y5, Y14
  916. VPBLENDW $0x55, Y10, Y12, Y10
  917. VPBLENDW $0xAA, Y3, Y11, Y3
  918. VPBLENDW $0x55, Y4, Y14, Y4
  919. VPBLENDW $0xAA, Y5, Y13, Y5
  920. VPSHUFB Y15, Y6, Y11
  921. VPSHUFB Y15, Y7, Y12
  922. VPSHUFB Y15, Y8, Y13
  923. VPSHUFB Y15, Y9, Y14
  924. VPBLENDW $0x55, Y6, Y12, Y6
  925. VPBLENDW $0xAA, Y7, Y11, Y7
  926. VPBLENDW $0x55, Y8, Y14, Y8
  927. VPBLENDW $0xAA, Y9, Y13, Y9
  928. // zetas
  929. VMOVDQU 1376(SI), Y12
  930. VMOVDQU 1408(SI), Y13
  931. VMOVDQU 1440(SI), Y14
  932. VMOVDQU 1472(SI), Y15
  933. // mul
  934. VPMULLW Y12, Y3, Y11
  935. VPMULHW Y12, Y3, Y3
  936. VPMULLW Y13, Y5, Y12
  937. VPMULHW Y13, Y5, Y5
  938. VPMULLW Y14, Y7, Y13
  939. VPMULHW Y14, Y7, Y7
  940. VPMULLW Y15, Y9, Y14
  941. VPMULHW Y15, Y9, Y9
  942. // reduce
  943. VPMULLW Y0, Y11, Y11
  944. VPMULLW Y0, Y12, Y12
  945. VPMULLW Y0, Y13, Y13
  946. VPMULLW Y0, Y14, Y14
  947. VPMULHW Y1, Y11, Y11
  948. VPMULHW Y1, Y12, Y12
  949. VPMULHW Y1, Y13, Y13
  950. VPMULHW Y1, Y14, Y14
  951. VPSUBW Y11, Y3, Y11
  952. VPSUBW Y12, Y5, Y12
  953. VPSUBW Y13, Y7, Y13
  954. VPSUBW Y14, Y9, Y14
  955. // reduce 3
  956. VMOVDQU ·q2_x16<>(SB), Y15
  957. VPSRAW $15, Y10, Y3
  958. VPSRAW $15, Y4, Y5
  959. VPSRAW $15, Y6, Y7
  960. VPSRAW $15, Y8, Y9
  961. VPAND Y15, Y3, Y3
  962. VPAND Y15, Y5, Y5
  963. VPAND Y15, Y7, Y7
  964. VPAND Y15, Y9, Y9
  965. VPADDW Y1, Y10, Y10
  966. VPADDW Y1, Y4, Y4
  967. VPADDW Y1, Y6, Y6
  968. VPADDW Y1, Y8, Y8
  969. VPADDW Y3, Y10, Y10
  970. VPADDW Y5, Y4, Y4
  971. VPADDW Y7, Y6, Y6
  972. VPADDW Y9, Y8, Y8
  973. // update
  974. VPSUBW Y11, Y10, Y3
  975. VPSUBW Y12, Y4, Y5
  976. VPSUBW Y13, Y6, Y7
  977. VPSUBW Y14, Y8, Y9
  978. VPADDW Y11, Y10, Y10
  979. VPADDW Y12, Y4, Y4
  980. VPADDW Y13, Y6, Y6
  981. VPADDW Y14, Y8, Y8
  982. // reorder
  983. VPUNPCKLWD Y3, Y10, Y12
  984. VPUNPCKHWD Y3, Y10, Y13
  985. VPUNPCKLWD Y5, Y4, Y14
  986. VPUNPCKHWD Y5, Y4, Y15
  987. VPUNPCKLWD Y7, Y6, Y3
  988. VPUNPCKHWD Y7, Y6, Y4
  989. VPUNPCKLWD Y9, Y8, Y5
  990. VPUNPCKHWD Y9, Y8, Y6
  991. VPERM2I128 $0x20, Y13, Y12, Y11
  992. VPERM2I128 $0x31, Y13, Y12, Y12
  993. VPERM2I128 $0x20, Y15, Y14, Y13
  994. VPERM2I128 $0x31, Y15, Y14, Y14
  995. VPERM2I128 $0x20, Y4, Y3, Y15
  996. VPERM2I128 $0x31, Y4, Y3, Y3
  997. VPERM2I128 $0x20, Y6, Y5, Y4
  998. VPERM2I128 $0x31, Y6, Y5, Y5
  999. // store
  1000. VMOVDQU Y11, (DI)
  1001. VMOVDQU Y12, 32(DI)
  1002. VMOVDQU Y13, 64(DI)
  1003. VMOVDQU Y14, 96(DI)
  1004. VMOVDQU Y15, 128(DI)
  1005. VMOVDQU Y3, 160(DI)
  1006. VMOVDQU Y4, 192(DI)
  1007. VMOVDQU Y5, 224(DI)
  1008. VZEROUPPER
  1009. RET
  1010. // Go 1.10's VPERMQ support expects the imm8 to be a `int8`, instead of a
  1011. // `uint8`. While this is fixed in master, use the signed representation
  1012. // for now till it's reasonable to expect versions with the fix to be widely
  1013. // available.
  1014. //
  1015. // See: https://github.com/golang/go/issues/24378
  1016. #define invntt_VPERMQ_IDX $-40 // $0xd8
  1017. // func invnttAVX2(inout, omegas *uint16)
  1018. TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
  1019. MOVQ inout+0(FP), DI
  1020. MOVQ omegas+8(FP), SI
  1021. VMOVDQU ·qinv_x16<>(SB), Y0
  1022. VMOVDQU ·q_x16<>(SB), Y1
  1023. VMOVDQU ·v_x16<>(SB), Y2
  1024. MOVQ SI, R8
  1025. // first round
  1026. // load
  1027. VMOVDQU (DI), Y4
  1028. VMOVDQU 32(DI), Y5
  1029. VMOVDQU 64(DI), Y6
  1030. VMOVDQU 96(DI), Y7
  1031. VMOVDQU 128(DI), Y8
  1032. VMOVDQU 160(DI), Y9
  1033. VMOVDQU 192(DI), Y10
  1034. VMOVDQU 224(DI), Y11
  1035. // reorder
  1036. VMOVDQU ·lowdword<>(SB), Y3
  1037. VPAND Y3, Y4, Y12
  1038. VPAND Y3, Y5, Y13
  1039. VPAND Y3, Y6, Y14
  1040. VPAND Y3, Y7, Y15
  1041. VPSRLD $16, Y4, Y4
  1042. VPSRLD $16, Y5, Y5
  1043. VPSRLD $16, Y6, Y6
  1044. VPSRLD $16, Y7, Y7
  1045. VPACKUSDW Y5, Y4, Y5
  1046. VPACKUSDW Y13, Y12, Y4
  1047. VPACKUSDW Y7, Y6, Y7
  1048. VPACKUSDW Y15, Y14, Y6
  1049. VPERMQ invntt_VPERMQ_IDX, Y4, Y4
  1050. VPERMQ invntt_VPERMQ_IDX, Y5, Y5
  1051. VPERMQ invntt_VPERMQ_IDX, Y6, Y6
  1052. VPERMQ invntt_VPERMQ_IDX, Y7, Y7
  1053. VPAND Y3, Y8, Y12
  1054. VPAND Y3, Y9, Y13
  1055. VPAND Y3, Y10, Y14
  1056. VPAND Y3, Y11, Y15
  1057. VPSRLD $16, Y8, Y8
  1058. VPSRLD $16, Y9, Y9
  1059. VPSRLD $16, Y10, Y10
  1060. VPSRLD $16, Y11, Y11
  1061. VPACKUSDW Y9, Y8, Y9
  1062. VPACKUSDW Y13, Y12, Y8
  1063. VPACKUSDW Y11, Y10, Y11
  1064. VPACKUSDW Y15, Y14, Y10
  1065. VPERMQ invntt_VPERMQ_IDX, Y8, Y8
  1066. VPERMQ invntt_VPERMQ_IDX, Y9, Y9
  1067. VPERMQ invntt_VPERMQ_IDX, Y10, Y10
  1068. VPERMQ invntt_VPERMQ_IDX, Y11, Y11
  1069. // level 0
  1070. // update
  1071. VPSUBW Y5, Y4, Y12
  1072. VPSUBW Y7, Y6, Y13
  1073. VPSUBW Y9, Y8, Y14
  1074. VPSUBW Y11, Y10, Y15
  1075. VPADDW Y4, Y5, Y4
  1076. VPADDW Y6, Y7, Y6
  1077. VPADDW Y8, Y9, Y8
  1078. VPADDW Y10, Y11, Y10
  1079. // zetas
  1080. VMOVDQU (R8), Y7
  1081. VMOVDQU 32(R8), Y9
  1082. VMOVDQU 64(R8), Y11
  1083. VMOVDQU 96(R8), Y3
  1084. // mul
  1085. VPMULLW Y7, Y12, Y5
  1086. VPMULHW Y7, Y12, Y12
  1087. VPMULLW Y9, Y13, Y7
  1088. VPMULHW Y9, Y13, Y13
  1089. VPMULLW Y11, Y14, Y9
  1090. VPMULHW Y11, Y14, Y14
  1091. VPMULLW Y3, Y15, Y11
  1092. VPMULHW Y3, Y15, Y15
  1093. // reduce
  1094. VPMULLW Y0, Y5, Y5
  1095. VPMULLW Y0, Y7, Y7
  1096. VPMULLW Y0, Y9, Y9
  1097. VPMULLW Y0, Y11, Y11
  1098. VPMULHW Y1, Y5, Y5
  1099. VPMULHW Y1, Y7, Y7
  1100. VPMULHW Y1, Y9, Y9
  1101. VPMULHW Y1, Y11, Y11
  1102. VPSUBW Y5, Y12, Y5
  1103. VPSUBW Y7, Y13, Y7
  1104. VPSUBW Y9, Y14, Y9
  1105. VPSUBW Y11, Y15, Y11
  1106. // level 1
  1107. // shuffle
  1108. VMOVDQU ·vpshufb_idx<>(SB), Y3
  1109. VPSHUFB Y3, Y4, Y12
  1110. VPSHUFB Y3, Y5, Y13
  1111. VPSHUFB Y3, Y6, Y14
  1112. VPSHUFB Y3, Y7, Y15
  1113. VPBLENDW $0x55, Y4, Y13, Y4
  1114. VPBLENDW $0xAA, Y5, Y12, Y5
  1115. VPBLENDW $0x55, Y6, Y15, Y6
  1116. VPBLENDW $0xAA, Y7, Y14, Y7
  1117. VPSHUFB Y3, Y8, Y12
  1118. VPSHUFB Y3, Y9, Y13
  1119. VPSHUFB Y3, Y10, Y14
  1120. VPSHUFB Y3, Y11, Y15
  1121. VPBLENDW $0x55, Y8, Y13, Y8
  1122. VPBLENDW $0xAA, Y9, Y12, Y9
  1123. VPBLENDW $0x55, Y10, Y15, Y10
  1124. VPBLENDW $0xAA, Y11, Y14, Y11
  1125. // update
  1126. VPSUBW Y5, Y4, Y12
  1127. VPSUBW Y7, Y6, Y13
  1128. VPSUBW Y9, Y8, Y14
  1129. VPSUBW Y11, Y10, Y15
  1130. VPADDW Y4, Y5, Y4
  1131. VPADDW Y6, Y7, Y6
  1132. VPADDW Y8, Y9, Y8
  1133. VPADDW Y10, Y11, Y10
  1134. // zetas
  1135. VMOVDQU 256(R8), Y7
  1136. VMOVDQU 288(R8), Y9
  1137. VMOVDQU 320(R8), Y11
  1138. VMOVDQU 352(R8), Y3
  1139. // mul
  1140. VPMULLW Y7, Y12, Y5
  1141. VPMULHW Y7, Y12, Y12
  1142. VPMULLW Y9, Y13, Y7
  1143. VPMULHW Y9, Y13, Y13
  1144. VPMULLW Y11, Y14, Y9
  1145. VPMULHW Y11, Y14, Y14
  1146. VPMULLW Y3, Y15, Y11
  1147. VPMULHW Y3, Y15, Y15
  1148. // reduce
  1149. VPMULLW Y0, Y5, Y5
  1150. VPMULLW Y0, Y7, Y7
  1151. VPMULLW Y0, Y9, Y9
  1152. VPMULLW Y0, Y11, Y11
  1153. VPMULHW Y1, Y5, Y5
  1154. VPMULHW Y1, Y7, Y7
  1155. VPMULHW Y1, Y9, Y9
  1156. VPMULHW Y1, Y11, Y11
  1157. VPSUBW Y5, Y12, Y5
  1158. VPSUBW Y7, Y13, Y7
  1159. VPSUBW Y9, Y14, Y9
  1160. VPSUBW Y11, Y15, Y11
  1161. // reduce 2
  1162. VPMULHW Y2, Y4, Y12
  1163. VPMULHW Y2, Y6, Y13
  1164. VPMULHW Y2, Y8, Y14
  1165. VPMULHW Y2, Y10, Y15
  1166. VPSRAW $11, Y12, Y12
  1167. VPSRAW $11, Y13, Y13
  1168. VPSRAW $11, Y14, Y14
  1169. VPSRAW $11, Y15, Y15
  1170. VPMULLW Y1, Y12, Y12
  1171. VPMULLW Y1, Y13, Y13
  1172. VPMULLW Y1, Y14, Y14
  1173. VPMULLW Y1, Y15, Y15
  1174. VPSUBW Y12, Y4, Y4
  1175. VPSUBW Y13, Y6, Y6
  1176. VPSUBW Y14, Y8, Y8
  1177. VPSUBW Y15, Y10, Y10
  1178. // level 2
  1179. // shuffle
  1180. VPSHUFD $0xB1, Y4, Y12
  1181. VPSHUFD $0xB1, Y5, Y13
  1182. VPSHUFD $0xB1, Y6, Y14
  1183. VPSHUFD $0xB1, Y7, Y15
  1184. VPBLENDD $0x55, Y4, Y13, Y4
  1185. VPBLENDD $0xAA, Y5, Y12, Y5
  1186. VPBLENDD $0x55, Y6, Y15, Y6
  1187. VPBLENDD $0xAA, Y7, Y14, Y7
  1188. VPSHUFD $0xB1, Y8, Y12
  1189. VPSHUFD $0xB1, Y9, Y13
  1190. VPSHUFD $0xB1, Y10, Y14
  1191. VPSHUFD $0xB1, Y11, Y15
  1192. VPBLENDD $0x55, Y8, Y13, Y8
  1193. VPBLENDD $0xAA, Y9, Y12, Y9
  1194. VPBLENDD $0x55, Y10, Y15, Y10
  1195. VPBLENDD $0xAA, Y11, Y14, Y11
  1196. // update
  1197. VPSUBW Y5, Y4, Y12
  1198. VPSUBW Y7, Y6, Y13
  1199. VPSUBW Y9, Y8, Y14
  1200. VPSUBW Y11, Y10, Y15
  1201. VPADDW Y4, Y5, Y4
  1202. VPADDW Y6, Y7, Y6
  1203. VPADDW Y8, Y9, Y8
  1204. VPADDW Y10, Y11, Y10
  1205. // zetas
  1206. VMOVDQU 512(R8), Y7
  1207. VMOVDQU 544(R8), Y9
  1208. VMOVDQU 576(R8), Y11
  1209. VMOVDQU 608(R8), Y3
  1210. // mul
  1211. VPMULLW Y7, Y12, Y5
  1212. VPMULHW Y7, Y12, Y12
  1213. VPMULLW Y9, Y13, Y7
  1214. VPMULHW Y9, Y13, Y13
  1215. VPMULLW Y11, Y14, Y9
  1216. VPMULHW Y11, Y14, Y14
  1217. VPMULLW Y3, Y15, Y11
  1218. VPMULHW Y3, Y15, Y15
  1219. // reduce
  1220. VPMULLW Y0, Y5, Y5
  1221. VPMULLW Y0, Y7, Y7
  1222. VPMULLW Y0, Y9, Y9
  1223. VPMULLW Y0, Y11, Y11
  1224. VPMULHW Y1, Y5, Y5
  1225. VPMULHW Y1, Y7, Y7
  1226. VPMULHW Y1, Y9, Y9
  1227. VPMULHW Y1, Y11, Y11
  1228. VPSUBW Y5, Y12, Y5
  1229. VPSUBW Y7, Y13, Y7
  1230. VPSUBW Y9, Y14, Y9
  1231. VPSUBW Y11, Y15, Y11
  1232. // level 3
  1233. // shuffle
  1234. VSHUFPD $0x00, Y5, Y4, Y3
  1235. VSHUFPD $0x0F, Y5, Y4, Y4
  1236. VSHUFPD $0x00, Y7, Y6, Y5
  1237. VSHUFPD $0x0F, Y7, Y6, Y6
  1238. VSHUFPD $0x00, Y9, Y8, Y7
  1239. VSHUFPD $0x0F, Y9, Y8, Y8
  1240. VSHUFPD $0x00, Y11, Y10, Y9
  1241. VSHUFPD $0x0F, Y11, Y10, Y10
  1242. // update
  1243. VPSUBW Y4, Y3, Y12
  1244. VPSUBW Y6, Y5, Y13
  1245. VPSUBW Y8, Y7, Y14
  1246. VPSUBW Y10, Y9, Y15
  1247. VPADDW Y3, Y4, Y3
  1248. VPADDW Y5, Y6, Y5
  1249. VPADDW Y7, Y8, Y7
  1250. VPADDW Y9, Y10, Y9
  1251. // zetas
  1252. VMOVDQU 768(R8), Y6
  1253. VMOVDQU 800(R8), Y8
  1254. VMOVDQU 832(R8), Y10
  1255. VMOVDQU 864(R8), Y11
  1256. // mul
  1257. VPMULLW Y6, Y12, Y4
  1258. VPMULHW Y6, Y12, Y12
  1259. VPMULLW Y8, Y13, Y6
  1260. VPMULHW Y8, Y13, Y13
  1261. VPMULLW Y10, Y14, Y8
  1262. VPMULHW Y10, Y14, Y14
  1263. VPMULLW Y11, Y15, Y10
  1264. VPMULHW Y11, Y15, Y15
  1265. // reduce
  1266. VPMULLW Y0, Y4, Y4
  1267. VPMULLW Y0, Y6, Y6
  1268. VPMULLW Y0, Y8, Y8
  1269. VPMULLW Y0, Y10, Y10
  1270. VPMULHW Y1, Y4, Y4
  1271. VPMULHW Y1, Y6, Y6
  1272. VPMULHW Y1, Y8, Y8
  1273. VPMULHW Y1, Y10, Y10
  1274. VPSUBW Y4, Y12, Y4
  1275. VPSUBW Y6, Y13, Y6
  1276. VPSUBW Y8, Y14, Y8
  1277. VPSUBW Y10, Y15, Y10
  1278. // reduce 2
  1279. VPMULHW Y2, Y3, Y12
  1280. VPMULHW Y2, Y5, Y13
  1281. VPMULHW Y2, Y7, Y14
  1282. VPMULHW Y2, Y9, Y15
  1283. VPSRAW $11, Y12, Y12
  1284. VPSRAW $11, Y13, Y13
  1285. VPSRAW $11, Y14, Y14
  1286. VPSRAW $11, Y15, Y15
  1287. VPMULLW Y1, Y12, Y12
  1288. VPMULLW Y1, Y13, Y13
  1289. VPMULLW Y1, Y14, Y14
  1290. VPMULLW Y1, Y15, Y15
  1291. VPSUBW Y12, Y3, Y3
  1292. VPSUBW Y13, Y5, Y5
  1293. VPSUBW Y14, Y7, Y7
  1294. VPSUBW Y15, Y9, Y9
  1295. // level 4
  1296. // shuffle
  1297. VPERM2I128 $0x02, Y3, Y4, Y11
  1298. VPERM2I128 $0x13, Y3, Y4, Y3
  1299. VPERM2I128 $0x02, Y5, Y6, Y4
  1300. VPERM2I128 $0x13, Y5, Y6, Y5
  1301. VPERM2I128 $0x02, Y7, Y8, Y6
  1302. VPERM2I128 $0x13, Y7, Y8, Y7
  1303. VPERM2I128 $0x02, Y9, Y10, Y8
  1304. VPERM2I128 $0x13, Y9, Y10, Y9
  1305. // update
  1306. VMOVDQA Y11, Y12
  1307. VMOVDQA Y4, Y13
  1308. VMOVDQA Y6, Y14
  1309. VMOVDQA Y8, Y15
  1310. VPADDW Y11, Y3, Y10
  1311. VPADDW Y4, Y5, Y4
  1312. VPADDW Y6, Y7, Y6
  1313. VPADDW Y8, Y9, Y8
  1314. VPSUBW Y3, Y12, Y3
  1315. VPSUBW Y5, Y13, Y5
  1316. VPSUBW Y7, Y14, Y7
  1317. VPSUBW Y9, Y15, Y9
  1318. // zetas
  1319. VMOVDQU 1024(R8), Y12
  1320. VMOVDQU 1056(R8), Y13
  1321. VMOVDQU 1088(R8), Y14
  1322. VMOVDQU 1120(R8), Y15
  1323. // mul
  1324. VPMULLW Y12, Y3, Y11
  1325. VPMULHW Y12, Y3, Y3
  1326. VPMULLW Y13, Y5, Y12
  1327. VPMULHW Y13, Y5, Y5
  1328. VPMULLW Y14, Y7, Y13
  1329. VPMULHW Y14, Y7, Y7
  1330. VPMULLW Y15, Y9, Y14
  1331. VPMULHW Y15, Y9, Y9
  1332. // reduce
  1333. VPMULLW Y0, Y11, Y11
  1334. VPMULLW Y0, Y12, Y12
  1335. VPMULLW Y0, Y13, Y13
  1336. VPMULLW Y0, Y14, Y14
  1337. VPMULHW Y1, Y11, Y11
  1338. VPMULHW Y1, Y12, Y12
  1339. VPMULHW Y1, Y13, Y13
  1340. VPMULHW Y1, Y14, Y14
  1341. VPSUBW Y11, Y3, Y3
  1342. VPSUBW Y12, Y5, Y5
  1343. VPSUBW Y13, Y7, Y7
  1344. VPSUBW Y14, Y9, Y9
  1345. // level 5
  1346. // update
  1347. VMOVDQA Y10, Y12
  1348. VMOVDQA Y3, Y13
  1349. VMOVDQA Y6, Y14
  1350. VMOVDQA Y7, Y15
  1351. VPADDW Y10, Y4, Y10
  1352. VPADDW Y3, Y5, Y3
  1353. VPADDW Y6, Y8, Y6
  1354. VPADDW Y7, Y9, Y7
  1355. VPSUBW Y4, Y12, Y4
  1356. VPSUBW Y5, Y13, Y5
  1357. VPSUBW Y8, Y14, Y8
  1358. VPSUBW Y9, Y15, Y9
  1359. // zetas
  1360. VMOVDQU 1280(SI), Y14
  1361. VMOVDQU 1312(SI), Y15
  1362. // mul
  1363. VPMULLW Y14, Y4, Y11
  1364. VPMULLW Y14, Y5, Y12
  1365. VPMULLW Y15, Y8, Y13
  1366. VPMULHW Y14, Y4, Y4
  1367. VPMULHW Y14, Y5, Y5
  1368. VPMULHW Y15, Y8, Y8
  1369. VPMULLW Y15, Y9, Y14
  1370. VPMULHW Y15, Y9, Y9
  1371. // reduce
  1372. VPMULLW Y0, Y11, Y11
  1373. VPMULLW Y0, Y12, Y12
  1374. VPMULLW Y0, Y13, Y13
  1375. VPMULLW Y0, Y14, Y14
  1376. VPMULHW Y1, Y11, Y11
  1377. VPMULHW Y1, Y12, Y12
  1378. VPMULHW Y1, Y13, Y13
  1379. VPMULHW Y1, Y14, Y14
  1380. VPSUBW Y11, Y4, Y4
  1381. VPSUBW Y12, Y5, Y5
  1382. VPSUBW Y13, Y8, Y8
  1383. VPSUBW Y14, Y9, Y9
  1384. // reduce 2
  1385. VPMULHW Y2, Y10, Y12
  1386. VPMULHW Y2, Y6, Y13
  1387. VPSRAW $11, Y12, Y12
  1388. VPSRAW $11, Y13, Y13
  1389. VPMULLW Y1, Y12, Y12
  1390. VPMULLW Y1, Y13, Y13
  1391. VPSUBW Y12, Y10, Y10
  1392. VPSUBW Y13, Y6, Y6
  1393. // level 6
  1394. // update
  1395. VMOVDQA Y10, Y12
  1396. VMOVDQA Y3, Y13
  1397. VMOVDQA Y4, Y14
  1398. VMOVDQA Y5, Y15
  1399. VPADDW Y10, Y6, Y10
  1400. VPADDW Y3, Y7, Y3
  1401. VPADDW Y4, Y8, Y4
  1402. VPADDW Y5, Y9, Y5
  1403. VPSUBW Y6, Y12, Y6
  1404. VPSUBW Y7, Y13, Y7
  1405. VPSUBW Y8, Y14, Y8
  1406. VPSUBW Y9, Y15, Y9
  1407. // zetas
  1408. VMOVDQU 1408(SI), Y15
  1409. // mul
  1410. VPMULLW Y15, Y6, Y11
  1411. VPMULLW Y15, Y7, Y12
  1412. VPMULLW Y15, Y8, Y13
  1413. VPMULLW Y15, Y9, Y14
  1414. VPMULHW Y15, Y6, Y6
  1415. VPMULHW Y15, Y7, Y7
  1416. VPMULHW Y15, Y8, Y8
  1417. VPMULHW Y15, Y9, Y9
  1418. // reduce
  1419. VPMULLW Y0, Y11, Y11
  1420. VPMULLW Y0, Y12, Y12
  1421. VPMULLW Y0, Y13, Y13
  1422. VPMULLW Y0, Y14, Y14
  1423. VPMULHW Y1, Y11, Y11
  1424. VPMULHW Y1, Y12, Y12
  1425. VPMULHW Y1, Y13, Y13
  1426. VPMULHW Y1, Y14, Y14
  1427. VPSUBW Y11, Y6, Y6
  1428. VPSUBW Y12, Y7, Y7
  1429. VPSUBW Y13, Y8, Y8
  1430. VPSUBW Y14, Y9, Y9
  1431. // reduce 2
  1432. VPMULHW Y2, Y3, Y12
  1433. VPSRAW $11, Y12, Y12
  1434. VPMULLW Y1, Y12, Y12
  1435. VPSUBW Y12, Y3, Y3
  1436. // store
  1437. VMOVDQU Y10, (DI)
  1438. VMOVDQU Y3, 32(DI)
  1439. VMOVDQU Y4, 64(DI)
  1440. VMOVDQU Y5, 96(DI)
  1441. VMOVDQU Y6, 128(DI)
  1442. VMOVDQU Y7, 160(DI)
  1443. VMOVDQU Y8, 192(DI)
  1444. VMOVDQU Y9, 224(DI)
  1445. ADDQ $256, DI
  1446. ADDQ $128, R8
  1447. // second round
  1448. // load
  1449. VMOVDQU (DI), Y4
  1450. VMOVDQU 32(DI), Y5
  1451. VMOVDQU 64(DI), Y6
  1452. VMOVDQU 96(DI), Y7
  1453. VMOVDQU 128(DI), Y8
  1454. VMOVDQU 160(DI), Y9
  1455. VMOVDQU 192(DI), Y10
  1456. VMOVDQU 224(DI), Y11
  1457. // reorder
  1458. VMOVDQU ·lowdword<>(SB), Y3
  1459. VPAND Y3, Y4, Y12
  1460. VPAND Y3, Y5, Y13
  1461. VPAND Y3, Y6, Y14
  1462. VPAND Y3, Y7, Y15
  1463. VPSRLD $16, Y4, Y4
  1464. VPSRLD $16, Y5, Y5
  1465. VPSRLD $16, Y6, Y6
  1466. VPSRLD $16, Y7, Y7
  1467. VPACKUSDW Y5, Y4, Y5
  1468. VPACKUSDW Y13, Y12, Y4
  1469. VPACKUSDW Y7, Y6, Y7
  1470. VPACKUSDW Y15, Y14, Y6
  1471. VPERMQ invntt_VPERMQ_IDX, Y4, Y4
  1472. VPERMQ invntt_VPERMQ_IDX, Y5, Y5
  1473. VPERMQ invntt_VPERMQ_IDX, Y6, Y6
  1474. VPERMQ invntt_VPERMQ_IDX, Y7, Y7
  1475. VPAND Y3, Y8, Y12
  1476. VPAND Y3, Y9, Y13
  1477. VPAND Y3, Y10, Y14
  1478. VPAND Y3, Y11, Y15
  1479. VPSRLD $16, Y8, Y8
  1480. VPSRLD $16, Y9, Y9
  1481. VPSRLD $16, Y10, Y10
  1482. VPSRLD $16, Y11, Y11
  1483. VPACKUSDW Y9, Y8, Y9
  1484. VPACKUSDW Y13, Y12, Y8
  1485. VPACKUSDW Y11, Y10, Y11
  1486. VPACKUSDW Y15, Y14, Y10
  1487. VPERMQ invntt_VPERMQ_IDX, Y8, Y8
  1488. VPERMQ invntt_VPERMQ_IDX, Y9, Y9
  1489. VPERMQ invntt_VPERMQ_IDX, Y10, Y10
  1490. VPERMQ invntt_VPERMQ_IDX, Y11, Y11
  1491. // level 0
  1492. // update
  1493. VMOVDQA Y4, Y12
  1494. VMOVDQA Y6, Y13
  1495. VMOVDQA Y8, Y14
  1496. VMOVDQA Y10, Y15
  1497. VPADDW Y4, Y5, Y4
  1498. VPADDW Y6, Y7, Y6
  1499. VPADDW Y8, Y9, Y8
  1500. VPADDW Y10, Y11, Y10
  1501. VPSUBW Y5, Y12, Y5
  1502. VPSUBW Y7, Y13, Y7
  1503. VPSUBW Y9, Y14, Y9
  1504. VPSUBW Y11, Y15, Y11
  1505. // zetas
  1506. VMOVDQU (R8), Y13
  1507. VMOVDQU 32(R8), Y14
  1508. VMOVDQU 64(R8), Y15
  1509. VMOVDQU 96(R8), Y3
  1510. // mul
  1511. VPMULLW Y13, Y5, Y12
  1512. VPMULHW Y13, Y5, Y5
  1513. VPMULLW Y14, Y7, Y13
  1514. VPMULHW Y14, Y7, Y7
  1515. VPMULLW Y15, Y9, Y14
  1516. VPMULHW Y15, Y9, Y9
  1517. VPMULLW Y3, Y11, Y15
  1518. VPMULHW Y3, Y11, Y11
  1519. // reduce
  1520. VPMULLW Y0, Y12, Y12
  1521. VPMULLW Y0, Y13, Y13
  1522. VPMULLW Y0, Y14, Y14
  1523. VPMULLW Y0, Y15, Y15
  1524. VPMULHW Y1, Y12, Y12
  1525. VPMULHW Y1, Y13, Y13
  1526. VPMULHW Y1, Y14, Y14
  1527. VPMULHW Y1, Y15, Y15
  1528. VPSUBW Y12, Y5, Y5
  1529. VPSUBW Y13, Y7, Y7
  1530. VPSUBW Y14, Y9, Y9
  1531. VPSUBW Y15, Y11, Y11
  1532. // level 1
  1533. // shuffle
  1534. VMOVDQU ·vpshufb_idx<>(SB), Y3
  1535. VPSHUFB Y3, Y4, Y12
  1536. VPSHUFB Y3, Y5, Y13
  1537. VPSHUFB Y3, Y6, Y14
  1538. VPSHUFB Y3, Y7, Y15
  1539. VPBLENDW $0x55, Y4, Y13, Y4
  1540. VPBLENDW $0xAA, Y5, Y12, Y5
  1541. VPBLENDW $0x55, Y6, Y15, Y6
  1542. VPBLENDW $0xAA, Y7, Y14, Y7
  1543. VPSHUFB Y3, Y8, Y12
  1544. VPSHUFB Y3, Y9, Y13
  1545. VPSHUFB Y3, Y10, Y14
  1546. VPSHUFB Y3, Y11, Y15
  1547. VPBLENDW $0x55, Y8, Y13, Y8
  1548. VPBLENDW $0xAA, Y9, Y12, Y9
  1549. VPBLENDW $0x55, Y10, Y15, Y10
  1550. VPBLENDW $0xAA, Y11, Y14, Y11
  1551. // update
  1552. VMOVDQA Y4, Y12
  1553. VMOVDQA Y6, Y13
  1554. VMOVDQA Y8, Y14
  1555. VMOVDQA Y10, Y15
  1556. VPADDW Y4, Y5, Y4
  1557. VPADDW Y6, Y7, Y6
  1558. VPADDW Y8, Y9, Y8
  1559. VPADDW Y10, Y11, Y10
  1560. VPSUBW Y5, Y12, Y5
  1561. VPSUBW Y7, Y13, Y7
  1562. VPSUBW Y9, Y14, Y9
  1563. VPSUBW Y11, Y15, Y11
  1564. // zetas
  1565. VMOVDQU 256(R8), Y13
  1566. VMOVDQU 288(R8), Y14
  1567. VMOVDQU 320(R8), Y15
  1568. VMOVDQU 352(R8), Y3
  1569. // mul
  1570. VPMULLW Y13, Y5, Y12
  1571. VPMULHW Y13, Y5, Y5
  1572. VPMULLW Y14, Y7, Y13
  1573. VPMULHW Y14, Y7, Y7
  1574. VPMULLW Y15, Y9, Y14
  1575. VPMULHW Y15, Y9, Y9
  1576. VPMULLW Y3, Y11, Y15
  1577. VPMULHW Y3, Y11, Y11
  1578. // reduce
  1579. VPMULLW Y0, Y12, Y12
  1580. VPMULLW Y0, Y13, Y13
  1581. VPMULLW Y0, Y14, Y14
  1582. VPMULLW Y0, Y15, Y15
  1583. VPMULHW Y1, Y12, Y12
  1584. VPMULHW Y1, Y13, Y13
  1585. VPMULHW Y1, Y14, Y14
  1586. VPMULHW Y1, Y15, Y15
  1587. VPSUBW Y12, Y5, Y5
  1588. VPSUBW Y13, Y7, Y7
  1589. VPSUBW Y14, Y9, Y9
  1590. VPSUBW Y15, Y11, Y11
  1591. // reduce 2
  1592. VPMULHW Y2, Y4, Y12
  1593. VPMULHW Y2, Y6, Y13
  1594. VPMULHW Y2, Y8, Y14
  1595. VPMULHW Y2, Y10, Y15
  1596. VPSRAW $11, Y12, Y12
  1597. VPSRAW $11, Y13, Y13
  1598. VPSRAW $11, Y14, Y14
  1599. VPSRAW $11, Y15, Y15
  1600. VPMULLW Y1, Y12, Y12
  1601. VPMULLW Y1, Y13, Y13
  1602. VPMULLW Y1, Y14, Y14
  1603. VPMULLW Y1, Y15, Y15
  1604. VPSUBW Y12, Y4, Y4
  1605. VPSUBW Y13, Y6, Y6
  1606. VPSUBW Y14, Y8, Y8
  1607. VPSUBW Y15, Y10, Y10
  1608. // level 2
  1609. // shuffle
  1610. VPSHUFD $0xB1, Y4, Y12
  1611. VPSHUFD $0xB1, Y5, Y13
  1612. VPSHUFD $0xB1, Y6, Y14
  1613. VPSHUFD $0xB1, Y7, Y15
  1614. VPBLENDD $0x55, Y4, Y13, Y4
  1615. VPBLENDD $0xAA, Y5, Y12, Y5
  1616. VPBLENDD $0x55, Y6, Y15, Y6
  1617. VPBLENDD $0xAA, Y7, Y14, Y7
  1618. VPSHUFD $0xB1, Y8, Y12
  1619. VPSHUFD $0xB1, Y9, Y13
  1620. VPSHUFD $0xB1, Y10, Y14
  1621. VPSHUFD $0xB1, Y11, Y15
  1622. VPBLENDD $0x55, Y8, Y13, Y8
  1623. VPBLENDD $0xAA, Y9, Y12, Y9
  1624. VPBLENDD $0x55, Y10, Y15, Y10
  1625. VPBLENDD $0xAA, Y11, Y14, Y11
  1626. // update
  1627. VMOVDQA Y4, Y12
  1628. VMOVDQA Y6, Y13
  1629. VMOVDQA Y8, Y14
  1630. VMOVDQA Y10, Y15
  1631. VPADDW Y4, Y5, Y4
  1632. VPADDW Y6, Y7, Y6
  1633. VPADDW Y8, Y9, Y8
  1634. VPADDW Y10, Y11, Y10
  1635. VPSUBW Y5, Y12, Y5
  1636. VPSUBW Y7, Y13, Y7
  1637. VPSUBW Y9, Y14, Y9
  1638. VPSUBW Y11, Y15, Y11
  1639. // zetas
  1640. VMOVDQU 512(R8), Y13
  1641. VMOVDQU 544(R8), Y14
  1642. VMOVDQU 576(R8), Y15
  1643. VMOVDQU 608(R8), Y3
  1644. // mul
  1645. VPMULLW Y13, Y5, Y12
  1646. VPMULHW Y13, Y5, Y5
  1647. VPMULLW Y14, Y7, Y13
  1648. VPMULHW Y14, Y7, Y7
  1649. VPMULLW Y15, Y9, Y14
  1650. VPMULHW Y15, Y9, Y9
  1651. VPMULLW Y3, Y11, Y15
  1652. VPMULHW Y3, Y11, Y11
  1653. // reduce
  1654. VPMULLW Y0, Y12, Y12
  1655. VPMULLW Y0, Y13, Y13
  1656. VPMULLW Y0, Y14, Y14
  1657. VPMULLW Y0, Y15, Y15
  1658. VPMULHW Y1, Y12, Y12
  1659. VPMULHW Y1, Y13, Y13
  1660. VPMULHW Y1, Y14, Y14
  1661. VPMULHW Y1, Y15, Y15
  1662. VPSUBW Y12, Y5, Y5
  1663. VPSUBW Y13, Y7, Y7
  1664. VPSUBW Y14, Y9, Y9
  1665. VPSUBW Y15, Y11, Y11
  1666. // level 3
  1667. // shuffle
  1668. VSHUFPD $0x00, Y5, Y4, Y3
  1669. VSHUFPD $0x0F, Y5, Y4, Y4
  1670. VSHUFPD $0x00, Y7, Y6, Y5
  1671. VSHUFPD $0x0F, Y7, Y6, Y6
  1672. VSHUFPD $0x00, Y9, Y8, Y7
  1673. VSHUFPD $0x0F, Y9, Y8, Y8
  1674. VSHUFPD $0x00, Y11, Y10, Y9
  1675. VSHUFPD $0x0F, Y11, Y10, Y10
  1676. // update
  1677. VMOVDQA Y3, Y12
  1678. VMOVDQA Y5, Y13
  1679. VMOVDQA Y7, Y14
  1680. VMOVDQA Y9, Y15
  1681. VPADDW Y3, Y4, Y3
  1682. VPADDW Y5, Y6, Y5
  1683. VPADDW Y7, Y8, Y7
  1684. VPADDW Y9, Y10, Y9
  1685. VPSUBW Y4, Y12, Y4
  1686. VPSUBW Y6, Y13, Y6
  1687. VPSUBW Y8, Y14, Y8
  1688. VPSUBW Y10, Y15, Y10
  1689. // zetas
  1690. VMOVDQU 768(R8), Y12
  1691. VMOVDQU 800(R8), Y13
  1692. VMOVDQU 832(R8), Y14
  1693. VMOVDQU 864(R8), Y15
  1694. // mul
  1695. VPMULLW Y12, Y4, Y11
  1696. VPMULHW Y12, Y4, Y4
  1697. VPMULLW Y13, Y6, Y12
  1698. VPMULHW Y13, Y6, Y6
  1699. VPMULLW Y14, Y8, Y13
  1700. VPMULHW Y14, Y8, Y8
  1701. VPMULLW Y15, Y10, Y14
  1702. VPMULHW Y15, Y10, Y10
  1703. // reduce
  1704. VPMULLW Y0, Y11, Y11
  1705. VPMULLW Y0, Y12, Y12
  1706. VPMULLW Y0, Y13, Y13
  1707. VPMULLW Y0, Y14, Y14
  1708. VPMULHW Y1, Y11, Y11
  1709. VPMULHW Y1, Y12, Y12
  1710. VPMULHW Y1, Y13, Y13
  1711. VPMULHW Y1, Y14, Y14
  1712. VPSUBW Y11, Y4, Y4
  1713. VPSUBW Y12, Y6, Y6
  1714. VPSUBW Y13, Y8, Y8
  1715. VPSUBW Y14, Y10, Y10
  1716. // reduce 2
  1717. VPMULHW Y2, Y3, Y12
  1718. VPMULHW Y2, Y5, Y13
  1719. VPMULHW Y2, Y7, Y14
  1720. VPMULHW Y2, Y9, Y15
  1721. VPSRAW $11, Y12, Y12
  1722. VPSRAW $11, Y13, Y13
  1723. VPSRAW $11, Y14, Y14
  1724. VPSRAW $11, Y15, Y15
  1725. VPMULLW Y1, Y12, Y12
  1726. VPMULLW Y1, Y13, Y13
  1727. VPMULLW Y1, Y14, Y14
  1728. VPMULLW Y1, Y15, Y15
  1729. VPSUBW Y12, Y3, Y3
  1730. VPSUBW Y13, Y5, Y5
  1731. VPSUBW Y14, Y7, Y7
  1732. VPSUBW Y15, Y9, Y9
  1733. // level 4
  1734. // shuffle
  1735. VPERM2I128 $0x02, Y3, Y4, Y11
  1736. VPERM2I128 $0x13, Y3, Y4, Y3
  1737. VPERM2I128 $0x02, Y5, Y6, Y4
  1738. VPERM2I128 $0x13, Y5, Y6, Y5
  1739. VPERM2I128 $0x02, Y7, Y8, Y6
  1740. VPERM2I128 $0x13, Y7, Y8, Y7
  1741. VPERM2I128 $0x02, Y9, Y10, Y8
  1742. VPERM2I128 $0x13, Y9, Y10, Y9
  1743. // update
  1744. VMOVDQA Y11, Y12
  1745. VMOVDQA Y4, Y13
  1746. VMOVDQA Y6, Y14
  1747. VMOVDQA Y8, Y15
  1748. VPADDW Y11, Y3, Y10
  1749. VPADDW Y4, Y5, Y4
  1750. VPADDW Y6, Y7, Y6
  1751. VPADDW Y8, Y9, Y8
  1752. VPSUBW Y3, Y12, Y3
  1753. VPSUBW Y5, Y13, Y5
  1754. VPSUBW Y7, Y14, Y7
  1755. VPSUBW Y9, Y15, Y9
  1756. // zetas
  1757. VMOVDQU 1024(R8), Y12
  1758. VMOVDQU 1056(R8), Y13
  1759. VMOVDQU 1088(R8), Y14
  1760. VMOVDQU 1120(R8), Y15
  1761. // mul
  1762. VPMULLW Y12, Y3, Y11
  1763. VPMULHW Y12, Y3, Y3
  1764. VPMULLW Y13, Y5, Y12
  1765. VPMULHW Y13, Y5, Y5
  1766. VPMULLW Y14, Y7, Y13
  1767. VPMULHW Y14, Y7, Y7
  1768. VPMULLW Y15, Y9, Y14
  1769. VPMULHW Y15, Y9, Y9
  1770. // reduce
  1771. VPMULLW Y0, Y11, Y11
  1772. VPMULLW Y0, Y12, Y12
  1773. VPMULLW Y0, Y13, Y13
  1774. VPMULLW Y0, Y14, Y14
  1775. VPMULHW Y1, Y11, Y11
  1776. VPMULHW Y1, Y12, Y12
  1777. VPMULHW Y1, Y13, Y13
  1778. VPMULHW Y1, Y14, Y14
  1779. VPSUBW Y11, Y3, Y3
  1780. VPSUBW Y12, Y5, Y5
  1781. VPSUBW Y13, Y7, Y7
  1782. VPSUBW Y14, Y9, Y9
  1783. // level 5
  1784. // update
  1785. VMOVDQA Y10, Y12
  1786. VMOVDQA Y3, Y13
  1787. VMOVDQA Y6, Y14
  1788. VMOVDQA Y7, Y15
  1789. VPADDW Y10, Y4, Y10
  1790. VPADDW Y3, Y5, Y3
  1791. VPADDW Y6, Y8, Y6
  1792. VPADDW Y7, Y9, Y7
  1793. VPSUBW Y4, Y12, Y4
  1794. VPSUBW Y5, Y13, Y5
  1795. VPSUBW Y8, Y14, Y8
  1796. VPSUBW Y9, Y15, Y9
  1797. // zetas
  1798. VMOVDQU 1344(SI), Y14
  1799. VMOVDQU 1376(SI), Y15
  1800. // mul
  1801. VPMULLW Y14, Y4, Y11
  1802. VPMULLW Y14, Y5, Y12
  1803. VPMULLW Y15, Y8, Y13
  1804. VPMULHW Y14, Y4, Y4
  1805. VPMULHW Y14, Y5, Y5
  1806. VPMULHW Y15, Y8, Y8
  1807. VPMULLW Y15, Y9, Y14
  1808. VPMULHW Y15, Y9, Y9
  1809. // reduce
  1810. VPMULLW Y0, Y11, Y11
  1811. VPMULLW Y0, Y12, Y12
  1812. VPMULLW Y0, Y13, Y13
  1813. VPMULLW Y0, Y14, Y14
  1814. VPMULHW Y1, Y11, Y11
  1815. VPMULHW Y1, Y12, Y12
  1816. VPMULHW Y1, Y13, Y13
  1817. VPMULHW Y1, Y14, Y14
  1818. VPSUBW Y11, Y4, Y4
  1819. VPSUBW Y12, Y5, Y5
  1820. VPSUBW Y13, Y8, Y8
  1821. VPSUBW Y14, Y9, Y9
  1822. // reduce 2
  1823. VPMULHW Y2, Y10, Y12
  1824. VPMULHW Y2, Y6, Y13
  1825. VPSRAW $11, Y12, Y12
  1826. VPSRAW $11, Y13, Y13
  1827. VPMULLW Y1, Y12, Y12
  1828. VPMULLW Y1, Y13, Y13
  1829. VPSUBW Y12, Y10, Y10
  1830. VPSUBW Y13, Y6, Y6
  1831. // level 6
  1832. // update
  1833. VMOVDQA Y10, Y12
  1834. VMOVDQA Y3, Y13
  1835. VMOVDQA Y4, Y14
  1836. VMOVDQA Y5, Y15
  1837. VPADDW Y10, Y6, Y10
  1838. VPADDW Y3, Y7, Y3
  1839. VPADDW Y4, Y8, Y4
  1840. VPADDW Y5, Y9, Y5
  1841. VPSUBW Y6, Y12, Y6
  1842. VPSUBW Y7, Y13, Y7
  1843. VPSUBW Y8, Y14, Y8
  1844. VPSUBW Y9, Y15, Y9
  1845. // zetas
  1846. VMOVDQU 1440(SI), Y15
  1847. // mul
  1848. VPMULLW Y15, Y6, Y11
  1849. VPMULLW Y15, Y7, Y12
  1850. VPMULLW Y15, Y8, Y13
  1851. VPMULLW Y15, Y9, Y14
  1852. VPMULHW Y15, Y6, Y6
  1853. VPMULHW Y15, Y7, Y7
  1854. VPMULHW Y15, Y8, Y8
  1855. VPMULHW Y15, Y9, Y9
  1856. // reduce
  1857. VPMULLW Y0, Y11, Y11
  1858. VPMULLW Y0, Y12, Y12
  1859. VPMULLW Y0, Y13, Y13
  1860. VPMULLW Y0, Y14, Y14
  1861. VPMULHW Y1, Y11, Y11
  1862. VPMULHW Y1, Y12, Y12
  1863. VPMULHW Y1, Y13, Y13
  1864. VPMULHW Y1, Y14, Y14
  1865. VPSUBW Y11, Y6, Y6
  1866. VPSUBW Y12, Y7, Y7
  1867. VPSUBW Y13, Y8, Y8
  1868. VPSUBW Y14, Y9, Y9
  1869. // reduce 2
  1870. VPMULHW Y2, Y3, Y12
  1871. VPSRAW $11, Y12, Y12
  1872. VPMULLW Y1, Y12, Y12
  1873. VPSUBW Y12, Y3, Y3
  1874. // store
  1875. VMOVDQU Y10, (DI)
  1876. VMOVDQU Y3, 32(DI)
  1877. VMOVDQU Y4, 64(DI)
  1878. VMOVDQU Y5, 96(DI)
  1879. VMOVDQU Y6, 128(DI)
  1880. VMOVDQU Y7, 160(DI)
  1881. VMOVDQU Y8, 192(DI)
  1882. VMOVDQU Y9, 224(DI)
  1883. SUBQ $256, DI
  1884. // f
  1885. VMOVDQU ·f_x16<>(SB), Y2
  1886. // first round
  1887. // load
  1888. VMOVDQU (DI), Y4
  1889. VMOVDQU 32(DI), Y5
  1890. VMOVDQU 64(DI), Y6
  1891. VMOVDQU 96(DI), Y7
  1892. VMOVDQU 256(DI), Y8
  1893. VMOVDQU 288(DI), Y9
  1894. VMOVDQU 320(DI), Y10
  1895. VMOVDQU 352(DI), Y11
  1896. // level 7
  1897. // update
  1898. VMOVDQA Y4, Y12
  1899. VMOVDQA Y5, Y13
  1900. VMOVDQA Y6, Y14
  1901. VMOVDQA Y7, Y15
  1902. VPADDW Y4, Y8, Y4
  1903. VPADDW Y5, Y9, Y5
  1904. VPADDW Y6, Y10, Y6
  1905. VPADDW Y7, Y11, Y7
  1906. VPSUBW Y8, Y12, Y8
  1907. VPSUBW Y9, Y13, Y9
  1908. VPSUBW Y10, Y14, Y10
  1909. VPSUBW Y11, Y15, Y11
  1910. // zeta
  1911. VMOVDQU 1472(SI), Y3
  1912. // mul
  1913. VPMULLW Y3, Y8, Y12
  1914. VPMULLW Y3, Y9, Y13
  1915. VPMULLW Y3, Y10, Y14
  1916. VPMULLW Y3, Y11, Y15
  1917. VPMULHW Y3, Y8, Y8
  1918. VPMULHW Y3, Y9, Y9
  1919. VPMULHW Y3, Y10, Y10
  1920. VPMULHW Y3, Y11, Y11
  1921. // reduce
  1922. VPMULLW Y0, Y12, Y12
  1923. VPMULLW Y0, Y13, Y13
  1924. VPMULLW Y0, Y14, Y14
  1925. VPMULLW Y0, Y15, Y15
  1926. VPMULHW Y1, Y12, Y12
  1927. VPMULHW Y1, Y13, Y13
  1928. VPMULHW Y1, Y14, Y14
  1929. VPMULHW Y1, Y15, Y15
  1930. VPSUBW Y12, Y8, Y8
  1931. VPSUBW Y13, Y9, Y9
  1932. VPSUBW Y14, Y10, Y10
  1933. VPSUBW Y15, Y11, Y11
  1934. VPADDW Y1, Y8, Y8
  1935. VPADDW Y1, Y9, Y9
  1936. VPADDW Y1, Y10, Y10
  1937. VPADDW Y1, Y11, Y11
  1938. // mul
  1939. VPMULLW Y2, Y4, Y12
  1940. VPMULLW Y2, Y5, Y13
  1941. VPMULLW Y2, Y6, Y14
  1942. VPMULLW Y2, Y7, Y15
  1943. VPMULHW Y2, Y4, Y4
  1944. VPMULHW Y2, Y5, Y5
  1945. VPMULHW Y2, Y6, Y6
  1946. VPMULHW Y2, Y7, Y7
  1947. // reduce
  1948. VPMULLW Y0, Y12, Y12
  1949. VPMULLW Y0, Y13, Y13
  1950. VPMULLW Y0, Y14, Y14
  1951. VPMULLW Y0, Y15, Y15
  1952. VPMULHW Y1, Y12, Y12
  1953. VPMULHW Y1, Y13, Y13
  1954. VPMULHW Y1, Y14, Y14
  1955. VPMULHW Y1, Y15, Y15
  1956. VPSUBW Y12, Y4, Y4
  1957. VPSUBW Y13, Y5, Y5
  1958. VPSUBW Y14, Y6, Y6
  1959. VPSUBW Y15, Y7, Y7
  1960. VPADDW Y1, Y4, Y4
  1961. VPADDW Y1, Y5, Y5
  1962. VPADDW Y1, Y6, Y6
  1963. VPADDW Y1, Y7, Y7
  1964. // store
  1965. VMOVDQU Y4, (DI)
  1966. VMOVDQU Y5, 32(DI)
  1967. VMOVDQU Y6, 64(DI)
  1968. VMOVDQU Y7, 96(DI)
  1969. VMOVDQU Y8, 256(DI)
  1970. VMOVDQU Y9, 288(DI)
  1971. VMOVDQU Y10, 320(DI)
  1972. VMOVDQU Y11, 352(DI)
  1973. ADDQ $128, DI
  1974. // second round
  1975. // load
  1976. VMOVDQU (DI), Y4
  1977. VMOVDQU 32(DI), Y5
  1978. VMOVDQU 64(DI), Y6
  1979. VMOVDQU 96(DI), Y7
  1980. VMOVDQU 256(DI), Y8
  1981. VMOVDQU 288(DI), Y9
  1982. VMOVDQU 320(DI), Y10
  1983. VMOVDQU 352(DI), Y11
  1984. // zeta
  1985. VMOVDQU 1472(SI), Y3
  1986. // level 7
  1987. // update
  1988. VMOVDQA Y4, Y12
  1989. VMOVDQA Y5, Y13
  1990. VMOVDQA Y6, Y14
  1991. VMOVDQA Y7, Y15
  1992. VPADDW Y4, Y8, Y4
  1993. VPADDW Y5, Y9, Y5
  1994. VPADDW Y6, Y10, Y6
  1995. VPADDW Y7, Y11, Y7
  1996. VPSUBW Y8, Y12, Y8
  1997. VPSUBW Y9, Y13, Y9
  1998. VPSUBW Y10, Y14, Y10
  1999. VPSUBW Y11, Y15, Y11
  2000. // mul
  2001. VPMULLW Y3, Y8, Y12
  2002. VPMULLW Y3, Y9, Y13
  2003. VPMULLW Y3, Y10, Y14
  2004. VPMULLW Y3, Y11, Y15
  2005. VPMULHW Y3, Y8, Y8
  2006. VPMULHW Y3, Y9, Y9
  2007. VPMULHW Y3, Y10, Y10
  2008. VPMULHW Y3, Y11, Y11
  2009. // reduce
  2010. VPMULLW Y0, Y12, Y12
  2011. VPMULLW Y0, Y13, Y13
  2012. VPMULLW Y0, Y14, Y14
  2013. VPMULLW Y0, Y15, Y15
  2014. VPMULHW Y1, Y12, Y12
  2015. VPMULHW Y1, Y13, Y13
  2016. VPMULHW Y1, Y14, Y14
  2017. VPMULHW Y1, Y15, Y15
  2018. VPSUBW Y12, Y8, Y8
  2019. VPSUBW Y13, Y9, Y9
  2020. VPSUBW Y14, Y10, Y10
  2021. VPSUBW Y15, Y11, Y11
  2022. VPADDW Y1, Y8, Y8
  2023. VPADDW Y1, Y9, Y9
  2024. VPADDW Y1, Y10, Y10
  2025. VPADDW Y1, Y11, Y11
  2026. // mul
  2027. VPMULLW Y2, Y4, Y12
  2028. VPMULLW Y2, Y5, Y13
  2029. VPMULLW Y2, Y6, Y14
  2030. VPMULLW Y2, Y7, Y15
  2031. VPMULHW Y2, Y4, Y4
  2032. VPMULHW Y2, Y5, Y5
  2033. VPMULHW Y2, Y6, Y6
  2034. VPMULHW Y2, Y7, Y7
  2035. // reduce
  2036. VPMULLW Y0, Y12, Y12
  2037. VPMULLW Y0, Y13, Y13
  2038. VPMULLW Y0, Y14, Y14
  2039. VPMULLW Y0, Y15, Y15
  2040. VPMULHW Y1, Y12, Y12
  2041. VPMULHW Y1, Y13, Y13
  2042. VPMULHW Y1, Y14, Y14
  2043. VPMULHW Y1, Y15, Y15
  2044. VPSUBW Y12, Y4, Y4
  2045. VPSUBW Y13, Y5, Y5
  2046. VPSUBW Y14, Y6, Y6
  2047. VPSUBW Y15, Y7, Y7
  2048. VPADDW Y1, Y4, Y4
  2049. VPADDW Y1, Y5, Y5
  2050. VPADDW Y1, Y6, Y6
  2051. VPADDW Y1, Y7, Y7
  2052. // store
  2053. VMOVDQU Y4, (DI)
  2054. VMOVDQU Y5, 32(DI)
  2055. VMOVDQU Y6, 64(DI)
  2056. VMOVDQU Y7, 96(DI)
  2057. VMOVDQU Y8, 256(DI)
  2058. VMOVDQU Y9, 288(DI)
  2059. VMOVDQU Y10, 320(DI)
  2060. VMOVDQU Y11, 352(DI)
  2061. VZEROUPPER
  2062. RET
  2063. // func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
  2064. TEXT ·pointwiseAccK2AVX2(SB), NOSPLIT, $0-24
  2065. MOVQ dst+0(FP), DI
  2066. MOVQ a+8(FP), SI
  2067. MOVQ b+16(FP), DX
  2068. VMOVDQU ·qinv_x16<>(SB), Y0
  2069. VMOVDQU ·q_x16<>(SB), Y1
  2070. VMOVDQU ·montsq_x16<>(SB), Y2
  2071. XORQ AX, AX
  2072. XORQ BX, BX
  2073. MOVQ 8(SI), R8 // a[1]
  2074. MOVQ (SI), SI // a[0]
  2075. MOVQ 8(DX), R11 // b[1]
  2076. MOVQ (DX), DX // b[0]
  2077. looptop2:
  2078. // load a
  2079. VMOVDQU (SI)(BX*1), Y4
  2080. VMOVDQU 32(SI)(BX*1), Y5
  2081. VMOVDQU 64(SI)(BX*1), Y6
  2082. VMOVDQU (R8)(BX*1), Y7
  2083. VMOVDQU 32(R8)(BX*1), Y8
  2084. VMOVDQU 64(R8)(BX*1), Y9
  2085. // mul montsq
  2086. VPMULLW Y2, Y4, Y3
  2087. VPMULHW Y2, Y4, Y10
  2088. VPMULLW Y2, Y5, Y4
  2089. VPMULHW Y2, Y5, Y11
  2090. VPMULLW Y2, Y6, Y5
  2091. VPMULHW Y2, Y6, Y12
  2092. VPMULLW Y2, Y7, Y6
  2093. VPMULHW Y2, Y7, Y13
  2094. VPMULLW Y2, Y8, Y7
  2095. VPMULHW Y2, Y8, Y14
  2096. VPMULLW Y2, Y9, Y8
  2097. VPMULHW Y2, Y9, Y15
  2098. // reduce
  2099. VPMULLW Y0, Y3, Y3
  2100. VPMULLW Y0, Y4, Y4
  2101. VPMULLW Y0, Y5, Y5
  2102. VPMULLW Y0, Y6, Y6
  2103. VPMULLW Y0, Y7, Y7
  2104. VPMULLW Y0, Y8, Y8
  2105. VPMULHW Y1, Y3, Y3
  2106. VPMULHW Y1, Y4, Y4
  2107. VPMULHW Y1, Y5, Y5
  2108. VPMULHW Y1, Y6, Y6
  2109. VPMULHW Y1, Y7, Y7
  2110. VPMULHW Y1, Y8, Y8
  2111. VPSUBW Y3, Y10, Y3
  2112. VPSUBW Y4, Y11, Y4
  2113. VPSUBW Y5, Y12, Y5
  2114. VPSUBW Y6, Y13, Y6
  2115. VPSUBW Y7, Y14, Y7
  2116. VPSUBW Y8, Y15, Y8
  2117. // load b
  2118. VMOVDQU (DX)(BX*1), Y9
  2119. VMOVDQU 32(DX)(BX*1), Y10
  2120. VMOVDQU 64(DX)(BX*1), Y11
  2121. VMOVDQU (R11)(BX*1), Y12
  2122. VMOVDQU 32(R11)(BX*1), Y13
  2123. VMOVDQU 64(R11)(BX*1), Y14
  2124. // mul
  2125. VPMULLW Y3, Y9, Y15
  2126. VPMULHW Y3, Y9, Y9
  2127. VPMULLW Y4, Y10, Y3
  2128. VPMULHW Y4, Y10, Y10
  2129. VPMULLW Y5, Y11, Y4
  2130. VPMULHW Y5, Y11, Y11
  2131. VPMULLW Y6, Y12, Y5
  2132. VPMULHW Y6, Y12, Y12
  2133. VPMULLW Y7, Y13, Y6
  2134. VPMULHW Y7, Y13, Y13
  2135. VPMULLW Y8, Y14, Y7
  2136. VPMULHW Y8, Y14, Y14
  2137. // reduce
  2138. VPMULLW Y0, Y15, Y15
  2139. VPMULLW Y0, Y3, Y3
  2140. VPMULLW Y0, Y4, Y4
  2141. VPMULLW Y0, Y5, Y5
  2142. VPMULLW Y0, Y6, Y6
  2143. VPMULLW Y0, Y7, Y7
  2144. VPMULHW Y1, Y15, Y15
  2145. VPMULHW Y1, Y3, Y3
  2146. VPMULHW Y1, Y4, Y4
  2147. VPMULHW Y1, Y5, Y5
  2148. VPMULHW Y1, Y6, Y6
  2149. VPMULHW Y1, Y7, Y7
  2150. VPSUBW Y15, Y9, Y15
  2151. VPSUBW Y3, Y10, Y3
  2152. VPSUBW Y4, Y11, Y4
  2153. VPSUBW Y5, Y12, Y5
  2154. VPSUBW Y6, Y13, Y6
  2155. VPSUBW Y7, Y14, Y7
  2156. // add
  2157. VPADDW Y15, Y5, Y5
  2158. VPADDW Y3, Y6, Y6
  2159. VPADDW Y4, Y7, Y7
  2160. // reduce 2
  2161. VMOVDQU ·v_x16<>(SB), Y3
  2162. VPMULHW Y3, Y5, Y8
  2163. VPMULHW Y3, Y6, Y9
  2164. VPMULHW Y3, Y7, Y10
  2165. VPSRAW $11, Y8, Y8
  2166. VPSRAW $11, Y9, Y9
  2167. VPSRAW $11, Y10, Y10
  2168. VPMULLW Y1, Y8, Y8
  2169. VPMULLW Y1, Y9, Y9
  2170. VPMULLW Y1, Y10, Y10
  2171. VPSUBW Y8, Y5, Y5
  2172. VPSUBW Y9, Y6, Y6
  2173. VPSUBW Y10, Y7, Y7
  2174. // store
  2175. VMOVDQU Y5, (DI)(BX*1)
  2176. VMOVDQU Y6, 32(DI)(BX*1)
  2177. VMOVDQU Y7, 64(DI)(BX*1)
  2178. ADDQ $1, AX
  2179. ADDQ $96, BX
  2180. CMPQ AX, $5
  2181. JB looptop2
  2182. // load
  2183. VMOVDQU (SI)(BX*1), Y4
  2184. VMOVDQU (R8)(BX*1), Y7
  2185. VMOVDQU (DX)(BX*1), Y9
  2186. VMOVDQU (R11)(BX*1), Y12
  2187. // mul montsq
  2188. VPMULLW Y2, Y4, Y3
  2189. VPMULHW Y2, Y4, Y10
  2190. VPMULLW Y2, Y7, Y6
  2191. VPMULHW Y2, Y7, Y13
  2192. // reduce
  2193. VPMULLW Y0, Y3, Y3
  2194. VPMULLW Y0, Y6, Y6
  2195. VPMULHW Y1, Y3, Y3
  2196. VPMULHW Y1, Y6, Y6
  2197. VPSUBW Y3, Y10, Y3
  2198. VPSUBW Y6, Y13, Y6
  2199. // mul
  2200. VPMULLW Y3, Y9, Y15
  2201. VPMULHW Y3, Y9, Y9
  2202. VPMULLW Y6, Y12, Y5
  2203. VPMULHW Y6, Y12, Y12
  2204. // reduce
  2205. VPMULLW Y0, Y15, Y15
  2206. VPMULLW Y0, Y5, Y5
  2207. VPMULHW Y1, Y15, Y15
  2208. VPMULHW Y1, Y5, Y5
  2209. VPSUBW Y15, Y9, Y15
  2210. VPSUBW Y5, Y12, Y5
  2211. // add
  2212. VPADDW Y15, Y5, Y5
  2213. // reduce 2
  2214. VMOVDQU ·v_x16<>(SB), Y3
  2215. VPMULHW Y3, Y5, Y8
  2216. VPSRAW $11, Y8, Y8
  2217. VPMULLW Y1, Y8, Y8
  2218. VPSUBW Y8, Y5, Y5
  2219. // store
  2220. VMOVDQU Y5, (DI)(BX*1)
  2221. VZEROUPPER
  2222. RET
  2223. // func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
  2224. TEXT ·pointwiseAccK3AVX2(SB), NOSPLIT, $0-24
  2225. MOVQ dst+0(FP), DI
  2226. MOVQ a+8(FP), SI
  2227. MOVQ b+16(FP), DX
  2228. VMOVDQU ·qinv_x16<>(SB), Y0
  2229. VMOVDQU ·q_x16<>(SB), Y1
  2230. VMOVDQU ·montsq_x16<>(SB), Y2
  2231. XORQ AX, AX
  2232. XORQ BX, BX
  2233. MOVQ (16)(SI), R9 // a[2]
  2234. MOVQ 8(SI), R8 // a[1]
  2235. MOVQ (SI), SI // a[0]
  2236. MOVQ 16(DX), R12 // b[2]
  2237. MOVQ 8(DX), R11 // b[1]
  2238. MOVQ (DX), DX // b[0]
  2239. looptop3:
  2240. // load a
  2241. VMOVDQU (SI)(BX*1), Y4
  2242. VMOVDQU 32(SI)(BX*1), Y5
  2243. VMOVDQU (R8)(BX*1), Y6
  2244. VMOVDQU 32(R8)(BX*1), Y7
  2245. VMOVDQU (R9)(BX*1), Y8
  2246. VMOVDQU 32(R9)(BX*1), Y9
  2247. // mul montsq
  2248. VPMULLW Y2, Y4, Y3
  2249. VPMULHW Y2, Y4, Y10
  2250. VPMULLW Y2, Y5, Y4
  2251. VPMULHW Y2, Y5, Y11
  2252. VPMULLW Y2, Y6, Y5
  2253. VPMULHW Y2, Y6, Y12
  2254. VPMULLW Y2, Y7, Y6
  2255. VPMULHW Y2, Y7, Y13
  2256. VPMULLW Y2, Y8, Y7
  2257. VPMULHW Y2, Y8, Y14
  2258. VPMULLW Y2, Y9, Y8
  2259. VPMULHW Y2, Y9, Y15
  2260. // reduce
  2261. VPMULLW Y0, Y3, Y3
  2262. VPMULLW Y0, Y4, Y4
  2263. VPMULLW Y0, Y5, Y5
  2264. VPMULLW Y0, Y6, Y6
  2265. VPMULLW Y0, Y7, Y7
  2266. VPMULLW Y0, Y8, Y8
  2267. VPMULHW Y1, Y3, Y3
  2268. VPMULHW Y1, Y4, Y4
  2269. VPMULHW Y1, Y5, Y5
  2270. VPMULHW Y1, Y6, Y6
  2271. VPMULHW Y1, Y7, Y7
  2272. VPMULHW Y1, Y8, Y8
  2273. VPSUBW Y3, Y10, Y3
  2274. VPSUBW Y4, Y11, Y4
  2275. VPSUBW Y5, Y12, Y5
  2276. VPSUBW Y6, Y13, Y6
  2277. VPSUBW Y7, Y14, Y7
  2278. VPSUBW Y8, Y15, Y8
  2279. // load b
  2280. VMOVDQU (DX)(BX*1), Y9
  2281. VMOVDQU 32(DX)(BX*1), Y10
  2282. VMOVDQU (R11)(BX*1), Y11
  2283. VMOVDQU 32(R11)(BX*1), Y12
  2284. VMOVDQU (R12)(BX*1), Y13
  2285. VMOVDQU 32(R12)(BX*1), Y14
  2286. // mul
  2287. VPMULLW Y3, Y9, Y15
  2288. VPMULHW Y3, Y9, Y9
  2289. VPMULLW Y4, Y10, Y3
  2290. VPMULHW Y4, Y10, Y10
  2291. VPMULLW Y5, Y11, Y4
  2292. VPMULHW Y5, Y11, Y11
  2293. VPMULLW Y6, Y12, Y5
  2294. VPMULHW Y6, Y12, Y12
  2295. VPMULLW Y7, Y13, Y6
  2296. VPMULHW Y7, Y13, Y13
  2297. VPMULLW Y8, Y14, Y7
  2298. VPMULHW Y8, Y14, Y14
  2299. // reduce
  2300. VPMULLW Y0, Y15, Y15
  2301. VPMULLW Y0, Y3, Y3
  2302. VPMULLW Y0, Y4, Y4
  2303. VPMULLW Y0, Y5, Y5
  2304. VPMULLW Y0, Y6, Y6
  2305. VPMULLW Y0, Y7, Y7
  2306. VPMULHW Y1, Y15, Y15
  2307. VPMULHW Y1, Y3, Y3
  2308. VPMULHW Y1, Y4, Y4
  2309. VPMULHW Y1, Y5, Y5
  2310. VPMULHW Y1, Y6, Y6
  2311. VPMULHW Y1, Y7, Y7
  2312. VPSUBW Y15, Y9, Y15
  2313. VPSUBW Y3, Y10, Y3
  2314. VPSUBW Y4, Y11, Y4
  2315. VPSUBW Y5, Y12, Y5
  2316. VPSUBW Y6, Y13, Y6
  2317. VPSUBW Y7, Y14, Y7
  2318. // add
  2319. VPADDW Y15, Y4, Y4
  2320. VPADDW Y3, Y5, Y5
  2321. VPADDW Y4, Y6, Y6
  2322. VPADDW Y5, Y7, Y7
  2323. // reduce 2
  2324. VMOVDQU ·v_x16<>(SB), Y3
  2325. VPMULHW Y3, Y6, Y8
  2326. VPMULHW Y3, Y7, Y9
  2327. VPSRAW $11, Y8, Y8
  2328. VPSRAW $11, Y9, Y9
  2329. VPMULLW Y1, Y8, Y8
  2330. VPMULLW Y1, Y9, Y9
  2331. VPSUBW Y8, Y6, Y6
  2332. VPSUBW Y9, Y7, Y7
  2333. // store
  2334. VMOVDQU Y6, (DI)(BX*1)
  2335. VMOVDQU Y7, 32(DI)(BX*1)
  2336. ADDQ $1, AX
  2337. ADDQ $64, BX
  2338. CMPQ AX, $8
  2339. JB looptop3
  2340. VZEROUPPER
  2341. RET
  2342. // func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
  2343. TEXT ·pointwiseAccK4AVX2(SB), NOSPLIT, $0-24
  2344. MOVQ dst+0(FP), DI
  2345. MOVQ a+8(FP), SI
  2346. MOVQ b+16(FP), DX
  2347. VMOVDQU ·qinv_x16<>(SB), Y0
  2348. VMOVDQU ·q_x16<>(SB), Y1
  2349. VMOVDQU ·montsq_x16<>(SB), Y2
  2350. VMOVDQU ·v_x16<>(SB), Y3
  2351. XORQ AX, AX
  2352. XORQ BX, BX
  2353. MOVQ 24(SI), R10 // a[3]
  2354. MOVQ 16(SI), R9 // a[2]
  2355. MOVQ 8(SI), R8 // a[1]
  2356. MOVQ (SI), SI // a[0]
  2357. MOVQ 24(DX), R13 // b[3]
  2358. MOVQ 16(DX), R12 // b[2]
  2359. MOVQ 8(DX), R11 // b[1]
  2360. MOVQ (DX), DX // b[0]
  2361. looptop4:
  2362. // load a
  2363. VMOVDQU (SI)(BX*1), Y6
  2364. VMOVDQU (R8)(BX*1), Y7
  2365. VMOVDQU (R9)(BX*1), Y8
  2366. VMOVDQU (R10)(BX*1), Y9
  2367. // mul montsq
  2368. VPMULLW Y2, Y6, Y5
  2369. VPMULHW Y2, Y6, Y10
  2370. VPMULLW Y2, Y7, Y6
  2371. VPMULHW Y2, Y7, Y11
  2372. VPMULLW Y2, Y8, Y7
  2373. VPMULHW Y2, Y8, Y12
  2374. VPMULLW Y2, Y9, Y8
  2375. VPMULHW Y2, Y9, Y13
  2376. // reduce
  2377. VPMULLW Y0, Y5, Y5
  2378. VPMULLW Y0, Y6, Y6
  2379. VPMULLW Y0, Y7, Y7
  2380. VPMULLW Y0, Y8, Y8
  2381. VPMULHW Y1, Y5, Y5
  2382. VPMULHW Y1, Y6, Y6
  2383. VPMULHW Y1, Y7, Y7
  2384. VPMULHW Y1, Y8, Y8
  2385. VPSUBW Y5, Y10, Y5
  2386. VPSUBW Y6, Y11, Y6
  2387. VPSUBW Y7, Y12, Y7
  2388. VPSUBW Y8, Y13, Y8
  2389. // load b
  2390. VMOVDQU (DX)(BX*1), Y9
  2391. VMOVDQU (R11)(BX*1), Y10
  2392. VMOVDQU (R12)(BX*1), Y11
  2393. VMOVDQU (R13)(BX*1), Y12
  2394. // mul
  2395. VPMULLW Y5, Y9, Y4
  2396. VPMULHW Y5, Y9, Y9
  2397. VPMULLW Y6, Y10, Y5
  2398. VPMULHW Y6, Y10, Y10
  2399. VPMULLW Y7, Y11, Y6
  2400. VPMULHW Y7, Y11, Y11
  2401. VPMULLW Y8, Y12, Y7
  2402. VPMULHW Y8, Y12, Y12
  2403. // reduce
  2404. VPMULLW Y0, Y4, Y4
  2405. VPMULLW Y0, Y5, Y5
  2406. VPMULLW Y0, Y6, Y6
  2407. VPMULLW Y0, Y7, Y7
  2408. VPMULHW Y1, Y4, Y4
  2409. VPMULHW Y1, Y5, Y5
  2410. VPMULHW Y1, Y6, Y6
  2411. VPMULHW Y1, Y7, Y7
  2412. VPSUBW Y4, Y9, Y4
  2413. VPSUBW Y5, Y10, Y5
  2414. VPSUBW Y6, Y11, Y6
  2415. VPSUBW Y7, Y12, Y7
  2416. // add
  2417. VPADDW Y4, Y5, Y5
  2418. VPADDW Y5, Y6, Y6
  2419. VPADDW Y6, Y7, Y7
  2420. // reduce 2
  2421. VPMULHW Y3, Y7, Y8
  2422. VPSRAW $11, Y8, Y8
  2423. VPMULLW Y1, Y8, Y8
  2424. VPSUBW Y8, Y7, Y8
  2425. // store
  2426. VMOVDQU Y8, (DI)(BX*1)
  2427. ADDQ $1, AX
  2428. ADDQ $32, BX
  2429. CMPQ AX, $16
  2430. JB looptop4
  2431. VZEROUPPER
  2432. RET