hwaccel_amd64.s 53 KB


  1. // +build !noasm,go1.10
  2. // hwaccel_amd64.s - AMD64 optimized routines.
  3. //
  4. // To the extent possible under law, Yawning Angel has waived all copyright
  5. // and related or neighboring rights to the software, using the Creative
  6. // Commons "CC0" public domain dedication. See LICENSE or
  7. // <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
  8. #include "textflag.h"
  9. // func cpuidAmd64(cpuidParams *uint32)
  10. TEXT ·cpuidAmd64(SB), NOSPLIT, $0-8
  11. MOVQ cpuidParams+0(FP), R15
  12. MOVL 0(R15), AX
  13. MOVL 8(R15), CX
  14. CPUID
  15. MOVL AX, 0(R15)
  16. MOVL BX, 4(R15)
  17. MOVL CX, 8(R15)
  18. MOVL DX, 12(R15)
  19. RET
  20. // func xgetbv0Amd64(xcrVec *uint32)
  21. TEXT ·xgetbv0Amd64(SB), NOSPLIT, $0-8
  22. MOVQ xcrVec+0(FP), BX
  23. XORL CX, CX
  24. XGETBV
  25. MOVL AX, 0(BX)
  26. MOVL DX, 4(BX)
  27. RET
  28. // Routines taken from the `avx2` implementation, converted to Go's assembly
  29. // dialect. I do this in lieu of cutting myself to see if I still can feel
  30. // pain.
  31. //
  32. // The conversion is mostly direct except:
  33. // * Instead of aligned loads, unaligned loads are used, as there is no
  34. // meaningful difference on modern Intel systems, and it's not immediately
  35. // obvious to me how Go will align global data.
  36. // * The polyvec_pointwise_acc family of routines take vectors of pointers
  37. // due to the different internal memory layout of a polyvec.
  38. // * The constants are renamed slightly.
  39. // Note:
  40. // * These must be kept in sync with the values in params.go.
  41. // Currently assumes Q = 7681, Q_INV = 57857.
  42. // * Caution, Little endian so things will look different from avx2/consts.c.
  43. DATA ·vpshufb_idx<>+0x00(SB)/8, $0x0504070601000302
  44. DATA ·vpshufb_idx<>+0x08(SB)/8, $0x0d0c0f0e09080b0a
  45. DATA ·vpshufb_idx<>+0x10(SB)/8, $0x0504070601000302
  46. DATA ·vpshufb_idx<>+0x18(SB)/8, $0x0d0c0f0e09080b0a
  47. GLOBL ·vpshufb_idx<>(SB), (NOPTR+RODATA), $32
  48. DATA ·low_mask<>+0x00(SB)/8, $0x1fff1fff1fff1fff
  49. DATA ·low_mask<>+0x08(SB)/8, $0x1fff1fff1fff1fff
  50. DATA ·low_mask<>+0x10(SB)/8, $0x1fff1fff1fff1fff
  51. DATA ·low_mask<>+0x18(SB)/8, $0x1fff1fff1fff1fff
  52. GLOBL ·low_mask<>(SB), (NOPTR+RODATA), $32
  53. DATA ·lowdword<>+0x00(SB)/8, $0x0000ffff0000ffff
  54. DATA ·lowdword<>+0x08(SB)/8, $0x0000ffff0000ffff
  55. DATA ·lowdword<>+0x10(SB)/8, $0x0000ffff0000ffff
  56. DATA ·lowdword<>+0x18(SB)/8, $0x0000ffff0000ffff
  57. GLOBL ·lowdword<>(SB), (NOPTR+RODATA), $32
  58. DATA ·q_x16<>+0x00(SB)/8, $0x1e011e011e011e01
  59. DATA ·q_x16<>+0x08(SB)/8, $0x1e011e011e011e01
  60. DATA ·q_x16<>+0x10(SB)/8, $0x1e011e011e011e01
  61. DATA ·q_x16<>+0x18(SB)/8, $0x1e011e011e011e01
  62. GLOBL ·q_x16<>(SB), (NOPTR+RODATA), $32
  63. DATA ·q2_x16<>+0x00(SB)/8, $0x3c023c023c023c02
  64. DATA ·q2_x16<>+0x08(SB)/8, $0x3c023c023c023c02
  65. DATA ·q2_x16<>+0x10(SB)/8, $0x3c023c023c023c02
  66. DATA ·q2_x16<>+0x18(SB)/8, $0x3c023c023c023c02
  67. GLOBL ·q2_x16<>(SB), (NOPTR+RODATA), $32
  68. DATA ·qinv_x16<>+0x00(SB)/8, $0xe201e201e201e201
  69. DATA ·qinv_x16<>+0x08(SB)/8, $0xe201e201e201e201
  70. DATA ·qinv_x16<>+0x10(SB)/8, $0xe201e201e201e201
  71. DATA ·qinv_x16<>+0x18(SB)/8, $0xe201e201e201e201
  72. GLOBL ·qinv_x16<>(SB), (NOPTR+RODATA), $32
  73. DATA ·f_x16<>+0x00(SB)/8, $0x0100010001000100
  74. DATA ·f_x16<>+0x08(SB)/8, $0x0100010001000100
  75. DATA ·f_x16<>+0x10(SB)/8, $0x0100010001000100
  76. DATA ·f_x16<>+0x18(SB)/8, $0x0100010001000100
  77. GLOBL ·f_x16<>(SB), (NOPTR+RODATA), $32
  78. DATA ·v_x16<>+0x00(SB)/8, $0x4442444244424442
  79. DATA ·v_x16<>+0x08(SB)/8, $0x4442444244424442
  80. DATA ·v_x16<>+0x10(SB)/8, $0x4442444244424442
  81. DATA ·v_x16<>+0x18(SB)/8, $0x4442444244424442
  82. GLOBL ·v_x16<>(SB), (NOPTR+RODATA), $32
  83. DATA ·montsq_x16<>+0x00(SB)/8, $0x15c115c115c115c1
  84. DATA ·montsq_x16<>+0x08(SB)/8, $0x15c115c115c115c1
  85. DATA ·montsq_x16<>+0x10(SB)/8, $0x15c115c115c115c1
  86. DATA ·montsq_x16<>+0x18(SB)/8, $0x15c115c115c115c1
  87. GLOBL ·montsq_x16<>(SB), (NOPTR+RODATA), $32
  88. DATA ·mask11<>+0x00(SB)/8, $0x1111111111111111
  89. DATA ·mask11<>+0x08(SB)/8, $0x1111111111111111
  90. DATA ·mask11<>+0x10(SB)/8, $0x1111111111111111
  91. DATA ·mask11<>+0x18(SB)/8, $0x1111111111111111
  92. GLOBL ·mask11<>(SB), (NOPTR+RODATA), $32
  93. DATA ·mask0f<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
  94. DATA ·mask0f<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
  95. DATA ·mask0f<>+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f
  96. DATA ·mask0f<>+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f
  97. GLOBL ·mask0f<>(SB), (NOPTR+RODATA), $32
  98. // func nttAVX2(inout, zetas *uint16)
  99. TEXT ·nttAVX2(SB), NOSPLIT, $0-16
  100. MOVQ inout+0(FP), DI
  101. MOVQ zetas+8(FP), SI
  102. VMOVDQU ·qinv_x16<>(SB), Y0
  103. VMOVDQU ·q_x16<>(SB), Y1
  104. VMOVDQU ·low_mask<>(SB), Y2
  105. // zetas
  106. VMOVDQU (SI), Y3
  107. // first round
  108. // load
  109. VMOVDQU (DI), Y4
  110. VMOVDQU 32(DI), Y5
  111. VMOVDQU 64(DI), Y6
  112. VMOVDQU 96(DI), Y7
  113. VMOVDQU 256(DI), Y8
  114. VMOVDQU 288(DI), Y9
  115. VMOVDQU 320(DI), Y10
  116. VMOVDQU 352(DI), Y11
  117. // level 0
  118. // mul
  119. VPMULLW Y3, Y8, Y12
  120. VPMULHW Y3, Y8, Y8
  121. VPMULLW Y3, Y9, Y13
  122. VPMULHW Y3, Y9, Y9
  123. VPMULLW Y3, Y10, Y14
  124. VPMULHW Y3, Y10, Y10
  125. VPMULLW Y3, Y11, Y15
  126. VPMULHW Y3, Y11, Y11
  127. // reduce
  128. VPMULLW Y0, Y12, Y12
  129. VPMULLW Y0, Y13, Y13
  130. VPMULLW Y0, Y14, Y14
  131. VPMULLW Y0, Y15, Y15
  132. VPMULHW Y1, Y12, Y12
  133. VPMULHW Y1, Y13, Y13
  134. VPMULHW Y1, Y14, Y14
  135. VPMULHW Y1, Y15, Y15
  136. VPSUBW Y12, Y8, Y12
  137. VPSUBW Y13, Y9, Y13
  138. VPSUBW Y14, Y10, Y14
  139. VPSUBW Y15, Y11, Y15
  140. // update
  141. VPSUBW Y12, Y4, Y8
  142. VPSUBW Y13, Y5, Y9
  143. VPSUBW Y14, Y6, Y10
  144. VPSUBW Y15, Y7, Y11
  145. VPADDW Y12, Y4, Y4
  146. VPADDW Y13, Y5, Y5
  147. VPADDW Y14, Y6, Y6
  148. VPADDW Y15, Y7, Y7
  149. // store
  150. VMOVDQU Y4, (DI)
  151. VMOVDQU Y5, 32(DI)
  152. VMOVDQU Y6, 64(DI)
  153. VMOVDQU Y7, 96(DI)
  154. VMOVDQU Y8, 256(DI)
  155. VMOVDQU Y9, 288(DI)
  156. VMOVDQU Y10, 320(DI)
  157. VMOVDQU Y11, 352(DI)
  158. ADDQ $128, DI
  159. // second round
  160. // load
  161. VMOVDQU (DI), Y4
  162. VMOVDQU 32(DI), Y5
  163. VMOVDQU 64(DI), Y6
  164. VMOVDQU 96(DI), Y7
  165. VMOVDQU 256(DI), Y8
  166. VMOVDQU 288(DI), Y9
  167. VMOVDQU 320(DI), Y10
  168. VMOVDQU 352(DI), Y11
  169. // level 0
  170. // mul
  171. VPMULLW Y3, Y8, Y12
  172. VPMULHW Y3, Y8, Y8
  173. VPMULLW Y3, Y9, Y13
  174. VPMULHW Y3, Y9, Y9
  175. VPMULLW Y3, Y10, Y14
  176. VPMULHW Y3, Y10, Y10
  177. VPMULLW Y3, Y11, Y15
  178. VPMULHW Y3, Y11, Y11
  179. // reduce
  180. VPMULLW Y0, Y12, Y12
  181. VPMULLW Y0, Y13, Y13
  182. VPMULLW Y0, Y14, Y14
  183. VPMULLW Y0, Y15, Y15
  184. VPMULHW Y1, Y12, Y12
  185. VPMULHW Y1, Y13, Y13
  186. VPMULHW Y1, Y14, Y14
  187. VPMULHW Y1, Y15, Y15
  188. VPSUBW Y12, Y8, Y12
  189. VPSUBW Y13, Y9, Y13
  190. VPSUBW Y14, Y10, Y14
  191. VPSUBW Y15, Y11, Y15
  192. // update
  193. VPSUBW Y12, Y4, Y8
  194. VPSUBW Y13, Y5, Y9
  195. VPSUBW Y14, Y6, Y10
  196. VPSUBW Y15, Y7, Y11
  197. VPADDW Y12, Y4, Y4
  198. VPADDW Y13, Y5, Y5
  199. VPADDW Y14, Y6, Y6
  200. VPADDW Y15, Y7, Y7
  201. // store
  202. VMOVDQU Y4, (DI)
  203. VMOVDQU Y5, 32(DI)
  204. VMOVDQU Y6, 64(DI)
  205. VMOVDQU Y7, 96(DI)
  206. VMOVDQU Y8, 256(DI)
  207. VMOVDQU Y9, 288(DI)
  208. VMOVDQU Y10, 320(DI)
  209. VMOVDQU Y11, 352(DI)
  210. SUBQ $128, DI
  211. // first round
  212. // zetas
  213. VMOVDQU 32(SI), Y3
  214. // load
  215. VMOVDQU (DI), Y4
  216. VMOVDQU 32(DI), Y5
  217. VMOVDQU 64(DI), Y6
  218. VMOVDQU 96(DI), Y7
  219. VMOVDQU 128(DI), Y8
  220. VMOVDQU 160(DI), Y9
  221. VMOVDQU 192(DI), Y10
  222. VMOVDQU 224(DI), Y11
  223. // level 1
  224. // mul
  225. VPMULLW Y3, Y8, Y12
  226. VPMULHW Y3, Y8, Y8
  227. VPMULLW Y3, Y9, Y13
  228. VPMULHW Y3, Y9, Y9
  229. VPMULLW Y3, Y10, Y14
  230. VPMULHW Y3, Y10, Y10
  231. VPMULLW Y3, Y11, Y15
  232. VPMULHW Y3, Y11, Y11
  233. // reduce
  234. VPMULLW Y0, Y12, Y12
  235. VPMULLW Y0, Y13, Y13
  236. VPMULLW Y0, Y14, Y14
  237. VPMULLW Y0, Y15, Y15
  238. VPMULHW Y1, Y12, Y12
  239. VPMULHW Y1, Y13, Y13
  240. VPMULHW Y1, Y14, Y14
  241. VPMULHW Y1, Y15, Y15
  242. VPSUBW Y12, Y8, Y12
  243. VPSUBW Y13, Y9, Y13
  244. VPSUBW Y14, Y10, Y14
  245. VPSUBW Y15, Y11, Y15
  246. // update
  247. VPSUBW Y12, Y4, Y8
  248. VPSUBW Y13, Y5, Y9
  249. VPSUBW Y14, Y6, Y10
  250. VPSUBW Y15, Y7, Y11
  251. VPADDW Y12, Y4, Y4
  252. VPADDW Y13, Y5, Y5
  253. VPADDW Y14, Y6, Y6
  254. VPADDW Y15, Y7, Y7
  255. // level 2
  256. // zetas
  257. VMOVDQU 96(SI), Y15
  258. VMOVDQU 128(SI), Y3
  259. // mul
  260. VPMULLW Y15, Y6, Y12
  261. VPMULHW Y15, Y6, Y6
  262. VPMULLW Y15, Y7, Y13
  263. VPMULHW Y15, Y7, Y7
  264. VPMULLW Y3, Y10, Y14
  265. VPMULHW Y3, Y10, Y10
  266. VPMULLW Y3, Y11, Y15
  267. VPMULHW Y3, Y11, Y11
  268. // reduce
  269. VPMULLW Y0, Y12, Y12
  270. VPMULLW Y0, Y13, Y13
  271. VPMULLW Y0, Y14, Y14
  272. VPMULLW Y0, Y15, Y15
  273. VPMULHW Y1, Y12, Y12
  274. VPMULHW Y1, Y13, Y13
  275. VPMULHW Y1, Y14, Y14
  276. VPMULHW Y1, Y15, Y15
  277. VPSUBW Y12, Y6, Y12
  278. VPSUBW Y13, Y7, Y13
  279. VPSUBW Y14, Y10, Y14
  280. VPSUBW Y15, Y11, Y15
  281. // update
  282. VPSUBW Y12, Y4, Y6
  283. VPSUBW Y13, Y5, Y7
  284. VPSUBW Y14, Y8, Y10
  285. VPSUBW Y15, Y9, Y11
  286. VPADDW Y12, Y4, Y4
  287. VPADDW Y13, Y5, Y5
  288. VPADDW Y14, Y8, Y8
  289. VPADDW Y15, Y9, Y9
  290. // level 3
  291. // zetas
  292. VMOVDQU 224(SI), Y13
  293. VMOVDQU 256(SI), Y14
  294. VMOVDQU 288(SI), Y15
  295. VMOVDQU 320(SI), Y3
  296. // mul
  297. VPMULLW Y13, Y5, Y12
  298. VPMULHW Y13, Y5, Y5
  299. VPMULLW Y14, Y7, Y13
  300. VPMULHW Y14, Y7, Y7
  301. VPMULLW Y15, Y9, Y14
  302. VPMULHW Y15, Y9, Y9
  303. VPMULLW Y3, Y11, Y15
  304. VPMULHW Y3, Y11, Y11
  305. // reduce
  306. VPMULLW Y0, Y12, Y12
  307. VPMULLW Y0, Y13, Y13
  308. VPMULLW Y0, Y14, Y14
  309. VPMULLW Y0, Y15, Y15
  310. VPMULHW Y1, Y12, Y12
  311. VPMULHW Y1, Y13, Y13
  312. VPMULHW Y1, Y14, Y14
  313. VPMULHW Y1, Y15, Y15
  314. VPSUBW Y12, Y5, Y12
  315. VPSUBW Y13, Y7, Y13
  316. VPSUBW Y14, Y9, Y14
  317. VPSUBW Y15, Y11, Y15
  318. // reduce 2
  319. VPSRAW $13, Y4, Y5
  320. VPSRAW $13, Y6, Y7
  321. VPSRAW $13, Y8, Y9
  322. VPSRAW $13, Y10, Y11
  323. VPAND Y2, Y4, Y4
  324. VPAND Y2, Y6, Y6
  325. VPAND Y2, Y8, Y8
  326. VPAND Y2, Y10, Y10
  327. VPSUBW Y5, Y4, Y4
  328. VPSUBW Y7, Y6, Y6
  329. VPSUBW Y9, Y8, Y8
  330. VPSUBW Y11, Y10, Y10
  331. VPSLLW $9, Y5, Y5
  332. VPSLLW $9, Y7, Y7
  333. VPSLLW $9, Y9, Y9
  334. VPSLLW $9, Y11, Y11
  335. VPADDW Y5, Y4, Y4
  336. VPADDW Y7, Y6, Y6
  337. VPADDW Y9, Y8, Y8
  338. VPADDW Y11, Y10, Y10
  339. // update
  340. VPSUBW Y12, Y4, Y5
  341. VPSUBW Y13, Y6, Y7
  342. VPSUBW Y14, Y8, Y9
  343. VPSUBW Y15, Y10, Y11
  344. VPADDW Y12, Y4, Y4
  345. VPADDW Y13, Y6, Y6
  346. VPADDW Y14, Y8, Y8
  347. VPADDW Y15, Y10, Y10
  348. // level 4
  349. // zetas
  350. VMOVDQU 480(SI), Y12
  351. VMOVDQU 512(SI), Y13
  352. VMOVDQU 544(SI), Y14
  353. VMOVDQU 576(SI), Y15
  354. // shuffle
  355. VPERM2I128 $0x02, Y4, Y5, Y3
  356. VPERM2I128 $0x13, Y4, Y5, Y4
  357. VPERM2I128 $0x02, Y6, Y7, Y5
  358. VPERM2I128 $0x13, Y6, Y7, Y6
  359. VPERM2I128 $0x02, Y8, Y9, Y7
  360. VPERM2I128 $0x13, Y8, Y9, Y8
  361. VPERM2I128 $0x02, Y10, Y11, Y9
  362. VPERM2I128 $0x13, Y10, Y11, Y10
  363. // mul
  364. VPMULLW Y12, Y4, Y11
  365. VPMULHW Y12, Y4, Y4
  366. VPMULLW Y13, Y6, Y12
  367. VPMULHW Y13, Y6, Y6
  368. VPMULLW Y14, Y8, Y13
  369. VPMULHW Y14, Y8, Y8
  370. VPMULLW Y15, Y10, Y14
  371. VPMULHW Y15, Y10, Y10
  372. // reduce
  373. VPMULLW Y0, Y11, Y11
  374. VPMULLW Y0, Y12, Y12
  375. VPMULLW Y0, Y13, Y13
  376. VPMULLW Y0, Y14, Y14
  377. VPMULHW Y1, Y11, Y11
  378. VPMULHW Y1, Y12, Y12
  379. VPMULHW Y1, Y13, Y13
  380. VPMULHW Y1, Y14, Y14
  381. VPSUBW Y11, Y4, Y11
  382. VPSUBW Y12, Y6, Y12
  383. VPSUBW Y13, Y8, Y13
  384. VPSUBW Y14, Y10, Y14
  385. // update
  386. VPSUBW Y11, Y3, Y4
  387. VPSUBW Y12, Y5, Y6
  388. VPSUBW Y13, Y7, Y8
  389. VPSUBW Y14, Y9, Y10
  390. VPADDW Y11, Y3, Y3
  391. VPADDW Y12, Y5, Y5
  392. VPADDW Y13, Y7, Y7
  393. VPADDW Y14, Y9, Y9
  394. // level 5
  395. // zetas
  396. VMOVDQU 736(SI), Y12
  397. VMOVDQU 768(SI), Y13
  398. VMOVDQU 800(SI), Y14
  399. VMOVDQU 832(SI), Y15
  400. // shuffle
  401. VSHUFPD $0x00, Y4, Y3, Y11
  402. VSHUFPD $0x0F, Y4, Y3, Y3
  403. VSHUFPD $0x00, Y6, Y5, Y4
  404. VSHUFPD $0x0F, Y6, Y5, Y5
  405. VSHUFPD $0x00, Y8, Y7, Y6
  406. VSHUFPD $0x0F, Y8, Y7, Y7
  407. VSHUFPD $0x00, Y10, Y9, Y8
  408. VSHUFPD $0x0F, Y10, Y9, Y9
  409. // mul
  410. VPMULLW Y12, Y3, Y10
  411. VPMULHW Y12, Y3, Y3
  412. VPMULLW Y13, Y5, Y12
  413. VPMULHW Y13, Y5, Y5
  414. VPMULLW Y14, Y7, Y13
  415. VPMULHW Y14, Y7, Y7
  416. VPMULLW Y15, Y9, Y14
  417. VPMULHW Y15, Y9, Y9
  418. // reduce
  419. VPMULLW Y0, Y10, Y10
  420. VPMULLW Y0, Y12, Y12
  421. VPMULLW Y0, Y13, Y13
  422. VPMULLW Y0, Y14, Y14
  423. VPMULHW Y1, Y10, Y10
  424. VPMULHW Y1, Y12, Y12
  425. VPMULHW Y1, Y13, Y13
  426. VPMULHW Y1, Y14, Y14
  427. VPSUBW Y10, Y3, Y10
  428. VPSUBW Y12, Y5, Y12
  429. VPSUBW Y13, Y7, Y13
  430. VPSUBW Y14, Y9, Y14
  431. // update
  432. VPSUBW Y10, Y11, Y3
  433. VPSUBW Y12, Y4, Y5
  434. VPSUBW Y13, Y6, Y7
  435. VPSUBW Y14, Y8, Y9
  436. VPADDW Y10, Y11, Y10
  437. VPADDW Y12, Y4, Y4
  438. VPADDW Y13, Y6, Y6
  439. VPADDW Y14, Y8, Y8
  440. // level 6
  441. // shuffle
  442. VPSHUFD $0xB1, Y10, Y12
  443. VPSHUFD $0xB1, Y3, Y13
  444. VPSHUFD $0xB1, Y4, Y14
  445. VPSHUFD $0xB1, Y5, Y15
  446. VPBLENDD $0x55, Y10, Y13, Y10
  447. VPBLENDD $0xAA, Y3, Y12, Y3
  448. VPBLENDD $0x55, Y4, Y15, Y4
  449. VPBLENDD $0xAA, Y5, Y14, Y5
  450. VPSHUFD $0xB1, Y6, Y12
  451. VPSHUFD $0xB1, Y7, Y13
  452. VPSHUFD $0xB1, Y8, Y14
  453. VPSHUFD $0xB1, Y9, Y15
  454. VPBLENDD $0x55, Y6, Y13, Y6
  455. VPBLENDD $0xAA, Y7, Y12, Y7
  456. VPBLENDD $0x55, Y8, Y15, Y8
  457. VPBLENDD $0xAA, Y9, Y14, Y9
  458. // zetas
  459. VMOVDQU 992(SI), Y12
  460. VMOVDQU 1024(SI), Y13
  461. VMOVDQU 1056(SI), Y14
  462. VMOVDQU 1088(SI), Y15
  463. // mul
  464. VPMULLW Y12, Y3, Y11
  465. VPMULHW Y12, Y3, Y3
  466. VPMULLW Y13, Y5, Y12
  467. VPMULHW Y13, Y5, Y5
  468. VPMULLW Y14, Y7, Y13
  469. VPMULHW Y14, Y7, Y7
  470. VPMULLW Y15, Y9, Y14
  471. VPMULHW Y15, Y9, Y9
  472. // reduce
  473. VPMULLW Y0, Y11, Y11
  474. VPMULLW Y0, Y12, Y12
  475. VPMULLW Y0, Y13, Y13
  476. VPMULLW Y0, Y14, Y14
  477. VPMULHW Y1, Y11, Y11
  478. VPMULHW Y1, Y12, Y12
  479. VPMULHW Y1, Y13, Y13
  480. VPMULHW Y1, Y14, Y14
  481. VPSUBW Y11, Y3, Y11
  482. VPSUBW Y12, Y5, Y12
  483. VPSUBW Y13, Y7, Y13
  484. VPSUBW Y14, Y9, Y14
  485. // reduce 2
  486. VPSRAW $13, Y10, Y3
  487. VPSRAW $13, Y4, Y5
  488. VPSRAW $13, Y6, Y7
  489. VPSRAW $13, Y8, Y9
  490. VPAND Y2, Y10, Y10
  491. VPAND Y2, Y4, Y4
  492. VPAND Y2, Y6, Y6
  493. VPAND Y2, Y8, Y8
  494. VPSUBW Y3, Y10, Y10
  495. VPSUBW Y5, Y4, Y4
  496. VPSUBW Y7, Y6, Y6
  497. VPSUBW Y9, Y8, Y8
  498. VPSLLW $9, Y3, Y3
  499. VPSLLW $9, Y5, Y5
  500. VPSLLW $9, Y7, Y7
  501. VPSLLW $9, Y9, Y9
  502. VPADDW Y3, Y10, Y10
  503. VPADDW Y5, Y4, Y4
  504. VPADDW Y7, Y6, Y6
  505. VPADDW Y9, Y8, Y8
  506. // update
  507. VPSUBW Y11, Y10, Y3
  508. VPSUBW Y12, Y4, Y5
  509. VPSUBW Y13, Y6, Y7
  510. VPSUBW Y14, Y8, Y9
  511. VPADDW Y11, Y10, Y10
  512. VPADDW Y12, Y4, Y4
  513. VPADDW Y13, Y6, Y6
  514. VPADDW Y14, Y8, Y8
  515. // level 7
  516. // shuffle
  517. VMOVDQU ·vpshufb_idx<>(SB), Y15
  518. VPSHUFB Y15, Y10, Y11
  519. VPSHUFB Y15, Y3, Y12
  520. VPSHUFB Y15, Y4, Y13
  521. VPSHUFB Y15, Y5, Y14
  522. VPBLENDW $0x55, Y10, Y12, Y10
  523. VPBLENDW $0xAA, Y3, Y11, Y3
  524. VPBLENDW $0x55, Y4, Y14, Y4
  525. VPBLENDW $0xAA, Y5, Y13, Y5
  526. VPSHUFB Y15, Y6, Y11
  527. VPSHUFB Y15, Y7, Y12
  528. VPSHUFB Y15, Y8, Y13
  529. VPSHUFB Y15, Y9, Y14
  530. VPBLENDW $0x55, Y6, Y12, Y6
  531. VPBLENDW $0xAA, Y7, Y11, Y7
  532. VPBLENDW $0x55, Y8, Y14, Y8
  533. VPBLENDW $0xAA, Y9, Y13, Y9
  534. // zetas
  535. VMOVDQU 1248(SI), Y12
  536. VMOVDQU 1280(SI), Y13
  537. VMOVDQU 1312(SI), Y14
  538. VMOVDQU 1344(SI), Y15
  539. // mul
  540. VPMULLW Y12, Y3, Y11
  541. VPMULHW Y12, Y3, Y3
  542. VPMULLW Y13, Y5, Y12
  543. VPMULHW Y13, Y5, Y5
  544. VPMULLW Y14, Y7, Y13
  545. VPMULHW Y14, Y7, Y7
  546. VPMULLW Y15, Y9, Y14
  547. VPMULHW Y15, Y9, Y9
  548. // reduce
  549. VPMULLW Y0, Y11, Y11
  550. VPMULLW Y0, Y12, Y12
  551. VPMULLW Y0, Y13, Y13
  552. VPMULLW Y0, Y14, Y14
  553. VPMULHW Y1, Y11, Y11
  554. VPMULHW Y1, Y12, Y12
  555. VPMULHW Y1, Y13, Y13
  556. VPMULHW Y1, Y14, Y14
  557. VPSUBW Y11, Y3, Y11
  558. VPSUBW Y12, Y5, Y12
  559. VPSUBW Y13, Y7, Y13
  560. VPSUBW Y14, Y9, Y14
  561. // reduce 3
  562. VMOVDQU ·q2_x16<>(SB), Y15
  563. VPSRAW $15, Y10, Y3
  564. VPSRAW $15, Y4, Y5
  565. VPSRAW $15, Y6, Y7
  566. VPSRAW $15, Y8, Y9
  567. VPAND Y15, Y3, Y3
  568. VPAND Y15, Y5, Y5
  569. VPAND Y15, Y7, Y7
  570. VPAND Y15, Y9, Y9
  571. VPADDW Y1, Y10, Y10
  572. VPADDW Y1, Y4, Y4
  573. VPADDW Y1, Y6, Y6
  574. VPADDW Y1, Y8, Y8
  575. VPADDW Y3, Y10, Y10
  576. VPADDW Y5, Y4, Y4
  577. VPADDW Y7, Y6, Y6
  578. VPADDW Y9, Y8, Y8
  579. // update
  580. VPSUBW Y11, Y10, Y3
  581. VPSUBW Y12, Y4, Y5
  582. VPSUBW Y13, Y6, Y7
  583. VPSUBW Y14, Y8, Y9
  584. VPADDW Y11, Y10, Y10
  585. VPADDW Y12, Y4, Y4
  586. VPADDW Y13, Y6, Y6
  587. VPADDW Y14, Y8, Y8
  588. // reorder
  589. VPUNPCKLWD Y3, Y10, Y12
  590. VPUNPCKHWD Y3, Y10, Y13
  591. VPUNPCKLWD Y5, Y4, Y14
  592. VPUNPCKHWD Y5, Y4, Y15
  593. VPUNPCKLWD Y7, Y6, Y3
  594. VPUNPCKHWD Y7, Y6, Y4
  595. VPUNPCKLWD Y9, Y8, Y5
  596. VPUNPCKHWD Y9, Y8, Y6
  597. VPERM2I128 $0x20, Y13, Y12, Y11
  598. VPERM2I128 $0x31, Y13, Y12, Y12
  599. VPERM2I128 $0x20, Y15, Y14, Y13
  600. VPERM2I128 $0x31, Y15, Y14, Y14
  601. VPERM2I128 $0x20, Y4, Y3, Y15
  602. VPERM2I128 $0x31, Y4, Y3, Y3
  603. VPERM2I128 $0x20, Y6, Y5, Y4
  604. VPERM2I128 $0x31, Y6, Y5, Y5
  605. // store
  606. VMOVDQU Y11, (DI)
  607. VMOVDQU Y12, 32(DI)
  608. VMOVDQU Y13, 64(DI)
  609. VMOVDQU Y14, 96(DI)
  610. VMOVDQU Y15, 128(DI)
  611. VMOVDQU Y3, 160(DI)
  612. VMOVDQU Y4, 192(DI)
  613. VMOVDQU Y5, 224(DI)
  614. ADDQ $256, DI
  615. // second round
  616. // zetas
  617. VMOVDQU 64(SI), Y3
  618. // load
  619. VMOVDQU (DI), Y4
  620. VMOVDQU 32(DI), Y5
  621. VMOVDQU 64(DI), Y6
  622. VMOVDQU 96(DI), Y7
  623. VMOVDQU 128(DI), Y8
  624. VMOVDQU 160(DI), Y9
  625. VMOVDQU 192(DI), Y10
  626. VMOVDQU 224(DI), Y11
  627. // level 1
  628. // mul
  629. VPMULLW Y3, Y8, Y12
  630. VPMULHW Y3, Y8, Y8
  631. VPMULLW Y3, Y9, Y13
  632. VPMULHW Y3, Y9, Y9
  633. VPMULLW Y3, Y10, Y14
  634. VPMULHW Y3, Y10, Y10
  635. VPMULLW Y3, Y11, Y15
  636. VPMULHW Y3, Y11, Y11
  637. // reduce
  638. VPMULLW Y0, Y12, Y12
  639. VPMULLW Y0, Y13, Y13
  640. VPMULLW Y0, Y14, Y14
  641. VPMULLW Y0, Y15, Y15
  642. VPMULHW Y1, Y12, Y12
  643. VPMULHW Y1, Y13, Y13
  644. VPMULHW Y1, Y14, Y14
  645. VPMULHW Y1, Y15, Y15
  646. VPSUBW Y12, Y8, Y12
  647. VPSUBW Y13, Y9, Y13
  648. VPSUBW Y14, Y10, Y14
  649. VPSUBW Y15, Y11, Y15
  650. // update
  651. VPSUBW Y12, Y4, Y8
  652. VPSUBW Y13, Y5, Y9
  653. VPSUBW Y14, Y6, Y10
  654. VPSUBW Y15, Y7, Y11
  655. VPADDW Y12, Y4, Y4
  656. VPADDW Y13, Y5, Y5
  657. VPADDW Y14, Y6, Y6
  658. VPADDW Y15, Y7, Y7
  659. // level 2
  660. // zetas
  661. VMOVDQU 160(SI), Y15
  662. VMOVDQU 192(SI), Y3
  663. // mul
  664. VPMULLW Y15, Y6, Y12
  665. VPMULHW Y15, Y6, Y6
  666. VPMULLW Y15, Y7, Y13
  667. VPMULHW Y15, Y7, Y7
  668. VPMULLW Y3, Y10, Y14
  669. VPMULHW Y3, Y10, Y10
  670. VPMULLW Y3, Y11, Y15
  671. VPMULHW Y3, Y11, Y11
  672. // reduce
  673. VPMULLW Y0, Y12, Y12
  674. VPMULLW Y0, Y13, Y13
  675. VPMULLW Y0, Y14, Y14
  676. VPMULLW Y0, Y15, Y15
  677. VPMULHW Y1, Y12, Y12
  678. VPMULHW Y1, Y13, Y13
  679. VPMULHW Y1, Y14, Y14
  680. VPMULHW Y1, Y15, Y15
  681. VPSUBW Y12, Y6, Y12
  682. VPSUBW Y13, Y7, Y13
  683. VPSUBW Y14, Y10, Y14
  684. VPSUBW Y15, Y11, Y15
  685. // update
  686. VPSUBW Y12, Y4, Y6
  687. VPSUBW Y13, Y5, Y7
  688. VPSUBW Y14, Y8, Y10
  689. VPSUBW Y15, Y9, Y11
  690. VPADDW Y12, Y4, Y4
  691. VPADDW Y13, Y5, Y5
  692. VPADDW Y14, Y8, Y8
  693. VPADDW Y15, Y9, Y9
  694. // level 3
  695. // zetas
  696. VMOVDQU 352(SI), Y13
  697. VMOVDQU 384(SI), Y14
  698. VMOVDQU 416(SI), Y15
  699. VMOVDQU 448(SI), Y3
  700. // mul
  701. VPMULLW Y13, Y5, Y12
  702. VPMULHW Y13, Y5, Y5
  703. VPMULLW Y14, Y7, Y13
  704. VPMULHW Y14, Y7, Y7
  705. VPMULLW Y15, Y9, Y14
  706. VPMULHW Y15, Y9, Y9
  707. VPMULLW Y3, Y11, Y15
  708. VPMULHW Y3, Y11, Y11
  709. // reduce
  710. VPMULLW Y0, Y12, Y12
  711. VPMULLW Y0, Y13, Y13
  712. VPMULLW Y0, Y14, Y14
  713. VPMULLW Y0, Y15, Y15
  714. VPMULHW Y1, Y12, Y12
  715. VPMULHW Y1, Y13, Y13
  716. VPMULHW Y1, Y14, Y14
  717. VPMULHW Y1, Y15, Y15
  718. VPSUBW Y12, Y5, Y12
  719. VPSUBW Y13, Y7, Y13
  720. VPSUBW Y14, Y9, Y14
  721. VPSUBW Y15, Y11, Y15
  722. // reduce 2
  723. VPSRAW $13, Y4, Y5
  724. VPSRAW $13, Y6, Y7
  725. VPSRAW $13, Y8, Y9
  726. VPSRAW $13, Y10, Y11
  727. VPAND Y2, Y4, Y4
  728. VPAND Y2, Y6, Y6
  729. VPAND Y2, Y8, Y8
  730. VPAND Y2, Y10, Y10
  731. VPSUBW Y5, Y4, Y4
  732. VPSUBW Y7, Y6, Y6
  733. VPSUBW Y9, Y8, Y8
  734. VPSUBW Y11, Y10, Y10
  735. VPSLLW $9, Y5, Y5
  736. VPSLLW $9, Y7, Y7
  737. VPSLLW $9, Y9, Y9
  738. VPSLLW $9, Y11, Y11
  739. VPADDW Y5, Y4, Y4
  740. VPADDW Y7, Y6, Y6
  741. VPADDW Y9, Y8, Y8
  742. VPADDW Y11, Y10, Y10
  743. // update
  744. VPSUBW Y12, Y4, Y5
  745. VPSUBW Y13, Y6, Y7
  746. VPSUBW Y14, Y8, Y9
  747. VPSUBW Y15, Y10, Y11
  748. VPADDW Y12, Y4, Y4
  749. VPADDW Y13, Y6, Y6
  750. VPADDW Y14, Y8, Y8
  751. VPADDW Y15, Y10, Y10
  752. // level 4
  753. // zetas
  754. VMOVDQU 608(SI), Y12
  755. VMOVDQU 640(SI), Y13
  756. VMOVDQU 672(SI), Y14
  757. VMOVDQU 704(SI), Y15
  758. // shuffle
  759. VPERM2I128 $0x02, Y4, Y5, Y3
  760. VPERM2I128 $0x13, Y4, Y5, Y4
  761. VPERM2I128 $0x02, Y6, Y7, Y5
  762. VPERM2I128 $0x13, Y6, Y7, Y6
  763. VPERM2I128 $0x02, Y8, Y9, Y7
  764. VPERM2I128 $0x13, Y8, Y9, Y8
  765. VPERM2I128 $0x02, Y10, Y11, Y9
  766. VPERM2I128 $0x13, Y10, Y11, Y10
  767. // mul
  768. VPMULLW Y12, Y4, Y11
  769. VPMULHW Y12, Y4, Y4
  770. VPMULLW Y13, Y6, Y12
  771. VPMULHW Y13, Y6, Y6
  772. VPMULLW Y14, Y8, Y13
  773. VPMULHW Y14, Y8, Y8
  774. VPMULLW Y15, Y10, Y14
  775. VPMULHW Y15, Y10, Y10
  776. // reduce
  777. VPMULLW Y0, Y11, Y11
  778. VPMULLW Y0, Y12, Y12
  779. VPMULLW Y0, Y13, Y13
  780. VPMULLW Y0, Y14, Y14
  781. VPMULHW Y1, Y11, Y11
  782. VPMULHW Y1, Y12, Y12
  783. VPMULHW Y1, Y13, Y13
  784. VPMULHW Y1, Y14, Y14
  785. VPSUBW Y11, Y4, Y11
  786. VPSUBW Y12, Y6, Y12
  787. VPSUBW Y13, Y8, Y13
  788. VPSUBW Y14, Y10, Y14
  789. // update
  790. VPSUBW Y11, Y3, Y4
  791. VPSUBW Y12, Y5, Y6
  792. VPSUBW Y13, Y7, Y8
  793. VPSUBW Y14, Y9, Y10
  794. VPADDW Y11, Y3, Y3
  795. VPADDW Y12, Y5, Y5
  796. VPADDW Y13, Y7, Y7
  797. VPADDW Y14, Y9, Y9
  798. // level 5
  799. // zetas
  800. VMOVDQU 864(SI), Y12
  801. VMOVDQU 896(SI), Y13
  802. VMOVDQU 928(SI), Y14
  803. VMOVDQU 960(SI), Y15
  804. // shuffle
  805. VSHUFPD $0x00, Y4, Y3, Y11
  806. VSHUFPD $0x0F, Y4, Y3, Y3
  807. VSHUFPD $0x00, Y6, Y5, Y4
  808. VSHUFPD $0x0F, Y6, Y5, Y5
  809. VSHUFPD $0x00, Y8, Y7, Y6
  810. VSHUFPD $0x0F, Y8, Y7, Y7
  811. VSHUFPD $0x00, Y10, Y9, Y8
  812. VSHUFPD $0x0F, Y10, Y9, Y9
  813. // mul
  814. VPMULLW Y12, Y3, Y10
  815. VPMULHW Y12, Y3, Y3
  816. VPMULLW Y13, Y5, Y12
  817. VPMULHW Y13, Y5, Y5
  818. VPMULLW Y14, Y7, Y13
  819. VPMULHW Y14, Y7, Y7
  820. VPMULLW Y15, Y9, Y14
  821. VPMULHW Y15, Y9, Y9
  822. // reduce
  823. VPMULLW Y0, Y10, Y10
  824. VPMULLW Y0, Y12, Y12
  825. VPMULLW Y0, Y13, Y13
  826. VPMULLW Y0, Y14, Y14
  827. VPMULHW Y1, Y10, Y10
  828. VPMULHW Y1, Y12, Y12
  829. VPMULHW Y1, Y13, Y13
  830. VPMULHW Y1, Y14, Y14
  831. VPSUBW Y10, Y3, Y10
  832. VPSUBW Y12, Y5, Y12
  833. VPSUBW Y13, Y7, Y13
  834. VPSUBW Y14, Y9, Y14
  835. // update
  836. VPSUBW Y10, Y11, Y3
  837. VPSUBW Y12, Y4, Y5
  838. VPSUBW Y13, Y6, Y7
  839. VPSUBW Y14, Y8, Y9
  840. VPADDW Y10, Y11, Y10
  841. VPADDW Y12, Y4, Y4
  842. VPADDW Y13, Y6, Y6
  843. VPADDW Y14, Y8, Y8
  844. // level 6
  845. // shuffle
  846. VPSHUFD $0xB1, Y10, Y12
  847. VPSHUFD $0xB1, Y3, Y13
  848. VPSHUFD $0xB1, Y4, Y14
  849. VPSHUFD $0xB1, Y5, Y15
  850. VPBLENDD $0x55, Y10, Y13, Y10
  851. VPBLENDD $0xAA, Y3, Y12, Y3
  852. VPBLENDD $0x55, Y4, Y15, Y4
  853. VPBLENDD $0xAA, Y5, Y14, Y5
  854. VPSHUFD $0xB1, Y6, Y12
  855. VPSHUFD $0xB1, Y7, Y13
  856. VPSHUFD $0xB1, Y8, Y14
  857. VPSHUFD $0xB1, Y9, Y15
  858. VPBLENDD $0x55, Y6, Y13, Y6
  859. VPBLENDD $0xAA, Y7, Y12, Y7
  860. VPBLENDD $0x55, Y8, Y15, Y8
  861. VPBLENDD $0xAA, Y9, Y14, Y9
  862. // zetas
  863. VMOVDQU 1120(SI), Y12
  864. VMOVDQU 1152(SI), Y13
  865. VMOVDQU 1184(SI), Y14
  866. VMOVDQU 1216(SI), Y15
  867. // mul
  868. VPMULLW Y12, Y3, Y11
  869. VPMULHW Y12, Y3, Y3
  870. VPMULLW Y13, Y5, Y12
  871. VPMULHW Y13, Y5, Y5
  872. VPMULLW Y14, Y7, Y13
  873. VPMULHW Y14, Y7, Y7
  874. VPMULLW Y15, Y9, Y14
  875. VPMULHW Y15, Y9, Y9
  876. // reduce
  877. VPMULLW Y0, Y11, Y11
  878. VPMULLW Y0, Y12, Y12
  879. VPMULLW Y0, Y13, Y13
  880. VPMULLW Y0, Y14, Y14
  881. VPMULHW Y1, Y11, Y11
  882. VPMULHW Y1, Y12, Y12
  883. VPMULHW Y1, Y13, Y13
  884. VPMULHW Y1, Y14, Y14
  885. VPSUBW Y11, Y3, Y11
  886. VPSUBW Y12, Y5, Y12
  887. VPSUBW Y13, Y7, Y13
  888. VPSUBW Y14, Y9, Y14
  889. // reduce 2
  890. VPSRAW $13, Y10, Y3
  891. VPSRAW $13, Y4, Y5
  892. VPSRAW $13, Y6, Y7
  893. VPSRAW $13, Y8, Y9
  894. VPAND Y2, Y10, Y10
  895. VPAND Y2, Y4, Y4
  896. VPAND Y2, Y6, Y6
  897. VPAND Y2, Y8, Y8
  898. VPSUBW Y3, Y10, Y10
  899. VPSUBW Y5, Y4, Y4
  900. VPSUBW Y7, Y6, Y6
  901. VPSUBW Y9, Y8, Y8
  902. VPSLLW $9, Y3, Y3
  903. VPSLLW $9, Y5, Y5
  904. VPSLLW $9, Y7, Y7
  905. VPSLLW $9, Y9, Y9
  906. VPADDW Y3, Y10, Y10
  907. VPADDW Y5, Y4, Y4
  908. VPADDW Y7, Y6, Y6
  909. VPADDW Y9, Y8, Y8
  910. // update
  911. VPSUBW Y11, Y10, Y3
  912. VPSUBW Y12, Y4, Y5
  913. VPSUBW Y13, Y6, Y7
  914. VPSUBW Y14, Y8, Y9
  915. VPADDW Y11, Y10, Y10
  916. VPADDW Y12, Y4, Y4
  917. VPADDW Y13, Y6, Y6
  918. VPADDW Y14, Y8, Y8
  919. // level 7
  920. // shuffle
  921. VMOVDQU ·vpshufb_idx<>(SB), Y15
  922. VPSHUFB Y15, Y10, Y11
  923. VPSHUFB Y15, Y3, Y12
  924. VPSHUFB Y15, Y4, Y13
  925. VPSHUFB Y15, Y5, Y14
  926. VPBLENDW $0x55, Y10, Y12, Y10
  927. VPBLENDW $0xAA, Y3, Y11, Y3
  928. VPBLENDW $0x55, Y4, Y14, Y4
  929. VPBLENDW $0xAA, Y5, Y13, Y5
  930. VPSHUFB Y15, Y6, Y11
  931. VPSHUFB Y15, Y7, Y12
  932. VPSHUFB Y15, Y8, Y13
  933. VPSHUFB Y15, Y9, Y14
  934. VPBLENDW $0x55, Y6, Y12, Y6
  935. VPBLENDW $0xAA, Y7, Y11, Y7
  936. VPBLENDW $0x55, Y8, Y14, Y8
  937. VPBLENDW $0xAA, Y9, Y13, Y9
  938. // zetas
  939. VMOVDQU 1376(SI), Y12
  940. VMOVDQU 1408(SI), Y13
  941. VMOVDQU 1440(SI), Y14
  942. VMOVDQU 1472(SI), Y15
  943. // mul
  944. VPMULLW Y12, Y3, Y11
  945. VPMULHW Y12, Y3, Y3
  946. VPMULLW Y13, Y5, Y12
  947. VPMULHW Y13, Y5, Y5
  948. VPMULLW Y14, Y7, Y13
  949. VPMULHW Y14, Y7, Y7
  950. VPMULLW Y15, Y9, Y14
  951. VPMULHW Y15, Y9, Y9
  952. // reduce
  953. VPMULLW Y0, Y11, Y11
  954. VPMULLW Y0, Y12, Y12
  955. VPMULLW Y0, Y13, Y13
  956. VPMULLW Y0, Y14, Y14
  957. VPMULHW Y1, Y11, Y11
  958. VPMULHW Y1, Y12, Y12
  959. VPMULHW Y1, Y13, Y13
  960. VPMULHW Y1, Y14, Y14
  961. VPSUBW Y11, Y3, Y11
  962. VPSUBW Y12, Y5, Y12
  963. VPSUBW Y13, Y7, Y13
  964. VPSUBW Y14, Y9, Y14
  965. // reduce 3
  966. VMOVDQU ·q2_x16<>(SB), Y15
  967. VPSRAW $15, Y10, Y3
  968. VPSRAW $15, Y4, Y5
  969. VPSRAW $15, Y6, Y7
  970. VPSRAW $15, Y8, Y9
  971. VPAND Y15, Y3, Y3
  972. VPAND Y15, Y5, Y5
  973. VPAND Y15, Y7, Y7
  974. VPAND Y15, Y9, Y9
  975. VPADDW Y1, Y10, Y10
  976. VPADDW Y1, Y4, Y4
  977. VPADDW Y1, Y6, Y6
  978. VPADDW Y1, Y8, Y8
  979. VPADDW Y3, Y10, Y10
  980. VPADDW Y5, Y4, Y4
  981. VPADDW Y7, Y6, Y6
  982. VPADDW Y9, Y8, Y8
  983. // update
  984. VPSUBW Y11, Y10, Y3
  985. VPSUBW Y12, Y4, Y5
  986. VPSUBW Y13, Y6, Y7
  987. VPSUBW Y14, Y8, Y9
  988. VPADDW Y11, Y10, Y10
  989. VPADDW Y12, Y4, Y4
  990. VPADDW Y13, Y6, Y6
  991. VPADDW Y14, Y8, Y8
  992. // reorder
  993. VPUNPCKLWD Y3, Y10, Y12
  994. VPUNPCKHWD Y3, Y10, Y13
  995. VPUNPCKLWD Y5, Y4, Y14
  996. VPUNPCKHWD Y5, Y4, Y15
  997. VPUNPCKLWD Y7, Y6, Y3
  998. VPUNPCKHWD Y7, Y6, Y4
  999. VPUNPCKLWD Y9, Y8, Y5
  1000. VPUNPCKHWD Y9, Y8, Y6
  1001. VPERM2I128 $0x20, Y13, Y12, Y11
  1002. VPERM2I128 $0x31, Y13, Y12, Y12
  1003. VPERM2I128 $0x20, Y15, Y14, Y13
  1004. VPERM2I128 $0x31, Y15, Y14, Y14
  1005. VPERM2I128 $0x20, Y4, Y3, Y15
  1006. VPERM2I128 $0x31, Y4, Y3, Y3
  1007. VPERM2I128 $0x20, Y6, Y5, Y4
  1008. VPERM2I128 $0x31, Y6, Y5, Y5
  1009. // store
  1010. VMOVDQU Y11, (DI)
  1011. VMOVDQU Y12, 32(DI)
  1012. VMOVDQU Y13, 64(DI)
  1013. VMOVDQU Y14, 96(DI)
  1014. VMOVDQU Y15, 128(DI)
  1015. VMOVDQU Y3, 160(DI)
  1016. VMOVDQU Y4, 192(DI)
  1017. VMOVDQU Y5, 224(DI)
  1018. VZEROUPPER
  1019. RET
  1020. // Go 1.10's VPERMQ support expects the imm8 to be a `int8`, instead of a
  1021. // `uint8`. While this is fixed in master, use the signed representation
  1022. // for now till it's reasonable to expect versions with the fix to be widely
  1023. // available.
  1024. //
  1025. // See: https://github.com/golang/go/issues/24378
  1026. #define invntt_VPERMQ_IDX $-40 // $0xd8
  1027. // func invnttAVX2(inout, omegas *uint16)
  1028. TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
  1029. MOVQ inout+0(FP), DI
  1030. MOVQ omegas+8(FP), SI
  1031. VMOVDQU ·qinv_x16<>(SB), Y0
  1032. VMOVDQU ·q_x16<>(SB), Y1
  1033. VMOVDQU ·v_x16<>(SB), Y2
  1034. MOVQ SI, R8
  1035. // first round
  1036. // load
  1037. VMOVDQU (DI), Y4
  1038. VMOVDQU 32(DI), Y5
  1039. VMOVDQU 64(DI), Y6
  1040. VMOVDQU 96(DI), Y7
  1041. VMOVDQU 128(DI), Y8
  1042. VMOVDQU 160(DI), Y9
  1043. VMOVDQU 192(DI), Y10
  1044. VMOVDQU 224(DI), Y11
  1045. // reorder
  1046. VMOVDQU ·lowdword<>(SB), Y3
  1047. VPAND Y3, Y4, Y12
  1048. VPAND Y3, Y5, Y13
  1049. VPAND Y3, Y6, Y14
  1050. VPAND Y3, Y7, Y15
  1051. VPSRLD $16, Y4, Y4
  1052. VPSRLD $16, Y5, Y5
  1053. VPSRLD $16, Y6, Y6
  1054. VPSRLD $16, Y7, Y7
  1055. VPACKUSDW Y5, Y4, Y5
  1056. VPACKUSDW Y13, Y12, Y4
  1057. VPACKUSDW Y7, Y6, Y7
  1058. VPACKUSDW Y15, Y14, Y6
  1059. VPERMQ invntt_VPERMQ_IDX, Y4, Y4
  1060. VPERMQ invntt_VPERMQ_IDX, Y5, Y5
  1061. VPERMQ invntt_VPERMQ_IDX, Y6, Y6
  1062. VPERMQ invntt_VPERMQ_IDX, Y7, Y7
  1063. VPAND Y3, Y8, Y12
  1064. VPAND Y3, Y9, Y13
  1065. VPAND Y3, Y10, Y14
  1066. VPAND Y3, Y11, Y15
  1067. VPSRLD $16, Y8, Y8
  1068. VPSRLD $16, Y9, Y9
  1069. VPSRLD $16, Y10, Y10
  1070. VPSRLD $16, Y11, Y11
  1071. VPACKUSDW Y9, Y8, Y9
  1072. VPACKUSDW Y13, Y12, Y8
  1073. VPACKUSDW Y11, Y10, Y11
  1074. VPACKUSDW Y15, Y14, Y10
  1075. VPERMQ invntt_VPERMQ_IDX, Y8, Y8
  1076. VPERMQ invntt_VPERMQ_IDX, Y9, Y9
  1077. VPERMQ invntt_VPERMQ_IDX, Y10, Y10
  1078. VPERMQ invntt_VPERMQ_IDX, Y11, Y11
  1079. // level 0
  1080. // update
  1081. VPSUBW Y5, Y4, Y12
  1082. VPSUBW Y7, Y6, Y13
  1083. VPSUBW Y9, Y8, Y14
  1084. VPSUBW Y11, Y10, Y15
  1085. VPADDW Y4, Y5, Y4
  1086. VPADDW Y6, Y7, Y6
  1087. VPADDW Y8, Y9, Y8
  1088. VPADDW Y10, Y11, Y10
  1089. // zetas
  1090. VMOVDQU (R8), Y7
  1091. VMOVDQU 32(R8), Y9
  1092. VMOVDQU 64(R8), Y11
  1093. VMOVDQU 96(R8), Y3
  1094. // mul
  1095. VPMULLW Y7, Y12, Y5
  1096. VPMULHW Y7, Y12, Y12
  1097. VPMULLW Y9, Y13, Y7
  1098. VPMULHW Y9, Y13, Y13
  1099. VPMULLW Y11, Y14, Y9
  1100. VPMULHW Y11, Y14, Y14
  1101. VPMULLW Y3, Y15, Y11
  1102. VPMULHW Y3, Y15, Y15
  1103. // reduce
  1104. VPMULLW Y0, Y5, Y5
  1105. VPMULLW Y0, Y7, Y7
  1106. VPMULLW Y0, Y9, Y9
  1107. VPMULLW Y0, Y11, Y11
  1108. VPMULHW Y1, Y5, Y5
  1109. VPMULHW Y1, Y7, Y7
  1110. VPMULHW Y1, Y9, Y9
  1111. VPMULHW Y1, Y11, Y11
  1112. VPSUBW Y5, Y12, Y5
  1113. VPSUBW Y7, Y13, Y7
  1114. VPSUBW Y9, Y14, Y9
  1115. VPSUBW Y11, Y15, Y11
  1116. // level 1
  1117. // shuffle
  1118. VMOVDQU ·vpshufb_idx<>(SB), Y3
  1119. VPSHUFB Y3, Y4, Y12
  1120. VPSHUFB Y3, Y5, Y13
  1121. VPSHUFB Y3, Y6, Y14
  1122. VPSHUFB Y3, Y7, Y15
  1123. VPBLENDW $0x55, Y4, Y13, Y4
  1124. VPBLENDW $0xAA, Y5, Y12, Y5
  1125. VPBLENDW $0x55, Y6, Y15, Y6
  1126. VPBLENDW $0xAA, Y7, Y14, Y7
  1127. VPSHUFB Y3, Y8, Y12
  1128. VPSHUFB Y3, Y9, Y13
  1129. VPSHUFB Y3, Y10, Y14
  1130. VPSHUFB Y3, Y11, Y15
  1131. VPBLENDW $0x55, Y8, Y13, Y8
  1132. VPBLENDW $0xAA, Y9, Y12, Y9
  1133. VPBLENDW $0x55, Y10, Y15, Y10
  1134. VPBLENDW $0xAA, Y11, Y14, Y11
  1135. // update
  1136. VPSUBW Y5, Y4, Y12
  1137. VPSUBW Y7, Y6, Y13
  1138. VPSUBW Y9, Y8, Y14
  1139. VPSUBW Y11, Y10, Y15
  1140. VPADDW Y4, Y5, Y4
  1141. VPADDW Y6, Y7, Y6
  1142. VPADDW Y8, Y9, Y8
  1143. VPADDW Y10, Y11, Y10
  1144. // zetas
  1145. VMOVDQU 256(R8), Y7
  1146. VMOVDQU 288(R8), Y9
  1147. VMOVDQU 320(R8), Y11
  1148. VMOVDQU 352(R8), Y3
  1149. // mul
  1150. VPMULLW Y7, Y12, Y5
  1151. VPMULHW Y7, Y12, Y12
  1152. VPMULLW Y9, Y13, Y7
  1153. VPMULHW Y9, Y13, Y13
  1154. VPMULLW Y11, Y14, Y9
  1155. VPMULHW Y11, Y14, Y14
  1156. VPMULLW Y3, Y15, Y11
  1157. VPMULHW Y3, Y15, Y15
  1158. // reduce
  1159. VPMULLW Y0, Y5, Y5
  1160. VPMULLW Y0, Y7, Y7
  1161. VPMULLW Y0, Y9, Y9
  1162. VPMULLW Y0, Y11, Y11
  1163. VPMULHW Y1, Y5, Y5
  1164. VPMULHW Y1, Y7, Y7
  1165. VPMULHW Y1, Y9, Y9
  1166. VPMULHW Y1, Y11, Y11
  1167. VPSUBW Y5, Y12, Y5
  1168. VPSUBW Y7, Y13, Y7
  1169. VPSUBW Y9, Y14, Y9
  1170. VPSUBW Y11, Y15, Y11
  1171. // reduce 2
  1172. VPMULHW Y2, Y4, Y12
  1173. VPMULHW Y2, Y6, Y13
  1174. VPMULHW Y2, Y8, Y14
  1175. VPMULHW Y2, Y10, Y15
  1176. VPSRAW $11, Y12, Y12
  1177. VPSRAW $11, Y13, Y13
  1178. VPSRAW $11, Y14, Y14
  1179. VPSRAW $11, Y15, Y15
  1180. VPMULLW Y1, Y12, Y12
  1181. VPMULLW Y1, Y13, Y13
  1182. VPMULLW Y1, Y14, Y14
  1183. VPMULLW Y1, Y15, Y15
  1184. VPSUBW Y12, Y4, Y4
  1185. VPSUBW Y13, Y6, Y6
  1186. VPSUBW Y14, Y8, Y8
  1187. VPSUBW Y15, Y10, Y10
  1188. // level 2
  1189. // shuffle
  1190. VPSHUFD $0xB1, Y4, Y12
  1191. VPSHUFD $0xB1, Y5, Y13
  1192. VPSHUFD $0xB1, Y6, Y14
  1193. VPSHUFD $0xB1, Y7, Y15
  1194. VPBLENDD $0x55, Y4, Y13, Y4
  1195. VPBLENDD $0xAA, Y5, Y12, Y5
  1196. VPBLENDD $0x55, Y6, Y15, Y6
  1197. VPBLENDD $0xAA, Y7, Y14, Y7
  1198. VPSHUFD $0xB1, Y8, Y12
  1199. VPSHUFD $0xB1, Y9, Y13
  1200. VPSHUFD $0xB1, Y10, Y14
  1201. VPSHUFD $0xB1, Y11, Y15
  1202. VPBLENDD $0x55, Y8, Y13, Y8
  1203. VPBLENDD $0xAA, Y9, Y12, Y9
  1204. VPBLENDD $0x55, Y10, Y15, Y10
  1205. VPBLENDD $0xAA, Y11, Y14, Y11
  1206. // update
  1207. VPSUBW Y5, Y4, Y12
  1208. VPSUBW Y7, Y6, Y13
  1209. VPSUBW Y9, Y8, Y14
  1210. VPSUBW Y11, Y10, Y15
  1211. VPADDW Y4, Y5, Y4
  1212. VPADDW Y6, Y7, Y6
  1213. VPADDW Y8, Y9, Y8
  1214. VPADDW Y10, Y11, Y10
  1215. // zetas
  1216. VMOVDQU 512(R8), Y7
  1217. VMOVDQU 544(R8), Y9
  1218. VMOVDQU 576(R8), Y11
  1219. VMOVDQU 608(R8), Y3
  1220. // mul
  1221. VPMULLW Y7, Y12, Y5
  1222. VPMULHW Y7, Y12, Y12
  1223. VPMULLW Y9, Y13, Y7
  1224. VPMULHW Y9, Y13, Y13
  1225. VPMULLW Y11, Y14, Y9
  1226. VPMULHW Y11, Y14, Y14
  1227. VPMULLW Y3, Y15, Y11
  1228. VPMULHW Y3, Y15, Y15
  1229. // reduce
  1230. VPMULLW Y0, Y5, Y5
  1231. VPMULLW Y0, Y7, Y7
  1232. VPMULLW Y0, Y9, Y9
  1233. VPMULLW Y0, Y11, Y11
  1234. VPMULHW Y1, Y5, Y5
  1235. VPMULHW Y1, Y7, Y7
  1236. VPMULHW Y1, Y9, Y9
  1237. VPMULHW Y1, Y11, Y11
  1238. VPSUBW Y5, Y12, Y5
  1239. VPSUBW Y7, Y13, Y7
  1240. VPSUBW Y9, Y14, Y9
  1241. VPSUBW Y11, Y15, Y11
  1242. // level 3
  1243. // shuffle
  1244. VSHUFPD $0x00, Y5, Y4, Y3
  1245. VSHUFPD $0x0F, Y5, Y4, Y4
  1246. VSHUFPD $0x00, Y7, Y6, Y5
  1247. VSHUFPD $0x0F, Y7, Y6, Y6
  1248. VSHUFPD $0x00, Y9, Y8, Y7
  1249. VSHUFPD $0x0F, Y9, Y8, Y8
  1250. VSHUFPD $0x00, Y11, Y10, Y9
  1251. VSHUFPD $0x0F, Y11, Y10, Y10
  1252. // update
  1253. VPSUBW Y4, Y3, Y12
  1254. VPSUBW Y6, Y5, Y13
  1255. VPSUBW Y8, Y7, Y14
  1256. VPSUBW Y10, Y9, Y15
  1257. VPADDW Y3, Y4, Y3
  1258. VPADDW Y5, Y6, Y5
  1259. VPADDW Y7, Y8, Y7
  1260. VPADDW Y9, Y10, Y9
  1261. // zetas
  1262. VMOVDQU 768(R8), Y6
  1263. VMOVDQU 800(R8), Y8
  1264. VMOVDQU 832(R8), Y10
  1265. VMOVDQU 864(R8), Y11
  1266. // mul
  1267. VPMULLW Y6, Y12, Y4
  1268. VPMULHW Y6, Y12, Y12
  1269. VPMULLW Y8, Y13, Y6
  1270. VPMULHW Y8, Y13, Y13
  1271. VPMULLW Y10, Y14, Y8
  1272. VPMULHW Y10, Y14, Y14
  1273. VPMULLW Y11, Y15, Y10
  1274. VPMULHW Y11, Y15, Y15
  1275. // reduce
  1276. VPMULLW Y0, Y4, Y4
  1277. VPMULLW Y0, Y6, Y6
  1278. VPMULLW Y0, Y8, Y8
  1279. VPMULLW Y0, Y10, Y10
  1280. VPMULHW Y1, Y4, Y4
  1281. VPMULHW Y1, Y6, Y6
  1282. VPMULHW Y1, Y8, Y8
  1283. VPMULHW Y1, Y10, Y10
  1284. VPSUBW Y4, Y12, Y4
  1285. VPSUBW Y6, Y13, Y6
  1286. VPSUBW Y8, Y14, Y8
  1287. VPSUBW Y10, Y15, Y10
  1288. // reduce 2
  1289. VPMULHW Y2, Y3, Y12
  1290. VPMULHW Y2, Y5, Y13
  1291. VPMULHW Y2, Y7, Y14
  1292. VPMULHW Y2, Y9, Y15
  1293. VPSRAW $11, Y12, Y12
  1294. VPSRAW $11, Y13, Y13
  1295. VPSRAW $11, Y14, Y14
  1296. VPSRAW $11, Y15, Y15
  1297. VPMULLW Y1, Y12, Y12
  1298. VPMULLW Y1, Y13, Y13
  1299. VPMULLW Y1, Y14, Y14
  1300. VPMULLW Y1, Y15, Y15
  1301. VPSUBW Y12, Y3, Y3
  1302. VPSUBW Y13, Y5, Y5
  1303. VPSUBW Y14, Y7, Y7
  1304. VPSUBW Y15, Y9, Y9
  1305. // level 4
  1306. // shuffle
  1307. VPERM2I128 $0x02, Y3, Y4, Y11
  1308. VPERM2I128 $0x13, Y3, Y4, Y3
  1309. VPERM2I128 $0x02, Y5, Y6, Y4
  1310. VPERM2I128 $0x13, Y5, Y6, Y5
  1311. VPERM2I128 $0x02, Y7, Y8, Y6
  1312. VPERM2I128 $0x13, Y7, Y8, Y7
  1313. VPERM2I128 $0x02, Y9, Y10, Y8
  1314. VPERM2I128 $0x13, Y9, Y10, Y9
  1315. // update
  1316. VMOVDQA Y11, Y12
  1317. VMOVDQA Y4, Y13
  1318. VMOVDQA Y6, Y14
  1319. VMOVDQA Y8, Y15
  1320. VPADDW Y11, Y3, Y10
  1321. VPADDW Y4, Y5, Y4
  1322. VPADDW Y6, Y7, Y6
  1323. VPADDW Y8, Y9, Y8
  1324. VPSUBW Y3, Y12, Y3
  1325. VPSUBW Y5, Y13, Y5
  1326. VPSUBW Y7, Y14, Y7
  1327. VPSUBW Y9, Y15, Y9
  1328. // zetas
  1329. VMOVDQU 1024(R8), Y12
  1330. VMOVDQU 1056(R8), Y13
  1331. VMOVDQU 1088(R8), Y14
  1332. VMOVDQU 1120(R8), Y15
  1333. // mul
  1334. VPMULLW Y12, Y3, Y11
  1335. VPMULHW Y12, Y3, Y3
  1336. VPMULLW Y13, Y5, Y12
  1337. VPMULHW Y13, Y5, Y5
  1338. VPMULLW Y14, Y7, Y13
  1339. VPMULHW Y14, Y7, Y7
  1340. VPMULLW Y15, Y9, Y14
  1341. VPMULHW Y15, Y9, Y9
  1342. // reduce
  1343. VPMULLW Y0, Y11, Y11
  1344. VPMULLW Y0, Y12, Y12
  1345. VPMULLW Y0, Y13, Y13
  1346. VPMULLW Y0, Y14, Y14
  1347. VPMULHW Y1, Y11, Y11
  1348. VPMULHW Y1, Y12, Y12
  1349. VPMULHW Y1, Y13, Y13
  1350. VPMULHW Y1, Y14, Y14
  1351. VPSUBW Y11, Y3, Y3
  1352. VPSUBW Y12, Y5, Y5
  1353. VPSUBW Y13, Y7, Y7
  1354. VPSUBW Y14, Y9, Y9
  1355. // level 5
  1356. // update
  1357. VMOVDQA Y10, Y12
  1358. VMOVDQA Y3, Y13
  1359. VMOVDQA Y6, Y14
  1360. VMOVDQA Y7, Y15
  1361. VPADDW Y10, Y4, Y10
  1362. VPADDW Y3, Y5, Y3
  1363. VPADDW Y6, Y8, Y6
  1364. VPADDW Y7, Y9, Y7
  1365. VPSUBW Y4, Y12, Y4
  1366. VPSUBW Y5, Y13, Y5
  1367. VPSUBW Y8, Y14, Y8
  1368. VPSUBW Y9, Y15, Y9
  1369. // zetas
  1370. VMOVDQU 1280(SI), Y14
  1371. VMOVDQU 1312(SI), Y15
  1372. // mul
  1373. VPMULLW Y14, Y4, Y11
  1374. VPMULLW Y14, Y5, Y12
  1375. VPMULLW Y15, Y8, Y13
  1376. VPMULHW Y14, Y4, Y4
  1377. VPMULHW Y14, Y5, Y5
  1378. VPMULHW Y15, Y8, Y8
  1379. VPMULLW Y15, Y9, Y14
  1380. VPMULHW Y15, Y9, Y9
  1381. // reduce
  1382. VPMULLW Y0, Y11, Y11
  1383. VPMULLW Y0, Y12, Y12
  1384. VPMULLW Y0, Y13, Y13
  1385. VPMULLW Y0, Y14, Y14
  1386. VPMULHW Y1, Y11, Y11
  1387. VPMULHW Y1, Y12, Y12
  1388. VPMULHW Y1, Y13, Y13
  1389. VPMULHW Y1, Y14, Y14
  1390. VPSUBW Y11, Y4, Y4
  1391. VPSUBW Y12, Y5, Y5
  1392. VPSUBW Y13, Y8, Y8
  1393. VPSUBW Y14, Y9, Y9
  1394. // reduce 2
  1395. VPMULHW Y2, Y10, Y12
  1396. VPMULHW Y2, Y6, Y13
  1397. VPSRAW $11, Y12, Y12
  1398. VPSRAW $11, Y13, Y13
  1399. VPMULLW Y1, Y12, Y12
  1400. VPMULLW Y1, Y13, Y13
  1401. VPSUBW Y12, Y10, Y10
  1402. VPSUBW Y13, Y6, Y6
  1403. // level 6
  1404. // update
  1405. VMOVDQA Y10, Y12
  1406. VMOVDQA Y3, Y13
  1407. VMOVDQA Y4, Y14
  1408. VMOVDQA Y5, Y15
  1409. VPADDW Y10, Y6, Y10
  1410. VPADDW Y3, Y7, Y3
  1411. VPADDW Y4, Y8, Y4
  1412. VPADDW Y5, Y9, Y5
  1413. VPSUBW Y6, Y12, Y6
  1414. VPSUBW Y7, Y13, Y7
  1415. VPSUBW Y8, Y14, Y8
  1416. VPSUBW Y9, Y15, Y9
  1417. // zetas
  1418. VMOVDQU 1408(SI), Y15
  1419. // mul
  1420. VPMULLW Y15, Y6, Y11
  1421. VPMULLW Y15, Y7, Y12
  1422. VPMULLW Y15, Y8, Y13
  1423. VPMULLW Y15, Y9, Y14
  1424. VPMULHW Y15, Y6, Y6
  1425. VPMULHW Y15, Y7, Y7
  1426. VPMULHW Y15, Y8, Y8
  1427. VPMULHW Y15, Y9, Y9
  1428. // reduce
  1429. VPMULLW Y0, Y11, Y11
  1430. VPMULLW Y0, Y12, Y12
  1431. VPMULLW Y0, Y13, Y13
  1432. VPMULLW Y0, Y14, Y14
  1433. VPMULHW Y1, Y11, Y11
  1434. VPMULHW Y1, Y12, Y12
  1435. VPMULHW Y1, Y13, Y13
  1436. VPMULHW Y1, Y14, Y14
  1437. VPSUBW Y11, Y6, Y6
  1438. VPSUBW Y12, Y7, Y7
  1439. VPSUBW Y13, Y8, Y8
  1440. VPSUBW Y14, Y9, Y9
  1441. // reduce 2
  1442. VPMULHW Y2, Y3, Y12
  1443. VPSRAW $11, Y12, Y12
  1444. VPMULLW Y1, Y12, Y12
  1445. VPSUBW Y12, Y3, Y3
  1446. // store
  1447. VMOVDQU Y10, (DI)
  1448. VMOVDQU Y3, 32(DI)
  1449. VMOVDQU Y4, 64(DI)
  1450. VMOVDQU Y5, 96(DI)
  1451. VMOVDQU Y6, 128(DI)
  1452. VMOVDQU Y7, 160(DI)
  1453. VMOVDQU Y8, 192(DI)
  1454. VMOVDQU Y9, 224(DI)
  1455. ADDQ $256, DI
  1456. ADDQ $128, R8
  1457. // second round
  1458. // load
  1459. VMOVDQU (DI), Y4
  1460. VMOVDQU 32(DI), Y5
  1461. VMOVDQU 64(DI), Y6
  1462. VMOVDQU 96(DI), Y7
  1463. VMOVDQU 128(DI), Y8
  1464. VMOVDQU 160(DI), Y9
  1465. VMOVDQU 192(DI), Y10
  1466. VMOVDQU 224(DI), Y11
  1467. // reorder
  1468. VMOVDQU ·lowdword<>(SB), Y3
  1469. VPAND Y3, Y4, Y12
  1470. VPAND Y3, Y5, Y13
  1471. VPAND Y3, Y6, Y14
  1472. VPAND Y3, Y7, Y15
  1473. VPSRLD $16, Y4, Y4
  1474. VPSRLD $16, Y5, Y5
  1475. VPSRLD $16, Y6, Y6
  1476. VPSRLD $16, Y7, Y7
  1477. VPACKUSDW Y5, Y4, Y5
  1478. VPACKUSDW Y13, Y12, Y4
  1479. VPACKUSDW Y7, Y6, Y7
  1480. VPACKUSDW Y15, Y14, Y6
  1481. VPERMQ invntt_VPERMQ_IDX, Y4, Y4
  1482. VPERMQ invntt_VPERMQ_IDX, Y5, Y5
  1483. VPERMQ invntt_VPERMQ_IDX, Y6, Y6
  1484. VPERMQ invntt_VPERMQ_IDX, Y7, Y7
  1485. VPAND Y3, Y8, Y12
  1486. VPAND Y3, Y9, Y13
  1487. VPAND Y3, Y10, Y14
  1488. VPAND Y3, Y11, Y15
  1489. VPSRLD $16, Y8, Y8
  1490. VPSRLD $16, Y9, Y9
  1491. VPSRLD $16, Y10, Y10
  1492. VPSRLD $16, Y11, Y11
  1493. VPACKUSDW Y9, Y8, Y9
  1494. VPACKUSDW Y13, Y12, Y8
  1495. VPACKUSDW Y11, Y10, Y11
  1496. VPACKUSDW Y15, Y14, Y10
  1497. VPERMQ invntt_VPERMQ_IDX, Y8, Y8
  1498. VPERMQ invntt_VPERMQ_IDX, Y9, Y9
  1499. VPERMQ invntt_VPERMQ_IDX, Y10, Y10
  1500. VPERMQ invntt_VPERMQ_IDX, Y11, Y11
  1501. // level 0
  1502. // update
  1503. VMOVDQA Y4, Y12
  1504. VMOVDQA Y6, Y13
  1505. VMOVDQA Y8, Y14
  1506. VMOVDQA Y10, Y15
  1507. VPADDW Y4, Y5, Y4
  1508. VPADDW Y6, Y7, Y6
  1509. VPADDW Y8, Y9, Y8
  1510. VPADDW Y10, Y11, Y10
  1511. VPSUBW Y5, Y12, Y5
  1512. VPSUBW Y7, Y13, Y7
  1513. VPSUBW Y9, Y14, Y9
  1514. VPSUBW Y11, Y15, Y11
  1515. // zetas
  1516. VMOVDQU (R8), Y13
  1517. VMOVDQU 32(R8), Y14
  1518. VMOVDQU 64(R8), Y15
  1519. VMOVDQU 96(R8), Y3
  1520. // mul
  1521. VPMULLW Y13, Y5, Y12
  1522. VPMULHW Y13, Y5, Y5
  1523. VPMULLW Y14, Y7, Y13
  1524. VPMULHW Y14, Y7, Y7
  1525. VPMULLW Y15, Y9, Y14
  1526. VPMULHW Y15, Y9, Y9
  1527. VPMULLW Y3, Y11, Y15
  1528. VPMULHW Y3, Y11, Y11
  1529. // reduce
  1530. VPMULLW Y0, Y12, Y12
  1531. VPMULLW Y0, Y13, Y13
  1532. VPMULLW Y0, Y14, Y14
  1533. VPMULLW Y0, Y15, Y15
  1534. VPMULHW Y1, Y12, Y12
  1535. VPMULHW Y1, Y13, Y13
  1536. VPMULHW Y1, Y14, Y14
  1537. VPMULHW Y1, Y15, Y15
  1538. VPSUBW Y12, Y5, Y5
  1539. VPSUBW Y13, Y7, Y7
  1540. VPSUBW Y14, Y9, Y9
  1541. VPSUBW Y15, Y11, Y11
  1542. // level 1
  1543. // shuffle
  1544. VMOVDQU ·vpshufb_idx<>(SB), Y3
  1545. VPSHUFB Y3, Y4, Y12
  1546. VPSHUFB Y3, Y5, Y13
  1547. VPSHUFB Y3, Y6, Y14
  1548. VPSHUFB Y3, Y7, Y15
  1549. VPBLENDW $0x55, Y4, Y13, Y4
  1550. VPBLENDW $0xAA, Y5, Y12, Y5
  1551. VPBLENDW $0x55, Y6, Y15, Y6
  1552. VPBLENDW $0xAA, Y7, Y14, Y7
  1553. VPSHUFB Y3, Y8, Y12
  1554. VPSHUFB Y3, Y9, Y13
  1555. VPSHUFB Y3, Y10, Y14
  1556. VPSHUFB Y3, Y11, Y15
  1557. VPBLENDW $0x55, Y8, Y13, Y8
  1558. VPBLENDW $0xAA, Y9, Y12, Y9
  1559. VPBLENDW $0x55, Y10, Y15, Y10
  1560. VPBLENDW $0xAA, Y11, Y14, Y11
  1561. // update
  1562. VMOVDQA Y4, Y12
  1563. VMOVDQA Y6, Y13
  1564. VMOVDQA Y8, Y14
  1565. VMOVDQA Y10, Y15
  1566. VPADDW Y4, Y5, Y4
  1567. VPADDW Y6, Y7, Y6
  1568. VPADDW Y8, Y9, Y8
  1569. VPADDW Y10, Y11, Y10
  1570. VPSUBW Y5, Y12, Y5
  1571. VPSUBW Y7, Y13, Y7
  1572. VPSUBW Y9, Y14, Y9
  1573. VPSUBW Y11, Y15, Y11
  1574. // zetas
  1575. VMOVDQU 256(R8), Y13
  1576. VMOVDQU 288(R8), Y14
  1577. VMOVDQU 320(R8), Y15
  1578. VMOVDQU 352(R8), Y3
  1579. // mul
  1580. VPMULLW Y13, Y5, Y12
  1581. VPMULHW Y13, Y5, Y5
  1582. VPMULLW Y14, Y7, Y13
  1583. VPMULHW Y14, Y7, Y7
  1584. VPMULLW Y15, Y9, Y14
  1585. VPMULHW Y15, Y9, Y9
  1586. VPMULLW Y3, Y11, Y15
  1587. VPMULHW Y3, Y11, Y11
  1588. // reduce
  1589. VPMULLW Y0, Y12, Y12
  1590. VPMULLW Y0, Y13, Y13
  1591. VPMULLW Y0, Y14, Y14
  1592. VPMULLW Y0, Y15, Y15
  1593. VPMULHW Y1, Y12, Y12
  1594. VPMULHW Y1, Y13, Y13
  1595. VPMULHW Y1, Y14, Y14
  1596. VPMULHW Y1, Y15, Y15
  1597. VPSUBW Y12, Y5, Y5
  1598. VPSUBW Y13, Y7, Y7
  1599. VPSUBW Y14, Y9, Y9
  1600. VPSUBW Y15, Y11, Y11
  1601. // reduce 2
  1602. VPMULHW Y2, Y4, Y12
  1603. VPMULHW Y2, Y6, Y13
  1604. VPMULHW Y2, Y8, Y14
  1605. VPMULHW Y2, Y10, Y15
  1606. VPSRAW $11, Y12, Y12
  1607. VPSRAW $11, Y13, Y13
  1608. VPSRAW $11, Y14, Y14
  1609. VPSRAW $11, Y15, Y15
  1610. VPMULLW Y1, Y12, Y12
  1611. VPMULLW Y1, Y13, Y13
  1612. VPMULLW Y1, Y14, Y14
  1613. VPMULLW Y1, Y15, Y15
  1614. VPSUBW Y12, Y4, Y4
  1615. VPSUBW Y13, Y6, Y6
  1616. VPSUBW Y14, Y8, Y8
  1617. VPSUBW Y15, Y10, Y10
  1618. // level 2
  1619. // shuffle
  1620. VPSHUFD $0xB1, Y4, Y12
  1621. VPSHUFD $0xB1, Y5, Y13
  1622. VPSHUFD $0xB1, Y6, Y14
  1623. VPSHUFD $0xB1, Y7, Y15
  1624. VPBLENDD $0x55, Y4, Y13, Y4
  1625. VPBLENDD $0xAA, Y5, Y12, Y5
  1626. VPBLENDD $0x55, Y6, Y15, Y6
  1627. VPBLENDD $0xAA, Y7, Y14, Y7
  1628. VPSHUFD $0xB1, Y8, Y12
  1629. VPSHUFD $0xB1, Y9, Y13
  1630. VPSHUFD $0xB1, Y10, Y14
  1631. VPSHUFD $0xB1, Y11, Y15
  1632. VPBLENDD $0x55, Y8, Y13, Y8
  1633. VPBLENDD $0xAA, Y9, Y12, Y9
  1634. VPBLENDD $0x55, Y10, Y15, Y10
  1635. VPBLENDD $0xAA, Y11, Y14, Y11
  1636. // update
  1637. VMOVDQA Y4, Y12
  1638. VMOVDQA Y6, Y13
  1639. VMOVDQA Y8, Y14
  1640. VMOVDQA Y10, Y15
  1641. VPADDW Y4, Y5, Y4
  1642. VPADDW Y6, Y7, Y6
  1643. VPADDW Y8, Y9, Y8
  1644. VPADDW Y10, Y11, Y10
  1645. VPSUBW Y5, Y12, Y5
  1646. VPSUBW Y7, Y13, Y7
  1647. VPSUBW Y9, Y14, Y9
  1648. VPSUBW Y11, Y15, Y11
  1649. // zetas
  1650. VMOVDQU 512(R8), Y13
  1651. VMOVDQU 544(R8), Y14
  1652. VMOVDQU 576(R8), Y15
  1653. VMOVDQU 608(R8), Y3
  1654. // mul
  1655. VPMULLW Y13, Y5, Y12
  1656. VPMULHW Y13, Y5, Y5
  1657. VPMULLW Y14, Y7, Y13
  1658. VPMULHW Y14, Y7, Y7
  1659. VPMULLW Y15, Y9, Y14
  1660. VPMULHW Y15, Y9, Y9
  1661. VPMULLW Y3, Y11, Y15
  1662. VPMULHW Y3, Y11, Y11
  1663. // reduce
  1664. VPMULLW Y0, Y12, Y12
  1665. VPMULLW Y0, Y13, Y13
  1666. VPMULLW Y0, Y14, Y14
  1667. VPMULLW Y0, Y15, Y15
  1668. VPMULHW Y1, Y12, Y12
  1669. VPMULHW Y1, Y13, Y13
  1670. VPMULHW Y1, Y14, Y14
  1671. VPMULHW Y1, Y15, Y15
  1672. VPSUBW Y12, Y5, Y5
  1673. VPSUBW Y13, Y7, Y7
  1674. VPSUBW Y14, Y9, Y9
  1675. VPSUBW Y15, Y11, Y11
  1676. // level 3
  1677. // shuffle
  1678. VSHUFPD $0x00, Y5, Y4, Y3
  1679. VSHUFPD $0x0F, Y5, Y4, Y4
  1680. VSHUFPD $0x00, Y7, Y6, Y5
  1681. VSHUFPD $0x0F, Y7, Y6, Y6
  1682. VSHUFPD $0x00, Y9, Y8, Y7
  1683. VSHUFPD $0x0F, Y9, Y8, Y8
  1684. VSHUFPD $0x00, Y11, Y10, Y9
  1685. VSHUFPD $0x0F, Y11, Y10, Y10
  1686. // update
  1687. VMOVDQA Y3, Y12
  1688. VMOVDQA Y5, Y13
  1689. VMOVDQA Y7, Y14
  1690. VMOVDQA Y9, Y15
  1691. VPADDW Y3, Y4, Y3
  1692. VPADDW Y5, Y6, Y5
  1693. VPADDW Y7, Y8, Y7
  1694. VPADDW Y9, Y10, Y9
  1695. VPSUBW Y4, Y12, Y4
  1696. VPSUBW Y6, Y13, Y6
  1697. VPSUBW Y8, Y14, Y8
  1698. VPSUBW Y10, Y15, Y10
  1699. // zetas
  1700. VMOVDQU 768(R8), Y12
  1701. VMOVDQU 800(R8), Y13
  1702. VMOVDQU 832(R8), Y14
  1703. VMOVDQU 864(R8), Y15
  1704. // mul
  1705. VPMULLW Y12, Y4, Y11
  1706. VPMULHW Y12, Y4, Y4
  1707. VPMULLW Y13, Y6, Y12
  1708. VPMULHW Y13, Y6, Y6
  1709. VPMULLW Y14, Y8, Y13
  1710. VPMULHW Y14, Y8, Y8
  1711. VPMULLW Y15, Y10, Y14
  1712. VPMULHW Y15, Y10, Y10
  1713. // reduce
  1714. VPMULLW Y0, Y11, Y11
  1715. VPMULLW Y0, Y12, Y12
  1716. VPMULLW Y0, Y13, Y13
  1717. VPMULLW Y0, Y14, Y14
  1718. VPMULHW Y1, Y11, Y11
  1719. VPMULHW Y1, Y12, Y12
  1720. VPMULHW Y1, Y13, Y13
  1721. VPMULHW Y1, Y14, Y14
  1722. VPSUBW Y11, Y4, Y4
  1723. VPSUBW Y12, Y6, Y6
  1724. VPSUBW Y13, Y8, Y8
  1725. VPSUBW Y14, Y10, Y10
  1726. // reduce 2
  1727. VPMULHW Y2, Y3, Y12
  1728. VPMULHW Y2, Y5, Y13
  1729. VPMULHW Y2, Y7, Y14
  1730. VPMULHW Y2, Y9, Y15
  1731. VPSRAW $11, Y12, Y12
  1732. VPSRAW $11, Y13, Y13
  1733. VPSRAW $11, Y14, Y14
  1734. VPSRAW $11, Y15, Y15
  1735. VPMULLW Y1, Y12, Y12
  1736. VPMULLW Y1, Y13, Y13
  1737. VPMULLW Y1, Y14, Y14
  1738. VPMULLW Y1, Y15, Y15
  1739. VPSUBW Y12, Y3, Y3
  1740. VPSUBW Y13, Y5, Y5
  1741. VPSUBW Y14, Y7, Y7
  1742. VPSUBW Y15, Y9, Y9
  1743. // level 4
  1744. // shuffle
  1745. VPERM2I128 $0x02, Y3, Y4, Y11
  1746. VPERM2I128 $0x13, Y3, Y4, Y3
  1747. VPERM2I128 $0x02, Y5, Y6, Y4
  1748. VPERM2I128 $0x13, Y5, Y6, Y5
  1749. VPERM2I128 $0x02, Y7, Y8, Y6
  1750. VPERM2I128 $0x13, Y7, Y8, Y7
  1751. VPERM2I128 $0x02, Y9, Y10, Y8
  1752. VPERM2I128 $0x13, Y9, Y10, Y9
  1753. // update
  1754. VMOVDQA Y11, Y12
  1755. VMOVDQA Y4, Y13
  1756. VMOVDQA Y6, Y14
  1757. VMOVDQA Y8, Y15
  1758. VPADDW Y11, Y3, Y10
  1759. VPADDW Y4, Y5, Y4
  1760. VPADDW Y6, Y7, Y6
  1761. VPADDW Y8, Y9, Y8
  1762. VPSUBW Y3, Y12, Y3
  1763. VPSUBW Y5, Y13, Y5
  1764. VPSUBW Y7, Y14, Y7
  1765. VPSUBW Y9, Y15, Y9
  1766. // zetas
  1767. VMOVDQU 1024(R8), Y12
  1768. VMOVDQU 1056(R8), Y13
  1769. VMOVDQU 1088(R8), Y14
  1770. VMOVDQU 1120(R8), Y15
  1771. // mul
  1772. VPMULLW Y12, Y3, Y11
  1773. VPMULHW Y12, Y3, Y3
  1774. VPMULLW Y13, Y5, Y12
  1775. VPMULHW Y13, Y5, Y5
  1776. VPMULLW Y14, Y7, Y13
  1777. VPMULHW Y14, Y7, Y7
  1778. VPMULLW Y15, Y9, Y14
  1779. VPMULHW Y15, Y9, Y9
  1780. // reduce
  1781. VPMULLW Y0, Y11, Y11
  1782. VPMULLW Y0, Y12, Y12
  1783. VPMULLW Y0, Y13, Y13
  1784. VPMULLW Y0, Y14, Y14
  1785. VPMULHW Y1, Y11, Y11
  1786. VPMULHW Y1, Y12, Y12
  1787. VPMULHW Y1, Y13, Y13
  1788. VPMULHW Y1, Y14, Y14
  1789. VPSUBW Y11, Y3, Y3
  1790. VPSUBW Y12, Y5, Y5
  1791. VPSUBW Y13, Y7, Y7
  1792. VPSUBW Y14, Y9, Y9
  1793. // level 5
  1794. // update
  1795. VMOVDQA Y10, Y12
  1796. VMOVDQA Y3, Y13
  1797. VMOVDQA Y6, Y14
  1798. VMOVDQA Y7, Y15
  1799. VPADDW Y10, Y4, Y10
  1800. VPADDW Y3, Y5, Y3
  1801. VPADDW Y6, Y8, Y6
  1802. VPADDW Y7, Y9, Y7
  1803. VPSUBW Y4, Y12, Y4
  1804. VPSUBW Y5, Y13, Y5
  1805. VPSUBW Y8, Y14, Y8
  1806. VPSUBW Y9, Y15, Y9
  1807. // zetas
  1808. VMOVDQU 1344(SI), Y14
  1809. VMOVDQU 1376(SI), Y15
  1810. // mul
  1811. VPMULLW Y14, Y4, Y11
  1812. VPMULLW Y14, Y5, Y12
  1813. VPMULLW Y15, Y8, Y13
  1814. VPMULHW Y14, Y4, Y4
  1815. VPMULHW Y14, Y5, Y5
  1816. VPMULHW Y15, Y8, Y8
  1817. VPMULLW Y15, Y9, Y14
  1818. VPMULHW Y15, Y9, Y9
  1819. // reduce
  1820. VPMULLW Y0, Y11, Y11
  1821. VPMULLW Y0, Y12, Y12
  1822. VPMULLW Y0, Y13, Y13
  1823. VPMULLW Y0, Y14, Y14
  1824. VPMULHW Y1, Y11, Y11
  1825. VPMULHW Y1, Y12, Y12
  1826. VPMULHW Y1, Y13, Y13
  1827. VPMULHW Y1, Y14, Y14
  1828. VPSUBW Y11, Y4, Y4
  1829. VPSUBW Y12, Y5, Y5
  1830. VPSUBW Y13, Y8, Y8
  1831. VPSUBW Y14, Y9, Y9
  1832. // reduce 2
  1833. VPMULHW Y2, Y10, Y12
  1834. VPMULHW Y2, Y6, Y13
  1835. VPSRAW $11, Y12, Y12
  1836. VPSRAW $11, Y13, Y13
  1837. VPMULLW Y1, Y12, Y12
  1838. VPMULLW Y1, Y13, Y13
  1839. VPSUBW Y12, Y10, Y10
  1840. VPSUBW Y13, Y6, Y6
  1841. // level 6
  1842. // update
  1843. VMOVDQA Y10, Y12
  1844. VMOVDQA Y3, Y13
  1845. VMOVDQA Y4, Y14
  1846. VMOVDQA Y5, Y15
  1847. VPADDW Y10, Y6, Y10
  1848. VPADDW Y3, Y7, Y3
  1849. VPADDW Y4, Y8, Y4
  1850. VPADDW Y5, Y9, Y5
  1851. VPSUBW Y6, Y12, Y6
  1852. VPSUBW Y7, Y13, Y7
  1853. VPSUBW Y8, Y14, Y8
  1854. VPSUBW Y9, Y15, Y9
  1855. // zetas
  1856. VMOVDQU 1440(SI), Y15
  1857. // mul
  1858. VPMULLW Y15, Y6, Y11
  1859. VPMULLW Y15, Y7, Y12
  1860. VPMULLW Y15, Y8, Y13
  1861. VPMULLW Y15, Y9, Y14
  1862. VPMULHW Y15, Y6, Y6
  1863. VPMULHW Y15, Y7, Y7
  1864. VPMULHW Y15, Y8, Y8
  1865. VPMULHW Y15, Y9, Y9
  1866. // reduce
  1867. VPMULLW Y0, Y11, Y11
  1868. VPMULLW Y0, Y12, Y12
  1869. VPMULLW Y0, Y13, Y13
  1870. VPMULLW Y0, Y14, Y14
  1871. VPMULHW Y1, Y11, Y11
  1872. VPMULHW Y1, Y12, Y12
  1873. VPMULHW Y1, Y13, Y13
  1874. VPMULHW Y1, Y14, Y14
  1875. VPSUBW Y11, Y6, Y6
  1876. VPSUBW Y12, Y7, Y7
  1877. VPSUBW Y13, Y8, Y8
  1878. VPSUBW Y14, Y9, Y9
  1879. // reduce 2
  1880. VPMULHW Y2, Y3, Y12
  1881. VPSRAW $11, Y12, Y12
  1882. VPMULLW Y1, Y12, Y12
  1883. VPSUBW Y12, Y3, Y3
  1884. // store
  1885. VMOVDQU Y10, (DI)
  1886. VMOVDQU Y3, 32(DI)
  1887. VMOVDQU Y4, 64(DI)
  1888. VMOVDQU Y5, 96(DI)
  1889. VMOVDQU Y6, 128(DI)
  1890. VMOVDQU Y7, 160(DI)
  1891. VMOVDQU Y8, 192(DI)
  1892. VMOVDQU Y9, 224(DI)
  1893. SUBQ $256, DI
  1894. // f
  1895. VMOVDQU ·f_x16<>(SB), Y2
  1896. // first round
  1897. // load
  1898. VMOVDQU (DI), Y4
  1899. VMOVDQU 32(DI), Y5
  1900. VMOVDQU 64(DI), Y6
  1901. VMOVDQU 96(DI), Y7
  1902. VMOVDQU 256(DI), Y8
  1903. VMOVDQU 288(DI), Y9
  1904. VMOVDQU 320(DI), Y10
  1905. VMOVDQU 352(DI), Y11
  1906. // level 7
  1907. // update
  1908. VMOVDQA Y4, Y12
  1909. VMOVDQA Y5, Y13
  1910. VMOVDQA Y6, Y14
  1911. VMOVDQA Y7, Y15
  1912. VPADDW Y4, Y8, Y4
  1913. VPADDW Y5, Y9, Y5
  1914. VPADDW Y6, Y10, Y6
  1915. VPADDW Y7, Y11, Y7
  1916. VPSUBW Y8, Y12, Y8
  1917. VPSUBW Y9, Y13, Y9
  1918. VPSUBW Y10, Y14, Y10
  1919. VPSUBW Y11, Y15, Y11
  1920. // zeta
  1921. VMOVDQU 1472(SI), Y3
  1922. // mul
  1923. VPMULLW Y3, Y8, Y12
  1924. VPMULLW Y3, Y9, Y13
  1925. VPMULLW Y3, Y10, Y14
  1926. VPMULLW Y3, Y11, Y15
  1927. VPMULHW Y3, Y8, Y8
  1928. VPMULHW Y3, Y9, Y9
  1929. VPMULHW Y3, Y10, Y10
  1930. VPMULHW Y3, Y11, Y11
  1931. // reduce
  1932. VPMULLW Y0, Y12, Y12
  1933. VPMULLW Y0, Y13, Y13
  1934. VPMULLW Y0, Y14, Y14
  1935. VPMULLW Y0, Y15, Y15
  1936. VPMULHW Y1, Y12, Y12
  1937. VPMULHW Y1, Y13, Y13
  1938. VPMULHW Y1, Y14, Y14
  1939. VPMULHW Y1, Y15, Y15
  1940. VPSUBW Y12, Y8, Y8
  1941. VPSUBW Y13, Y9, Y9
  1942. VPSUBW Y14, Y10, Y10
  1943. VPSUBW Y15, Y11, Y11
  1944. VPADDW Y1, Y8, Y8
  1945. VPADDW Y1, Y9, Y9
  1946. VPADDW Y1, Y10, Y10
  1947. VPADDW Y1, Y11, Y11
  1948. // mul
  1949. VPMULLW Y2, Y4, Y12
  1950. VPMULLW Y2, Y5, Y13
  1951. VPMULLW Y2, Y6, Y14
  1952. VPMULLW Y2, Y7, Y15
  1953. VPMULHW Y2, Y4, Y4
  1954. VPMULHW Y2, Y5, Y5
  1955. VPMULHW Y2, Y6, Y6
  1956. VPMULHW Y2, Y7, Y7
  1957. // reduce
  1958. VPMULLW Y0, Y12, Y12
  1959. VPMULLW Y0, Y13, Y13
  1960. VPMULLW Y0, Y14, Y14
  1961. VPMULLW Y0, Y15, Y15
  1962. VPMULHW Y1, Y12, Y12
  1963. VPMULHW Y1, Y13, Y13
  1964. VPMULHW Y1, Y14, Y14
  1965. VPMULHW Y1, Y15, Y15
  1966. VPSUBW Y12, Y4, Y4
  1967. VPSUBW Y13, Y5, Y5
  1968. VPSUBW Y14, Y6, Y6
  1969. VPSUBW Y15, Y7, Y7
  1970. VPADDW Y1, Y4, Y4
  1971. VPADDW Y1, Y5, Y5
  1972. VPADDW Y1, Y6, Y6
  1973. VPADDW Y1, Y7, Y7
  1974. // store
  1975. VMOVDQU Y4, (DI)
  1976. VMOVDQU Y5, 32(DI)
  1977. VMOVDQU Y6, 64(DI)
  1978. VMOVDQU Y7, 96(DI)
  1979. VMOVDQU Y8, 256(DI)
  1980. VMOVDQU Y9, 288(DI)
  1981. VMOVDQU Y10, 320(DI)
  1982. VMOVDQU Y11, 352(DI)
  1983. ADDQ $128, DI
  1984. // second round
  1985. // load
  1986. VMOVDQU (DI), Y4
  1987. VMOVDQU 32(DI), Y5
  1988. VMOVDQU 64(DI), Y6
  1989. VMOVDQU 96(DI), Y7
  1990. VMOVDQU 256(DI), Y8
  1991. VMOVDQU 288(DI), Y9
  1992. VMOVDQU 320(DI), Y10
  1993. VMOVDQU 352(DI), Y11
  1994. // zeta
  1995. VMOVDQU 1472(SI), Y3
  1996. // level 7
  1997. // update
  1998. VMOVDQA Y4, Y12
  1999. VMOVDQA Y5, Y13
  2000. VMOVDQA Y6, Y14
  2001. VMOVDQA Y7, Y15
  2002. VPADDW Y4, Y8, Y4
  2003. VPADDW Y5, Y9, Y5
  2004. VPADDW Y6, Y10, Y6
  2005. VPADDW Y7, Y11, Y7
  2006. VPSUBW Y8, Y12, Y8
  2007. VPSUBW Y9, Y13, Y9
  2008. VPSUBW Y10, Y14, Y10
  2009. VPSUBW Y11, Y15, Y11
  2010. // mul
  2011. VPMULLW Y3, Y8, Y12
  2012. VPMULLW Y3, Y9, Y13
  2013. VPMULLW Y3, Y10, Y14
  2014. VPMULLW Y3, Y11, Y15
  2015. VPMULHW Y3, Y8, Y8
  2016. VPMULHW Y3, Y9, Y9
  2017. VPMULHW Y3, Y10, Y10
  2018. VPMULHW Y3, Y11, Y11
  2019. // reduce
  2020. VPMULLW Y0, Y12, Y12
  2021. VPMULLW Y0, Y13, Y13
  2022. VPMULLW Y0, Y14, Y14
  2023. VPMULLW Y0, Y15, Y15
  2024. VPMULHW Y1, Y12, Y12
  2025. VPMULHW Y1, Y13, Y13
  2026. VPMULHW Y1, Y14, Y14
  2027. VPMULHW Y1, Y15, Y15
  2028. VPSUBW Y12, Y8, Y8
  2029. VPSUBW Y13, Y9, Y9
  2030. VPSUBW Y14, Y10, Y10
  2031. VPSUBW Y15, Y11, Y11
  2032. VPADDW Y1, Y8, Y8
  2033. VPADDW Y1, Y9, Y9
  2034. VPADDW Y1, Y10, Y10
  2035. VPADDW Y1, Y11, Y11
  2036. // mul
  2037. VPMULLW Y2, Y4, Y12
  2038. VPMULLW Y2, Y5, Y13
  2039. VPMULLW Y2, Y6, Y14
  2040. VPMULLW Y2, Y7, Y15
  2041. VPMULHW Y2, Y4, Y4
  2042. VPMULHW Y2, Y5, Y5
  2043. VPMULHW Y2, Y6, Y6
  2044. VPMULHW Y2, Y7, Y7
  2045. // reduce
  2046. VPMULLW Y0, Y12, Y12
  2047. VPMULLW Y0, Y13, Y13
  2048. VPMULLW Y0, Y14, Y14
  2049. VPMULLW Y0, Y15, Y15
  2050. VPMULHW Y1, Y12, Y12
  2051. VPMULHW Y1, Y13, Y13
  2052. VPMULHW Y1, Y14, Y14
  2053. VPMULHW Y1, Y15, Y15
  2054. VPSUBW Y12, Y4, Y4
  2055. VPSUBW Y13, Y5, Y5
  2056. VPSUBW Y14, Y6, Y6
  2057. VPSUBW Y15, Y7, Y7
  2058. VPADDW Y1, Y4, Y4
  2059. VPADDW Y1, Y5, Y5
  2060. VPADDW Y1, Y6, Y6
  2061. VPADDW Y1, Y7, Y7
  2062. // store
  2063. VMOVDQU Y4, (DI)
  2064. VMOVDQU Y5, 32(DI)
  2065. VMOVDQU Y6, 64(DI)
  2066. VMOVDQU Y7, 96(DI)
  2067. VMOVDQU Y8, 256(DI)
  2068. VMOVDQU Y9, 288(DI)
  2069. VMOVDQU Y10, 320(DI)
  2070. VMOVDQU Y11, 352(DI)
  2071. VZEROUPPER
  2072. RET
  2073. // func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
  2074. TEXT ·pointwiseAccK2AVX2(SB), NOSPLIT, $0-24
  2075. MOVQ dst+0(FP), DI
  2076. MOVQ a+8(FP), SI
  2077. MOVQ b+16(FP), DX
  2078. VMOVDQU ·qinv_x16<>(SB), Y0
  2079. VMOVDQU ·q_x16<>(SB), Y1
  2080. VMOVDQU ·montsq_x16<>(SB), Y2
  2081. XORQ AX, AX
  2082. XORQ BX, BX
  2083. MOVQ 8(SI), R8 // a[1]
  2084. MOVQ (SI), SI // a[0]
  2085. MOVQ 8(DX), R11 // b[1]
  2086. MOVQ (DX), DX // b[0]
  2087. looptop2:
  2088. // load a
  2089. VMOVDQU (SI)(BX*1), Y4
  2090. VMOVDQU 32(SI)(BX*1), Y5
  2091. VMOVDQU 64(SI)(BX*1), Y6
  2092. VMOVDQU (R8)(BX*1), Y7
  2093. VMOVDQU 32(R8)(BX*1), Y8
  2094. VMOVDQU 64(R8)(BX*1), Y9
  2095. // mul montsq
  2096. VPMULLW Y2, Y4, Y3
  2097. VPMULHW Y2, Y4, Y10
  2098. VPMULLW Y2, Y5, Y4
  2099. VPMULHW Y2, Y5, Y11
  2100. VPMULLW Y2, Y6, Y5
  2101. VPMULHW Y2, Y6, Y12
  2102. VPMULLW Y2, Y7, Y6
  2103. VPMULHW Y2, Y7, Y13
  2104. VPMULLW Y2, Y8, Y7
  2105. VPMULHW Y2, Y8, Y14
  2106. VPMULLW Y2, Y9, Y8
  2107. VPMULHW Y2, Y9, Y15
  2108. // reduce
  2109. VPMULLW Y0, Y3, Y3
  2110. VPMULLW Y0, Y4, Y4
  2111. VPMULLW Y0, Y5, Y5
  2112. VPMULLW Y0, Y6, Y6
  2113. VPMULLW Y0, Y7, Y7
  2114. VPMULLW Y0, Y8, Y8
  2115. VPMULHW Y1, Y3, Y3
  2116. VPMULHW Y1, Y4, Y4
  2117. VPMULHW Y1, Y5, Y5
  2118. VPMULHW Y1, Y6, Y6
  2119. VPMULHW Y1, Y7, Y7
  2120. VPMULHW Y1, Y8, Y8
  2121. VPSUBW Y3, Y10, Y3
  2122. VPSUBW Y4, Y11, Y4
  2123. VPSUBW Y5, Y12, Y5
  2124. VPSUBW Y6, Y13, Y6
  2125. VPSUBW Y7, Y14, Y7
  2126. VPSUBW Y8, Y15, Y8
  2127. // load b
  2128. VMOVDQU (DX)(BX*1), Y9
  2129. VMOVDQU 32(DX)(BX*1), Y10
  2130. VMOVDQU 64(DX)(BX*1), Y11
  2131. VMOVDQU (R11)(BX*1), Y12
  2132. VMOVDQU 32(R11)(BX*1), Y13
  2133. VMOVDQU 64(R11)(BX*1), Y14
  2134. // mul
  2135. VPMULLW Y3, Y9, Y15
  2136. VPMULHW Y3, Y9, Y9
  2137. VPMULLW Y4, Y10, Y3
  2138. VPMULHW Y4, Y10, Y10
  2139. VPMULLW Y5, Y11, Y4
  2140. VPMULHW Y5, Y11, Y11
  2141. VPMULLW Y6, Y12, Y5
  2142. VPMULHW Y6, Y12, Y12
  2143. VPMULLW Y7, Y13, Y6
  2144. VPMULHW Y7, Y13, Y13
  2145. VPMULLW Y8, Y14, Y7
  2146. VPMULHW Y8, Y14, Y14
  2147. // reduce
  2148. VPMULLW Y0, Y15, Y15
  2149. VPMULLW Y0, Y3, Y3
  2150. VPMULLW Y0, Y4, Y4
  2151. VPMULLW Y0, Y5, Y5
  2152. VPMULLW Y0, Y6, Y6
  2153. VPMULLW Y0, Y7, Y7
  2154. VPMULHW Y1, Y15, Y15
  2155. VPMULHW Y1, Y3, Y3
  2156. VPMULHW Y1, Y4, Y4
  2157. VPMULHW Y1, Y5, Y5
  2158. VPMULHW Y1, Y6, Y6
  2159. VPMULHW Y1, Y7, Y7
  2160. VPSUBW Y15, Y9, Y15
  2161. VPSUBW Y3, Y10, Y3
  2162. VPSUBW Y4, Y11, Y4
  2163. VPSUBW Y5, Y12, Y5
  2164. VPSUBW Y6, Y13, Y6
  2165. VPSUBW Y7, Y14, Y7
  2166. // add
  2167. VPADDW Y15, Y5, Y5
  2168. VPADDW Y3, Y6, Y6
  2169. VPADDW Y4, Y7, Y7
  2170. // reduce 2
  2171. VMOVDQU ·v_x16<>(SB), Y3
  2172. VPMULHW Y3, Y5, Y8
  2173. VPMULHW Y3, Y6, Y9
  2174. VPMULHW Y3, Y7, Y10
  2175. VPSRAW $11, Y8, Y8
  2176. VPSRAW $11, Y9, Y9
  2177. VPSRAW $11, Y10, Y10
  2178. VPMULLW Y1, Y8, Y8
  2179. VPMULLW Y1, Y9, Y9
  2180. VPMULLW Y1, Y10, Y10
  2181. VPSUBW Y8, Y5, Y5
  2182. VPSUBW Y9, Y6, Y6
  2183. VPSUBW Y10, Y7, Y7
  2184. // store
  2185. VMOVDQU Y5, (DI)(BX*1)
  2186. VMOVDQU Y6, 32(DI)(BX*1)
  2187. VMOVDQU Y7, 64(DI)(BX*1)
  2188. ADDQ $1, AX
  2189. ADDQ $96, BX
  2190. CMPQ AX, $5
  2191. JB looptop2
  2192. // load
  2193. VMOVDQU (SI)(BX*1), Y4
  2194. VMOVDQU (R8)(BX*1), Y7
  2195. VMOVDQU (DX)(BX*1), Y9
  2196. VMOVDQU (R11)(BX*1), Y12
  2197. // mul montsq
  2198. VPMULLW Y2, Y4, Y3
  2199. VPMULHW Y2, Y4, Y10
  2200. VPMULLW Y2, Y7, Y6
  2201. VPMULHW Y2, Y7, Y13
  2202. // reduce
  2203. VPMULLW Y0, Y3, Y3
  2204. VPMULLW Y0, Y6, Y6
  2205. VPMULHW Y1, Y3, Y3
  2206. VPMULHW Y1, Y6, Y6
  2207. VPSUBW Y3, Y10, Y3
  2208. VPSUBW Y6, Y13, Y6
  2209. // mul
  2210. VPMULLW Y3, Y9, Y15
  2211. VPMULHW Y3, Y9, Y9
  2212. VPMULLW Y6, Y12, Y5
  2213. VPMULHW Y6, Y12, Y12
  2214. // reduce
  2215. VPMULLW Y0, Y15, Y15
  2216. VPMULLW Y0, Y5, Y5
  2217. VPMULHW Y1, Y15, Y15
  2218. VPMULHW Y1, Y5, Y5
  2219. VPSUBW Y15, Y9, Y15
  2220. VPSUBW Y5, Y12, Y5
  2221. // add
  2222. VPADDW Y15, Y5, Y5
  2223. // reduce 2
  2224. VMOVDQU ·v_x16<>(SB), Y3
  2225. VPMULHW Y3, Y5, Y8
  2226. VPSRAW $11, Y8, Y8
  2227. VPMULLW Y1, Y8, Y8
  2228. VPSUBW Y8, Y5, Y5
  2229. // store
  2230. VMOVDQU Y5, (DI)(BX*1)
  2231. VZEROUPPER
  2232. RET
  2233. // func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
  2234. TEXT ·pointwiseAccK3AVX2(SB), NOSPLIT, $0-24
  2235. MOVQ dst+0(FP), DI
  2236. MOVQ a+8(FP), SI
  2237. MOVQ b+16(FP), DX
  2238. VMOVDQU ·qinv_x16<>(SB), Y0
  2239. VMOVDQU ·q_x16<>(SB), Y1
  2240. VMOVDQU ·montsq_x16<>(SB), Y2
  2241. XORQ AX, AX
  2242. XORQ BX, BX
  2243. MOVQ (16)(SI), R9 // a[2]
  2244. MOVQ 8(SI), R8 // a[1]
  2245. MOVQ (SI), SI // a[0]
  2246. MOVQ 16(DX), R12 // b[2]
  2247. MOVQ 8(DX), R11 // b[1]
  2248. MOVQ (DX), DX // b[0]
  2249. looptop3:
  2250. // load a
  2251. VMOVDQU (SI)(BX*1), Y4
  2252. VMOVDQU 32(SI)(BX*1), Y5
  2253. VMOVDQU (R8)(BX*1), Y6
  2254. VMOVDQU 32(R8)(BX*1), Y7
  2255. VMOVDQU (R9)(BX*1), Y8
  2256. VMOVDQU 32(R9)(BX*1), Y9
  2257. // mul montsq
  2258. VPMULLW Y2, Y4, Y3
  2259. VPMULHW Y2, Y4, Y10
  2260. VPMULLW Y2, Y5, Y4
  2261. VPMULHW Y2, Y5, Y11
  2262. VPMULLW Y2, Y6, Y5
  2263. VPMULHW Y2, Y6, Y12
  2264. VPMULLW Y2, Y7, Y6
  2265. VPMULHW Y2, Y7, Y13
  2266. VPMULLW Y2, Y8, Y7
  2267. VPMULHW Y2, Y8, Y14
  2268. VPMULLW Y2, Y9, Y8
  2269. VPMULHW Y2, Y9, Y15
  2270. // reduce
  2271. VPMULLW Y0, Y3, Y3
  2272. VPMULLW Y0, Y4, Y4
  2273. VPMULLW Y0, Y5, Y5
  2274. VPMULLW Y0, Y6, Y6
  2275. VPMULLW Y0, Y7, Y7
  2276. VPMULLW Y0, Y8, Y8
  2277. VPMULHW Y1, Y3, Y3
  2278. VPMULHW Y1, Y4, Y4
  2279. VPMULHW Y1, Y5, Y5
  2280. VPMULHW Y1, Y6, Y6
  2281. VPMULHW Y1, Y7, Y7
  2282. VPMULHW Y1, Y8, Y8
  2283. VPSUBW Y3, Y10, Y3
  2284. VPSUBW Y4, Y11, Y4
  2285. VPSUBW Y5, Y12, Y5
  2286. VPSUBW Y6, Y13, Y6
  2287. VPSUBW Y7, Y14, Y7
  2288. VPSUBW Y8, Y15, Y8
  2289. // load b
  2290. VMOVDQU (DX)(BX*1), Y9
  2291. VMOVDQU 32(DX)(BX*1), Y10
  2292. VMOVDQU (R11)(BX*1), Y11
  2293. VMOVDQU 32(R11)(BX*1), Y12
  2294. VMOVDQU (R12)(BX*1), Y13
  2295. VMOVDQU 32(R12)(BX*1), Y14
  2296. // mul
  2297. VPMULLW Y3, Y9, Y15
  2298. VPMULHW Y3, Y9, Y9
  2299. VPMULLW Y4, Y10, Y3
  2300. VPMULHW Y4, Y10, Y10
  2301. VPMULLW Y5, Y11, Y4
  2302. VPMULHW Y5, Y11, Y11
  2303. VPMULLW Y6, Y12, Y5
  2304. VPMULHW Y6, Y12, Y12
  2305. VPMULLW Y7, Y13, Y6
  2306. VPMULHW Y7, Y13, Y13
  2307. VPMULLW Y8, Y14, Y7
  2308. VPMULHW Y8, Y14, Y14
  2309. // reduce
  2310. VPMULLW Y0, Y15, Y15
  2311. VPMULLW Y0, Y3, Y3
  2312. VPMULLW Y0, Y4, Y4
  2313. VPMULLW Y0, Y5, Y5
  2314. VPMULLW Y0, Y6, Y6
  2315. VPMULLW Y0, Y7, Y7
  2316. VPMULHW Y1, Y15, Y15
  2317. VPMULHW Y1, Y3, Y3
  2318. VPMULHW Y1, Y4, Y4
  2319. VPMULHW Y1, Y5, Y5
  2320. VPMULHW Y1, Y6, Y6
  2321. VPMULHW Y1, Y7, Y7
  2322. VPSUBW Y15, Y9, Y15
  2323. VPSUBW Y3, Y10, Y3
  2324. VPSUBW Y4, Y11, Y4
  2325. VPSUBW Y5, Y12, Y5
  2326. VPSUBW Y6, Y13, Y6
  2327. VPSUBW Y7, Y14, Y7
  2328. // add
  2329. VPADDW Y15, Y4, Y4
  2330. VPADDW Y3, Y5, Y5
  2331. VPADDW Y4, Y6, Y6
  2332. VPADDW Y5, Y7, Y7
  2333. // reduce 2
  2334. VMOVDQU ·v_x16<>(SB), Y3
  2335. VPMULHW Y3, Y6, Y8
  2336. VPMULHW Y3, Y7, Y9
  2337. VPSRAW $11, Y8, Y8
  2338. VPSRAW $11, Y9, Y9
  2339. VPMULLW Y1, Y8, Y8
  2340. VPMULLW Y1, Y9, Y9
  2341. VPSUBW Y8, Y6, Y6
  2342. VPSUBW Y9, Y7, Y7
  2343. // store
  2344. VMOVDQU Y6, (DI)(BX*1)
  2345. VMOVDQU Y7, 32(DI)(BX*1)
  2346. ADDQ $1, AX
  2347. ADDQ $64, BX
  2348. CMPQ AX, $8
  2349. JB looptop3
  2350. VZEROUPPER
  2351. RET
  2352. // func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
  2353. TEXT ·pointwiseAccK4AVX2(SB), NOSPLIT, $0-24
  2354. MOVQ dst+0(FP), DI
  2355. MOVQ a+8(FP), SI
  2356. MOVQ b+16(FP), DX
  2357. VMOVDQU ·qinv_x16<>(SB), Y0
  2358. VMOVDQU ·q_x16<>(SB), Y1
  2359. VMOVDQU ·montsq_x16<>(SB), Y2
  2360. VMOVDQU ·v_x16<>(SB), Y3
  2361. XORQ AX, AX
  2362. XORQ BX, BX
  2363. MOVQ 24(SI), R10 // a[3]
  2364. MOVQ 16(SI), R9 // a[2]
  2365. MOVQ 8(SI), R8 // a[1]
  2366. MOVQ (SI), SI // a[0]
  2367. MOVQ 24(DX), R13 // b[3]
  2368. MOVQ 16(DX), R12 // b[2]
  2369. MOVQ 8(DX), R11 // b[1]
  2370. MOVQ (DX), DX // b[0]
  2371. looptop4:
  2372. // load a
  2373. VMOVDQU (SI)(BX*1), Y6
  2374. VMOVDQU (R8)(BX*1), Y7
  2375. VMOVDQU (R9)(BX*1), Y8
  2376. VMOVDQU (R10)(BX*1), Y9
  2377. // mul montsq
  2378. VPMULLW Y2, Y6, Y5
  2379. VPMULHW Y2, Y6, Y10
  2380. VPMULLW Y2, Y7, Y6
  2381. VPMULHW Y2, Y7, Y11
  2382. VPMULLW Y2, Y8, Y7
  2383. VPMULHW Y2, Y8, Y12
  2384. VPMULLW Y2, Y9, Y8
  2385. VPMULHW Y2, Y9, Y13
  2386. // reduce
  2387. VPMULLW Y0, Y5, Y5
  2388. VPMULLW Y0, Y6, Y6
  2389. VPMULLW Y0, Y7, Y7
  2390. VPMULLW Y0, Y8, Y8
  2391. VPMULHW Y1, Y5, Y5
  2392. VPMULHW Y1, Y6, Y6
  2393. VPMULHW Y1, Y7, Y7
  2394. VPMULHW Y1, Y8, Y8
  2395. VPSUBW Y5, Y10, Y5
  2396. VPSUBW Y6, Y11, Y6
  2397. VPSUBW Y7, Y12, Y7
  2398. VPSUBW Y8, Y13, Y8
  2399. // load b
  2400. VMOVDQU (DX)(BX*1), Y9
  2401. VMOVDQU (R11)(BX*1), Y10
  2402. VMOVDQU (R12)(BX*1), Y11
  2403. VMOVDQU (R13)(BX*1), Y12
  2404. // mul
  2405. VPMULLW Y5, Y9, Y4
  2406. VPMULHW Y5, Y9, Y9
  2407. VPMULLW Y6, Y10, Y5
  2408. VPMULHW Y6, Y10, Y10
  2409. VPMULLW Y7, Y11, Y6
  2410. VPMULHW Y7, Y11, Y11
  2411. VPMULLW Y8, Y12, Y7
  2412. VPMULHW Y8, Y12, Y12
  2413. // reduce
  2414. VPMULLW Y0, Y4, Y4
  2415. VPMULLW Y0, Y5, Y5
  2416. VPMULLW Y0, Y6, Y6
  2417. VPMULLW Y0, Y7, Y7
  2418. VPMULHW Y1, Y4, Y4
  2419. VPMULHW Y1, Y5, Y5
  2420. VPMULHW Y1, Y6, Y6
  2421. VPMULHW Y1, Y7, Y7
  2422. VPSUBW Y4, Y9, Y4
  2423. VPSUBW Y5, Y10, Y5
  2424. VPSUBW Y6, Y11, Y6
  2425. VPSUBW Y7, Y12, Y7
  2426. // add
  2427. VPADDW Y4, Y5, Y5
  2428. VPADDW Y5, Y6, Y6
  2429. VPADDW Y6, Y7, Y7
  2430. // reduce 2
  2431. VPMULHW Y3, Y7, Y8
  2432. VPSRAW $11, Y8, Y8
  2433. VPMULLW Y1, Y8, Y8
  2434. VPSUBW Y8, Y7, Y8
  2435. // store
  2436. VMOVDQU Y8, (DI)(BX*1)
  2437. ADDQ $1, AX
  2438. ADDQ $32, BX
  2439. CMPQ AX, $16
  2440. JB looptop4
  2441. VZEROUPPER
  2442. RET
  2443. // func cbdEta4AVX2(dst *uint16, b *byte)
  2444. TEXT ·cbdEta4AVX2(SB), NOSPLIT, $0-16
  2445. MOVQ dst+0(FP), DI
  2446. MOVQ b+8(FP), SI
  2447. VMOVDQU ·mask11<>(SB), Y0
  2448. VMOVDQU ·mask0f<>(SB), Y1
  2449. VMOVDQU ·q_x16<>(SB), Y2
  2450. MOVQ $256, DX
  2451. looptop:
  2452. VMOVUPD 0(SI), Y3
  2453. VPAND Y3, Y0, Y4
  2454. VPSRLW $1, Y3, Y3
  2455. VPAND Y3, Y0, Y5
  2456. VPADDB Y5, Y4, Y4
  2457. VPSRLW $1, Y3, Y3
  2458. VPAND Y3, Y0, Y5
  2459. VPADDB Y5, Y4, Y4
  2460. VPSRLW $1, Y3, Y3
  2461. VPAND Y3, Y0, Y3
  2462. VPADDB Y3, Y4, Y3
  2463. VPSRLW $4, Y3, Y4
  2464. VPAND Y3, Y1, Y3
  2465. VPAND Y4, Y1, Y4
  2466. VPSUBB Y4, Y3, Y3
  2467. VPMOVSXBW X3, Y4
  2468. VPADDW Y2, Y4, Y4
  2469. VMOVUPD Y4, 0(DI)
  2470. VPERM2F128 $0x21, Y3, Y3, Y3
  2471. VPMOVSXBW X3, Y4
  2472. VPADDW Y2, Y4, Y4
  2473. VMOVUPD Y4, 32(DI)
  2474. ADDQ $64, DI
  2475. ADDQ $32, SI
  2476. SUBQ $32, DX
  2477. JA looptop
  2478. VZEROUPPER
  2479. RET