hwaccel_amd64.s 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154
  1. // +build !noasm,go1.10
  2. // hwaccel_amd64.s - AMD64 optimized routines
  3. //
  4. // To the extent possible under law, Yawning Angel has waived all copyright
  5. // and related or neighboring rights to the software, using the Creative
  6. // Commons "CC0" public domain dedication. See LICENSE or
  7. // <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
  8. #include "textflag.h"
  9. // func cpuidAmd64(cpuidParams *uint32)
  10. TEXT ·cpuidAmd64(SB), NOSPLIT, $0-8
  11. MOVQ cpuidParams+0(FP), R15
  12. MOVL 0(R15), AX
  13. MOVL 8(R15), CX
  14. CPUID
  15. MOVL AX, 0(R15)
  16. MOVL BX, 4(R15)
  17. MOVL CX, 8(R15)
  18. MOVL DX, 12(R15)
  19. RET
  20. // func xgetbv0Amd64(xcrVec *uint32)
  21. TEXT ·xgetbv0Amd64(SB), NOSPLIT, $0-8
  22. MOVQ xcrVec+0(FP), BX
  23. XORL CX, CX
  24. XGETBV
  25. MOVL AX, 0(BX)
  26. MOVL DX, 4(BX)
  27. RET
  28. DATA ·chacha_constants<>+0x00(SB)/4, $0x61707865
  29. DATA ·chacha_constants<>+0x04(SB)/4, $0x3320646E
  30. DATA ·chacha_constants<>+0x08(SB)/4, $0x79622D32
  31. DATA ·chacha_constants<>+0x0c(SB)/4, $0x6B206574
  32. DATA ·chacha_constants<>+0x10(SB)/8, $0x0504070601000302
  33. DATA ·chacha_constants<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
  34. DATA ·chacha_constants<>+0x20(SB)/8, $0x0605040702010003
  35. DATA ·chacha_constants<>+0x28(SB)/8, $0x0E0D0C0F0A09080B
  36. GLOBL ·chacha_constants<>(SB), (NOPTR+RODATA), $48
  37. // func chachaXORKeyStreamAVX2(s *chachaState, in, out []byte)
  38. TEXT ·chachaXORKeyStreamAVX2(SB), NOSPLIT, $544-56
  39. // This is Andrew Moon's AVX2 ChaCha implementation taken from
  40. // supercop-20171218, with some minor changes, primarily calling
  41. // convention and assembly dialect related.
  42. // Align the stack on a 32 byte boundary.
  43. MOVQ SP, BP
  44. ADDQ $32, BP
  45. ANDQ $-32, BP
  46. // Go calling convention -> SYSV AMD64 (and a fixup).
  47. MOVQ s+0(FP), DI // &s -> DI
  48. ADDQ $16, DI // Skip the ChaCha constants in the chachaState.
  49. MOVQ in+8(FP), SI // &in[0] -> SI
  50. MOVQ out+32(FP), DX // &out[0] -> DX
  51. MOVQ in+16(FP), CX // len(in) -> CX
  52. // Begin the main body of `chacha_blocks_avx2`.
  53. //
  54. // Mostly a direct translation except:
  55. // * The number of rounds is always 20.
  56. // * %rbp is used instead of %rsp.
  57. LEAQ ·chacha_constants<>(SB), AX
  58. VMOVDQU 0(AX), X8
  59. VMOVDQU 16(AX), X6
  60. VMOVDQU 32(AX), X7
  61. VMOVDQU 0(DI), X9
  62. VMOVDQU 16(DI), X10
  63. VMOVDQU 32(DI), X11
  64. // MOVQ 48(DI), AX
  65. MOVQ $1, R9
  66. VMOVDQA X8, 0(BP)
  67. VMOVDQA X9, 16(BP)
  68. VMOVDQA X10, 32(BP)
  69. VMOVDQA X11, 48(BP)
  70. // MOVQ AX, 64(BP)
  71. VMOVDQA X6, 448(BP)
  72. VMOVDQA X6, 464(BP)
  73. VMOVDQA X7, 480(BP)
  74. VMOVDQA X7, 496(BP)
  75. CMPQ CX, $512
  76. JAE chacha_blocks_avx2_atleast512
  77. CMPQ CX, $256
  78. JAE chacha_blocks_avx2_atleast256
  79. JMP chacha_blocks_avx2_below256
  80. chacha_blocks_avx2_atleast512:
  81. MOVQ 48(BP), AX
  82. LEAQ 1(AX), R8
  83. LEAQ 2(AX), R9
  84. LEAQ 3(AX), R10
  85. LEAQ 4(AX), BX
  86. LEAQ 5(AX), R11
  87. LEAQ 6(AX), R12
  88. LEAQ 7(AX), R13
  89. LEAQ 8(AX), R14
  90. MOVL AX, 128(BP)
  91. MOVL R8, 4+128(BP)
  92. MOVL R9, 8+128(BP)
  93. MOVL R10, 12+128(BP)
  94. MOVL BX, 16+128(BP)
  95. MOVL R11, 20+128(BP)
  96. MOVL R12, 24+128(BP)
  97. MOVL R13, 28+128(BP)
  98. SHRQ $32, AX
  99. SHRQ $32, R8
  100. SHRQ $32, R9
  101. SHRQ $32, R10
  102. SHRQ $32, BX
  103. SHRQ $32, R11
  104. SHRQ $32, R12
  105. SHRQ $32, R13
  106. MOVL AX, 160(BP)
  107. MOVL R8, 4+160(BP)
  108. MOVL R9, 8+160(BP)
  109. MOVL R10, 12+160(BP)
  110. MOVL BX, 16+160(BP)
  111. MOVL R11, 20+160(BP)
  112. MOVL R12, 24+160(BP)
  113. MOVL R13, 28+160(BP)
  114. MOVQ R14, 48(BP)
  115. // MOVQ 64(BP), AX
  116. MOVQ $20, AX
  117. VPBROADCASTD 0(BP), Y0
  118. VPBROADCASTD 4+0(BP), Y1
  119. VPBROADCASTD 8+0(BP), Y2
  120. VPBROADCASTD 12+0(BP), Y3
  121. VPBROADCASTD 16(BP), Y4
  122. VPBROADCASTD 4+16(BP), Y5
  123. VPBROADCASTD 8+16(BP), Y6
  124. VPBROADCASTD 12+16(BP), Y7
  125. VPBROADCASTD 32(BP), Y8
  126. VPBROADCASTD 4+32(BP), Y9
  127. VPBROADCASTD 8+32(BP), Y10
  128. VPBROADCASTD 12+32(BP), Y11
  129. VPBROADCASTD 8+48(BP), Y14
  130. VPBROADCASTD 12+48(BP), Y15
  131. VMOVDQA 128(BP), Y12
  132. VMOVDQA 160(BP), Y13
  133. chacha_blocks_avx2_mainloop1:
  134. VPADDD Y0, Y4, Y0
  135. VPADDD Y1, Y5, Y1
  136. VPXOR Y12, Y0, Y12
  137. VPXOR Y13, Y1, Y13
  138. VPADDD Y2, Y6, Y2
  139. VPADDD Y3, Y7, Y3
  140. VPXOR Y14, Y2, Y14
  141. VPXOR Y15, Y3, Y15
  142. VPSHUFB 448(BP), Y12, Y12
  143. VPSHUFB 448(BP), Y13, Y13
  144. VPADDD Y8, Y12, Y8
  145. VPADDD Y9, Y13, Y9
  146. VPSHUFB 448(BP), Y14, Y14
  147. VPSHUFB 448(BP), Y15, Y15
  148. VPADDD Y10, Y14, Y10
  149. VPADDD Y11, Y15, Y11
  150. VMOVDQA Y12, 96(BP)
  151. VPXOR Y4, Y8, Y4
  152. VPXOR Y5, Y9, Y5
  153. VPSLLD $ 12, Y4, Y12
  154. VPSRLD $20, Y4, Y4
  155. VPXOR Y4, Y12, Y4
  156. VPSLLD $ 12, Y5, Y12
  157. VPSRLD $20, Y5, Y5
  158. VPXOR Y5, Y12, Y5
  159. VPXOR Y6, Y10, Y6
  160. VPXOR Y7, Y11, Y7
  161. VPSLLD $ 12, Y6, Y12
  162. VPSRLD $20, Y6, Y6
  163. VPXOR Y6, Y12, Y6
  164. VPSLLD $ 12, Y7, Y12
  165. VPSRLD $20, Y7, Y7
  166. VPXOR Y7, Y12, Y7
  167. VPADDD Y0, Y4, Y0
  168. VPADDD Y1, Y5, Y1
  169. VPXOR 96(BP), Y0, Y12
  170. VPXOR Y13, Y1, Y13
  171. VPADDD Y2, Y6, Y2
  172. VPADDD Y3, Y7, Y3
  173. VPXOR Y14, Y2, Y14
  174. VPXOR Y15, Y3, Y15
  175. VPSHUFB 480(BP), Y12, Y12
  176. VPSHUFB 480(BP), Y13, Y13
  177. VPADDD Y8, Y12, Y8
  178. VPADDD Y9, Y13, Y9
  179. VPSHUFB 480(BP), Y14, Y14
  180. VPSHUFB 480(BP), Y15, Y15
  181. VPADDD Y10, Y14, Y10
  182. VPADDD Y11, Y15, Y11
  183. VMOVDQA Y12, 96(BP)
  184. VPXOR Y4, Y8, Y4
  185. VPXOR Y5, Y9, Y5
  186. VPSLLD $ 7, Y4, Y12
  187. VPSRLD $25, Y4, Y4
  188. VPXOR Y4, Y12, Y4
  189. VPSLLD $ 7, Y5, Y12
  190. VPSRLD $25, Y5, Y5
  191. VPXOR Y5, Y12, Y5
  192. VPXOR Y6, Y10, Y6
  193. VPXOR Y7, Y11, Y7
  194. VPSLLD $ 7, Y6, Y12
  195. VPSRLD $25, Y6, Y6
  196. VPXOR Y6, Y12, Y6
  197. VPSLLD $ 7, Y7, Y12
  198. VPSRLD $25, Y7, Y7
  199. VPXOR Y7, Y12, Y7
  200. VPADDD Y0, Y5, Y0
  201. VPADDD Y1, Y6, Y1
  202. VPXOR Y15, Y0, Y15
  203. VPXOR 96(BP), Y1, Y12
  204. VPADDD Y2, Y7, Y2
  205. VPADDD Y3, Y4, Y3
  206. VPXOR Y13, Y2, Y13
  207. VPXOR Y14, Y3, Y14
  208. VPSHUFB 448(BP), Y15, Y15
  209. VPSHUFB 448(BP), Y12, Y12
  210. VPADDD Y10, Y15, Y10
  211. VPADDD Y11, Y12, Y11
  212. VPSHUFB 448(BP), Y13, Y13
  213. VPSHUFB 448(BP), Y14, Y14
  214. VPADDD Y8, Y13, Y8
  215. VPADDD Y9, Y14, Y9
  216. VMOVDQA Y15, 96(BP)
  217. VPXOR Y5, Y10, Y5
  218. VPXOR Y6, Y11, Y6
  219. VPSLLD $ 12, Y5, Y15
  220. VPSRLD $20, Y5, Y5
  221. VPXOR Y5, Y15, Y5
  222. VPSLLD $ 12, Y6, Y15
  223. VPSRLD $20, Y6, Y6
  224. VPXOR Y6, Y15, Y6
  225. VPXOR Y7, Y8, Y7
  226. VPXOR Y4, Y9, Y4
  227. VPSLLD $ 12, Y7, Y15
  228. VPSRLD $20, Y7, Y7
  229. VPXOR Y7, Y15, Y7
  230. VPSLLD $ 12, Y4, Y15
  231. VPSRLD $20, Y4, Y4
  232. VPXOR Y4, Y15, Y4
  233. VPADDD Y0, Y5, Y0
  234. VPADDD Y1, Y6, Y1
  235. VPXOR 96(BP), Y0, Y15
  236. VPXOR Y12, Y1, Y12
  237. VPADDD Y2, Y7, Y2
  238. VPADDD Y3, Y4, Y3
  239. VPXOR Y13, Y2, Y13
  240. VPXOR Y14, Y3, Y14
  241. VPSHUFB 480(BP), Y15, Y15
  242. VPSHUFB 480(BP), Y12, Y12
  243. VPADDD Y10, Y15, Y10
  244. VPADDD Y11, Y12, Y11
  245. VPSHUFB 480(BP), Y13, Y13
  246. VPSHUFB 480(BP), Y14, Y14
  247. VPADDD Y8, Y13, Y8
  248. VPADDD Y9, Y14, Y9
  249. VMOVDQA Y15, 96(BP)
  250. VPXOR Y5, Y10, Y5
  251. VPXOR Y6, Y11, Y6
  252. VPSLLD $ 7, Y5, Y15
  253. VPSRLD $25, Y5, Y5
  254. VPXOR Y5, Y15, Y5
  255. VPSLLD $ 7, Y6, Y15
  256. VPSRLD $25, Y6, Y6
  257. VPXOR Y6, Y15, Y6
  258. VPXOR Y7, Y8, Y7
  259. VPXOR Y4, Y9, Y4
  260. VPSLLD $ 7, Y7, Y15
  261. VPSRLD $25, Y7, Y7
  262. VPXOR Y7, Y15, Y7
  263. VPSLLD $ 7, Y4, Y15
  264. VPSRLD $25, Y4, Y4
  265. VPXOR Y4, Y15, Y4
  266. VMOVDQA 96(BP), Y15
  267. SUBQ $2, AX
  268. JNZ chacha_blocks_avx2_mainloop1
  269. VMOVDQA Y8, 192(BP)
  270. VMOVDQA Y9, 224(BP)
  271. VMOVDQA Y10, 256(BP)
  272. VMOVDQA Y11, 288(BP)
  273. VMOVDQA Y12, 320(BP)
  274. VMOVDQA Y13, 352(BP)
  275. VMOVDQA Y14, 384(BP)
  276. VMOVDQA Y15, 416(BP)
  277. VPBROADCASTD 0(BP), Y8
  278. VPBROADCASTD 4+0(BP), Y9
  279. VPBROADCASTD 8+0(BP), Y10
  280. VPBROADCASTD 12+0(BP), Y11
  281. VPBROADCASTD 16(BP), Y12
  282. VPBROADCASTD 4+16(BP), Y13
  283. VPBROADCASTD 8+16(BP), Y14
  284. VPBROADCASTD 12+16(BP), Y15
  285. VPADDD Y8, Y0, Y0
  286. VPADDD Y9, Y1, Y1
  287. VPADDD Y10, Y2, Y2
  288. VPADDD Y11, Y3, Y3
  289. VPADDD Y12, Y4, Y4
  290. VPADDD Y13, Y5, Y5
  291. VPADDD Y14, Y6, Y6
  292. VPADDD Y15, Y7, Y7
  293. VPUNPCKLDQ Y1, Y0, Y8
  294. VPUNPCKLDQ Y3, Y2, Y9
  295. VPUNPCKHDQ Y1, Y0, Y12
  296. VPUNPCKHDQ Y3, Y2, Y13
  297. VPUNPCKLDQ Y5, Y4, Y10
  298. VPUNPCKLDQ Y7, Y6, Y11
  299. VPUNPCKHDQ Y5, Y4, Y14
  300. VPUNPCKHDQ Y7, Y6, Y15
  301. VPUNPCKLQDQ Y9, Y8, Y0
  302. VPUNPCKLQDQ Y11, Y10, Y1
  303. VPUNPCKHQDQ Y9, Y8, Y2
  304. VPUNPCKHQDQ Y11, Y10, Y3
  305. VPUNPCKLQDQ Y13, Y12, Y4
  306. VPUNPCKLQDQ Y15, Y14, Y5
  307. VPUNPCKHQDQ Y13, Y12, Y6
  308. VPUNPCKHQDQ Y15, Y14, Y7
  309. VPERM2I128 $0x20, Y1, Y0, Y8
  310. VPERM2I128 $0x20, Y3, Y2, Y9
  311. VPERM2I128 $0x31, Y1, Y0, Y12
  312. VPERM2I128 $0x31, Y3, Y2, Y13
  313. VPERM2I128 $0x20, Y5, Y4, Y10
  314. VPERM2I128 $0x20, Y7, Y6, Y11
  315. VPERM2I128 $0x31, Y5, Y4, Y14
  316. VPERM2I128 $0x31, Y7, Y6, Y15
  317. ANDQ SI, SI
  318. JZ chacha_blocks_avx2_noinput1
  319. VPXOR 0(SI), Y8, Y8
  320. VPXOR 64(SI), Y9, Y9
  321. VPXOR 128(SI), Y10, Y10
  322. VPXOR 192(SI), Y11, Y11
  323. VPXOR 256(SI), Y12, Y12
  324. VPXOR 320(SI), Y13, Y13
  325. VPXOR 384(SI), Y14, Y14
  326. VPXOR 448(SI), Y15, Y15
  327. VMOVDQU Y8, 0(DX)
  328. VMOVDQU Y9, 64(DX)
  329. VMOVDQU Y10, 128(DX)
  330. VMOVDQU Y11, 192(DX)
  331. VMOVDQU Y12, 256(DX)
  332. VMOVDQU Y13, 320(DX)
  333. VMOVDQU Y14, 384(DX)
  334. VMOVDQU Y15, 448(DX)
  335. VMOVDQA 192(BP), Y0
  336. VMOVDQA 224(BP), Y1
  337. VMOVDQA 256(BP), Y2
  338. VMOVDQA 288(BP), Y3
  339. VMOVDQA 320(BP), Y4
  340. VMOVDQA 352(BP), Y5
  341. VMOVDQA 384(BP), Y6
  342. VMOVDQA 416(BP), Y7
  343. VPBROADCASTD 32(BP), Y8
  344. VPBROADCASTD 4+32(BP), Y9
  345. VPBROADCASTD 8+32(BP), Y10
  346. VPBROADCASTD 12+32(BP), Y11
  347. VMOVDQA 128(BP), Y12
  348. VMOVDQA 160(BP), Y13
  349. VPBROADCASTD 8+48(BP), Y14
  350. VPBROADCASTD 12+48(BP), Y15
  351. VPADDD Y8, Y0, Y0
  352. VPADDD Y9, Y1, Y1
  353. VPADDD Y10, Y2, Y2
  354. VPADDD Y11, Y3, Y3
  355. VPADDD Y12, Y4, Y4
  356. VPADDD Y13, Y5, Y5
  357. VPADDD Y14, Y6, Y6
  358. VPADDD Y15, Y7, Y7
  359. VPUNPCKLDQ Y1, Y0, Y8
  360. VPUNPCKLDQ Y3, Y2, Y9
  361. VPUNPCKHDQ Y1, Y0, Y12
  362. VPUNPCKHDQ Y3, Y2, Y13
  363. VPUNPCKLDQ Y5, Y4, Y10
  364. VPUNPCKLDQ Y7, Y6, Y11
  365. VPUNPCKHDQ Y5, Y4, Y14
  366. VPUNPCKHDQ Y7, Y6, Y15
  367. VPUNPCKLQDQ Y9, Y8, Y0
  368. VPUNPCKLQDQ Y11, Y10, Y1
  369. VPUNPCKHQDQ Y9, Y8, Y2
  370. VPUNPCKHQDQ Y11, Y10, Y3
  371. VPUNPCKLQDQ Y13, Y12, Y4
  372. VPUNPCKLQDQ Y15, Y14, Y5
  373. VPUNPCKHQDQ Y13, Y12, Y6
  374. VPUNPCKHQDQ Y15, Y14, Y7
  375. VPERM2I128 $0x20, Y1, Y0, Y8
  376. VPERM2I128 $0x20, Y3, Y2, Y9
  377. VPERM2I128 $0x31, Y1, Y0, Y12
  378. VPERM2I128 $0x31, Y3, Y2, Y13
  379. VPERM2I128 $0x20, Y5, Y4, Y10
  380. VPERM2I128 $0x20, Y7, Y6, Y11
  381. VPERM2I128 $0x31, Y5, Y4, Y14
  382. VPERM2I128 $0x31, Y7, Y6, Y15
  383. VPXOR 32(SI), Y8, Y8
  384. VPXOR 96(SI), Y9, Y9
  385. VPXOR 160(SI), Y10, Y10
  386. VPXOR 224(SI), Y11, Y11
  387. VPXOR 288(SI), Y12, Y12
  388. VPXOR 352(SI), Y13, Y13
  389. VPXOR 416(SI), Y14, Y14
  390. VPXOR 480(SI), Y15, Y15
  391. VMOVDQU Y8, 32(DX)
  392. VMOVDQU Y9, 96(DX)
  393. VMOVDQU Y10, 160(DX)
  394. VMOVDQU Y11, 224(DX)
  395. VMOVDQU Y12, 288(DX)
  396. VMOVDQU Y13, 352(DX)
  397. VMOVDQU Y14, 416(DX)
  398. VMOVDQU Y15, 480(DX)
  399. ADDQ $512, SI
  400. JMP chacha_blocks_avx2_mainloop1_cont
  401. chacha_blocks_avx2_noinput1:
  402. VMOVDQU Y8, 0(DX)
  403. VMOVDQU Y9, 64(DX)
  404. VMOVDQU Y10, 128(DX)
  405. VMOVDQU Y11, 192(DX)
  406. VMOVDQU Y12, 256(DX)
  407. VMOVDQU Y13, 320(DX)
  408. VMOVDQU Y14, 384(DX)
  409. VMOVDQU Y15, 448(DX)
  410. VMOVDQA 192(BP), Y0
  411. VMOVDQA 224(BP), Y1
  412. VMOVDQA 256(BP), Y2
  413. VMOVDQA 288(BP), Y3
  414. VMOVDQA 320(BP), Y4
  415. VMOVDQA 352(BP), Y5
  416. VMOVDQA 384(BP), Y6
  417. VMOVDQA 416(BP), Y7
  418. VPBROADCASTD 32(BP), Y8
  419. VPBROADCASTD 4+32(BP), Y9
  420. VPBROADCASTD 8+32(BP), Y10
  421. VPBROADCASTD 12+32(BP), Y11
  422. VMOVDQA 128(BP), Y12
  423. VMOVDQA 160(BP), Y13
  424. VPBROADCASTD 8+48(BP), Y14
  425. VPBROADCASTD 12+48(BP), Y15
  426. VPADDD Y8, Y0, Y0
  427. VPADDD Y9, Y1, Y1
  428. VPADDD Y10, Y2, Y2
  429. VPADDD Y11, Y3, Y3
  430. VPADDD Y12, Y4, Y4
  431. VPADDD Y13, Y5, Y5
  432. VPADDD Y14, Y6, Y6
  433. VPADDD Y15, Y7, Y7
  434. VPUNPCKLDQ Y1, Y0, Y8
  435. VPUNPCKLDQ Y3, Y2, Y9
  436. VPUNPCKHDQ Y1, Y0, Y12
  437. VPUNPCKHDQ Y3, Y2, Y13
  438. VPUNPCKLDQ Y5, Y4, Y10
  439. VPUNPCKLDQ Y7, Y6, Y11
  440. VPUNPCKHDQ Y5, Y4, Y14
  441. VPUNPCKHDQ Y7, Y6, Y15
  442. VPUNPCKLQDQ Y9, Y8, Y0
  443. VPUNPCKLQDQ Y11, Y10, Y1
  444. VPUNPCKHQDQ Y9, Y8, Y2
  445. VPUNPCKHQDQ Y11, Y10, Y3
  446. VPUNPCKLQDQ Y13, Y12, Y4
  447. VPUNPCKLQDQ Y15, Y14, Y5
  448. VPUNPCKHQDQ Y13, Y12, Y6
  449. VPUNPCKHQDQ Y15, Y14, Y7
  450. VPERM2I128 $0x20, Y1, Y0, Y8
  451. VPERM2I128 $0x20, Y3, Y2, Y9
  452. VPERM2I128 $0x31, Y1, Y0, Y12
  453. VPERM2I128 $0x31, Y3, Y2, Y13
  454. VPERM2I128 $0x20, Y5, Y4, Y10
  455. VPERM2I128 $0x20, Y7, Y6, Y11
  456. VPERM2I128 $0x31, Y5, Y4, Y14
  457. VPERM2I128 $0x31, Y7, Y6, Y15
  458. VMOVDQU Y8, 32(DX)
  459. VMOVDQU Y9, 96(DX)
  460. VMOVDQU Y10, 160(DX)
  461. VMOVDQU Y11, 224(DX)
  462. VMOVDQU Y12, 288(DX)
  463. VMOVDQU Y13, 352(DX)
  464. VMOVDQU Y14, 416(DX)
  465. VMOVDQU Y15, 480(DX)
  466. chacha_blocks_avx2_mainloop1_cont:
  467. ADDQ $512, DX
  468. SUBQ $512, CX
  469. CMPQ CX, $512
  470. JAE chacha_blocks_avx2_atleast512
  471. CMPQ CX, $256
  472. JB chacha_blocks_avx2_below256_fixup
  473. chacha_blocks_avx2_atleast256:
  474. MOVQ 48(BP), AX
  475. LEAQ 1(AX), R8
  476. LEAQ 2(AX), R9
  477. LEAQ 3(AX), R10
  478. LEAQ 4(AX), BX
  479. MOVL AX, 128(BP)
  480. MOVL R8, 4+128(BP)
  481. MOVL R9, 8+128(BP)
  482. MOVL R10, 12+128(BP)
  483. SHRQ $32, AX
  484. SHRQ $32, R8
  485. SHRQ $32, R9
  486. SHRQ $32, R10
  487. MOVL AX, 160(BP)
  488. MOVL R8, 4+160(BP)
  489. MOVL R9, 8+160(BP)
  490. MOVL R10, 12+160(BP)
  491. MOVQ BX, 48(BP)
  492. // MOVQ 64(BP), AX
  493. MOVQ $20, AX
  494. VPBROADCASTD 0(BP), X0
  495. VPBROADCASTD 4+0(BP), X1
  496. VPBROADCASTD 8+0(BP), X2
  497. VPBROADCASTD 12+0(BP), X3
  498. VPBROADCASTD 16(BP), X4
  499. VPBROADCASTD 4+16(BP), X5
  500. VPBROADCASTD 8+16(BP), X6
  501. VPBROADCASTD 12+16(BP), X7
  502. VPBROADCASTD 32(BP), X8
  503. VPBROADCASTD 4+32(BP), X9
  504. VPBROADCASTD 8+32(BP), X10
  505. VPBROADCASTD 12+32(BP), X11
  506. VMOVDQA 128(BP), X12
  507. VMOVDQA 160(BP), X13
  508. VPBROADCASTD 8+48(BP), X14
  509. VPBROADCASTD 12+48(BP), X15
  510. chacha_blocks_avx2_mainloop2:
  511. VPADDD X0, X4, X0
  512. VPADDD X1, X5, X1
  513. VPXOR X12, X0, X12
  514. VPXOR X13, X1, X13
  515. VPADDD X2, X6, X2
  516. VPADDD X3, X7, X3
  517. VPXOR X14, X2, X14
  518. VPXOR X15, X3, X15
  519. VPSHUFB 448(BP), X12, X12
  520. VPSHUFB 448(BP), X13, X13
  521. VPADDD X8, X12, X8
  522. VPADDD X9, X13, X9
  523. VPSHUFB 448(BP), X14, X14
  524. VPSHUFB 448(BP), X15, X15
  525. VPADDD X10, X14, X10
  526. VPADDD X11, X15, X11
  527. VMOVDQA X12, 96(BP)
  528. VPXOR X4, X8, X4
  529. VPXOR X5, X9, X5
  530. VPSLLD $ 12, X4, X12
  531. VPSRLD $20, X4, X4
  532. VPXOR X4, X12, X4
  533. VPSLLD $ 12, X5, X12
  534. VPSRLD $20, X5, X5
  535. VPXOR X5, X12, X5
  536. VPXOR X6, X10, X6
  537. VPXOR X7, X11, X7
  538. VPSLLD $ 12, X6, X12
  539. VPSRLD $20, X6, X6
  540. VPXOR X6, X12, X6
  541. VPSLLD $ 12, X7, X12
  542. VPSRLD $20, X7, X7
  543. VPXOR X7, X12, X7
  544. VPADDD X0, X4, X0
  545. VPADDD X1, X5, X1
  546. VPXOR 96(BP), X0, X12
  547. VPXOR X13, X1, X13
  548. VPADDD X2, X6, X2
  549. VPADDD X3, X7, X3
  550. VPXOR X14, X2, X14
  551. VPXOR X15, X3, X15
  552. VPSHUFB 480(BP), X12, X12
  553. VPSHUFB 480(BP), X13, X13
  554. VPADDD X8, X12, X8
  555. VPADDD X9, X13, X9
  556. VPSHUFB 480(BP), X14, X14
  557. VPSHUFB 480(BP), X15, X15
  558. VPADDD X10, X14, X10
  559. VPADDD X11, X15, X11
  560. VMOVDQA X12, 96(BP)
  561. VPXOR X4, X8, X4
  562. VPXOR X5, X9, X5
  563. VPSLLD $ 7, X4, X12
  564. VPSRLD $25, X4, X4
  565. VPXOR X4, X12, X4
  566. VPSLLD $ 7, X5, X12
  567. VPSRLD $25, X5, X5
  568. VPXOR X5, X12, X5
  569. VPXOR X6, X10, X6
  570. VPXOR X7, X11, X7
  571. VPSLLD $ 7, X6, X12
  572. VPSRLD $25, X6, X6
  573. VPXOR X6, X12, X6
  574. VPSLLD $ 7, X7, X12
  575. VPSRLD $25, X7, X7
  576. VPXOR X7, X12, X7
  577. VPADDD X0, X5, X0
  578. VPADDD X1, X6, X1
  579. VPXOR X15, X0, X15
  580. VPXOR 96(BP), X1, X12
  581. VPADDD X2, X7, X2
  582. VPADDD X3, X4, X3
  583. VPXOR X13, X2, X13
  584. VPXOR X14, X3, X14
  585. VPSHUFB 448(BP), X15, X15
  586. VPSHUFB 448(BP), X12, X12
  587. VPADDD X10, X15, X10
  588. VPADDD X11, X12, X11
  589. VPSHUFB 448(BP), X13, X13
  590. VPSHUFB 448(BP), X14, X14
  591. VPADDD X8, X13, X8
  592. VPADDD X9, X14, X9
  593. VMOVDQA X15, 96(BP)
  594. VPXOR X5, X10, X5
  595. VPXOR X6, X11, X6
  596. VPSLLD $ 12, X5, X15
  597. VPSRLD $20, X5, X5
  598. VPXOR X5, X15, X5
  599. VPSLLD $ 12, X6, X15
  600. VPSRLD $20, X6, X6
  601. VPXOR X6, X15, X6
  602. VPXOR X7, X8, X7
  603. VPXOR X4, X9, X4
  604. VPSLLD $ 12, X7, X15
  605. VPSRLD $20, X7, X7
  606. VPXOR X7, X15, X7
  607. VPSLLD $ 12, X4, X15
  608. VPSRLD $20, X4, X4
  609. VPXOR X4, X15, X4
  610. VPADDD X0, X5, X0
  611. VPADDD X1, X6, X1
  612. VPXOR 96(BP), X0, X15
  613. VPXOR X12, X1, X12
  614. VPADDD X2, X7, X2
  615. VPADDD X3, X4, X3
  616. VPXOR X13, X2, X13
  617. VPXOR X14, X3, X14
  618. VPSHUFB 480(BP), X15, X15
  619. VPSHUFB 480(BP), X12, X12
  620. VPADDD X10, X15, X10
  621. VPADDD X11, X12, X11
  622. VPSHUFB 480(BP), X13, X13
  623. VPSHUFB 480(BP), X14, X14
  624. VPADDD X8, X13, X8
  625. VPADDD X9, X14, X9
  626. VMOVDQA X15, 96(BP)
  627. VPXOR X5, X10, X5
  628. VPXOR X6, X11, X6
  629. VPSLLD $ 7, X5, X15
  630. VPSRLD $25, X5, X5
  631. VPXOR X5, X15, X5
  632. VPSLLD $ 7, X6, X15
  633. VPSRLD $25, X6, X6
  634. VPXOR X6, X15, X6
  635. VPXOR X7, X8, X7
  636. VPXOR X4, X9, X4
  637. VPSLLD $ 7, X7, X15
  638. VPSRLD $25, X7, X7
  639. VPXOR X7, X15, X7
  640. VPSLLD $ 7, X4, X15
  641. VPSRLD $25, X4, X4
  642. VPXOR X4, X15, X4
  643. VMOVDQA 96(BP), X15
  644. SUBQ $2, AX
  645. JNZ chacha_blocks_avx2_mainloop2
  646. VMOVDQA X8, 192(BP)
  647. VMOVDQA X9, 208(BP)
  648. VMOVDQA X10, 224(BP)
  649. VMOVDQA X11, 240(BP)
  650. VMOVDQA X12, 256(BP)
  651. VMOVDQA X13, 272(BP)
  652. VMOVDQA X14, 288(BP)
  653. VMOVDQA X15, 304(BP)
  654. VPBROADCASTD 0(BP), X8
  655. VPBROADCASTD 4+0(BP), X9
  656. VPBROADCASTD 8+0(BP), X10
  657. VPBROADCASTD 12+0(BP), X11
  658. VPBROADCASTD 16(BP), X12
  659. VPBROADCASTD 4+16(BP), X13
  660. VPBROADCASTD 8+16(BP), X14
  661. VPBROADCASTD 12+16(BP), X15
  662. VPADDD X8, X0, X0
  663. VPADDD X9, X1, X1
  664. VPADDD X10, X2, X2
  665. VPADDD X11, X3, X3
  666. VPADDD X12, X4, X4
  667. VPADDD X13, X5, X5
  668. VPADDD X14, X6, X6
  669. VPADDD X15, X7, X7
  670. VPUNPCKLDQ X1, X0, X8
  671. VPUNPCKLDQ X3, X2, X9
  672. VPUNPCKHDQ X1, X0, X12
  673. VPUNPCKHDQ X3, X2, X13
  674. VPUNPCKLDQ X5, X4, X10
  675. VPUNPCKLDQ X7, X6, X11
  676. VPUNPCKHDQ X5, X4, X14
  677. VPUNPCKHDQ X7, X6, X15
  678. VPUNPCKLQDQ X9, X8, X0
  679. VPUNPCKLQDQ X11, X10, X1
  680. VPUNPCKHQDQ X9, X8, X2
  681. VPUNPCKHQDQ X11, X10, X3
  682. VPUNPCKLQDQ X13, X12, X4
  683. VPUNPCKLQDQ X15, X14, X5
  684. VPUNPCKHQDQ X13, X12, X6
  685. VPUNPCKHQDQ X15, X14, X7
  686. ANDQ SI, SI
  687. JZ chacha_blocks_avx2_noinput2
  688. VPXOR 0(SI), X0, X0
  689. VPXOR 16(SI), X1, X1
  690. VPXOR 64(SI), X2, X2
  691. VPXOR 80(SI), X3, X3
  692. VPXOR 128(SI), X4, X4
  693. VPXOR 144(SI), X5, X5
  694. VPXOR 192(SI), X6, X6
  695. VPXOR 208(SI), X7, X7
  696. VMOVDQU X0, 0(DX)
  697. VMOVDQU X1, 16(DX)
  698. VMOVDQU X2, 64(DX)
  699. VMOVDQU X3, 80(DX)
  700. VMOVDQU X4, 128(DX)
  701. VMOVDQU X5, 144(DX)
  702. VMOVDQU X6, 192(DX)
  703. VMOVDQU X7, 208(DX)
  704. VMOVDQA 192(BP), X0
  705. VMOVDQA 208(BP), X1
  706. VMOVDQA 224(BP), X2
  707. VMOVDQA 240(BP), X3
  708. VMOVDQA 256(BP), X4
  709. VMOVDQA 272(BP), X5
  710. VMOVDQA 288(BP), X6
  711. VMOVDQA 304(BP), X7
  712. VPBROADCASTD 32(BP), X8
  713. VPBROADCASTD 4+32(BP), X9
  714. VPBROADCASTD 8+32(BP), X10
  715. VPBROADCASTD 12+32(BP), X11
  716. VMOVDQA 128(BP), X12
  717. VMOVDQA 160(BP), X13
  718. VPBROADCASTD 8+48(BP), X14
  719. VPBROADCASTD 12+48(BP), X15
  720. VPADDD X8, X0, X0
  721. VPADDD X9, X1, X1
  722. VPADDD X10, X2, X2
  723. VPADDD X11, X3, X3
  724. VPADDD X12, X4, X4
  725. VPADDD X13, X5, X5
  726. VPADDD X14, X6, X6
  727. VPADDD X15, X7, X7
  728. VPUNPCKLDQ X1, X0, X8
  729. VPUNPCKLDQ X3, X2, X9
  730. VPUNPCKHDQ X1, X0, X12
  731. VPUNPCKHDQ X3, X2, X13
  732. VPUNPCKLDQ X5, X4, X10
  733. VPUNPCKLDQ X7, X6, X11
  734. VPUNPCKHDQ X5, X4, X14
  735. VPUNPCKHDQ X7, X6, X15
  736. VPUNPCKLQDQ X9, X8, X0
  737. VPUNPCKLQDQ X11, X10, X1
  738. VPUNPCKHQDQ X9, X8, X2
  739. VPUNPCKHQDQ X11, X10, X3
  740. VPUNPCKLQDQ X13, X12, X4
  741. VPUNPCKLQDQ X15, X14, X5
  742. VPUNPCKHQDQ X13, X12, X6
  743. VPUNPCKHQDQ X15, X14, X7
  744. VPXOR 32(SI), X0, X0
  745. VPXOR 48(SI), X1, X1
  746. VPXOR 96(SI), X2, X2
  747. VPXOR 112(SI), X3, X3
  748. VPXOR 160(SI), X4, X4
  749. VPXOR 176(SI), X5, X5
  750. VPXOR 224(SI), X6, X6
  751. VPXOR 240(SI), X7, X7
  752. VMOVDQU X0, 32(DX)
  753. VMOVDQU X1, 48(DX)
  754. VMOVDQU X2, 96(DX)
  755. VMOVDQU X3, 112(DX)
  756. VMOVDQU X4, 160(DX)
  757. VMOVDQU X5, 176(DX)
  758. VMOVDQU X6, 224(DX)
  759. VMOVDQU X7, 240(DX)
  760. ADDQ $256, SI
  761. JMP chacha_blocks_avx2_mainloop2_cont
  762. chacha_blocks_avx2_noinput2:
  763. VMOVDQU X0, 0(DX)
  764. VMOVDQU X1, 16(DX)
  765. VMOVDQU X2, 64(DX)
  766. VMOVDQU X3, 80(DX)
  767. VMOVDQU X4, 128(DX)
  768. VMOVDQU X5, 144(DX)
  769. VMOVDQU X6, 192(DX)
  770. VMOVDQU X7, 208(DX)
  771. VMOVDQA 192(BP), X0
  772. VMOVDQA 208(BP), X1
  773. VMOVDQA 224(BP), X2
  774. VMOVDQA 240(BP), X3
  775. VMOVDQA 256(BP), X4
  776. VMOVDQA 272(BP), X5
  777. VMOVDQA 288(BP), X6
  778. VMOVDQA 304(BP), X7
  779. VPBROADCASTD 32(BP), X8
  780. VPBROADCASTD 4+32(BP), X9
  781. VPBROADCASTD 8+32(BP), X10
  782. VPBROADCASTD 12+32(BP), X11
  783. VMOVDQA 128(BP), X12
  784. VMOVDQA 160(BP), X13
  785. VPBROADCASTD 8+48(BP), X14
  786. VPBROADCASTD 12+48(BP), X15
  787. VPADDD X8, X0, X0
  788. VPADDD X9, X1, X1
  789. VPADDD X10, X2, X2
  790. VPADDD X11, X3, X3
  791. VPADDD X12, X4, X4
  792. VPADDD X13, X5, X5
  793. VPADDD X14, X6, X6
  794. VPADDD X15, X7, X7
  795. VPUNPCKLDQ X1, X0, X8
  796. VPUNPCKLDQ X3, X2, X9
  797. VPUNPCKHDQ X1, X0, X12
  798. VPUNPCKHDQ X3, X2, X13
  799. VPUNPCKLDQ X5, X4, X10
  800. VPUNPCKLDQ X7, X6, X11
  801. VPUNPCKHDQ X5, X4, X14
  802. VPUNPCKHDQ X7, X6, X15
  803. VPUNPCKLQDQ X9, X8, X0
  804. VPUNPCKLQDQ X11, X10, X1
  805. VPUNPCKHQDQ X9, X8, X2
  806. VPUNPCKHQDQ X11, X10, X3
  807. VPUNPCKLQDQ X13, X12, X4
  808. VPUNPCKLQDQ X15, X14, X5
  809. VPUNPCKHQDQ X13, X12, X6
  810. VPUNPCKHQDQ X15, X14, X7
  811. VMOVDQU X0, 32(DX)
  812. VMOVDQU X1, 48(DX)
  813. VMOVDQU X2, 96(DX)
  814. VMOVDQU X3, 112(DX)
  815. VMOVDQU X4, 160(DX)
  816. VMOVDQU X5, 176(DX)
  817. VMOVDQU X6, 224(DX)
  818. VMOVDQU X7, 240(DX)
  819. chacha_blocks_avx2_mainloop2_cont:
  820. ADDQ $256, DX
  821. SUBQ $256, CX
  822. CMPQ CX, $256
  823. JAE chacha_blocks_avx2_atleast256
  824. chacha_blocks_avx2_below256_fixup:
  825. VMOVDQA 448(BP), X6
  826. VMOVDQA 480(BP), X7
  827. VMOVDQA 0(BP), X8
  828. VMOVDQA 16(BP), X9
  829. VMOVDQA 32(BP), X10
  830. VMOVDQA 48(BP), X11
  831. MOVQ $1, R9
  832. chacha_blocks_avx2_below256:
  833. VMOVQ R9, X5
  834. ANDQ CX, CX
  835. JZ chacha_blocks_avx2_done
  836. CMPQ CX, $64
  837. JAE chacha_blocks_avx2_above63
  838. MOVQ DX, R9
  839. ANDQ SI, SI
  840. JZ chacha_blocks_avx2_noinput3
  841. MOVQ CX, R10
  842. MOVQ BP, DX
  843. ADDQ R10, SI
  844. ADDQ R10, DX
  845. NEGQ R10
  846. chacha_blocks_avx2_copyinput:
  847. MOVB (SI)(R10*1), AX
  848. MOVB AX, (DX)(R10*1)
  849. INCQ R10
  850. JNZ chacha_blocks_avx2_copyinput
  851. MOVQ BP, SI
  852. chacha_blocks_avx2_noinput3:
  853. MOVQ BP, DX
  854. chacha_blocks_avx2_above63:
  855. VMOVDQA X8, X0
  856. VMOVDQA X9, X1
  857. VMOVDQA X10, X2
  858. VMOVDQA X11, X3
  859. // MOVQ 64(BP), AX
  860. MOVQ $20, AX
  861. chacha_blocks_avx2_mainloop3:
  862. VPADDD X0, X1, X0
  863. VPXOR X3, X0, X3
  864. VPSHUFB X6, X3, X3
  865. VPADDD X2, X3, X2
  866. VPXOR X1, X2, X1
  867. VPSLLD $12, X1, X4
  868. VPSRLD $20, X1, X1
  869. VPXOR X1, X4, X1
  870. VPADDD X0, X1, X0
  871. VPXOR X3, X0, X3
  872. VPSHUFB X7, X3, X3
  873. VPSHUFD $0x93, X0, X0
  874. VPADDD X2, X3, X2
  875. VPSHUFD $0x4e, X3, X3
  876. VPXOR X1, X2, X1
  877. VPSHUFD $0x39, X2, X2
  878. VPSLLD $7, X1, X4
  879. VPSRLD $25, X1, X1
  880. VPXOR X1, X4, X1
  881. VPADDD X0, X1, X0
  882. VPXOR X3, X0, X3
  883. VPSHUFB X6, X3, X3
  884. VPADDD X2, X3, X2
  885. VPXOR X1, X2, X1
  886. VPSLLD $12, X1, X4
  887. VPSRLD $20, X1, X1
  888. VPXOR X1, X4, X1
  889. VPADDD X0, X1, X0
  890. VPXOR X3, X0, X3
  891. VPSHUFB X7, X3, X3
  892. VPSHUFD $0x39, X0, X0
  893. VPADDD X2, X3, X2
  894. VPSHUFD $0x4e, X3, X3
  895. VPXOR X1, X2, X1
  896. VPSHUFD $0x93, X2, X2
  897. VPSLLD $7, X1, X4
  898. VPSRLD $25, X1, X1
  899. VPXOR X1, X4, X1
  900. SUBQ $2, AX
  901. JNZ chacha_blocks_avx2_mainloop3
  902. VPADDD X0, X8, X0
  903. VPADDD X1, X9, X1
  904. VPADDD X2, X10, X2
  905. VPADDD X3, X11, X3
  906. ANDQ SI, SI
  907. JZ chacha_blocks_avx2_noinput4
  908. VPXOR 0(SI), X0, X0
  909. VPXOR 16(SI), X1, X1
  910. VPXOR 32(SI), X2, X2
  911. VPXOR 48(SI), X3, X3
  912. ADDQ $64, SI
  913. chacha_blocks_avx2_noinput4:
  914. VMOVDQU X0, 0(DX)
  915. VMOVDQU X1, 16(DX)
  916. VMOVDQU X2, 32(DX)
  917. VMOVDQU X3, 48(DX)
  918. VPADDQ X11, X5, X11
  919. CMPQ CX, $64
  920. JBE chacha_blocks_avx2_mainloop3_finishup
  921. ADDQ $64, DX
  922. SUBQ $64, CX
  923. JMP chacha_blocks_avx2_below256
  924. chacha_blocks_avx2_mainloop3_finishup:
  925. CMPQ CX, $64
  926. JE chacha_blocks_avx2_done
  927. ADDQ CX, R9
  928. ADDQ CX, DX
  929. NEGQ CX
  930. chacha_blocks_avx2_copyoutput:
  931. MOVB (DX)(CX*1), AX
  932. MOVB AX, (R9)(CX*1)
  933. INCQ CX
  934. JNZ chacha_blocks_avx2_copyoutput
  935. chacha_blocks_avx2_done:
  936. VMOVDQU X11, 32(DI)
  937. VZEROUPPER
  938. RET
  939. DATA ·m60_mask<>+0x00(SB)/8, $0x0fffffffffffffff
  940. DATA ·m60_mask<>+0x08(SB)/8, $0x0fffffffffffffff
  941. GLOBL ·m60_mask<>(SB), (NOPTR+RODATA), $16
  942. // func hashStepAVX2(ctx *hs1Ctx, in []byte, accum *[hs1HashRounds]uint64)
  943. TEXT ·hashStepAVX2(SB), NOSPLIT, $0-40
  944. MOVQ in+16(FP), CX // len(in) -> CX (inbytes)
  945. ANDQ CX, CX
  946. JZ hash_step_done
  947. MOVQ ctx+0(FP), DI // ctx -> DI
  948. MOVQ in+8(FP), SI // &in[0] -> SI (mp)
  949. MOVQ accum+32(FP), R15 // accum -> R15
  950. MOVO ·m60_mask<>(SB), X15
  951. MOVQ (R15), AX // accum[0] -> AX
  952. MOVQ 8(R15), BX // accum[1] -> BX
  953. MOVQ 16(R15), R11 // accum[2] -> R11
  954. MOVQ 24(R15), R12 // accum[3] -> R12
  955. MOVQ 32(R15), R13 // accum[4] -> R13
  956. MOVQ 40(R15), R14 // accum[4] -> R14
  957. hash_step_start:
  958. VPXOR Y0, Y0, Y0
  959. VPXOR Y1, Y1, Y1
  960. VPXOR Y2, Y2, Y2
  961. VPXOR Y3, Y3, Y3
  962. VPXOR Y4, Y4, Y4
  963. VPXOR Y5, Y5, Y5
  964. MOVQ DI, R9 // R9 -> ctx->nh_key (kp)
  965. MOVQ $2, R8
  966. hash_rounds_loop:
  967. VMOVDQU (SI), Y14 // mp
  968. VMOVDQU (R9), Y6
  969. VMOVDQU 16(R9), Y7
  970. VPADDD Y14, Y6, Y6
  971. VPADDD Y14, Y7, Y7
  972. VPSHUFD $0x05, Y6, Y8
  973. VPSHUFD $0xaf, Y6, Y9
  974. VPMULUDQ Y8, Y9, Y6
  975. VPADDQ Y0, Y6, Y0
  976. VPSHUFD $0x05, Y7, Y8
  977. VPSHUFD $0xaf, Y7, Y9
  978. VPMULUDQ Y8, Y9, Y7
  979. VPADDQ Y1, Y7, Y1
  980. VMOVDQU 32(R9), Y6
  981. VMOVDQU 48(R9), Y7
  982. VPADDD Y14, Y6, Y6
  983. VPADDD Y14, Y7, Y7
  984. VPSHUFD $0x05, Y6, Y8
  985. VPSHUFD $0xaf, Y6, Y9
  986. VPMULUDQ Y8, Y9, Y6
  987. VPADDQ Y2, Y6, Y2
  988. VPSHUFD $0x05, Y7, Y8
  989. VPSHUFD $0xaf, Y7, Y9
  990. VPMULUDQ Y8, Y9, Y7
  991. VPADDQ Y3, Y7, Y3
  992. VMOVDQU 64(R9), Y6
  993. VMOVDQU 80(R9), Y7
  994. VPADDD Y14, Y6, Y6
  995. VPADDD Y14, Y7, Y7
  996. VPSHUFD $0x05, Y6, Y8
  997. VPSHUFD $0xaf, Y6, Y9
  998. VPMULUDQ Y8, Y9, Y6
  999. VPADDQ Y4, Y6, Y4
  1000. VPSHUFD $0x05, Y7, Y8
  1001. VPSHUFD $0xaf, Y7, Y9
  1002. VPMULUDQ Y8, Y9, Y7
  1003. VPADDQ Y5, Y7, Y5
  1004. ADDQ $32, SI
  1005. ADDQ $32, R9
  1006. DECQ R8
  1007. JNZ hash_rounds_loop
  1008. MOVQ $0x1fffffffffffffff, R8 // m61
  1009. VEXTRACTI128 $1, Y0, X6
  1010. VEXTRACTI128 $1, Y1, X7
  1011. VPADDQ X0, X6, X0
  1012. VPADDQ X1, X7, X1
  1013. VPSRLDQ $8, X0, X6
  1014. VPSRLDQ $8, X1, X7
  1015. VPADDQ X0, X6, X0
  1016. VPADDQ X1, X7, X1
  1017. VPAND X0, X15, X0
  1018. VPAND X1, X15, X1
  1019. MOVQ AX, DX
  1020. VMOVQ X0, AX
  1021. BYTE $0xc4; BYTE $0xe2; BYTE $0xab; BYTE $0xf6; BYTE $0x97; BYTE $0x90; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MULX 144(DI), R10, DX
  1022. MOVQ R10, R9
  1023. ANDQ R8, R9
  1024. SHRQ $61, DX, R10
  1025. ADDQ R9, R10
  1026. ADDQ R10, AX
  1027. MOVQ BX, DX
  1028. VMOVQ X1, BX
  1029. BYTE $0xc4; BYTE $0xe2; BYTE $0xab; BYTE $0xf6; BYTE $0x97; BYTE $0x98; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MULX 152(DI), R10, DX
  1030. MOVQ R10, R9
  1031. ANDQ R8, R9
  1032. SHRQ $61, DX, R10
  1033. ADDQ R9, R10
  1034. ADDQ R10, BX
  1035. VEXTRACTI128 $1, Y2, X6
  1036. VEXTRACTI128 $1, Y3, X7
  1037. VPADDQ X2, X6, X2
  1038. VPADDQ X3, X7, X3
  1039. VPSRLDQ $8, X2, X6
  1040. VPSRLDQ $8, X3, X7
  1041. VPADDQ X2, X6, X2
  1042. VPADDQ X3, X7, X3
  1043. VPAND X2, X15, X2
  1044. VPAND X3, X15, X3
  1045. MOVQ R11, DX
  1046. VMOVQ X2, R11
  1047. BYTE $0xc4; BYTE $0xe2; BYTE $0xab; BYTE $0xf6; BYTE $0x97; BYTE $0xa0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MULX 160(DI), R10, DX
  1048. MOVQ R10, R9
  1049. ANDQ R8, R9
  1050. SHRQ $61, DX, R10
  1051. ADDQ R9, R10
  1052. ADDQ R10, R11
  1053. MOVQ R12, DX
  1054. VMOVQ X3, R12
  1055. BYTE $0xc4; BYTE $0xe2; BYTE $0xab; BYTE $0xf6; BYTE $0x97; BYTE $0xa8; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MULX 168(DI), R10, DX
  1056. MOVQ R10, R9
  1057. ANDQ R8, R9
  1058. SHRQ $61, DX, R10
  1059. ADDQ R9, R10
  1060. ADDQ R10, R12
  1061. VEXTRACTI128 $1, Y4, X6
  1062. VEXTRACTI128 $1, Y5, X7
  1063. VPADDQ X4, X6, X4
  1064. VPADDQ X5, X7, X5
  1065. VPSRLDQ $8, X4, X6
  1066. VPSRLDQ $8, X5, X7
  1067. VPADDQ X4, X6, X4
  1068. VPADDQ X5, X7, X5
  1069. VPAND X4, X15, X4
  1070. VPAND X5, X15, X5
  1071. MOVQ R13, DX
  1072. VMOVQ X4, R13
  1073. BYTE $0xc4; BYTE $0xe2; BYTE $0xab; BYTE $0xf6; BYTE $0x97; BYTE $0xb0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MULX 176(DI), R10, DX
  1074. MOVQ R10, R9
  1075. ANDQ R8, R9
  1076. SHRQ $61, DX, R10
  1077. ADDQ R9, R10
  1078. ADDQ R10, R13
  1079. MOVQ R14, DX
  1080. VMOVQ X5, R14
  1081. BYTE $0xc4; BYTE $0xe2; BYTE $0xab; BYTE $0xf6; BYTE $0x97; BYTE $0xb8; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MULX 184(DI), R10, DX
  1082. MOVQ R10, R9
  1083. ANDQ R8, R9
  1084. SHRQ $61, DX, R10
  1085. ADDQ R9, R10
  1086. ADDQ R10, R14
  1087. SUBQ $64, CX
  1088. JNZ hash_step_start
  1089. MOVQ AX, (R15) // AX -> accum[0]
  1090. MOVQ BX, 8(R15) // BX -> accum[1]
  1091. MOVQ R11, 16(R15) // R11 -> accum[2]
  1092. MOVQ R12, 24(R15) // accum[3] -> R12
  1093. MOVQ R13, 32(R15) // accum[4] -> R13
  1094. MOVQ R14, 40(R15) // accum[4] -> R14
  1095. hash_step_done:
  1096. VZEROUPPER
  1097. RET