chacha20_amd64.py 38 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295
  1. #!/usr/bin/env python3
  2. #
  3. # To the extent possible under law, Yawning Angel has waived all copyright
  4. # and related or neighboring rights to chacha20, using the Creative
  5. # Commons "CC0" public domain dedication. See LICENSE or
  6. # <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
  7. #
  8. # cgo sucks. Plan 9 assembly sucks. Real languages have SIMD intrinsics.
  9. # The least terrible/retarded option is to use a Python code generator, so
  10. # that's what I did.
  11. #
  12. # Code based on Ted Krovetz's vec128 C implementation, with corrections
  13. # to use a 64 bit counter instead of 32 bit, and to allow unaligned input and
  14. # output pointers.
  15. #
  16. # Dependencies: https://github.com/Maratyszcza/PeachPy
  17. #
  18. # python3 -m peachpy.x86_64 -mabi=goasm -S -o chacha20_amd64.s chacha20_amd64.py
  19. #
  20. from peachpy import *
  21. from peachpy.x86_64 import *
  22. x = Argument(ptr(uint32_t))
  23. inp = Argument(ptr(const_uint8_t))
  24. outp = Argument(ptr(uint8_t))
  25. nrBlocks = Argument(ptr(size_t))
  26. #
  27. # SSE2 helper functions. A temporary register is explicitly passed in because
  28. # the main fast loop uses every single register (and even spills) so manual
  29. # control is needed.
  30. #
  31. # This used to also have a DQROUNDS helper that did 2 rounds of ChaCha like
  32. # in the C code, but the C code has the luxury of an optimizer reordering
  33. # everything, while this does not.
  34. #
  35. def ROTW16_sse2(tmp, d):
  36. MOVDQA(tmp, d)
  37. PSLLD(tmp, 16)
  38. PSRLD(d, 16)
  39. PXOR(d, tmp)
  40. def ROTW12_sse2(tmp, b):
  41. MOVDQA(tmp, b)
  42. PSLLD(tmp, 12)
  43. PSRLD(b, 20)
  44. PXOR(b, tmp)
  45. def ROTW8_sse2(tmp, d):
  46. MOVDQA(tmp, d)
  47. PSLLD(tmp, 8)
  48. PSRLD(d, 24)
  49. PXOR(d, tmp)
  50. def ROTW7_sse2(tmp, b):
  51. MOVDQA(tmp, b)
  52. PSLLD(tmp, 7)
  53. PSRLD(b, 25)
  54. PXOR(b, tmp)
  55. def WriteXor_sse2(tmp, inp, outp, d, v0, v1, v2, v3):
  56. MOVDQU(tmp, [inp+d])
  57. PXOR(tmp, v0)
  58. MOVDQU([outp+d], tmp)
  59. MOVDQU(tmp, [inp+d+16])
  60. PXOR(tmp, v1)
  61. MOVDQU([outp+d+16], tmp)
  62. MOVDQU(tmp, [inp+d+32])
  63. PXOR(tmp, v2)
  64. MOVDQU([outp+d+32], tmp)
  65. MOVDQU(tmp, [inp+d+48])
  66. PXOR(tmp, v3)
  67. MOVDQU([outp+d+48], tmp)
  68. # SSE2 ChaCha20 (aka vec128). Does not handle partial blocks, and will
  69. # process 4/2/1 blocks at a time.
  70. with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
  71. reg_x = GeneralPurposeRegister64()
  72. reg_inp = GeneralPurposeRegister64()
  73. reg_outp = GeneralPurposeRegister64()
  74. reg_blocks = GeneralPurposeRegister64()
  75. reg_sp_save = GeneralPurposeRegister64()
  76. LOAD.ARGUMENT(reg_x, x)
  77. LOAD.ARGUMENT(reg_inp, inp)
  78. LOAD.ARGUMENT(reg_outp, outp)
  79. LOAD.ARGUMENT(reg_blocks, nrBlocks)
  80. # Align the stack to a 32 byte boundary.
  81. MOV(reg_sp_save, registers.rsp)
  82. AND(registers.rsp, 0xffffffffffffffe0)
  83. SUB(registers.rsp, 0x20)
  84. # Build the counter increment vector on the stack, and allocate the scratch
  85. # space
  86. xmm_v0 = XMMRegister()
  87. PXOR(xmm_v0, xmm_v0)
  88. SUB(registers.rsp, 16+16)
  89. MOVDQA([registers.rsp], xmm_v0)
  90. reg_tmp = GeneralPurposeRegister32()
  91. MOV(reg_tmp, 0x00000001)
  92. MOV([registers.rsp], reg_tmp)
  93. mem_one = [registers.rsp] # (Stack) Counter increment vector
  94. mem_tmp0 = [registers.rsp+16] # (Stack) Scratch space.
  95. mem_s0 = [reg_x] # (Memory) Cipher state [0..3]
  96. mem_s1 = [reg_x+16] # (Memory) Cipher state [4..7]
  97. mem_s2 = [reg_x+32] # (Memory) Cipher state [8..11]
  98. mem_s3 = [reg_x+48] # (Memory) Cipher state [12..15]
  99. # xmm_v0 allocated above...
  100. xmm_v1 = XMMRegister()
  101. xmm_v2 = XMMRegister()
  102. xmm_v3 = XMMRegister()
  103. xmm_v4 = XMMRegister()
  104. xmm_v5 = XMMRegister()
  105. xmm_v6 = XMMRegister()
  106. xmm_v7 = XMMRegister()
  107. xmm_v8 = XMMRegister()
  108. xmm_v9 = XMMRegister()
  109. xmm_v10 = XMMRegister()
  110. xmm_v11 = XMMRegister()
  111. xmm_v12 = XMMRegister()
  112. xmm_v13 = XMMRegister()
  113. xmm_v14 = XMMRegister()
  114. xmm_v15 = XMMRegister()
  115. xmm_tmp = xmm_v12
  116. #
  117. # 4 blocks at a time.
  118. #
  119. reg_rounds = GeneralPurposeRegister64()
  120. vector_loop4 = Loop()
  121. SUB(reg_blocks, 4)
  122. JB(vector_loop4.end)
  123. with vector_loop4:
  124. MOVDQU(xmm_v0, mem_s0)
  125. MOVDQU(xmm_v1, mem_s1)
  126. MOVDQU(xmm_v2, mem_s2)
  127. MOVDQU(xmm_v3, mem_s3)
  128. MOVDQA(xmm_v4, xmm_v0)
  129. MOVDQA(xmm_v5, xmm_v1)
  130. MOVDQA(xmm_v6, xmm_v2)
  131. MOVDQA(xmm_v7, xmm_v3)
  132. PADDQ(xmm_v7, mem_one)
  133. MOVDQA(xmm_v8, xmm_v0)
  134. MOVDQA(xmm_v9, xmm_v1)
  135. MOVDQA(xmm_v10, xmm_v2)
  136. MOVDQA(xmm_v11, xmm_v7)
  137. PADDQ(xmm_v11, mem_one)
  138. MOVDQA(xmm_v12, xmm_v0)
  139. MOVDQA(xmm_v13, xmm_v1)
  140. MOVDQA(xmm_v14, xmm_v2)
  141. MOVDQA(xmm_v15, xmm_v11)
  142. PADDQ(xmm_v15, mem_one)
  143. MOV(reg_rounds, 20)
  144. rounds_loop4 = Loop()
  145. with rounds_loop4:
  146. # a += b; d ^= a; d = ROTW16(d);
  147. PADDD(xmm_v0, xmm_v1)
  148. PADDD(xmm_v4, xmm_v5)
  149. PADDD(xmm_v8, xmm_v9)
  150. PADDD(xmm_v12, xmm_v13)
  151. PXOR(xmm_v3, xmm_v0)
  152. PXOR(xmm_v7, xmm_v4)
  153. PXOR(xmm_v11, xmm_v8)
  154. PXOR(xmm_v15, xmm_v12)
  155. MOVDQA(mem_tmp0, xmm_tmp) # Save
  156. ROTW16_sse2(xmm_tmp, xmm_v3)
  157. ROTW16_sse2(xmm_tmp, xmm_v7)
  158. ROTW16_sse2(xmm_tmp, xmm_v11)
  159. ROTW16_sse2(xmm_tmp, xmm_v15)
  160. # c += d; b ^= c; b = ROTW12(b);
  161. PADDD(xmm_v2, xmm_v3)
  162. PADDD(xmm_v6, xmm_v7)
  163. PADDD(xmm_v10, xmm_v11)
  164. PADDD(xmm_v14, xmm_v15)
  165. PXOR(xmm_v1, xmm_v2)
  166. PXOR(xmm_v5, xmm_v6)
  167. PXOR(xmm_v9, xmm_v10)
  168. PXOR(xmm_v13, xmm_v14)
  169. ROTW12_sse2(xmm_tmp, xmm_v1)
  170. ROTW12_sse2(xmm_tmp, xmm_v5)
  171. ROTW12_sse2(xmm_tmp, xmm_v9)
  172. ROTW12_sse2(xmm_tmp, xmm_v13)
  173. # a += b; d ^= a; d = ROTW8(d);
  174. MOVDQA(xmm_tmp, mem_tmp0) # Restore
  175. PADDD(xmm_v0, xmm_v1)
  176. PADDD(xmm_v4, xmm_v5)
  177. PADDD(xmm_v8, xmm_v9)
  178. PADDD(xmm_v12, xmm_v13)
  179. PXOR(xmm_v3, xmm_v0)
  180. PXOR(xmm_v7, xmm_v4)
  181. PXOR(xmm_v11, xmm_v8)
  182. PXOR(xmm_v15, xmm_v12)
  183. MOVDQA(mem_tmp0, xmm_tmp) # Save
  184. ROTW8_sse2(xmm_tmp, xmm_v3)
  185. ROTW8_sse2(xmm_tmp, xmm_v7)
  186. ROTW8_sse2(xmm_tmp, xmm_v11)
  187. ROTW8_sse2(xmm_tmp, xmm_v15)
  188. # c += d; b ^= c; b = ROTW7(b)
  189. PADDD(xmm_v2, xmm_v3)
  190. PADDD(xmm_v6, xmm_v7)
  191. PADDD(xmm_v10, xmm_v11)
  192. PADDD(xmm_v14, xmm_v15)
  193. PXOR(xmm_v1, xmm_v2)
  194. PXOR(xmm_v5, xmm_v6)
  195. PXOR(xmm_v9, xmm_v10)
  196. PXOR(xmm_v13, xmm_v14)
  197. ROTW7_sse2(xmm_tmp, xmm_v1)
  198. ROTW7_sse2(xmm_tmp, xmm_v5)
  199. ROTW7_sse2(xmm_tmp, xmm_v9)
  200. ROTW7_sse2(xmm_tmp, xmm_v13)
  201. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  202. PSHUFD(xmm_v1, xmm_v1, 0x39)
  203. PSHUFD(xmm_v5, xmm_v5, 0x39)
  204. PSHUFD(xmm_v9, xmm_v9, 0x39)
  205. PSHUFD(xmm_v13, xmm_v13, 0x39)
  206. PSHUFD(xmm_v2, xmm_v2, 0x4e)
  207. PSHUFD(xmm_v6, xmm_v6, 0x4e)
  208. PSHUFD(xmm_v10, xmm_v10, 0x4e)
  209. PSHUFD(xmm_v14, xmm_v14, 0x4e)
  210. PSHUFD(xmm_v3, xmm_v3, 0x93)
  211. PSHUFD(xmm_v7, xmm_v7, 0x93)
  212. PSHUFD(xmm_v11, xmm_v11, 0x93)
  213. PSHUFD(xmm_v15, xmm_v15, 0x93)
  214. MOVDQA(xmm_tmp, mem_tmp0) # Restore
  215. # a += b; d ^= a; d = ROTW16(d);
  216. PADDD(xmm_v0, xmm_v1)
  217. PADDD(xmm_v4, xmm_v5)
  218. PADDD(xmm_v8, xmm_v9)
  219. PADDD(xmm_v12, xmm_v13)
  220. PXOR(xmm_v3, xmm_v0)
  221. PXOR(xmm_v7, xmm_v4)
  222. PXOR(xmm_v11, xmm_v8)
  223. PXOR(xmm_v15, xmm_v12)
  224. MOVDQA(mem_tmp0, xmm_tmp) # Save
  225. ROTW16_sse2(xmm_tmp, xmm_v3)
  226. ROTW16_sse2(xmm_tmp, xmm_v7)
  227. ROTW16_sse2(xmm_tmp, xmm_v11)
  228. ROTW16_sse2(xmm_tmp, xmm_v15)
  229. # c += d; b ^= c; b = ROTW12(b);
  230. PADDD(xmm_v2, xmm_v3)
  231. PADDD(xmm_v6, xmm_v7)
  232. PADDD(xmm_v10, xmm_v11)
  233. PADDD(xmm_v14, xmm_v15)
  234. PXOR(xmm_v1, xmm_v2)
  235. PXOR(xmm_v5, xmm_v6)
  236. PXOR(xmm_v9, xmm_v10)
  237. PXOR(xmm_v13, xmm_v14)
  238. ROTW12_sse2(xmm_tmp, xmm_v1)
  239. ROTW12_sse2(xmm_tmp, xmm_v5)
  240. ROTW12_sse2(xmm_tmp, xmm_v9)
  241. ROTW12_sse2(xmm_tmp, xmm_v13)
  242. # a += b; d ^= a; d = ROTW8(d);
  243. MOVDQA(xmm_tmp, mem_tmp0) # Restore
  244. PADDD(xmm_v0, xmm_v1)
  245. PADDD(xmm_v4, xmm_v5)
  246. PADDD(xmm_v8, xmm_v9)
  247. PADDD(xmm_v12, xmm_v13)
  248. PXOR(xmm_v3, xmm_v0)
  249. PXOR(xmm_v7, xmm_v4)
  250. PXOR(xmm_v11, xmm_v8)
  251. PXOR(xmm_v15, xmm_v12)
  252. MOVDQA(mem_tmp0, xmm_tmp) # Save
  253. ROTW8_sse2(xmm_tmp, xmm_v3)
  254. ROTW8_sse2(xmm_tmp, xmm_v7)
  255. ROTW8_sse2(xmm_tmp, xmm_v11)
  256. ROTW8_sse2(xmm_tmp, xmm_v15)
  257. # c += d; b ^= c; b = ROTW7(b)
  258. PADDD(xmm_v2, xmm_v3)
  259. PADDD(xmm_v6, xmm_v7)
  260. PADDD(xmm_v10, xmm_v11)
  261. PADDD(xmm_v14, xmm_v15)
  262. PXOR(xmm_v1, xmm_v2)
  263. PXOR(xmm_v5, xmm_v6)
  264. PXOR(xmm_v9, xmm_v10)
  265. PXOR(xmm_v13, xmm_v14)
  266. ROTW7_sse2(xmm_tmp, xmm_v1)
  267. ROTW7_sse2(xmm_tmp, xmm_v5)
  268. ROTW7_sse2(xmm_tmp, xmm_v9)
  269. ROTW7_sse2(xmm_tmp, xmm_v13)
  270. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  271. PSHUFD(xmm_v1, xmm_v1, 0x93)
  272. PSHUFD(xmm_v5, xmm_v5, 0x93)
  273. PSHUFD(xmm_v9, xmm_v9, 0x93)
  274. PSHUFD(xmm_v13, xmm_v13, 0x93)
  275. PSHUFD(xmm_v2, xmm_v2, 0x4e)
  276. PSHUFD(xmm_v6, xmm_v6, 0x4e)
  277. PSHUFD(xmm_v10, xmm_v10, 0x4e)
  278. PSHUFD(xmm_v14, xmm_v14, 0x4e)
  279. PSHUFD(xmm_v3, xmm_v3, 0x39)
  280. PSHUFD(xmm_v7, xmm_v7, 0x39)
  281. PSHUFD(xmm_v11, xmm_v11, 0x39)
  282. PSHUFD(xmm_v15, xmm_v15, 0x39)
  283. MOVDQA(xmm_tmp, mem_tmp0) # Restore
  284. SUB(reg_rounds, 2)
  285. JNZ(rounds_loop4.begin)
  286. MOVDQA(mem_tmp0, xmm_tmp)
  287. PADDD(xmm_v0, mem_s0)
  288. PADDD(xmm_v1, mem_s1)
  289. PADDD(xmm_v2, mem_s2)
  290. PADDD(xmm_v3, mem_s3)
  291. WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
  292. MOVDQU(xmm_v3, mem_s3)
  293. PADDQ(xmm_v3, mem_one)
  294. PADDD(xmm_v4, mem_s0)
  295. PADDD(xmm_v5, mem_s1)
  296. PADDD(xmm_v6, mem_s2)
  297. PADDD(xmm_v7, xmm_v3)
  298. WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
  299. PADDQ(xmm_v3, mem_one)
  300. PADDD(xmm_v8, mem_s0)
  301. PADDD(xmm_v9, mem_s1)
  302. PADDD(xmm_v10, mem_s2)
  303. PADDD(xmm_v11, xmm_v3)
  304. WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 128, xmm_v8, xmm_v9, xmm_v10, xmm_v11)
  305. PADDQ(xmm_v3, mem_one)
  306. MOVDQA(xmm_tmp, mem_tmp0)
  307. PADDD(xmm_v12, mem_s0)
  308. PADDD(xmm_v13, mem_s1)
  309. PADDD(xmm_v14, mem_s2)
  310. PADDD(xmm_v15, xmm_v3)
  311. WriteXor_sse2(xmm_v0, reg_inp, reg_outp, 192, xmm_v12, xmm_v13, xmm_v14, xmm_v15)
  312. PADDQ(xmm_v3, mem_one)
  313. MOVDQU(mem_s3, xmm_v3)
  314. ADD(reg_inp, 4 * 64)
  315. ADD(reg_outp, 4 * 64)
  316. SUB(reg_blocks, 4)
  317. JAE(vector_loop4.begin)
  318. ADD(reg_blocks, 4)
  319. out = Label()
  320. JZ(out)
  321. # Past this point, we no longer need to use every single register to hold
  322. # the in progress state.
  323. xmm_s0 = xmm_v8
  324. xmm_s1 = xmm_v9
  325. xmm_s2 = xmm_v10
  326. xmm_s3 = xmm_v11
  327. xmm_one = xmm_v13
  328. MOVDQU(xmm_s0, mem_s0)
  329. MOVDQU(xmm_s1, mem_s1)
  330. MOVDQU(xmm_s2, mem_s2)
  331. MOVDQU(xmm_s3, mem_s3)
  332. MOVDQA(xmm_one, mem_one)
  333. #
  334. # 2 blocks at a time.
  335. #
  336. process_1_block = Label()
  337. SUB(reg_blocks, 2)
  338. JB(process_1_block) # < 2 blocks remaining.
  339. MOVDQA(xmm_v0, xmm_s0)
  340. MOVDQA(xmm_v1, xmm_s1)
  341. MOVDQA(xmm_v2, xmm_s2)
  342. MOVDQA(xmm_v3, xmm_s3)
  343. MOVDQA(xmm_v4, xmm_v0)
  344. MOVDQA(xmm_v5, xmm_v1)
  345. MOVDQA(xmm_v6, xmm_v2)
  346. MOVDQA(xmm_v7, xmm_v3)
  347. PADDQ(xmm_v7, xmm_one)
  348. MOV(reg_rounds, 20)
  349. rounds_loop2 = Loop()
  350. with rounds_loop2:
  351. # a += b; d ^= a; d = ROTW16(d);
  352. PADDD(xmm_v0, xmm_v1)
  353. PADDD(xmm_v4, xmm_v5)
  354. PXOR(xmm_v3, xmm_v0)
  355. PXOR(xmm_v7, xmm_v4)
  356. ROTW16_sse2(xmm_tmp, xmm_v3)
  357. ROTW16_sse2(xmm_tmp, xmm_v7)
  358. # c += d; b ^= c; b = ROTW12(b);
  359. PADDD(xmm_v2, xmm_v3)
  360. PADDD(xmm_v6, xmm_v7)
  361. PXOR(xmm_v1, xmm_v2)
  362. PXOR(xmm_v5, xmm_v6)
  363. ROTW12_sse2(xmm_tmp, xmm_v1)
  364. ROTW12_sse2(xmm_tmp, xmm_v5)
  365. # a += b; d ^= a; d = ROTW8(d);
  366. PADDD(xmm_v0, xmm_v1)
  367. PADDD(xmm_v4, xmm_v5)
  368. PXOR(xmm_v3, xmm_v0)
  369. PXOR(xmm_v7, xmm_v4)
  370. ROTW8_sse2(xmm_tmp, xmm_v3)
  371. ROTW8_sse2(xmm_tmp, xmm_v7)
  372. # c += d; b ^= c; b = ROTW7(b)
  373. PADDD(xmm_v2, xmm_v3)
  374. PADDD(xmm_v6, xmm_v7)
  375. PXOR(xmm_v1, xmm_v2)
  376. PXOR(xmm_v5, xmm_v6)
  377. ROTW7_sse2(xmm_tmp, xmm_v1)
  378. ROTW7_sse2(xmm_tmp, xmm_v5)
  379. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  380. PSHUFD(xmm_v1, xmm_v1, 0x39)
  381. PSHUFD(xmm_v5, xmm_v5, 0x39)
  382. PSHUFD(xmm_v2, xmm_v2, 0x4e)
  383. PSHUFD(xmm_v6, xmm_v6, 0x4e)
  384. PSHUFD(xmm_v3, xmm_v3, 0x93)
  385. PSHUFD(xmm_v7, xmm_v7, 0x93)
  386. # a += b; d ^= a; d = ROTW16(d);
  387. PADDD(xmm_v0, xmm_v1)
  388. PADDD(xmm_v4, xmm_v5)
  389. PXOR(xmm_v3, xmm_v0)
  390. PXOR(xmm_v7, xmm_v4)
  391. ROTW16_sse2(xmm_tmp, xmm_v3)
  392. ROTW16_sse2(xmm_tmp, xmm_v7)
  393. # c += d; b ^= c; b = ROTW12(b);
  394. PADDD(xmm_v2, xmm_v3)
  395. PADDD(xmm_v6, xmm_v7)
  396. PXOR(xmm_v1, xmm_v2)
  397. PXOR(xmm_v5, xmm_v6)
  398. ROTW12_sse2(xmm_tmp, xmm_v1)
  399. ROTW12_sse2(xmm_tmp, xmm_v5)
  400. # a += b; d ^= a; d = ROTW8(d);
  401. PADDD(xmm_v0, xmm_v1)
  402. PADDD(xmm_v4, xmm_v5)
  403. PXOR(xmm_v3, xmm_v0)
  404. PXOR(xmm_v7, xmm_v4)
  405. ROTW8_sse2(xmm_tmp, xmm_v3)
  406. ROTW8_sse2(xmm_tmp, xmm_v7)
  407. # c += d; b ^= c; b = ROTW7(b)
  408. PADDD(xmm_v2, xmm_v3)
  409. PADDD(xmm_v6, xmm_v7)
  410. PXOR(xmm_v1, xmm_v2)
  411. PXOR(xmm_v5, xmm_v6)
  412. ROTW7_sse2(xmm_tmp, xmm_v1)
  413. ROTW7_sse2(xmm_tmp, xmm_v5)
  414. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  415. PSHUFD(xmm_v1, xmm_v1, 0x93)
  416. PSHUFD(xmm_v5, xmm_v5, 0x93)
  417. PSHUFD(xmm_v2, xmm_v2, 0x4e)
  418. PSHUFD(xmm_v6, xmm_v6, 0x4e)
  419. PSHUFD(xmm_v3, xmm_v3, 0x39)
  420. PSHUFD(xmm_v7, xmm_v7, 0x39)
  421. SUB(reg_rounds, 2)
  422. JNZ(rounds_loop2.begin)
  423. PADDD(xmm_v0, xmm_s0)
  424. PADDD(xmm_v1, xmm_s1)
  425. PADDD(xmm_v2, xmm_s2)
  426. PADDD(xmm_v3, xmm_s3)
  427. WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
  428. PADDQ(xmm_s3, xmm_one)
  429. PADDD(xmm_v4, xmm_s0)
  430. PADDD(xmm_v5, xmm_s1)
  431. PADDD(xmm_v6, xmm_s2)
  432. PADDD(xmm_v7, xmm_s3)
  433. WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
  434. PADDQ(xmm_s3, xmm_one)
  435. ADD(reg_inp, 2 * 64)
  436. ADD(reg_outp, 2 * 64)
  437. SUB(reg_blocks, 2)
  438. LABEL(process_1_block)
  439. ADD(reg_blocks, 2)
  440. out_serial = Label()
  441. JZ(out_serial)
  442. #
  443. # 1 block at a time. Only executed once, because if there was > 1,
  444. # the parallel code would have processed it already.
  445. #
  446. MOVDQA(xmm_v0, xmm_s0)
  447. MOVDQA(xmm_v1, xmm_s1)
  448. MOVDQA(xmm_v2, xmm_s2)
  449. MOVDQA(xmm_v3, xmm_s3)
  450. MOV(reg_rounds, 20)
  451. rounds_loop1 = Loop()
  452. with rounds_loop1:
  453. # a += b; d ^= a; d = ROTW16(d);
  454. PADDD(xmm_v0, xmm_v1)
  455. PXOR(xmm_v3, xmm_v0)
  456. ROTW16_sse2(xmm_tmp, xmm_v3)
  457. # c += d; b ^= c; b = ROTW12(b);
  458. PADDD(xmm_v2, xmm_v3)
  459. PXOR(xmm_v1, xmm_v2)
  460. ROTW12_sse2(xmm_tmp, xmm_v1)
  461. # a += b; d ^= a; d = ROTW8(d);
  462. PADDD(xmm_v0, xmm_v1)
  463. PXOR(xmm_v3, xmm_v0)
  464. ROTW8_sse2(xmm_tmp, xmm_v3)
  465. # c += d; b ^= c; b = ROTW7(b)
  466. PADDD(xmm_v2, xmm_v3)
  467. PXOR(xmm_v1, xmm_v2)
  468. ROTW7_sse2(xmm_tmp, xmm_v1)
  469. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  470. PSHUFD(xmm_v1, xmm_v1, 0x39)
  471. PSHUFD(xmm_v2, xmm_v2, 0x4e)
  472. PSHUFD(xmm_v3, xmm_v3, 0x93)
  473. # a += b; d ^= a; d = ROTW16(d);
  474. PADDD(xmm_v0, xmm_v1)
  475. PXOR(xmm_v3, xmm_v0)
  476. ROTW16_sse2(xmm_tmp, xmm_v3)
  477. # c += d; b ^= c; b = ROTW12(b);
  478. PADDD(xmm_v2, xmm_v3)
  479. PXOR(xmm_v1, xmm_v2)
  480. ROTW12_sse2(xmm_tmp, xmm_v1)
  481. # a += b; d ^= a; d = ROTW8(d);
  482. PADDD(xmm_v0, xmm_v1)
  483. PXOR(xmm_v3, xmm_v0)
  484. ROTW8_sse2(xmm_tmp, xmm_v3)
  485. # c += d; b ^= c; b = ROTW7(b)
  486. PADDD(xmm_v2, xmm_v3)
  487. PXOR(xmm_v1, xmm_v2)
  488. ROTW7_sse2(xmm_tmp, xmm_v1)
  489. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  490. PSHUFD(xmm_v1, xmm_v1, 0x93)
  491. PSHUFD(xmm_v2, xmm_v2, 0x4e)
  492. PSHUFD(xmm_v3, xmm_v3, 0x39)
  493. SUB(reg_rounds, 2)
  494. JNZ(rounds_loop1.begin)
  495. PADDD(xmm_v0, xmm_s0)
  496. PADDD(xmm_v1, xmm_s1)
  497. PADDD(xmm_v2, xmm_s2)
  498. PADDD(xmm_v3, xmm_s3)
  499. WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
  500. PADDQ(xmm_s3, xmm_one)
  501. LABEL(out_serial)
  502. # Write back the updated counter. Stoping at 2^70 bytes is the user's
  503. # problem, not mine. (Skipped if there's exactly a multiple of 4 blocks
  504. # because the counter is incremented in memory while looping.)
  505. MOVDQU(mem_s3, xmm_s3)
  506. LABEL(out)
  507. # Paranoia, cleanse the scratch space.
  508. PXOR(xmm_v0, xmm_v0)
  509. MOVDQA(mem_tmp0, xmm_v0)
  510. # Remove our stack allocation.
  511. MOV(registers.rsp, reg_sp_save)
  512. RETURN()
  513. #
  514. # AVX2 helpers. Like the SSE2 equivalents, the scratch register is explicit,
  515. # and more helpers are used to increase readability for destructive operations.
  516. #
  517. # XXX/Performance: ROTW16_avx2/ROTW8_avx2 both can use VPSHUFFB.
  518. #
  519. def ADD_avx2(dst, src):
  520. VPADDD(dst, dst, src)
  521. def XOR_avx2(dst, src):
  522. VPXOR(dst, dst, src)
  523. def ROTW16_avx2(tmp, d):
  524. VPSLLD(tmp, d, 16)
  525. VPSRLD(d, d, 16)
  526. XOR_avx2(d, tmp)
  527. def ROTW12_avx2(tmp, b):
  528. VPSLLD(tmp, b, 12)
  529. VPSRLD(b, b, 20)
  530. XOR_avx2(b, tmp)
  531. def ROTW8_avx2(tmp, d):
  532. VPSLLD(tmp, d, 8)
  533. VPSRLD(d, d, 24)
  534. XOR_avx2(d, tmp)
  535. def ROTW7_avx2(tmp, b):
  536. VPSLLD(tmp, b, 7)
  537. VPSRLD(b, b, 25)
  538. XOR_avx2(b, tmp)
  539. def WriteXor_avx2(tmp, inp, outp, d, v0, v1, v2, v3):
  540. # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
  541. VPERM2I128(tmp, v0, v1, 0x20)
  542. VPXOR(tmp, tmp, [inp+d])
  543. VMOVDQU([outp+d], tmp)
  544. # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
  545. VPERM2I128(tmp, v2, v3, 0x20)
  546. VPXOR(tmp, tmp, [inp+d+32])
  547. VMOVDQU([outp+d+32], tmp)
  548. # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
  549. VPERM2I128(tmp, v0, v1, 0x31)
  550. VPXOR(tmp, tmp, [inp+d+64])
  551. VMOVDQU([outp+d+64], tmp)
  552. # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
  553. VPERM2I128(tmp, v2, v3, 0x31)
  554. VPXOR(tmp, tmp, [inp+d+96])
  555. VMOVDQU([outp+d+96], tmp)
  556. # AVX2 ChaCha20 (aka avx2). Does not handle partial blocks, will process
  557. # 8/4/2 blocks at a time.
  558. with Function("blocksAmd64AVX2", (x, inp, outp, nrBlocks), target=uarch.broadwell):
  559. reg_x = GeneralPurposeRegister64()
  560. reg_inp = GeneralPurposeRegister64()
  561. reg_outp = GeneralPurposeRegister64()
  562. reg_blocks = GeneralPurposeRegister64()
  563. reg_sp_save = GeneralPurposeRegister64()
  564. LOAD.ARGUMENT(reg_x, x)
  565. LOAD.ARGUMENT(reg_inp, inp)
  566. LOAD.ARGUMENT(reg_outp, outp)
  567. LOAD.ARGUMENT(reg_blocks, nrBlocks)
  568. # Align the stack to a 32 byte boundary.
  569. MOV(reg_sp_save, registers.rsp)
  570. AND(registers.rsp, 0xffffffffffffffe0)
  571. SUB(registers.rsp, 0x20)
  572. x_s0 = [reg_x] # (Memory) Cipher state [0..3]
  573. x_s1 = [reg_x+16] # (Memory) Cipher state [4..7]
  574. x_s2 = [reg_x+32] # (Memory) Cipher state [8..11]
  575. x_s3 = [reg_x+48] # (Memory) Cipher state [12..15]
  576. ymm_v0 = YMMRegister()
  577. ymm_v1 = YMMRegister()
  578. ymm_v2 = YMMRegister()
  579. ymm_v3 = YMMRegister()
  580. ymm_v4 = YMMRegister()
  581. ymm_v5 = YMMRegister()
  582. ymm_v6 = YMMRegister()
  583. ymm_v7 = YMMRegister()
  584. ymm_v8 = YMMRegister()
  585. ymm_v9 = YMMRegister()
  586. ymm_v10 = YMMRegister()
  587. ymm_v11 = YMMRegister()
  588. ymm_v12 = YMMRegister()
  589. ymm_v13 = YMMRegister()
  590. ymm_v14 = YMMRegister()
  591. ymm_v15 = YMMRegister()
  592. ymm_tmp0 = ymm_v12
  593. # Allocate the neccecary stack space for the counter vector and two ymm
  594. # registers that we will spill.
  595. SUB(registers.rsp, 96)
  596. mem_tmp0 = [registers.rsp+64] # (Stack) Scratch space.
  597. mem_s3 = [registers.rsp+32] # (Stack) Working copy of s3. (8x)
  598. mem_inc = [registers.rsp] # (Stack) Counter increment vector.
  599. # Increment the counter for one side of the state vector.
  600. VPXOR(ymm_tmp0, ymm_tmp0, ymm_tmp0)
  601. VMOVDQU(mem_inc, ymm_tmp0)
  602. reg_tmp = GeneralPurposeRegister32()
  603. MOV(reg_tmp, 0x00000001)
  604. MOV([registers.rsp+16], reg_tmp)
  605. VBROADCASTI128(ymm_v3, x_s3)
  606. VPADDQ(ymm_v3, ymm_v3, [registers.rsp])
  607. VMOVDQA(mem_s3, ymm_v3)
  608. # As we process 2xN blocks at a time, so the counter increment for both
  609. # sides of the state vector is 2.
  610. MOV(reg_tmp, 0x00000002)
  611. MOV([registers.rsp], reg_tmp)
  612. MOV([registers.rsp+16], reg_tmp)
  613. out_write_even = Label()
  614. out_write_odd = Label()
  615. #
  616. # 8 blocks at a time. Ted Krovetz's avx2 code does not do this, but it's
  617. # a decent gain despite all the pain...
  618. #
  619. reg_rounds = GeneralPurposeRegister64()
  620. vector_loop8 = Loop()
  621. SUB(reg_blocks, 8)
  622. JB(vector_loop8.end)
  623. with vector_loop8:
  624. VBROADCASTI128(ymm_v0, x_s0)
  625. VBROADCASTI128(ymm_v1, x_s1)
  626. VBROADCASTI128(ymm_v2, x_s2)
  627. VMOVDQA(ymm_v3, mem_s3)
  628. VMOVDQA(ymm_v4, ymm_v0)
  629. VMOVDQA(ymm_v5, ymm_v1)
  630. VMOVDQA(ymm_v6, ymm_v2)
  631. VPADDQ(ymm_v7, ymm_v3, mem_inc)
  632. VMOVDQA(ymm_v8, ymm_v0)
  633. VMOVDQA(ymm_v9, ymm_v1)
  634. VMOVDQA(ymm_v10, ymm_v2)
  635. VPADDQ(ymm_v11, ymm_v7, mem_inc)
  636. VMOVDQA(ymm_v12, ymm_v0)
  637. VMOVDQA(ymm_v13, ymm_v1)
  638. VMOVDQA(ymm_v14, ymm_v2)
  639. VPADDQ(ymm_v15, ymm_v11, mem_inc)
  640. MOV(reg_rounds, 20)
  641. rounds_loop8 = Loop()
  642. with rounds_loop8:
  643. # a += b; d ^= a; d = ROTW16(d);
  644. ADD_avx2(ymm_v0, ymm_v1)
  645. ADD_avx2(ymm_v4, ymm_v5)
  646. ADD_avx2(ymm_v8, ymm_v9)
  647. ADD_avx2(ymm_v12, ymm_v13)
  648. XOR_avx2(ymm_v3, ymm_v0)
  649. XOR_avx2(ymm_v7, ymm_v4)
  650. XOR_avx2(ymm_v11, ymm_v8)
  651. XOR_avx2(ymm_v15, ymm_v12)
  652. VMOVDQA(mem_tmp0, ymm_tmp0) # Save
  653. ROTW16_avx2(ymm_tmp0, ymm_v3)
  654. ROTW16_avx2(ymm_tmp0, ymm_v7)
  655. ROTW16_avx2(ymm_tmp0, ymm_v11)
  656. ROTW16_avx2(ymm_tmp0, ymm_v15)
  657. # c += d; b ^= c; b = ROTW12(b);
  658. ADD_avx2(ymm_v2, ymm_v3)
  659. ADD_avx2(ymm_v6, ymm_v7)
  660. ADD_avx2(ymm_v10, ymm_v11)
  661. ADD_avx2(ymm_v14, ymm_v15)
  662. XOR_avx2(ymm_v1, ymm_v2)
  663. XOR_avx2(ymm_v5, ymm_v6)
  664. XOR_avx2(ymm_v9, ymm_v10)
  665. XOR_avx2(ymm_v13, ymm_v14)
  666. ROTW12_avx2(ymm_tmp0, ymm_v1)
  667. ROTW12_avx2(ymm_tmp0, ymm_v5)
  668. ROTW12_avx2(ymm_tmp0, ymm_v9)
  669. ROTW12_avx2(ymm_tmp0, ymm_v13)
  670. # a += b; d ^= a; d = ROTW8(d);
  671. VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
  672. ADD_avx2(ymm_v0, ymm_v1)
  673. ADD_avx2(ymm_v4, ymm_v5)
  674. ADD_avx2(ymm_v8, ymm_v9)
  675. ADD_avx2(ymm_v12, ymm_v13)
  676. XOR_avx2(ymm_v3, ymm_v0)
  677. XOR_avx2(ymm_v7, ymm_v4)
  678. XOR_avx2(ymm_v11, ymm_v8)
  679. XOR_avx2(ymm_v15, ymm_v12)
  680. VMOVDQA(mem_tmp0, ymm_tmp0) # Save
  681. ROTW8_avx2(ymm_tmp0, ymm_v3)
  682. ROTW8_avx2(ymm_tmp0, ymm_v7)
  683. ROTW8_avx2(ymm_tmp0, ymm_v11)
  684. ROTW8_avx2(ymm_tmp0, ymm_v15)
  685. # c += d; b ^= c; b = ROTW7(b)
  686. ADD_avx2(ymm_v2, ymm_v3)
  687. ADD_avx2(ymm_v6, ymm_v7)
  688. ADD_avx2(ymm_v10, ymm_v11)
  689. ADD_avx2(ymm_v14, ymm_v15)
  690. XOR_avx2(ymm_v1, ymm_v2)
  691. XOR_avx2(ymm_v5, ymm_v6)
  692. XOR_avx2(ymm_v9, ymm_v10)
  693. XOR_avx2(ymm_v13, ymm_v14)
  694. ROTW7_avx2(ymm_tmp0, ymm_v1)
  695. ROTW7_avx2(ymm_tmp0, ymm_v5)
  696. ROTW7_avx2(ymm_tmp0, ymm_v9)
  697. ROTW7_avx2(ymm_tmp0, ymm_v13)
  698. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  699. VPSHUFD(ymm_v1, ymm_v1, 0x39)
  700. VPSHUFD(ymm_v5, ymm_v5, 0x39)
  701. VPSHUFD(ymm_v9, ymm_v9, 0x39)
  702. VPSHUFD(ymm_v13, ymm_v13, 0x39)
  703. VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  704. VPSHUFD(ymm_v6, ymm_v6, 0x4e)
  705. VPSHUFD(ymm_v10, ymm_v10, 0x4e)
  706. VPSHUFD(ymm_v14, ymm_v14, 0x4e)
  707. VPSHUFD(ymm_v3, ymm_v3, 0x93)
  708. VPSHUFD(ymm_v7, ymm_v7, 0x93)
  709. VPSHUFD(ymm_v11, ymm_v11, 0x93)
  710. VPSHUFD(ymm_v15, ymm_v15, 0x93)
  711. # a += b; d ^= a; d = ROTW16(d);
  712. VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
  713. ADD_avx2(ymm_v0, ymm_v1)
  714. ADD_avx2(ymm_v4, ymm_v5)
  715. ADD_avx2(ymm_v8, ymm_v9)
  716. ADD_avx2(ymm_v12, ymm_v13)
  717. XOR_avx2(ymm_v3, ymm_v0)
  718. XOR_avx2(ymm_v7, ymm_v4)
  719. XOR_avx2(ymm_v11, ymm_v8)
  720. XOR_avx2(ymm_v15, ymm_v12)
  721. VMOVDQA(mem_tmp0, ymm_tmp0) # Save
  722. ROTW16_avx2(ymm_tmp0, ymm_v3)
  723. ROTW16_avx2(ymm_tmp0, ymm_v7)
  724. ROTW16_avx2(ymm_tmp0, ymm_v11)
  725. ROTW16_avx2(ymm_tmp0, ymm_v15)
  726. # c += d; b ^= c; b = ROTW12(b);
  727. ADD_avx2(ymm_v2, ymm_v3)
  728. ADD_avx2(ymm_v6, ymm_v7)
  729. ADD_avx2(ymm_v10, ymm_v11)
  730. ADD_avx2(ymm_v14, ymm_v15)
  731. XOR_avx2(ymm_v1, ymm_v2)
  732. XOR_avx2(ymm_v5, ymm_v6)
  733. XOR_avx2(ymm_v9, ymm_v10)
  734. XOR_avx2(ymm_v13, ymm_v14)
  735. ROTW12_avx2(ymm_tmp0, ymm_v1)
  736. ROTW12_avx2(ymm_tmp0, ymm_v5)
  737. ROTW12_avx2(ymm_tmp0, ymm_v9)
  738. ROTW12_avx2(ymm_tmp0, ymm_v13)
  739. # a += b; d ^= a; d = ROTW8(d);
  740. VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
  741. ADD_avx2(ymm_v0, ymm_v1)
  742. ADD_avx2(ymm_v4, ymm_v5)
  743. ADD_avx2(ymm_v8, ymm_v9)
  744. ADD_avx2(ymm_v12, ymm_v13)
  745. XOR_avx2(ymm_v3, ymm_v0)
  746. XOR_avx2(ymm_v7, ymm_v4)
  747. XOR_avx2(ymm_v11, ymm_v8)
  748. XOR_avx2(ymm_v15, ymm_v12)
  749. VMOVDQA(mem_tmp0, ymm_tmp0) # Save
  750. ROTW8_avx2(ymm_tmp0, ymm_v3)
  751. ROTW8_avx2(ymm_tmp0, ymm_v7)
  752. ROTW8_avx2(ymm_tmp0, ymm_v11)
  753. ROTW8_avx2(ymm_tmp0, ymm_v15)
  754. # c += d; b ^= c; b = ROTW7(b)
  755. ADD_avx2(ymm_v2, ymm_v3)
  756. ADD_avx2(ymm_v6, ymm_v7)
  757. ADD_avx2(ymm_v10, ymm_v11)
  758. ADD_avx2(ymm_v14, ymm_v15)
  759. XOR_avx2(ymm_v1, ymm_v2)
  760. XOR_avx2(ymm_v5, ymm_v6)
  761. XOR_avx2(ymm_v9, ymm_v10)
  762. XOR_avx2(ymm_v13, ymm_v14)
  763. ROTW7_avx2(ymm_tmp0, ymm_v1)
  764. ROTW7_avx2(ymm_tmp0, ymm_v5)
  765. ROTW7_avx2(ymm_tmp0, ymm_v9)
  766. ROTW7_avx2(ymm_tmp0, ymm_v13)
  767. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  768. VPSHUFD(ymm_v1, ymm_v1, 0x93)
  769. VPSHUFD(ymm_v5, ymm_v5, 0x93)
  770. VPSHUFD(ymm_v9, ymm_v9, 0x93)
  771. VPSHUFD(ymm_v13, ymm_v13, 0x93)
  772. VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  773. VPSHUFD(ymm_v6, ymm_v6, 0x4e)
  774. VPSHUFD(ymm_v10, ymm_v10, 0x4e)
  775. VPSHUFD(ymm_v14, ymm_v14, 0x4e)
  776. VPSHUFD(ymm_v3, ymm_v3, 0x39)
  777. VPSHUFD(ymm_v7, ymm_v7, 0x39)
  778. VPSHUFD(ymm_v11, ymm_v11, 0x39)
  779. VPSHUFD(ymm_v15, ymm_v15, 0x39)
  780. VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
  781. SUB(reg_rounds, 2)
  782. JNZ(rounds_loop8.begin)
  783. # ymm_v12 is in mem_tmp0 and is current....
  784. # XXX: I assume VBROADCASTI128 is about as fast as VMOVDQA....
  785. VBROADCASTI128(ymm_tmp0, x_s0)
  786. ADD_avx2(ymm_v0, ymm_tmp0)
  787. ADD_avx2(ymm_v4, ymm_tmp0)
  788. ADD_avx2(ymm_v8, ymm_tmp0)
  789. ADD_avx2(ymm_tmp0, mem_tmp0)
  790. VMOVDQA(mem_tmp0, ymm_tmp0)
  791. VBROADCASTI128(ymm_tmp0, x_s1)
  792. ADD_avx2(ymm_v1, ymm_tmp0)
  793. ADD_avx2(ymm_v5, ymm_tmp0)
  794. ADD_avx2(ymm_v9, ymm_tmp0)
  795. ADD_avx2(ymm_v13, ymm_tmp0)
  796. VBROADCASTI128(ymm_tmp0, x_s2)
  797. ADD_avx2(ymm_v2, ymm_tmp0)
  798. ADD_avx2(ymm_v6, ymm_tmp0)
  799. ADD_avx2(ymm_v10, ymm_tmp0)
  800. ADD_avx2(ymm_v14, ymm_tmp0)
  801. ADD_avx2(ymm_v3, mem_s3)
  802. WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
  803. VMOVDQA(ymm_v3, mem_s3)
  804. ADD_avx2(ymm_v3, mem_inc)
  805. ADD_avx2(ymm_v7, ymm_v3)
  806. WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
  807. ADD_avx2(ymm_v3, mem_inc)
  808. ADD_avx2(ymm_v11, ymm_v3)
  809. WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 256, ymm_v8, ymm_v9, ymm_v10, ymm_v11)
  810. ADD_avx2(ymm_v3, mem_inc)
  811. VMOVDQA(ymm_v12, mem_tmp0)
  812. ADD_avx2(ymm_v15, ymm_v3)
  813. WriteXor_avx2(ymm_v0, reg_inp, reg_outp, 384, ymm_v12, ymm_v13, ymm_v14, ymm_v15)
  814. ADD_avx2(ymm_v3, mem_inc)
  815. VMOVDQA(mem_s3, ymm_v3)
  816. ADD(reg_inp, 8 * 64)
  817. ADD(reg_outp, 8 * 64)
  818. SUB(reg_blocks, 8)
  819. JAE(vector_loop8.begin)
  820. # ymm_v3 contains a current copy of mem_s3 either from when it was built,
  821. # or because the loop updates it. Copy this before we mess with the block
  822. # counter in case we need to write it back and return.
  823. ymm_s3 = ymm_v11
  824. VMOVDQA(ymm_s3, ymm_v3)
  825. ADD(reg_blocks, 8)
  826. JZ(out_write_even)
  827. # We now actually can do everything in registers.
  828. ymm_s0 = ymm_v8
  829. VBROADCASTI128(ymm_s0, x_s0)
  830. ymm_s1 = ymm_v9
  831. VBROADCASTI128(ymm_s1, x_s1)
  832. ymm_s2 = ymm_v10
  833. VBROADCASTI128(ymm_s2, x_s2)
  834. ymm_inc = ymm_v14
  835. VMOVDQA(ymm_inc, mem_inc)
  836. #
  837. # 4 blocks at a time.
  838. #
  839. process_2_blocks = Label()
  840. SUB(reg_blocks, 4)
  841. JB(process_2_blocks) # < 4 blocks remaining.
  842. VMOVDQA(ymm_v0, ymm_s0)
  843. VMOVDQA(ymm_v1, ymm_s1)
  844. VMOVDQA(ymm_v2, ymm_s2)
  845. VMOVDQA(ymm_v3, ymm_s3)
  846. VMOVDQA(ymm_v4, ymm_v0)
  847. VMOVDQA(ymm_v5, ymm_v1)
  848. VMOVDQA(ymm_v6, ymm_v2)
  849. VPADDQ(ymm_v7, ymm_v3, ymm_inc)
  850. MOV(reg_rounds, 20)
  851. rounds_loop4 = Loop()
  852. with rounds_loop4:
  853. # a += b; d ^= a; d = ROTW16(d);
  854. ADD_avx2(ymm_v0, ymm_v1)
  855. ADD_avx2(ymm_v4, ymm_v5)
  856. XOR_avx2(ymm_v3, ymm_v0)
  857. XOR_avx2(ymm_v7, ymm_v4)
  858. ROTW16_avx2(ymm_tmp0, ymm_v3)
  859. ROTW16_avx2(ymm_tmp0, ymm_v7)
  860. # c += d; b ^= c; b = ROTW12(b);
  861. ADD_avx2(ymm_v2, ymm_v3)
  862. ADD_avx2(ymm_v6, ymm_v7)
  863. XOR_avx2(ymm_v1, ymm_v2)
  864. XOR_avx2(ymm_v5, ymm_v6)
  865. ROTW12_avx2(ymm_tmp0, ymm_v1)
  866. ROTW12_avx2(ymm_tmp0, ymm_v5)
  867. # a += b; d ^= a; d = ROTW8(d);
  868. ADD_avx2(ymm_v0, ymm_v1)
  869. ADD_avx2(ymm_v4, ymm_v5)
  870. XOR_avx2(ymm_v3, ymm_v0)
  871. XOR_avx2(ymm_v7, ymm_v4)
  872. ROTW8_avx2(ymm_tmp0, ymm_v3)
  873. ROTW8_avx2(ymm_tmp0, ymm_v7)
  874. # c += d; b ^= c; b = ROTW7(b)
  875. ADD_avx2(ymm_v2, ymm_v3)
  876. ADD_avx2(ymm_v6, ymm_v7)
  877. XOR_avx2(ymm_v1, ymm_v2)
  878. XOR_avx2(ymm_v5, ymm_v6)
  879. ROTW7_avx2(ymm_tmp0, ymm_v1)
  880. ROTW7_avx2(ymm_tmp0, ymm_v5)
  881. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  882. VPSHUFD(ymm_v1, ymm_v1, 0x39)
  883. VPSHUFD(ymm_v5, ymm_v5, 0x39)
  884. VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  885. VPSHUFD(ymm_v6, ymm_v6, 0x4e)
  886. VPSHUFD(ymm_v3, ymm_v3, 0x93)
  887. VPSHUFD(ymm_v7, ymm_v7, 0x93)
  888. # a += b; d ^= a; d = ROTW16(d);
  889. ADD_avx2(ymm_v0, ymm_v1)
  890. ADD_avx2(ymm_v4, ymm_v5)
  891. XOR_avx2(ymm_v3, ymm_v0)
  892. XOR_avx2(ymm_v7, ymm_v4)
  893. ROTW16_avx2(ymm_tmp0, ymm_v3)
  894. ROTW16_avx2(ymm_tmp0, ymm_v7)
  895. # c += d; b ^= c; b = ROTW12(b);
  896. ADD_avx2(ymm_v2, ymm_v3)
  897. ADD_avx2(ymm_v6, ymm_v7)
  898. XOR_avx2(ymm_v1, ymm_v2)
  899. XOR_avx2(ymm_v5, ymm_v6)
  900. ROTW12_avx2(ymm_tmp0, ymm_v1)
  901. ROTW12_avx2(ymm_tmp0, ymm_v5)
  902. # a += b; d ^= a; d = ROTW8(d);
  903. ADD_avx2(ymm_v0, ymm_v1)
  904. ADD_avx2(ymm_v4, ymm_v5)
  905. XOR_avx2(ymm_v3, ymm_v0)
  906. XOR_avx2(ymm_v7, ymm_v4)
  907. ROTW8_avx2(ymm_tmp0, ymm_v3)
  908. ROTW8_avx2(ymm_tmp0, ymm_v7)
  909. # c += d; b ^= c; b = ROTW7(b)
  910. ADD_avx2(ymm_v2, ymm_v3)
  911. ADD_avx2(ymm_v6, ymm_v7)
  912. XOR_avx2(ymm_v1, ymm_v2)
  913. XOR_avx2(ymm_v5, ymm_v6)
  914. ROTW7_avx2(ymm_tmp0, ymm_v1)
  915. ROTW7_avx2(ymm_tmp0, ymm_v5)
  916. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  917. VPSHUFD(ymm_v1, ymm_v1, 0x93)
  918. VPSHUFD(ymm_v5, ymm_v5, 0x93)
  919. VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  920. VPSHUFD(ymm_v6, ymm_v6, 0x4e)
  921. VPSHUFD(ymm_v3, ymm_v3, 0x39)
  922. VPSHUFD(ymm_v7, ymm_v7, 0x39)
  923. SUB(reg_rounds, 2)
  924. JNZ(rounds_loop4.begin)
  925. ADD_avx2(ymm_v0, ymm_s0)
  926. ADD_avx2(ymm_v1, ymm_s1)
  927. ADD_avx2(ymm_v2, ymm_s2)
  928. ADD_avx2(ymm_v3, ymm_s3)
  929. WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
  930. ADD_avx2(ymm_s3, ymm_inc)
  931. ADD_avx2(ymm_v4, ymm_s0)
  932. ADD_avx2(ymm_v5, ymm_s1)
  933. ADD_avx2(ymm_v6, ymm_s2)
  934. ADD_avx2(ymm_v7, ymm_s3)
  935. WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
  936. ADD_avx2(ymm_s3, ymm_inc)
  937. ADD(reg_inp, 4 * 64)
  938. ADD(reg_outp, 4 * 64)
  939. SUB(reg_blocks, 4)
  940. LABEL(process_2_blocks)
  941. ADD(reg_blocks, 4)
  942. JZ(out_write_even) # 0 blocks left.
  943. #
  944. # 2/1 blocks at a time. The two codepaths are unified because
  945. # with AVX2 we do 2 blocks at a time anyway, and this only gets called
  946. # if 3/2/1 blocks are remaining, so the extra branches don't hurt that
  947. # much.
  948. #
  949. vector_loop2 = Loop()
  950. with vector_loop2:
  951. VMOVDQA(ymm_v0, ymm_s0)
  952. VMOVDQA(ymm_v1, ymm_s1)
  953. VMOVDQA(ymm_v2, ymm_s2)
  954. VMOVDQA(ymm_v3, ymm_s3)
  955. MOV(reg_rounds, 20)
  956. rounds_loop2 = Loop()
  957. with rounds_loop2:
  958. # a += b; d ^= a; d = ROTW16(d);
  959. ADD_avx2(ymm_v0, ymm_v1)
  960. XOR_avx2(ymm_v3, ymm_v0)
  961. ROTW16_avx2(ymm_tmp0, ymm_v3)
  962. # c += d; b ^= c; b = ROTW12(b);
  963. ADD_avx2(ymm_v2, ymm_v3)
  964. XOR_avx2(ymm_v1, ymm_v2)
  965. ROTW12_avx2(ymm_tmp0, ymm_v1)
  966. # a += b; d ^= a; d = ROTW8(d);
  967. ADD_avx2(ymm_v0, ymm_v1)
  968. XOR_avx2(ymm_v3, ymm_v0)
  969. ROTW8_avx2(ymm_tmp0, ymm_v3)
  970. # c += d; b ^= c; b = ROTW7(b)
  971. ADD_avx2(ymm_v2, ymm_v3)
  972. XOR_avx2(ymm_v1, ymm_v2)
  973. ROTW7_avx2(ymm_tmp0, ymm_v1)
  974. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  975. VPSHUFD(ymm_v1, ymm_v1, 0x39)
  976. VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  977. VPSHUFD(ymm_v3, ymm_v3, 0x93)
  978. # a += b; d ^= a; d = ROTW16(d);
  979. ADD_avx2(ymm_v0, ymm_v1)
  980. XOR_avx2(ymm_v3, ymm_v0)
  981. ROTW16_avx2(ymm_tmp0, ymm_v3)
  982. # c += d; b ^= c; b = ROTW12(b);
  983. ADD_avx2(ymm_v2, ymm_v3)
  984. XOR_avx2(ymm_v1, ymm_v2)
  985. ROTW12_avx2(ymm_tmp0, ymm_v1)
  986. # a += b; d ^= a; d = ROTW8(d);
  987. ADD_avx2(ymm_v0, ymm_v1)
  988. XOR_avx2(ymm_v3, ymm_v0)
  989. ROTW8_avx2(ymm_tmp0, ymm_v3)
  990. # c += d; b ^= c; b = ROTW7(b)
  991. ADD_avx2(ymm_v2, ymm_v3)
  992. XOR_avx2(ymm_v1, ymm_v2)
  993. ROTW7_avx2(ymm_tmp0, ymm_v1)
  994. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  995. VPSHUFD(ymm_v1, ymm_v1, 0x93)
  996. VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  997. VPSHUFD(ymm_v3, ymm_v3, 0x39)
  998. SUB(reg_rounds, 2)
  999. JNZ(rounds_loop2.begin)
  1000. ADD_avx2(ymm_v0, ymm_s0)
  1001. ADD_avx2(ymm_v1, ymm_s1)
  1002. ADD_avx2(ymm_v2, ymm_s2)
  1003. ADD_avx2(ymm_v3, ymm_s3)
  1004. # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
  1005. VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x20)
  1006. VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp])
  1007. VMOVDQU([reg_outp], ymm_tmp0)
  1008. # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
  1009. VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x20)
  1010. VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+32])
  1011. VMOVDQU([reg_outp+32], ymm_tmp0)
  1012. SUB(reg_blocks, 1)
  1013. JZ(out_write_odd)
  1014. ADD_avx2(ymm_s3, ymm_inc)
  1015. # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
  1016. VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x31)
  1017. VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+64])
  1018. VMOVDQU([reg_outp+64], ymm_tmp0)
  1019. # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
  1020. VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x31)
  1021. VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+96])
  1022. VMOVDQU([reg_outp+96], ymm_tmp0)
  1023. SUB(reg_blocks, 1)
  1024. JZ(out_write_even)
  1025. ADD(reg_inp, 2 * 64)
  1026. ADD(reg_outp, 2 * 64)
  1027. JMP(vector_loop2.begin)
  1028. LABEL(out_write_odd)
  1029. VPERM2I128(ymm_s3, ymm_s3, ymm_s3, 0x01) # Odd number of blocks.
  1030. LABEL(out_write_even)
  1031. VMOVDQU(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3
  1032. # Paranoia, cleanse the scratch space.
  1033. VPXOR(ymm_v0, ymm_v0, ymm_v0)
  1034. VMOVDQA(mem_tmp0, ymm_v0)
  1035. VMOVDQA(mem_s3, ymm_v0)
  1036. # Clear all YMM (and XMM) registers.
  1037. VZEROALL()
  1038. # Remove our stack allocation.
  1039. MOV(registers.rsp, reg_sp_save)
  1040. RETURN()
  1041. #
  1042. # CPUID
  1043. #
  1044. cpuidParams = Argument(ptr(uint32_t))
  1045. with Function("cpuidAmd64", (cpuidParams,)):
  1046. reg_params = registers.r15
  1047. LOAD.ARGUMENT(reg_params, cpuidParams)
  1048. MOV(registers.eax, [reg_params])
  1049. MOV(registers.ecx, [reg_params+8])
  1050. CPUID()
  1051. MOV([reg_params], registers.eax)
  1052. MOV([reg_params+4], registers.ebx)
  1053. MOV([reg_params+8], registers.ecx)
  1054. MOV([reg_params+12], registers.edx)
  1055. RETURN()
  1056. #
  1057. # XGETBV (ECX = 0)
  1058. #
  1059. xcrVec = Argument(ptr(uint32_t))
  1060. with Function("xgetbv0Amd64", (xcrVec,)):
  1061. reg_vec = GeneralPurposeRegister64()
  1062. LOAD.ARGUMENT(reg_vec, xcrVec)
  1063. XOR(registers.ecx, registers.ecx)
  1064. XGETBV()
  1065. MOV([reg_vec], registers.eax)
  1066. MOV([reg_vec+4], registers.edx)
  1067. RETURN()