chacha20_amd64.py 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303
  1. #!/usr/bin/env python3
  2. #
  3. # To the extent possible under law, Yawning Angel has waived all copyright
  4. # and related or neighboring rights to chacha20, using the Creative
  5. # Commons "CC0" public domain dedication. See LICENSE or
  6. # <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
  7. #
  8. # cgo sucks. Plan 9 assembly sucks. Real languages have SIMD intrinsics.
  9. # The least terrible/retarded option is to use a Python code generator, so
  10. # that's what I did.
  11. #
  12. # Code based on Ted Krovetz's vec128 C implementation, with corrections
  13. # to use a 64 bit counter instead of 32 bit, and to allow unaligned input and
  14. # output pointers.
  15. #
  16. # Dependencies: https://github.com/Maratyszcza/PeachPy
  17. #
  18. # python3 -m peachpy.x86_64 -mabi=goasm -S -o chacha20_amd64.s chacha20_amd64.py
  19. #
  20. from peachpy import *
  21. from peachpy.x86_64 import *
  22. x = Argument(ptr(uint32_t))
  23. inp = Argument(ptr(const_uint8_t))
  24. outp = Argument(ptr(uint8_t))
  25. nrBlocks = Argument(ptr(size_t))
  26. #
  27. # SSE2 helper functions. A temporary register is explicitly passed in because
  28. # the main fast loop uses every single register (and even spills) so manual
  29. # control is needed.
  30. #
  31. # This used to also have a DQROUNDS helper that did 2 rounds of ChaCha like
  32. # in the C code, but the C code has the luxury of an optimizer reordering
  33. # everything, while this does not.
  34. #
  35. def ROTW16_sse2(tmp, d):
  36. MOVDQA(tmp, d)
  37. PSLLD(tmp, 16)
  38. PSRLD(d, 16)
  39. PXOR(d, tmp)
  40. def ROTW12_sse2(tmp, b):
  41. MOVDQA(tmp, b)
  42. PSLLD(tmp, 12)
  43. PSRLD(b, 20)
  44. PXOR(b, tmp)
  45. def ROTW8_sse2(tmp, d):
  46. MOVDQA(tmp, d)
  47. PSLLD(tmp, 8)
  48. PSRLD(d, 24)
  49. PXOR(d, tmp)
  50. def ROTW7_sse2(tmp, b):
  51. MOVDQA(tmp, b)
  52. PSLLD(tmp, 7)
  53. PSRLD(b, 25)
  54. PXOR(b, tmp)
  55. def WriteXor_sse2(tmp, inp, outp, d, v0, v1, v2, v3):
  56. MOVDQU(tmp, [inp+d])
  57. PXOR(tmp, v0)
  58. MOVDQU([outp+d], tmp)
  59. MOVDQU(tmp, [inp+d+16])
  60. PXOR(tmp, v1)
  61. MOVDQU([outp+d+16], tmp)
  62. MOVDQU(tmp, [inp+d+32])
  63. PXOR(tmp, v2)
  64. MOVDQU([outp+d+32], tmp)
  65. MOVDQU(tmp, [inp+d+48])
  66. PXOR(tmp, v3)
  67. MOVDQU([outp+d+48], tmp)
  68. # SSE2 ChaCha20 (aka vec128). Does not handle partial blocks, and will
  69. # process 4/2/1 blocks at a time. x (the ChaCha20 state) must be 16 byte
  70. # aligned.
  71. with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
  72. reg_x = GeneralPurposeRegister64()
  73. reg_inp = GeneralPurposeRegister64()
  74. reg_outp = GeneralPurposeRegister64()
  75. reg_blocks = GeneralPurposeRegister64()
  76. reg_sp_save = GeneralPurposeRegister64()
  77. LOAD.ARGUMENT(reg_x, x)
  78. LOAD.ARGUMENT(reg_inp, inp)
  79. LOAD.ARGUMENT(reg_outp, outp)
  80. LOAD.ARGUMENT(reg_blocks, nrBlocks)
  81. # Align the stack to a 32 byte boundary.
  82. reg_align = GeneralPurposeRegister64()
  83. MOV(reg_sp_save, registers.rsp)
  84. MOV(reg_align, 0x1f)
  85. NOT(reg_align)
  86. AND(registers.rsp, reg_align)
  87. SUB(registers.rsp, 0x20)
  88. # Build the counter increment vector on the stack, and allocate the scratch
  89. # space
  90. xmm_v0 = XMMRegister()
  91. PXOR(xmm_v0, xmm_v0)
  92. SUB(registers.rsp, 16+16)
  93. MOVDQA([registers.rsp], xmm_v0)
  94. reg_tmp = GeneralPurposeRegister32()
  95. MOV(reg_tmp, 0x00000001)
  96. MOV([registers.rsp], reg_tmp)
  97. mem_one = [registers.rsp] # (Stack) Counter increment vector
  98. mem_tmp0 = [registers.rsp+16] # (Stack) Scratch space.
  99. mem_s0 = [reg_x] # (Memory) Cipher state [0..3]
  100. mem_s1 = [reg_x+16] # (Memory) Cipher state [4..7]
  101. mem_s2 = [reg_x+32] # (Memory) Cipher state [8..11]
  102. mem_s3 = [reg_x+48] # (Memory) Cipher state [12..15]
  103. # xmm_v0 allocated above...
  104. xmm_v1 = XMMRegister()
  105. xmm_v2 = XMMRegister()
  106. xmm_v3 = XMMRegister()
  107. xmm_v4 = XMMRegister()
  108. xmm_v5 = XMMRegister()
  109. xmm_v6 = XMMRegister()
  110. xmm_v7 = XMMRegister()
  111. xmm_v8 = XMMRegister()
  112. xmm_v9 = XMMRegister()
  113. xmm_v10 = XMMRegister()
  114. xmm_v11 = XMMRegister()
  115. xmm_v12 = XMMRegister()
  116. xmm_v13 = XMMRegister()
  117. xmm_v14 = XMMRegister()
  118. xmm_v15 = XMMRegister()
  119. xmm_tmp = xmm_v12
  120. #
  121. # 4 blocks at a time.
  122. #
  123. vector_loop4 = Loop()
  124. SUB(reg_blocks, 4)
  125. JB(vector_loop4.end)
  126. with vector_loop4:
  127. MOVDQA(xmm_v0, mem_s0)
  128. MOVDQA(xmm_v1, mem_s1)
  129. MOVDQA(xmm_v2, mem_s2)
  130. MOVDQA(xmm_v3, mem_s3)
  131. MOVDQA(xmm_v4, xmm_v0)
  132. MOVDQA(xmm_v5, xmm_v1)
  133. MOVDQA(xmm_v6, xmm_v2)
  134. MOVDQA(xmm_v7, xmm_v3)
  135. PADDQ(xmm_v7, mem_one)
  136. MOVDQA(xmm_v8, xmm_v0)
  137. MOVDQA(xmm_v9, xmm_v1)
  138. MOVDQA(xmm_v10, xmm_v2)
  139. MOVDQA(xmm_v11, xmm_v7)
  140. PADDQ(xmm_v11, mem_one)
  141. MOVDQA(xmm_v12, xmm_v0)
  142. MOVDQA(xmm_v13, xmm_v1)
  143. MOVDQA(xmm_v14, xmm_v2)
  144. MOVDQA(xmm_v15, xmm_v11)
  145. PADDQ(xmm_v15, mem_one)
  146. reg_rounds = GeneralPurposeRegister64()
  147. MOV(reg_rounds, 20)
  148. rounds_loop4 = Loop()
  149. with rounds_loop4:
  150. # a += b; d ^= a; d = ROTW16(d);
  151. PADDD(xmm_v0, xmm_v1)
  152. PADDD(xmm_v4, xmm_v5)
  153. PADDD(xmm_v8, xmm_v9)
  154. PADDD(xmm_v12, xmm_v13)
  155. PXOR(xmm_v3, xmm_v0)
  156. PXOR(xmm_v7, xmm_v4)
  157. PXOR(xmm_v11, xmm_v8)
  158. PXOR(xmm_v15, xmm_v12)
  159. MOVDQA(mem_tmp0, xmm_tmp) # Save
  160. ROTW16_sse2(xmm_tmp, xmm_v3)
  161. ROTW16_sse2(xmm_tmp, xmm_v7)
  162. ROTW16_sse2(xmm_tmp, xmm_v11)
  163. ROTW16_sse2(xmm_tmp, xmm_v15)
  164. # c += d; b ^= c; b = ROTW12(b);
  165. PADDD(xmm_v2, xmm_v3)
  166. PADDD(xmm_v6, xmm_v7)
  167. PADDD(xmm_v10, xmm_v11)
  168. PADDD(xmm_v14, xmm_v15)
  169. PXOR(xmm_v1, xmm_v2)
  170. PXOR(xmm_v5, xmm_v6)
  171. PXOR(xmm_v9, xmm_v10)
  172. PXOR(xmm_v13, xmm_v14)
  173. ROTW12_sse2(xmm_tmp, xmm_v1)
  174. ROTW12_sse2(xmm_tmp, xmm_v5)
  175. ROTW12_sse2(xmm_tmp, xmm_v9)
  176. ROTW12_sse2(xmm_tmp, xmm_v13)
  177. # a += b; d ^= a; d = ROTW8(d);
  178. MOVDQA(xmm_tmp, mem_tmp0) # Restore
  179. PADDD(xmm_v0, xmm_v1)
  180. PADDD(xmm_v4, xmm_v5)
  181. PADDD(xmm_v8, xmm_v9)
  182. PADDD(xmm_v12, xmm_v13)
  183. PXOR(xmm_v3, xmm_v0)
  184. PXOR(xmm_v7, xmm_v4)
  185. PXOR(xmm_v11, xmm_v8)
  186. PXOR(xmm_v15, xmm_v12)
  187. MOVDQA(mem_tmp0, xmm_tmp) # Save
  188. ROTW8_sse2(xmm_tmp, xmm_v3)
  189. ROTW8_sse2(xmm_tmp, xmm_v7)
  190. ROTW8_sse2(xmm_tmp, xmm_v11)
  191. ROTW8_sse2(xmm_tmp, xmm_v15)
  192. # c += d; b ^= c; b = ROTW7(b)
  193. PADDD(xmm_v2, xmm_v3)
  194. PADDD(xmm_v6, xmm_v7)
  195. PADDD(xmm_v10, xmm_v11)
  196. PADDD(xmm_v14, xmm_v15)
  197. PXOR(xmm_v1, xmm_v2)
  198. PXOR(xmm_v5, xmm_v6)
  199. PXOR(xmm_v9, xmm_v10)
  200. PXOR(xmm_v13, xmm_v14)
  201. ROTW7_sse2(xmm_tmp, xmm_v1)
  202. ROTW7_sse2(xmm_tmp, xmm_v5)
  203. ROTW7_sse2(xmm_tmp, xmm_v9)
  204. ROTW7_sse2(xmm_tmp, xmm_v13)
  205. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  206. PSHUFD(xmm_v1, xmm_v1, 0x39)
  207. PSHUFD(xmm_v5, xmm_v5, 0x39)
  208. PSHUFD(xmm_v9, xmm_v9, 0x39)
  209. PSHUFD(xmm_v13, xmm_v13, 0x39)
  210. PSHUFD(xmm_v2, xmm_v2, 0x4e)
  211. PSHUFD(xmm_v6, xmm_v6, 0x4e)
  212. PSHUFD(xmm_v10, xmm_v10, 0x4e)
  213. PSHUFD(xmm_v14, xmm_v14, 0x4e)
  214. PSHUFD(xmm_v3, xmm_v3, 0x93)
  215. PSHUFD(xmm_v7, xmm_v7, 0x93)
  216. PSHUFD(xmm_v11, xmm_v11, 0x93)
  217. PSHUFD(xmm_v15, xmm_v15, 0x93)
  218. MOVDQA(xmm_tmp, mem_tmp0) # Restore
  219. # a += b; d ^= a; d = ROTW16(d);
  220. PADDD(xmm_v0, xmm_v1)
  221. PADDD(xmm_v4, xmm_v5)
  222. PADDD(xmm_v8, xmm_v9)
  223. PADDD(xmm_v12, xmm_v13)
  224. PXOR(xmm_v3, xmm_v0)
  225. PXOR(xmm_v7, xmm_v4)
  226. PXOR(xmm_v11, xmm_v8)
  227. PXOR(xmm_v15, xmm_v12)
  228. MOVDQA(mem_tmp0, xmm_tmp) # Save
  229. ROTW16_sse2(xmm_tmp, xmm_v3)
  230. ROTW16_sse2(xmm_tmp, xmm_v7)
  231. ROTW16_sse2(xmm_tmp, xmm_v11)
  232. ROTW16_sse2(xmm_tmp, xmm_v15)
  233. # c += d; b ^= c; b = ROTW12(b);
  234. PADDD(xmm_v2, xmm_v3)
  235. PADDD(xmm_v6, xmm_v7)
  236. PADDD(xmm_v10, xmm_v11)
  237. PADDD(xmm_v14, xmm_v15)
  238. PXOR(xmm_v1, xmm_v2)
  239. PXOR(xmm_v5, xmm_v6)
  240. PXOR(xmm_v9, xmm_v10)
  241. PXOR(xmm_v13, xmm_v14)
  242. ROTW12_sse2(xmm_tmp, xmm_v1)
  243. ROTW12_sse2(xmm_tmp, xmm_v5)
  244. ROTW12_sse2(xmm_tmp, xmm_v9)
  245. ROTW12_sse2(xmm_tmp, xmm_v13)
  246. # a += b; d ^= a; d = ROTW8(d);
  247. MOVDQA(xmm_tmp, mem_tmp0) # Restore
  248. PADDD(xmm_v0, xmm_v1)
  249. PADDD(xmm_v4, xmm_v5)
  250. PADDD(xmm_v8, xmm_v9)
  251. PADDD(xmm_v12, xmm_v13)
  252. PXOR(xmm_v3, xmm_v0)
  253. PXOR(xmm_v7, xmm_v4)
  254. PXOR(xmm_v11, xmm_v8)
  255. PXOR(xmm_v15, xmm_v12)
  256. MOVDQA(mem_tmp0, xmm_tmp) # Save
  257. ROTW8_sse2(xmm_tmp, xmm_v3)
  258. ROTW8_sse2(xmm_tmp, xmm_v7)
  259. ROTW8_sse2(xmm_tmp, xmm_v11)
  260. ROTW8_sse2(xmm_tmp, xmm_v15)
  261. # c += d; b ^= c; b = ROTW7(b)
  262. PADDD(xmm_v2, xmm_v3)
  263. PADDD(xmm_v6, xmm_v7)
  264. PADDD(xmm_v10, xmm_v11)
  265. PADDD(xmm_v14, xmm_v15)
  266. PXOR(xmm_v1, xmm_v2)
  267. PXOR(xmm_v5, xmm_v6)
  268. PXOR(xmm_v9, xmm_v10)
  269. PXOR(xmm_v13, xmm_v14)
  270. ROTW7_sse2(xmm_tmp, xmm_v1)
  271. ROTW7_sse2(xmm_tmp, xmm_v5)
  272. ROTW7_sse2(xmm_tmp, xmm_v9)
  273. ROTW7_sse2(xmm_tmp, xmm_v13)
  274. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  275. PSHUFD(xmm_v1, xmm_v1, 0x93)
  276. PSHUFD(xmm_v5, xmm_v5, 0x93)
  277. PSHUFD(xmm_v9, xmm_v9, 0x93)
  278. PSHUFD(xmm_v13, xmm_v13, 0x93)
  279. PSHUFD(xmm_v2, xmm_v2, 0x4e)
  280. PSHUFD(xmm_v6, xmm_v6, 0x4e)
  281. PSHUFD(xmm_v10, xmm_v10, 0x4e)
  282. PSHUFD(xmm_v14, xmm_v14, 0x4e)
  283. PSHUFD(xmm_v3, xmm_v3, 0x39)
  284. PSHUFD(xmm_v7, xmm_v7, 0x39)
  285. PSHUFD(xmm_v11, xmm_v11, 0x39)
  286. PSHUFD(xmm_v15, xmm_v15, 0x39)
  287. MOVDQA(xmm_tmp, mem_tmp0) # Restore
  288. SUB(reg_rounds, 2)
  289. JNZ(rounds_loop4.begin)
  290. MOVDQA(mem_tmp0, xmm_tmp)
  291. PADDD(xmm_v0, mem_s0)
  292. PADDD(xmm_v1, mem_s1)
  293. PADDD(xmm_v2, mem_s2)
  294. PADDD(xmm_v3, mem_s3)
  295. WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
  296. MOVDQA(xmm_v3, mem_s3)
  297. PADDQ(xmm_v3, mem_one)
  298. PADDD(xmm_v4, mem_s0)
  299. PADDD(xmm_v5, mem_s1)
  300. PADDD(xmm_v6, mem_s2)
  301. PADDD(xmm_v7, xmm_v3)
  302. WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
  303. PADDQ(xmm_v3, mem_one)
  304. PADDD(xmm_v8, mem_s0)
  305. PADDD(xmm_v9, mem_s1)
  306. PADDD(xmm_v10, mem_s2)
  307. PADDD(xmm_v11, xmm_v3)
  308. WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 128, xmm_v8, xmm_v9, xmm_v10, xmm_v11)
  309. PADDQ(xmm_v3, mem_one)
  310. MOVDQA(xmm_tmp, mem_tmp0)
  311. PADDD(xmm_v12, mem_s0)
  312. PADDD(xmm_v13, mem_s1)
  313. PADDD(xmm_v14, mem_s2)
  314. PADDD(xmm_v15, xmm_v3)
  315. WriteXor_sse2(xmm_v0, reg_inp, reg_outp, 192, xmm_v12, xmm_v13, xmm_v14, xmm_v15)
  316. PADDQ(xmm_v3, mem_one)
  317. MOVDQA(mem_s3, xmm_v3)
  318. ADD(reg_inp, 4 * 64)
  319. ADD(reg_outp, 4 * 64)
  320. SUB(reg_blocks, 4)
  321. JAE(vector_loop4.begin)
  322. ADD(reg_blocks, 4)
  323. out = Label()
  324. JZ(out)
  325. # Past this point, we no longer need to use every single register to hold
  326. # the in progress state.
  327. xmm_s0 = xmm_v8
  328. xmm_s1 = xmm_v9
  329. xmm_s2 = xmm_v10
  330. xmm_s3 = xmm_v11
  331. xmm_one = xmm_v13
  332. MOVDQA(xmm_s0, mem_s0)
  333. MOVDQA(xmm_s1, mem_s1)
  334. MOVDQA(xmm_s2, mem_s2)
  335. MOVDQA(xmm_s3, mem_s3)
  336. MOVDQA(xmm_one, mem_one)
  337. #
  338. # 2 blocks at a time.
  339. #
  340. SUB(reg_blocks, 2)
  341. vector_loop2 = Loop()
  342. JB(vector_loop2.end)
  343. with vector_loop2:
  344. MOVDQA(xmm_v0, xmm_s0)
  345. MOVDQA(xmm_v1, xmm_s1)
  346. MOVDQA(xmm_v2, xmm_s2)
  347. MOVDQA(xmm_v3, xmm_s3)
  348. MOVDQA(xmm_v4, xmm_v0)
  349. MOVDQA(xmm_v5, xmm_v1)
  350. MOVDQA(xmm_v6, xmm_v2)
  351. MOVDQA(xmm_v7, xmm_v3)
  352. PADDQ(xmm_v7, xmm_one)
  353. reg_rounds = GeneralPurposeRegister64()
  354. MOV(reg_rounds, 20)
  355. rounds_loop2 = Loop()
  356. with rounds_loop2:
  357. # a += b; d ^= a; d = ROTW16(d);
  358. PADDD(xmm_v0, xmm_v1)
  359. PADDD(xmm_v4, xmm_v5)
  360. PXOR(xmm_v3, xmm_v0)
  361. PXOR(xmm_v7, xmm_v4)
  362. ROTW16_sse2(xmm_tmp, xmm_v3)
  363. ROTW16_sse2(xmm_tmp, xmm_v7)
  364. # c += d; b ^= c; b = ROTW12(b);
  365. PADDD(xmm_v2, xmm_v3)
  366. PADDD(xmm_v6, xmm_v7)
  367. PXOR(xmm_v1, xmm_v2)
  368. PXOR(xmm_v5, xmm_v6)
  369. ROTW12_sse2(xmm_tmp, xmm_v1)
  370. ROTW12_sse2(xmm_tmp, xmm_v5)
  371. # a += b; d ^= a; d = ROTW8(d);
  372. PADDD(xmm_v0, xmm_v1)
  373. PADDD(xmm_v4, xmm_v5)
  374. PXOR(xmm_v3, xmm_v0)
  375. PXOR(xmm_v7, xmm_v4)
  376. ROTW8_sse2(xmm_tmp, xmm_v3)
  377. ROTW8_sse2(xmm_tmp, xmm_v7)
  378. # c += d; b ^= c; b = ROTW7(b)
  379. PADDD(xmm_v2, xmm_v3)
  380. PADDD(xmm_v6, xmm_v7)
  381. PXOR(xmm_v1, xmm_v2)
  382. PXOR(xmm_v5, xmm_v6)
  383. ROTW7_sse2(xmm_tmp, xmm_v1)
  384. ROTW7_sse2(xmm_tmp, xmm_v5)
  385. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  386. PSHUFD(xmm_v1, xmm_v1, 0x39)
  387. PSHUFD(xmm_v5, xmm_v5, 0x39)
  388. PSHUFD(xmm_v2, xmm_v2, 0x4e)
  389. PSHUFD(xmm_v6, xmm_v6, 0x4e)
  390. PSHUFD(xmm_v3, xmm_v3, 0x93)
  391. PSHUFD(xmm_v7, xmm_v7, 0x93)
  392. # a += b; d ^= a; d = ROTW16(d);
  393. PADDD(xmm_v0, xmm_v1)
  394. PADDD(xmm_v4, xmm_v5)
  395. PXOR(xmm_v3, xmm_v0)
  396. PXOR(xmm_v7, xmm_v4)
  397. ROTW16_sse2(xmm_tmp, xmm_v3)
  398. ROTW16_sse2(xmm_tmp, xmm_v7)
  399. # c += d; b ^= c; b = ROTW12(b);
  400. PADDD(xmm_v2, xmm_v3)
  401. PADDD(xmm_v6, xmm_v7)
  402. PXOR(xmm_v1, xmm_v2)
  403. PXOR(xmm_v5, xmm_v6)
  404. ROTW12_sse2(xmm_tmp, xmm_v1)
  405. ROTW12_sse2(xmm_tmp, xmm_v5)
  406. # a += b; d ^= a; d = ROTW8(d);
  407. PADDD(xmm_v0, xmm_v1)
  408. PADDD(xmm_v4, xmm_v5)
  409. PXOR(xmm_v3, xmm_v0)
  410. PXOR(xmm_v7, xmm_v4)
  411. ROTW8_sse2(xmm_tmp, xmm_v3)
  412. ROTW8_sse2(xmm_tmp, xmm_v7)
  413. # c += d; b ^= c; b = ROTW7(b)
  414. PADDD(xmm_v2, xmm_v3)
  415. PADDD(xmm_v6, xmm_v7)
  416. PXOR(xmm_v1, xmm_v2)
  417. PXOR(xmm_v5, xmm_v6)
  418. ROTW7_sse2(xmm_tmp, xmm_v1)
  419. ROTW7_sse2(xmm_tmp, xmm_v5)
  420. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  421. PSHUFD(xmm_v1, xmm_v1, 0x93)
  422. PSHUFD(xmm_v5, xmm_v5, 0x93)
  423. PSHUFD(xmm_v2, xmm_v2, 0x4e)
  424. PSHUFD(xmm_v6, xmm_v6, 0x4e)
  425. PSHUFD(xmm_v3, xmm_v3, 0x39)
  426. PSHUFD(xmm_v7, xmm_v7, 0x39)
  427. SUB(reg_rounds, 2)
  428. JNZ(rounds_loop2.begin)
  429. PADDD(xmm_v0, xmm_s0)
  430. PADDD(xmm_v1, xmm_s1)
  431. PADDD(xmm_v2, xmm_s2)
  432. PADDD(xmm_v3, xmm_s3)
  433. WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
  434. PADDQ(xmm_s3, xmm_one)
  435. PADDD(xmm_v4, xmm_s0)
  436. PADDD(xmm_v5, xmm_s1)
  437. PADDD(xmm_v6, xmm_s2)
  438. PADDD(xmm_v7, xmm_s3)
  439. WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
  440. PADDQ(xmm_s3, xmm_one)
  441. ADD(reg_inp, 2 * 64)
  442. ADD(reg_outp, 2 * 64)
  443. SUB(reg_blocks, 2)
  444. JAE(vector_loop2.begin)
  445. ADD(reg_blocks, 2)
  446. out_serial = Label()
  447. JZ(out_serial)
  448. #
  449. # 1 block at a time. Only executed once, because if there was > 1,
  450. # the parallel code would have processed it already.
  451. #
  452. MOVDQA(xmm_v0, xmm_s0)
  453. MOVDQA(xmm_v1, xmm_s1)
  454. MOVDQA(xmm_v2, xmm_s2)
  455. MOVDQA(xmm_v3, xmm_s3)
  456. reg_rounds = GeneralPurposeRegister64()
  457. MOV(reg_rounds, 20)
  458. rounds_loop1 = Loop()
  459. with rounds_loop1:
  460. # a += b; d ^= a; d = ROTW16(d);
  461. PADDD(xmm_v0, xmm_v1)
  462. PXOR(xmm_v3, xmm_v0)
  463. ROTW16_sse2(xmm_tmp, xmm_v3)
  464. # c += d; b ^= c; b = ROTW12(b);
  465. PADDD(xmm_v2, xmm_v3)
  466. PXOR(xmm_v1, xmm_v2)
  467. ROTW12_sse2(xmm_tmp, xmm_v1)
  468. # a += b; d ^= a; d = ROTW8(d);
  469. PADDD(xmm_v0, xmm_v1)
  470. PXOR(xmm_v3, xmm_v0)
  471. ROTW8_sse2(xmm_tmp, xmm_v3)
  472. # c += d; b ^= c; b = ROTW7(b)
  473. PADDD(xmm_v2, xmm_v3)
  474. PXOR(xmm_v1, xmm_v2)
  475. ROTW7_sse2(xmm_tmp, xmm_v1)
  476. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  477. PSHUFD(xmm_v1, xmm_v1, 0x39)
  478. PSHUFD(xmm_v2, xmm_v2, 0x4e)
  479. PSHUFD(xmm_v3, xmm_v3, 0x93)
  480. # a += b; d ^= a; d = ROTW16(d);
  481. PADDD(xmm_v0, xmm_v1)
  482. PXOR(xmm_v3, xmm_v0)
  483. ROTW16_sse2(xmm_tmp, xmm_v3)
  484. # c += d; b ^= c; b = ROTW12(b);
  485. PADDD(xmm_v2, xmm_v3)
  486. PXOR(xmm_v1, xmm_v2)
  487. ROTW12_sse2(xmm_tmp, xmm_v1)
  488. # a += b; d ^= a; d = ROTW8(d);
  489. PADDD(xmm_v0, xmm_v1)
  490. PXOR(xmm_v3, xmm_v0)
  491. ROTW8_sse2(xmm_tmp, xmm_v3)
  492. # c += d; b ^= c; b = ROTW7(b)
  493. PADDD(xmm_v2, xmm_v3)
  494. PXOR(xmm_v1, xmm_v2)
  495. ROTW7_sse2(xmm_tmp, xmm_v1)
  496. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  497. PSHUFD(xmm_v1, xmm_v1, 0x93)
  498. PSHUFD(xmm_v2, xmm_v2, 0x4e)
  499. PSHUFD(xmm_v3, xmm_v3, 0x39)
  500. SUB(reg_rounds, 2)
  501. JNZ(rounds_loop1.begin)
  502. PADDD(xmm_v0, xmm_s0)
  503. PADDD(xmm_v1, xmm_s1)
  504. PADDD(xmm_v2, xmm_s2)
  505. PADDD(xmm_v3, xmm_s3)
  506. WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
  507. PADDQ(xmm_s3, xmm_one)
  508. LABEL(out_serial)
  509. # Write back the updated counter. Stoping at 2^70 bytes is the user's
  510. # problem, not mine. (Skipped if there's exactly a multiple of 4 blocks
  511. # because the counter is incremented in memory while looping.)
  512. MOVDQA(mem_s3, xmm_s3)
  513. LABEL(out)
  514. # Paranoia, cleanse the scratch space.
  515. PXOR(xmm_v0, xmm_v0)
  516. MOVDQA(mem_tmp0, xmm_v0)
  517. # Remove our stack allocation.
  518. MOV(registers.rsp, reg_sp_save)
  519. RETURN()
  520. #
  521. # AVX2 helpers. Like the SSE2 equivalents, the scratch register is explicit,
  522. # and more helpers are used to increase readability for destructive operations.
  523. #
  524. # XXX/Performance: ROTW16_avx2/ROTW8_avx2 both can use VPSHUFFB.
  525. #
  526. def ADD_avx2(dst, src):
  527. VPADDD(dst, dst, src)
  528. def XOR_avx2(dst, src):
  529. VPXOR(dst, dst, src)
  530. def ROTW16_avx2(tmp, d):
  531. VPSLLD(tmp, d, 16)
  532. VPSRLD(d, d, 16)
  533. XOR_avx2(d, tmp)
  534. def ROTW12_avx2(tmp, b):
  535. VPSLLD(tmp, b, 12)
  536. VPSRLD(b, b, 20)
  537. XOR_avx2(b, tmp)
  538. def ROTW8_avx2(tmp, d):
  539. VPSLLD(tmp, d, 8)
  540. VPSRLD(d, d, 24)
  541. XOR_avx2(d, tmp)
  542. def ROTW7_avx2(tmp, b):
  543. VPSLLD(tmp, b, 7)
  544. VPSRLD(b, b, 25)
  545. XOR_avx2(b, tmp)
  546. def WriteXor_avx2(tmp, inp, outp, d, v0, v1, v2, v3):
  547. # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
  548. VPERM2I128(tmp, v0, v1, 0x20)
  549. VPXOR(tmp, tmp, [inp+d])
  550. VMOVDQU([outp+d], tmp)
  551. # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
  552. VPERM2I128(tmp, v2, v3, 0x20)
  553. VPXOR(tmp, tmp, [inp+d+32])
  554. VMOVDQU([outp+d+32], tmp)
  555. # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
  556. VPERM2I128(tmp, v0, v1, 0x31)
  557. VPXOR(tmp, tmp, [inp+d+64])
  558. VMOVDQU([outp+d+64], tmp)
  559. # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
  560. VPERM2I128(tmp, v2, v3, 0x31)
  561. VPXOR(tmp, tmp, [inp+d+96])
  562. VMOVDQU([outp+d+96], tmp)
  563. # AVX2 ChaCha20 (aka avx2). Does not handle partial blocks, will process
  564. # 8/4/2 blocks at a time. Alignment blah blah blah fuck you.
  565. with Function("blocksAmd64AVX2", (x, inp, outp, nrBlocks), target=uarch.broadwell):
  566. reg_x = GeneralPurposeRegister64()
  567. reg_inp = GeneralPurposeRegister64()
  568. reg_outp = GeneralPurposeRegister64()
  569. reg_blocks = GeneralPurposeRegister64()
  570. reg_sp_save = GeneralPurposeRegister64()
  571. LOAD.ARGUMENT(reg_x, x)
  572. LOAD.ARGUMENT(reg_inp, inp)
  573. LOAD.ARGUMENT(reg_outp, outp)
  574. LOAD.ARGUMENT(reg_blocks, nrBlocks)
  575. # Align the stack to a 32 byte boundary.
  576. reg_align = GeneralPurposeRegister64()
  577. MOV(reg_sp_save, registers.rsp)
  578. MOV(reg_align, 0x1f)
  579. NOT(reg_align)
  580. AND(registers.rsp, reg_align)
  581. SUB(registers.rsp, 0x20)
  582. x_s0 = [reg_x] # (Memory) Cipher state [0..3]
  583. x_s1 = [reg_x+16] # (Memory) Cipher state [4..7]
  584. x_s2 = [reg_x+32] # (Memory) Cipher state [8..11]
  585. x_s3 = [reg_x+48] # (Memory) Cipher state [12..15]
  586. ymm_v0 = YMMRegister()
  587. ymm_v1 = YMMRegister()
  588. ymm_v2 = YMMRegister()
  589. ymm_v3 = YMMRegister()
  590. ymm_v4 = YMMRegister()
  591. ymm_v5 = YMMRegister()
  592. ymm_v6 = YMMRegister()
  593. ymm_v7 = YMMRegister()
  594. ymm_v8 = YMMRegister()
  595. ymm_v9 = YMMRegister()
  596. ymm_v10 = YMMRegister()
  597. ymm_v11 = YMMRegister()
  598. ymm_v12 = YMMRegister()
  599. ymm_v13 = YMMRegister()
  600. ymm_v14 = YMMRegister()
  601. ymm_v15 = YMMRegister()
  602. ymm_tmp0 = ymm_v12
  603. # Allocate the neccecary stack space for the counter vector and two ymm
  604. # registers that we will spill.
  605. SUB(registers.rsp, 96)
  606. mem_tmp0 = [registers.rsp+64] # (Stack) Scratch space.
  607. mem_s3 = [registers.rsp+32] # (Stack) Working copy of s3. (8x)
  608. mem_inc = [registers.rsp] # (Stack) Counter increment vector.
  609. # Increment the counter for one side of the state vector.
  610. VPXOR(ymm_tmp0, ymm_tmp0, ymm_tmp0)
  611. VMOVDQU(mem_inc, ymm_tmp0)
  612. reg_tmp = GeneralPurposeRegister32()
  613. MOV(reg_tmp, 0x00000001)
  614. MOV([registers.rsp+16], reg_tmp)
  615. VBROADCASTI128(ymm_v3, x_s3)
  616. VPADDQ(ymm_v3, ymm_v3, [registers.rsp])
  617. VMOVDQA(mem_s3, ymm_v3)
  618. # As we process 2xN blocks at a time, so the counter increment for both
  619. # sides of the state vector is 2.
  620. MOV(reg_tmp, 0x00000002)
  621. MOV([registers.rsp], reg_tmp)
  622. MOV([registers.rsp+16], reg_tmp)
  623. out_write_even = Label()
  624. out_write_odd = Label()
  625. #
  626. # 8 blocks at a time. Ted Krovetz's avx2 code does not do this, but it's
  627. # a decent gain despite all the pain...
  628. #
  629. vector_loop8 = Loop()
  630. SUB(reg_blocks, 8)
  631. JB(vector_loop8.end)
  632. with vector_loop8:
  633. VBROADCASTI128(ymm_v0, x_s0)
  634. VBROADCASTI128(ymm_v1, x_s1)
  635. VBROADCASTI128(ymm_v2, x_s2)
  636. VMOVDQA(ymm_v3, mem_s3)
  637. VMOVDQA(ymm_v4, ymm_v0)
  638. VMOVDQA(ymm_v5, ymm_v1)
  639. VMOVDQA(ymm_v6, ymm_v2)
  640. VPADDQ(ymm_v7, ymm_v3, mem_inc)
  641. VMOVDQA(ymm_v8, ymm_v0)
  642. VMOVDQA(ymm_v9, ymm_v1)
  643. VMOVDQA(ymm_v10, ymm_v2)
  644. VPADDQ(ymm_v11, ymm_v7, mem_inc)
  645. VMOVDQA(ymm_v12, ymm_v0)
  646. VMOVDQA(ymm_v13, ymm_v1)
  647. VMOVDQA(ymm_v14, ymm_v2)
  648. VPADDQ(ymm_v15, ymm_v11, mem_inc)
  649. reg_rounds = GeneralPurposeRegister64()
  650. MOV(reg_rounds, 20)
  651. rounds_loop8 = Loop()
  652. with rounds_loop8:
  653. # a += b; d ^= a; d = ROTW16(d);
  654. ADD_avx2(ymm_v0, ymm_v1)
  655. ADD_avx2(ymm_v4, ymm_v5)
  656. ADD_avx2(ymm_v8, ymm_v9)
  657. ADD_avx2(ymm_v12, ymm_v13)
  658. XOR_avx2(ymm_v3, ymm_v0)
  659. XOR_avx2(ymm_v7, ymm_v4)
  660. XOR_avx2(ymm_v11, ymm_v8)
  661. XOR_avx2(ymm_v15, ymm_v12)
  662. VMOVDQA(mem_tmp0, ymm_tmp0) # Save
  663. ROTW16_avx2(ymm_tmp0, ymm_v3)
  664. ROTW16_avx2(ymm_tmp0, ymm_v7)
  665. ROTW16_avx2(ymm_tmp0, ymm_v11)
  666. ROTW16_avx2(ymm_tmp0, ymm_v15)
  667. # c += d; b ^= c; b = ROTW12(b);
  668. ADD_avx2(ymm_v2, ymm_v3)
  669. ADD_avx2(ymm_v6, ymm_v7)
  670. ADD_avx2(ymm_v10, ymm_v11)
  671. ADD_avx2(ymm_v14, ymm_v15)
  672. XOR_avx2(ymm_v1, ymm_v2)
  673. XOR_avx2(ymm_v5, ymm_v6)
  674. XOR_avx2(ymm_v9, ymm_v10)
  675. XOR_avx2(ymm_v13, ymm_v14)
  676. ROTW12_avx2(ymm_tmp0, ymm_v1)
  677. ROTW12_avx2(ymm_tmp0, ymm_v5)
  678. ROTW12_avx2(ymm_tmp0, ymm_v9)
  679. ROTW12_avx2(ymm_tmp0, ymm_v13)
  680. # a += b; d ^= a; d = ROTW8(d);
  681. VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
  682. ADD_avx2(ymm_v0, ymm_v1)
  683. ADD_avx2(ymm_v4, ymm_v5)
  684. ADD_avx2(ymm_v8, ymm_v9)
  685. ADD_avx2(ymm_v12, ymm_v13)
  686. XOR_avx2(ymm_v3, ymm_v0)
  687. XOR_avx2(ymm_v7, ymm_v4)
  688. XOR_avx2(ymm_v11, ymm_v8)
  689. XOR_avx2(ymm_v15, ymm_v12)
  690. VMOVDQA(mem_tmp0, ymm_tmp0) # Save
  691. ROTW8_avx2(ymm_tmp0, ymm_v3)
  692. ROTW8_avx2(ymm_tmp0, ymm_v7)
  693. ROTW8_avx2(ymm_tmp0, ymm_v11)
  694. ROTW8_avx2(ymm_tmp0, ymm_v15)
  695. # c += d; b ^= c; b = ROTW7(b)
  696. ADD_avx2(ymm_v2, ymm_v3)
  697. ADD_avx2(ymm_v6, ymm_v7)
  698. ADD_avx2(ymm_v10, ymm_v11)
  699. ADD_avx2(ymm_v14, ymm_v15)
  700. XOR_avx2(ymm_v1, ymm_v2)
  701. XOR_avx2(ymm_v5, ymm_v6)
  702. XOR_avx2(ymm_v9, ymm_v10)
  703. XOR_avx2(ymm_v13, ymm_v14)
  704. ROTW7_avx2(ymm_tmp0, ymm_v1)
  705. ROTW7_avx2(ymm_tmp0, ymm_v5)
  706. ROTW7_avx2(ymm_tmp0, ymm_v9)
  707. ROTW7_avx2(ymm_tmp0, ymm_v13)
  708. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  709. VPSHUFD(ymm_v1, ymm_v1, 0x39)
  710. VPSHUFD(ymm_v5, ymm_v5, 0x39)
  711. VPSHUFD(ymm_v9, ymm_v9, 0x39)
  712. VPSHUFD(ymm_v13, ymm_v13, 0x39)
  713. VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  714. VPSHUFD(ymm_v6, ymm_v6, 0x4e)
  715. VPSHUFD(ymm_v10, ymm_v10, 0x4e)
  716. VPSHUFD(ymm_v14, ymm_v14, 0x4e)
  717. VPSHUFD(ymm_v3, ymm_v3, 0x93)
  718. VPSHUFD(ymm_v7, ymm_v7, 0x93)
  719. VPSHUFD(ymm_v11, ymm_v11, 0x93)
  720. VPSHUFD(ymm_v15, ymm_v15, 0x93)
  721. # a += b; d ^= a; d = ROTW16(d);
  722. VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
  723. ADD_avx2(ymm_v0, ymm_v1)
  724. ADD_avx2(ymm_v4, ymm_v5)
  725. ADD_avx2(ymm_v8, ymm_v9)
  726. ADD_avx2(ymm_v12, ymm_v13)
  727. XOR_avx2(ymm_v3, ymm_v0)
  728. XOR_avx2(ymm_v7, ymm_v4)
  729. XOR_avx2(ymm_v11, ymm_v8)
  730. XOR_avx2(ymm_v15, ymm_v12)
  731. VMOVDQA(mem_tmp0, ymm_tmp0) # Save
  732. ROTW16_avx2(ymm_tmp0, ymm_v3)
  733. ROTW16_avx2(ymm_tmp0, ymm_v7)
  734. ROTW16_avx2(ymm_tmp0, ymm_v11)
  735. ROTW16_avx2(ymm_tmp0, ymm_v15)
  736. # c += d; b ^= c; b = ROTW12(b);
  737. ADD_avx2(ymm_v2, ymm_v3)
  738. ADD_avx2(ymm_v6, ymm_v7)
  739. ADD_avx2(ymm_v10, ymm_v11)
  740. ADD_avx2(ymm_v14, ymm_v15)
  741. XOR_avx2(ymm_v1, ymm_v2)
  742. XOR_avx2(ymm_v5, ymm_v6)
  743. XOR_avx2(ymm_v9, ymm_v10)
  744. XOR_avx2(ymm_v13, ymm_v14)
  745. ROTW12_avx2(ymm_tmp0, ymm_v1)
  746. ROTW12_avx2(ymm_tmp0, ymm_v5)
  747. ROTW12_avx2(ymm_tmp0, ymm_v9)
  748. ROTW12_avx2(ymm_tmp0, ymm_v13)
  749. # a += b; d ^= a; d = ROTW8(d);
  750. VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
  751. ADD_avx2(ymm_v0, ymm_v1)
  752. ADD_avx2(ymm_v4, ymm_v5)
  753. ADD_avx2(ymm_v8, ymm_v9)
  754. ADD_avx2(ymm_v12, ymm_v13)
  755. XOR_avx2(ymm_v3, ymm_v0)
  756. XOR_avx2(ymm_v7, ymm_v4)
  757. XOR_avx2(ymm_v11, ymm_v8)
  758. XOR_avx2(ymm_v15, ymm_v12)
  759. VMOVDQA(mem_tmp0, ymm_tmp0) # Save
  760. ROTW8_avx2(ymm_tmp0, ymm_v3)
  761. ROTW8_avx2(ymm_tmp0, ymm_v7)
  762. ROTW8_avx2(ymm_tmp0, ymm_v11)
  763. ROTW8_avx2(ymm_tmp0, ymm_v15)
  764. # c += d; b ^= c; b = ROTW7(b)
  765. ADD_avx2(ymm_v2, ymm_v3)
  766. ADD_avx2(ymm_v6, ymm_v7)
  767. ADD_avx2(ymm_v10, ymm_v11)
  768. ADD_avx2(ymm_v14, ymm_v15)
  769. XOR_avx2(ymm_v1, ymm_v2)
  770. XOR_avx2(ymm_v5, ymm_v6)
  771. XOR_avx2(ymm_v9, ymm_v10)
  772. XOR_avx2(ymm_v13, ymm_v14)
  773. ROTW7_avx2(ymm_tmp0, ymm_v1)
  774. ROTW7_avx2(ymm_tmp0, ymm_v5)
  775. ROTW7_avx2(ymm_tmp0, ymm_v9)
  776. ROTW7_avx2(ymm_tmp0, ymm_v13)
  777. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  778. VPSHUFD(ymm_v1, ymm_v1, 0x93)
  779. VPSHUFD(ymm_v5, ymm_v5, 0x93)
  780. VPSHUFD(ymm_v9, ymm_v9, 0x93)
  781. VPSHUFD(ymm_v13, ymm_v13, 0x93)
  782. VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  783. VPSHUFD(ymm_v6, ymm_v6, 0x4e)
  784. VPSHUFD(ymm_v10, ymm_v10, 0x4e)
  785. VPSHUFD(ymm_v14, ymm_v14, 0x4e)
  786. VPSHUFD(ymm_v3, ymm_v3, 0x39)
  787. VPSHUFD(ymm_v7, ymm_v7, 0x39)
  788. VPSHUFD(ymm_v11, ymm_v11, 0x39)
  789. VPSHUFD(ymm_v15, ymm_v15, 0x39)
  790. VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
  791. SUB(reg_rounds, 2)
  792. JNZ(rounds_loop8.begin)
  793. # ymm_v12 is in mem_tmp0 and is current....
  794. # XXX: I assume VBROADCASTI128 is about as fast as VMOVDQA....
  795. VBROADCASTI128(ymm_tmp0, x_s0)
  796. ADD_avx2(ymm_v0, ymm_tmp0)
  797. ADD_avx2(ymm_v4, ymm_tmp0)
  798. ADD_avx2(ymm_v8, ymm_tmp0)
  799. ADD_avx2(ymm_tmp0, mem_tmp0)
  800. VMOVDQA(mem_tmp0, ymm_tmp0)
  801. VBROADCASTI128(ymm_tmp0, x_s1)
  802. ADD_avx2(ymm_v1, ymm_tmp0)
  803. ADD_avx2(ymm_v5, ymm_tmp0)
  804. ADD_avx2(ymm_v9, ymm_tmp0)
  805. ADD_avx2(ymm_v13, ymm_tmp0)
  806. VBROADCASTI128(ymm_tmp0, x_s2)
  807. ADD_avx2(ymm_v2, ymm_tmp0)
  808. ADD_avx2(ymm_v6, ymm_tmp0)
  809. ADD_avx2(ymm_v10, ymm_tmp0)
  810. ADD_avx2(ymm_v14, ymm_tmp0)
  811. ADD_avx2(ymm_v3, mem_s3)
  812. WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
  813. VMOVDQA(ymm_v3, mem_s3)
  814. ADD_avx2(ymm_v3, mem_inc)
  815. ADD_avx2(ymm_v7, ymm_v3)
  816. WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
  817. ADD_avx2(ymm_v3, mem_inc)
  818. ADD_avx2(ymm_v11, ymm_v3)
  819. WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 256, ymm_v8, ymm_v9, ymm_v10, ymm_v11)
  820. ADD_avx2(ymm_v3, mem_inc)
  821. VMOVDQA(ymm_v12, mem_tmp0)
  822. ADD_avx2(ymm_v15, ymm_v3)
  823. WriteXor_avx2(ymm_v0, reg_inp, reg_outp, 384, ymm_v12, ymm_v13, ymm_v14, ymm_v15)
  824. ADD_avx2(ymm_v3, mem_inc)
  825. VMOVDQA(mem_s3, ymm_v3)
  826. ADD(reg_inp, 8 * 64)
  827. ADD(reg_outp, 8 * 64)
  828. SUB(reg_blocks, 8)
  829. JAE(vector_loop8.begin)
  830. # ymm_v3 contains a current copy of mem_s3 either from when it was built,
  831. # or because the loop updates it. Copy this before we mess with the block
  832. # counter in case we need to write it back and return.
  833. ymm_s3 = ymm_v11
  834. VMOVDQA(ymm_s3, ymm_v3)
  835. ADD(reg_blocks, 8)
  836. JZ(out_write_even)
  837. # We now actually can do everything in registers.
  838. ymm_s0 = ymm_v8
  839. VBROADCASTI128(ymm_s0, x_s0)
  840. ymm_s1 = ymm_v9
  841. VBROADCASTI128(ymm_s1, x_s1)
  842. ymm_s2 = ymm_v10
  843. VBROADCASTI128(ymm_s2, x_s2)
  844. ymm_inc = ymm_v14
  845. VMOVDQA(ymm_inc, mem_inc)
  846. #
  847. # 4 blocks at a time.
  848. #
  849. SUB(reg_blocks, 4)
  850. vector_loop4 = Loop()
  851. JB(vector_loop4.end)
  852. with vector_loop4:
  853. VMOVDQA(ymm_v0, ymm_s0)
  854. VMOVDQA(ymm_v1, ymm_s1)
  855. VMOVDQA(ymm_v2, ymm_s2)
  856. VMOVDQA(ymm_v3, ymm_s3)
  857. VMOVDQA(ymm_v4, ymm_v0)
  858. VMOVDQA(ymm_v5, ymm_v1)
  859. VMOVDQA(ymm_v6, ymm_v2)
  860. VPADDQ(ymm_v7, ymm_v3, ymm_inc)
  861. reg_rounds = GeneralPurposeRegister64()
  862. MOV(reg_rounds, 20)
  863. rounds_loop4 = Loop()
  864. with rounds_loop4:
  865. # a += b; d ^= a; d = ROTW16(d);
  866. ADD_avx2(ymm_v0, ymm_v1)
  867. ADD_avx2(ymm_v4, ymm_v5)
  868. XOR_avx2(ymm_v3, ymm_v0)
  869. XOR_avx2(ymm_v7, ymm_v4)
  870. ROTW16_avx2(ymm_tmp0, ymm_v3)
  871. ROTW16_avx2(ymm_tmp0, ymm_v7)
  872. # c += d; b ^= c; b = ROTW12(b);
  873. ADD_avx2(ymm_v2, ymm_v3)
  874. ADD_avx2(ymm_v6, ymm_v7)
  875. XOR_avx2(ymm_v1, ymm_v2)
  876. XOR_avx2(ymm_v5, ymm_v6)
  877. ROTW12_avx2(ymm_tmp0, ymm_v1)
  878. ROTW12_avx2(ymm_tmp0, ymm_v5)
  879. # a += b; d ^= a; d = ROTW8(d);
  880. ADD_avx2(ymm_v0, ymm_v1)
  881. ADD_avx2(ymm_v4, ymm_v5)
  882. XOR_avx2(ymm_v3, ymm_v0)
  883. XOR_avx2(ymm_v7, ymm_v4)
  884. ROTW8_avx2(ymm_tmp0, ymm_v3)
  885. ROTW8_avx2(ymm_tmp0, ymm_v7)
  886. # c += d; b ^= c; b = ROTW7(b)
  887. ADD_avx2(ymm_v2, ymm_v3)
  888. ADD_avx2(ymm_v6, ymm_v7)
  889. XOR_avx2(ymm_v1, ymm_v2)
  890. XOR_avx2(ymm_v5, ymm_v6)
  891. ROTW7_avx2(ymm_tmp0, ymm_v1)
  892. ROTW7_avx2(ymm_tmp0, ymm_v5)
  893. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  894. VPSHUFD(ymm_v1, ymm_v1, 0x39)
  895. VPSHUFD(ymm_v5, ymm_v5, 0x39)
  896. VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  897. VPSHUFD(ymm_v6, ymm_v6, 0x4e)
  898. VPSHUFD(ymm_v3, ymm_v3, 0x93)
  899. VPSHUFD(ymm_v7, ymm_v7, 0x93)
  900. # a += b; d ^= a; d = ROTW16(d);
  901. ADD_avx2(ymm_v0, ymm_v1)
  902. ADD_avx2(ymm_v4, ymm_v5)
  903. XOR_avx2(ymm_v3, ymm_v0)
  904. XOR_avx2(ymm_v7, ymm_v4)
  905. ROTW16_avx2(ymm_tmp0, ymm_v3)
  906. ROTW16_avx2(ymm_tmp0, ymm_v7)
  907. # c += d; b ^= c; b = ROTW12(b);
  908. ADD_avx2(ymm_v2, ymm_v3)
  909. ADD_avx2(ymm_v6, ymm_v7)
  910. XOR_avx2(ymm_v1, ymm_v2)
  911. XOR_avx2(ymm_v5, ymm_v6)
  912. ROTW12_avx2(ymm_tmp0, ymm_v1)
  913. ROTW12_avx2(ymm_tmp0, ymm_v5)
  914. # a += b; d ^= a; d = ROTW8(d);
  915. ADD_avx2(ymm_v0, ymm_v1)
  916. ADD_avx2(ymm_v4, ymm_v5)
  917. XOR_avx2(ymm_v3, ymm_v0)
  918. XOR_avx2(ymm_v7, ymm_v4)
  919. ROTW8_avx2(ymm_tmp0, ymm_v3)
  920. ROTW8_avx2(ymm_tmp0, ymm_v7)
  921. # c += d; b ^= c; b = ROTW7(b)
  922. ADD_avx2(ymm_v2, ymm_v3)
  923. ADD_avx2(ymm_v6, ymm_v7)
  924. XOR_avx2(ymm_v1, ymm_v2)
  925. XOR_avx2(ymm_v5, ymm_v6)
  926. ROTW7_avx2(ymm_tmp0, ymm_v1)
  927. ROTW7_avx2(ymm_tmp0, ymm_v5)
  928. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  929. VPSHUFD(ymm_v1, ymm_v1, 0x93)
  930. VPSHUFD(ymm_v5, ymm_v5, 0x93)
  931. VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  932. VPSHUFD(ymm_v6, ymm_v6, 0x4e)
  933. VPSHUFD(ymm_v3, ymm_v3, 0x39)
  934. VPSHUFD(ymm_v7, ymm_v7, 0x39)
  935. SUB(reg_rounds, 2)
  936. JNZ(rounds_loop4.begin)
  937. ADD_avx2(ymm_v0, ymm_s0)
  938. ADD_avx2(ymm_v1, ymm_s1)
  939. ADD_avx2(ymm_v2, ymm_s2)
  940. ADD_avx2(ymm_v3, ymm_s3)
  941. WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
  942. ADD_avx2(ymm_s3, ymm_inc)
  943. ADD_avx2(ymm_v4, ymm_s0)
  944. ADD_avx2(ymm_v5, ymm_s1)
  945. ADD_avx2(ymm_v6, ymm_s2)
  946. ADD_avx2(ymm_v7, ymm_s3)
  947. WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
  948. ADD_avx2(ymm_s3, ymm_inc)
  949. ADD(reg_inp, 4 * 64)
  950. ADD(reg_outp, 4 * 64)
  951. SUB(reg_blocks, 4)
  952. JAE(vector_loop4.begin)
  953. ADD(reg_blocks, 4)
  954. JZ(out_write_even)
  955. #
  956. # 2/1 blocks at a time. The two codepaths are unified because
  957. # with AVX2 we do 2 blocks at a time anyway, and this only gets called
  958. # if 3/2/1 blocks are remaining, so the extra branches don't hurt that
  959. # much.
  960. #
  961. vector_loop2 = Loop()
  962. with vector_loop2:
  963. VMOVDQA(ymm_v0, ymm_s0)
  964. VMOVDQA(ymm_v1, ymm_s1)
  965. VMOVDQA(ymm_v2, ymm_s2)
  966. VMOVDQA(ymm_v3, ymm_s3)
  967. reg_rounds = GeneralPurposeRegister64()
  968. MOV(reg_rounds, 20)
  969. rounds_loop2 = Loop()
  970. with rounds_loop2:
  971. # a += b; d ^= a; d = ROTW16(d);
  972. ADD_avx2(ymm_v0, ymm_v1)
  973. XOR_avx2(ymm_v3, ymm_v0)
  974. ROTW16_avx2(ymm_tmp0, ymm_v3)
  975. # c += d; b ^= c; b = ROTW12(b);
  976. ADD_avx2(ymm_v2, ymm_v3)
  977. XOR_avx2(ymm_v1, ymm_v2)
  978. ROTW12_avx2(ymm_tmp0, ymm_v1)
  979. # a += b; d ^= a; d = ROTW8(d);
  980. ADD_avx2(ymm_v0, ymm_v1)
  981. XOR_avx2(ymm_v3, ymm_v0)
  982. ROTW8_avx2(ymm_tmp0, ymm_v3)
  983. # c += d; b ^= c; b = ROTW7(b)
  984. ADD_avx2(ymm_v2, ymm_v3)
  985. XOR_avx2(ymm_v1, ymm_v2)
  986. ROTW7_avx2(ymm_tmp0, ymm_v1)
  987. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  988. VPSHUFD(ymm_v1, ymm_v1, 0x39)
  989. VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  990. VPSHUFD(ymm_v3, ymm_v3, 0x93)
  991. # a += b; d ^= a; d = ROTW16(d);
  992. ADD_avx2(ymm_v0, ymm_v1)
  993. XOR_avx2(ymm_v3, ymm_v0)
  994. ROTW16_avx2(ymm_tmp0, ymm_v3)
  995. # c += d; b ^= c; b = ROTW12(b);
  996. ADD_avx2(ymm_v2, ymm_v3)
  997. XOR_avx2(ymm_v1, ymm_v2)
  998. ROTW12_avx2(ymm_tmp0, ymm_v1)
  999. # a += b; d ^= a; d = ROTW8(d);
  1000. ADD_avx2(ymm_v0, ymm_v1)
  1001. XOR_avx2(ymm_v3, ymm_v0)
  1002. ROTW8_avx2(ymm_tmp0, ymm_v3)
  1003. # c += d; b ^= c; b = ROTW7(b)
  1004. ADD_avx2(ymm_v2, ymm_v3)
  1005. XOR_avx2(ymm_v1, ymm_v2)
  1006. ROTW7_avx2(ymm_tmp0, ymm_v1)
  1007. # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
  1008. VPSHUFD(ymm_v1, ymm_v1, 0x93)
  1009. VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  1010. VPSHUFD(ymm_v3, ymm_v3, 0x39)
  1011. SUB(reg_rounds, 2)
  1012. JNZ(rounds_loop2.begin)
  1013. ADD_avx2(ymm_v0, ymm_s0)
  1014. ADD_avx2(ymm_v1, ymm_s1)
  1015. ADD_avx2(ymm_v2, ymm_s2)
  1016. ADD_avx2(ymm_v3, ymm_s3)
  1017. # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
  1018. VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x20)
  1019. VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp])
  1020. VMOVDQU([reg_outp], ymm_tmp0)
  1021. # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
  1022. VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x20)
  1023. VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+32])
  1024. VMOVDQU([reg_outp+32], ymm_tmp0)
  1025. SUB(reg_blocks, 1)
  1026. JZ(out_write_odd)
  1027. ADD_avx2(ymm_s3, ymm_inc)
  1028. # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
  1029. VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x31)
  1030. VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+64])
  1031. VMOVDQU([reg_outp+64], ymm_tmp0)
  1032. # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
  1033. VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x31)
  1034. VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+96])
  1035. VMOVDQU([reg_outp+96], ymm_tmp0)
  1036. SUB(reg_blocks, 1)
  1037. JZ(out_write_even)
  1038. ADD(reg_inp, 2 * 64)
  1039. ADD(reg_outp, 2 * 64)
  1040. JMP(vector_loop2.begin)
  1041. LABEL(out_write_odd)
  1042. VPERM2I128(ymm_s3, ymm_s3, ymm_s3, 0x01) # Odd number of blocks.
  1043. LABEL(out_write_even)
  1044. VMOVDQA(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3
  1045. # Paranoia, cleanse the scratch space.
  1046. VPXOR(ymm_v0, ymm_v0, ymm_v0)
  1047. VMOVDQA(mem_tmp0, ymm_v0)
  1048. VMOVDQA(mem_s3, ymm_v0)
  1049. # Remove our stack allocation.
  1050. MOV(registers.rsp, reg_sp_save)
  1051. RETURN()
  1052. #
  1053. # CPUID
  1054. #
  1055. cpuidParams = Argument(ptr(uint32_t))
  1056. with Function("cpuidAmd64", (cpuidParams,)):
  1057. reg_params = registers.r15
  1058. LOAD.ARGUMENT(reg_params, cpuidParams)
  1059. MOV(registers.eax, [reg_params])
  1060. MOV(registers.ecx, [reg_params+4])
  1061. CPUID()
  1062. MOV([reg_params], registers.eax)
  1063. MOV([reg_params+4], registers.ebx)
  1064. MOV([reg_params+8], registers.ecx)
  1065. MOV([reg_params+12], registers.edx)
  1066. RETURN()
  1067. #
  1068. # XGETBV (ECX = 0)
  1069. #
  1070. xcrVec = Argument(ptr(uint32_t))
  1071. with Function("xgetbv0Amd64", (xcrVec,)):
  1072. reg_vec = GeneralPurposeRegister64()
  1073. LOAD.ARGUMENT(reg_vec, xcrVec)
  1074. XOR(registers.ecx, registers.ecx)
  1075. XGETBV()
  1076. MOV([reg_vec], registers.eax)
  1077. MOV([reg_vec+4], registers.edx)
  1078. RETURN()