aez_amd64.py 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269
  1. #!/usr/bin/env python3
  2. #
  3. # To the extent possible under law, Yawning Angel has waived all copyright
  4. # and related or neighboring rights to aez, using the Creative
  5. # Commons "CC0" public domain dedication. See LICENSE or
  6. # <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
  7. #
  8. # Dependencies: https://github.com/Maratyszcza/PeachPy
  9. #
  10. # python3 -m peachpy.x86_64 -mabi=goasm -S -o aez_amd64.s aez_amd64.py
  11. #
  12. from peachpy import *
  13. from peachpy.x86_64 import *
  14. cpuidParams = Argument(ptr(uint32_t))
  15. with Function("cpuidAMD64", (cpuidParams,)):
  16. reg_params = registers.r15
  17. LOAD.ARGUMENT(reg_params, cpuidParams)
  18. MOV(registers.eax, [reg_params])
  19. MOV(registers.ecx, [reg_params+8])
  20. CPUID()
  21. MOV([reg_params], registers.eax)
  22. MOV([reg_params+4], registers.ebx)
  23. MOV([reg_params+8], registers.ecx)
  24. MOV([reg_params+12], registers.edx)
  25. RETURN()
  26. with Function("resetAMD64SSE2", ()):
  27. PXOR(registers.xmm0, registers.xmm0)
  28. PXOR(registers.xmm1, registers.xmm1)
  29. PXOR(registers.xmm2, registers.xmm2)
  30. PXOR(registers.xmm3, registers.xmm3)
  31. PXOR(registers.xmm4, registers.xmm4)
  32. PXOR(registers.xmm5, registers.xmm5)
  33. PXOR(registers.xmm6, registers.xmm6)
  34. PXOR(registers.xmm7, registers.xmm7)
  35. PXOR(registers.xmm8, registers.xmm8)
  36. PXOR(registers.xmm9, registers.xmm9)
  37. PXOR(registers.xmm10, registers.xmm10)
  38. PXOR(registers.xmm11, registers.xmm10)
  39. PXOR(registers.xmm12, registers.xmm12)
  40. PXOR(registers.xmm13, registers.xmm13)
  41. PXOR(registers.xmm14, registers.xmm14)
  42. PXOR(registers.xmm15, registers.xmm15)
  43. RETURN()
  44. a = Argument(ptr(const_uint8_t))
  45. b = Argument(ptr(const_uint8_t))
  46. c = Argument(ptr(const_uint8_t))
  47. d = Argument(ptr(const_uint8_t))
  48. dst = Argument(ptr(uint8_t))
  49. with Function("xorBytes1x16AMD64SSE2", (a, b, dst)):
  50. reg_a = GeneralPurposeRegister64()
  51. reg_b = GeneralPurposeRegister64()
  52. reg_dst = GeneralPurposeRegister64()
  53. LOAD.ARGUMENT(reg_a, a)
  54. LOAD.ARGUMENT(reg_b, b)
  55. LOAD.ARGUMENT(reg_dst, dst)
  56. xmm_a = XMMRegister()
  57. xmm_b = XMMRegister()
  58. MOVDQU(xmm_a, [reg_a])
  59. MOVDQU(xmm_b, [reg_b])
  60. PXOR(xmm_a, xmm_b)
  61. MOVDQU([reg_dst], xmm_a)
  62. RETURN()
  63. with Function("xorBytes4x16AMD64SSE2", (a, b, c, d, dst)):
  64. reg_a = GeneralPurposeRegister64()
  65. reg_b = GeneralPurposeRegister64()
  66. reg_c = GeneralPurposeRegister64()
  67. reg_d = GeneralPurposeRegister64()
  68. reg_dst = GeneralPurposeRegister64()
  69. LOAD.ARGUMENT(reg_a, a)
  70. LOAD.ARGUMENT(reg_b, b)
  71. LOAD.ARGUMENT(reg_c, c)
  72. LOAD.ARGUMENT(reg_d, d)
  73. LOAD.ARGUMENT(reg_dst, dst)
  74. xmm_a = XMMRegister()
  75. xmm_b = XMMRegister()
  76. xmm_c = XMMRegister()
  77. xmm_d = XMMRegister()
  78. MOVDQU(xmm_a, [reg_a])
  79. MOVDQU(xmm_b, [reg_b])
  80. MOVDQU(xmm_c, [reg_c])
  81. MOVDQU(xmm_d, [reg_d])
  82. PXOR(xmm_a, xmm_b)
  83. PXOR(xmm_c, xmm_d)
  84. PXOR(xmm_a, xmm_c)
  85. MOVDQU([reg_dst], xmm_a)
  86. RETURN()
  87. #
  88. # AES-NI helper functions.
  89. #
  90. def aesenc4x1(o, j, i, l, z):
  91. AESENC(o, j)
  92. AESENC(o, i)
  93. AESENC(o, l)
  94. AESENC(o, z)
  95. def aesenc4x2(o0, o1, j, i, l, z):
  96. AESENC(o0, j)
  97. AESENC(o1, j)
  98. AESENC(o0, i)
  99. AESENC(o1, i)
  100. AESENC(o0, l)
  101. AESENC(o1, l)
  102. AESENC(o0, z)
  103. AESENC(o1, z)
  104. def aesenc4x4(o0, o1, o2, o3, j, i, l, z):
  105. AESENC(o0, j)
  106. AESENC(o1, j)
  107. AESENC(o2, j)
  108. AESENC(o3, j)
  109. AESENC(o0, i)
  110. AESENC(o1, i)
  111. AESENC(o2, i)
  112. AESENC(o3, i)
  113. AESENC(o0, l)
  114. AESENC(o1, l)
  115. AESENC(o2, l)
  116. AESENC(o3, l)
  117. AESENC(o0, z)
  118. AESENC(o1, z)
  119. AESENC(o2, z)
  120. AESENC(o3, z)
  121. def aesenc4x8(o0, o1, o2, o3, o4, o5, o6, o7, j, i, l, z):
  122. AESENC(o0, j)
  123. AESENC(o1, j)
  124. AESENC(o2, j)
  125. AESENC(o3, j)
  126. AESENC(o4, j)
  127. AESENC(o5, j)
  128. AESENC(o6, j)
  129. AESENC(o7, j)
  130. AESENC(o0, i)
  131. AESENC(o1, i)
  132. AESENC(o2, i)
  133. AESENC(o3, i)
  134. AESENC(o4, i)
  135. AESENC(o5, i)
  136. AESENC(o6, i)
  137. AESENC(o7, i)
  138. AESENC(o0, l)
  139. AESENC(o1, l)
  140. AESENC(o2, l)
  141. AESENC(o3, l)
  142. AESENC(o4, l)
  143. AESENC(o5, l)
  144. AESENC(o6, l)
  145. AESENC(o7, l)
  146. AESENC(o0, z)
  147. AESENC(o1, z)
  148. AESENC(o2, z)
  149. AESENC(o3, z)
  150. AESENC(o4, z)
  151. AESENC(o5, z)
  152. AESENC(o6, z)
  153. AESENC(o7, z)
  154. #
  155. # Sigh. PeachPy has "interesting" ideas of definitions for certain things,
  156. # so just use the `zen` uarch, because it supports everything.
  157. #
  158. j = Argument(ptr(const_uint8_t))
  159. i = Argument(ptr(const_uint8_t))
  160. l = Argument(ptr(const_uint8_t))
  161. k = Argument(ptr(const_uint8_t))
  162. src = Argument(ptr(uint8_t))
  163. with Function("aezAES4AMD64AESNI", (j, i, l, k, src, dst), target=uarch.zen):
  164. reg_j = GeneralPurposeRegister64()
  165. reg_i = GeneralPurposeRegister64()
  166. reg_l = GeneralPurposeRegister64()
  167. reg_k = GeneralPurposeRegister64()
  168. reg_src = GeneralPurposeRegister64()
  169. reg_dst = GeneralPurposeRegister64()
  170. LOAD.ARGUMENT(reg_j, j)
  171. LOAD.ARGUMENT(reg_i, i)
  172. LOAD.ARGUMENT(reg_l, l)
  173. LOAD.ARGUMENT(reg_k, k)
  174. LOAD.ARGUMENT(reg_src, src)
  175. LOAD.ARGUMENT(reg_dst, dst)
  176. xmm_state = XMMRegister()
  177. xmm_j = XMMRegister()
  178. xmm_i = XMMRegister()
  179. xmm_l = XMMRegister()
  180. xmm_zero = XMMRegister()
  181. MOVDQU(xmm_state, [reg_src])
  182. MOVDQA(xmm_j, [reg_j])
  183. MOVDQA(xmm_i, [reg_i])
  184. MOVDQA(xmm_l, [reg_l])
  185. PXOR(xmm_state, xmm_j)
  186. PXOR(xmm_i, xmm_l)
  187. PXOR(xmm_state, xmm_i)
  188. PXOR(xmm_zero, xmm_zero)
  189. MOVDQA(xmm_i, [reg_k])
  190. MOVDQA(xmm_j, [reg_k+16])
  191. MOVDQA(xmm_l, [reg_k+32])
  192. aesenc4x1(xmm_state, xmm_j, xmm_i, xmm_l, xmm_zero)
  193. MOVDQU([reg_dst], xmm_state)
  194. RETURN()
  195. with Function("aezAES10AMD64AESNI", (l, k, src, dst), target=uarch.zen):
  196. reg_l = GeneralPurposeRegister64()
  197. reg_k = GeneralPurposeRegister64()
  198. reg_src = GeneralPurposeRegister64()
  199. reg_dst = GeneralPurposeRegister64()
  200. LOAD.ARGUMENT(reg_l, l)
  201. LOAD.ARGUMENT(reg_k, k)
  202. LOAD.ARGUMENT(reg_src, src)
  203. LOAD.ARGUMENT(reg_dst, dst)
  204. MOVDQU(xmm_state, [reg_src])
  205. MOVDQU(xmm_l, [reg_l])
  206. PXOR(xmm_state, xmm_l)
  207. MOVDQA(xmm_i, [reg_k])
  208. MOVDQA(xmm_j, [reg_k+16])
  209. MOVDQA(xmm_l, [reg_k+32])
  210. AESENC(xmm_state, xmm_i)
  211. AESENC(xmm_state, xmm_j)
  212. AESENC(xmm_state, xmm_l)
  213. AESENC(xmm_state, xmm_i)
  214. AESENC(xmm_state, xmm_j)
  215. AESENC(xmm_state, xmm_l)
  216. AESENC(xmm_state, xmm_i)
  217. AESENC(xmm_state, xmm_j)
  218. AESENC(xmm_state, xmm_l)
  219. AESENC(xmm_state, xmm_i)
  220. MOVDQU([reg_dst], xmm_state)
  221. RETURN()
  222. def doubleBlock(blk, tmp0, tmp1, c):
  223. MOVDQA(tmp0, [c])
  224. PSHUFB(blk, tmp0)
  225. MOVDQA(tmp1, blk)
  226. PSRAD(tmp1, 31)
  227. PAND(tmp1, [c+16])
  228. PSHUFD(tmp1, tmp1, 0x93)
  229. PSLLD(blk, 1)
  230. PXOR(blk, tmp1)
  231. PSHUFB(blk, tmp0)
  232. x = Argument(ptr(uint8_t))
  233. consts = Argument(ptr(const_uint8_t))
  234. sz = Argument(ptr(size_t))
  235. with Function("aezCorePass1AMD64AESNI", (src, dst, x, i, l, k, consts, sz), target=uarch.zen):
  236. # This would be better as a port of the aesni pass_one() routine,
  237. # however that requires storing some intermediaries in reversed
  238. # form.
  239. reg_src = GeneralPurposeRegister64()
  240. reg_dst = GeneralPurposeRegister64()
  241. reg_x = GeneralPurposeRegister64()
  242. reg_tmp = GeneralPurposeRegister64()
  243. reg_l = GeneralPurposeRegister64()
  244. reg_bytes = GeneralPurposeRegister64()
  245. reg_idx = GeneralPurposeRegister64()
  246. LOAD.ARGUMENT(reg_src, src) # src pointer
  247. LOAD.ARGUMENT(reg_dst, dst) # dst pointer
  248. LOAD.ARGUMENT(reg_x, x)
  249. LOAD.ARGUMENT(reg_l, l) # e.L[]
  250. LOAD.ARGUMENT(reg_bytes, sz) # bytes remaining
  251. MOV(reg_idx, 1) # Index into e.L[]
  252. xmm_j = XMMRegister() # AESENC Round key J
  253. xmm_i = XMMRegister() # AESENC Round key I
  254. xmm_l = XMMRegister() # AESENC Round Key L
  255. xmm_x = XMMRegister() # Checksum X
  256. xmm_iDbl = XMMRegister() # e.I[1]
  257. xmm_tmp0 = XMMRegister()
  258. xmm_tmp1 = XMMRegister()
  259. xmm_zero = XMMRegister() # [16]byte{0x00}
  260. xmm_o0 = XMMRegister()
  261. xmm_o1 = XMMRegister()
  262. xmm_o2 = XMMRegister()
  263. xmm_o3 = XMMRegister()
  264. xmm_o4 = XMMRegister()
  265. xmm_o5 = XMMRegister()
  266. xmm_o6 = XMMRegister()
  267. xmm_o7 = XMMRegister()
  268. MOVDQU(xmm_x, [reg_x])
  269. LOAD.ARGUMENT(reg_tmp, i)
  270. MOVDQU(xmm_iDbl, [reg_tmp])
  271. LOAD.ARGUMENT(reg_tmp, k)
  272. MOVDQU(xmm_i, [reg_tmp])
  273. MOVDQU(xmm_j, [reg_tmp+16])
  274. MOVDQU(xmm_l, [reg_tmp+32])
  275. LOAD.ARGUMENT(reg_tmp, consts) # doubleBlock constants
  276. PXOR(xmm_zero, xmm_zero)
  277. # Process 16 * 16 bytes at a time in a loop.
  278. vector_loop256 = Loop()
  279. SUB(reg_bytes, 256)
  280. JB(vector_loop256.end)
  281. with vector_loop256:
  282. # TODO: Make better use of registers, optimize scheduling.
  283. # o0 = aes4(o0 ^ J ^ I ^ L[1], keys) // E(1,1)
  284. # o1 = aes4(o1 ^ J ^ I ^ L[2], keys) // E(1,2)
  285. # o2 = aes4(o2 ^ J ^ I ^ L[3], keys) // E(1,3)
  286. # o3 = aes4(o3 ^ J ^ I ^ L[4], keys) // E(1,4)
  287. # o4 = aes4(o4 ^ J ^ I ^ L[5], keys) // E(1,5)
  288. # o5 = aes4(o5 ^ J ^ I ^ L[6], keys) // E(1,6)
  289. # o6 = aes4(o6 ^ J ^ I ^ L[7], keys) // E(1,7)
  290. # o7 = aes4(o7 ^ J ^ I ^ L[0], keys) // E(1,0)
  291. MOVDQU(xmm_o0, [reg_src+16])
  292. MOVDQU(xmm_o1, [reg_src+48])
  293. MOVDQU(xmm_o2, [reg_src+80])
  294. MOVDQU(xmm_o3, [reg_src+112])
  295. MOVDQU(xmm_o4, [reg_src+144])
  296. MOVDQU(xmm_o5, [reg_src+176])
  297. MOVDQU(xmm_o6, [reg_src+208])
  298. MOVDQU(xmm_o7, [reg_src+240])
  299. MOVDQA(xmm_tmp0, xmm_j) # tmp = j ^ iDbl
  300. PXOR(xmm_tmp0, xmm_iDbl)
  301. PXOR(xmm_o0, xmm_tmp0)
  302. PXOR(xmm_o1, xmm_tmp0)
  303. PXOR(xmm_o2, xmm_tmp0)
  304. PXOR(xmm_o3, xmm_tmp0)
  305. PXOR(xmm_o4, xmm_tmp0)
  306. PXOR(xmm_o5, xmm_tmp0)
  307. PXOR(xmm_o6, xmm_tmp0)
  308. PXOR(xmm_o7, xmm_tmp0)
  309. PXOR(xmm_o0, [reg_l+16]) # L[1]
  310. PXOR(xmm_o1, [reg_l+32]) # L[2]
  311. PXOR(xmm_o2, [reg_l+48]) # L[3]
  312. PXOR(xmm_o3, [reg_l+64]) # L[4]
  313. PXOR(xmm_o4, [reg_l+80]) # L[5]
  314. PXOR(xmm_o5, [reg_l+96]) # L[6]
  315. PXOR(xmm_o6, [reg_l+112]) # L[7]
  316. PXOR(xmm_o7, [reg_l]) # L[0]
  317. aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero)
  318. # dst[ :] = in[ :] ^ o0
  319. # dst[32:] = in[32:] ^ o1
  320. # dst[64:] = in[64:] ^ o2
  321. # dst[96:] = in[96:] ^ o3
  322. # dst[128:] = in[128:] ^ o4
  323. # dst[160:] = in[160:] ^ o5
  324. # dst[192:] = in[192:] ^ o6
  325. # dst[224:] = in[224:] ^ o7
  326. MOVDQU(xmm_tmp0, [reg_src])
  327. MOVDQU(xmm_tmp1, [reg_src+32])
  328. PXOR(xmm_o0, xmm_tmp0)
  329. PXOR(xmm_o1, xmm_tmp1)
  330. MOVDQU(xmm_tmp0, [reg_src+64])
  331. MOVDQU(xmm_tmp1, [reg_src+96])
  332. PXOR(xmm_o2, xmm_tmp0)
  333. PXOR(xmm_o3, xmm_tmp1)
  334. MOVDQU(xmm_tmp0, [reg_src+128])
  335. MOVDQU(xmm_tmp1, [reg_src+160])
  336. PXOR(xmm_o4, xmm_tmp0)
  337. PXOR(xmm_o5, xmm_tmp1)
  338. MOVDQU(xmm_tmp0, [reg_src+192])
  339. MOVDQU(xmm_tmp1, [reg_src+224])
  340. PXOR(xmm_o6, xmm_tmp0)
  341. PXOR(xmm_o7, xmm_tmp1)
  342. MOVDQU([reg_dst], xmm_o0)
  343. MOVDQU([reg_dst+32], xmm_o1)
  344. MOVDQU([reg_dst+64], xmm_o2)
  345. MOVDQU([reg_dst+96], xmm_o3)
  346. MOVDQU([reg_dst+128], xmm_o4)
  347. MOVDQU([reg_dst+160], xmm_o5)
  348. MOVDQU([reg_dst+192], xmm_o6)
  349. MOVDQU([reg_dst+224], xmm_o7)
  350. # o0 = aes4(o0 ^ I, keys) // E(0,0)
  351. # o1 = aes4(o1 ^ I, keys) // E(0,0)
  352. # o2 = aes4(o2 ^ I, keys) // E(0,0)
  353. # o3 = aes4(o3 ^ I, keys) // E(0,0)
  354. # o4 = aes4(o4 ^ I, keys) // E(0,0)
  355. # o5 = aes4(o5 ^ I, keys) // E(0,0)
  356. # o6 = aes4(o6 ^ I, keys) // E(0,0)
  357. # o7 = aes4(o7 ^ I, keys) // E(0,0)
  358. PXOR(xmm_o0, xmm_i)
  359. PXOR(xmm_o1, xmm_i)
  360. PXOR(xmm_o2, xmm_i)
  361. PXOR(xmm_o3, xmm_i)
  362. PXOR(xmm_o4, xmm_i)
  363. PXOR(xmm_o5, xmm_i)
  364. PXOR(xmm_o6, xmm_i)
  365. PXOR(xmm_o7, xmm_i)
  366. aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero)
  367. # dst[ 16:] = o0 ^ in[ 16:]
  368. # dst[ 48:] = o1 ^ in[ 48:]
  369. # dst[ 80:] = o2 ^ in[ 80:]
  370. # dst[112:] = o3 ^ in[112:]
  371. # dst[144:] = o4 ^ in[144:]
  372. # dst[176:] = o5 ^ in[176:]
  373. # dst[208:] = o6 ^ in[208:]
  374. # dst[240:] = o7 ^ in[240:]
  375. MOVDQU(xmm_tmp0, [reg_src+16])
  376. MOVDQU(xmm_tmp1, [reg_src+48])
  377. PXOR(xmm_o0, xmm_tmp0)
  378. PXOR(xmm_o1, xmm_tmp1)
  379. MOVDQU(xmm_tmp0, [reg_src+80])
  380. MOVDQU(xmm_tmp1, [reg_src+112])
  381. PXOR(xmm_o2, xmm_tmp0)
  382. PXOR(xmm_o3, xmm_tmp1)
  383. MOVDQU(xmm_tmp0, [reg_src+144])
  384. MOVDQU(xmm_tmp1, [reg_src+176])
  385. PXOR(xmm_o4, xmm_tmp0)
  386. PXOR(xmm_o5, xmm_tmp1)
  387. MOVDQU(xmm_tmp0, [reg_src+208])
  388. MOVDQU(xmm_tmp1, [reg_src+240])
  389. PXOR(xmm_o6, xmm_tmp0)
  390. PXOR(xmm_o7, xmm_tmp1)
  391. MOVDQU([reg_dst+16], xmm_o0)
  392. MOVDQU([reg_dst+48], xmm_o1)
  393. MOVDQU([reg_dst+80], xmm_o2)
  394. MOVDQU([reg_dst+112], xmm_o3)
  395. MOVDQU([reg_dst+144], xmm_o4)
  396. MOVDQU([reg_dst+176], xmm_o5)
  397. MOVDQU([reg_dst+208], xmm_o6)
  398. MOVDQU([reg_dst+240], xmm_o7)
  399. # X ^= o0 ^ o1 ^ o2 ^ o3
  400. PXOR(xmm_x, xmm_o0)
  401. PXOR(xmm_x, xmm_o1)
  402. PXOR(xmm_x, xmm_o2)
  403. PXOR(xmm_x, xmm_o3)
  404. PXOR(xmm_x, xmm_o4)
  405. PXOR(xmm_x, xmm_o5)
  406. PXOR(xmm_x, xmm_o6)
  407. PXOR(xmm_x, xmm_o7)
  408. # doubleBlock(I)
  409. doubleBlock(xmm_iDbl, xmm_tmp0, xmm_tmp1, reg_tmp)
  410. # Update book keeping.
  411. ADD(reg_src, 256)
  412. ADD(reg_dst, 256)
  413. SUB(reg_bytes, 256)
  414. JAE(vector_loop256.begin)
  415. ADD(reg_bytes, 256)
  416. process_64bytes = Label()
  417. SUB(reg_bytes, 128)
  418. JB(process_64bytes)
  419. # Can I haz registers?
  420. xmm_src_l0 = xmm_tmp0
  421. xmm_src_l1 = xmm_tmp1
  422. xmm_src_r0 = xmm_o4 # Change these at your peril (tmp0 used in 8 * 16 path)
  423. xmm_src_r1 = xmm_o5
  424. xmm_src_r2 = xmm_o6
  425. xmm_src_r3 = xmm_o7
  426. #
  427. # Process 8 * 16 bytes.
  428. #
  429. # o0 = aes4(o0 ^ J ^ I ^ L[1], keys) // E(1,1)
  430. # o1 = aes4(o1 ^ J ^ I ^ L[2], keys) // E(1,2)
  431. # o2 = aes4(o2 ^ J ^ I ^ L[3], keys) // E(1,3)
  432. # o3 = aes4(o3 ^ J ^ I ^ L[4], keys) // E(1,4)
  433. MOVDQU(xmm_src_r0, [reg_src+16])
  434. MOVDQU(xmm_src_r1, [reg_src+48])
  435. MOVDQU(xmm_src_r2, [reg_src+80])
  436. MOVDQU(xmm_src_r3, [reg_src+112])
  437. MOVDQA(xmm_o0, xmm_src_r0)
  438. MOVDQA(xmm_o1, xmm_src_r1)
  439. MOVDQU(xmm_o2, xmm_src_r2)
  440. MOVDQU(xmm_o3, xmm_src_r3)
  441. MOVDQA(xmm_tmp0, xmm_j) # tmp0(src_l0) = j ^ iDbl)1
  442. PXOR(xmm_tmp0, xmm_iDbl)
  443. PXOR(xmm_o0, xmm_tmp0)
  444. PXOR(xmm_o1, xmm_tmp0)
  445. PXOR(xmm_o2, xmm_tmp0)
  446. PXOR(xmm_o3, xmm_tmp0)
  447. PXOR(xmm_o0, [reg_l+16]) # L[1]
  448. PXOR(xmm_o1, [reg_l+32]) # L[2]
  449. PXOR(xmm_o2, [reg_l+48]) # L[3]
  450. PXOR(xmm_o3, [reg_l+64]) # L[4]
  451. aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero)
  452. # dst[ :] = in[ :] ^ o0
  453. # dst[32:] = in[32:] ^ o1
  454. # dst[64:] = in[64:] ^ o2
  455. # dst[96:] = in[96:] ^ o3
  456. MOVDQU(xmm_src_l0, [reg_src])
  457. MOVDQU(xmm_src_l1, [reg_src+32])
  458. PXOR(xmm_o0, xmm_src_l0)
  459. PXOR(xmm_o1, xmm_src_l1)
  460. MOVDQU(xmm_src_l0, [reg_src+64])
  461. MOVDQU(xmm_src_l1, [reg_src+96])
  462. PXOR(xmm_o2, xmm_src_l0)
  463. PXOR(xmm_o3, xmm_src_l1)
  464. MOVDQU([reg_dst], xmm_o0)
  465. MOVDQU([reg_dst+32], xmm_o1)
  466. MOVDQU([reg_dst+64], xmm_o2)
  467. MOVDQU([reg_dst+96], xmm_o3)
  468. # o0 = aes4(o0 ^ I, keys) // E(0,0)
  469. # o1 = aes4(o1 ^ I, keys) // E(0,0)
  470. # o2 = aes4(o2 ^ I, keys) // E(0,0)
  471. # o3 = aes4(o3 ^ I, keys) // E(0,0)
  472. PXOR(xmm_o0, xmm_i)
  473. PXOR(xmm_o1, xmm_i)
  474. PXOR(xmm_o2, xmm_i)
  475. PXOR(xmm_o3, xmm_i)
  476. aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero)
  477. # dst[ 16:] = o0 ^ in[ 16:]
  478. # dst[ 48:] = o1 ^ in[ 48:]
  479. # dst[ 80:] = o2 ^ in[ 80:]
  480. # dst[112:] = o3 ^ in[112:]
  481. PXOR(xmm_o0, xmm_src_r0)
  482. PXOR(xmm_o1, xmm_src_r1)
  483. PXOR(xmm_o2, xmm_src_r2)
  484. PXOR(xmm_o3, xmm_src_r3)
  485. MOVDQU([reg_dst+16], xmm_o0)
  486. MOVDQU([reg_dst+48], xmm_o1)
  487. MOVDQU([reg_dst+80], xmm_o2)
  488. MOVDQU([reg_dst+112], xmm_o3)
  489. # X ^= o0 ^ o1 ^ o2 ^ o3
  490. PXOR(xmm_x, xmm_o0)
  491. PXOR(xmm_x, xmm_o1)
  492. PXOR(xmm_x, xmm_o2)
  493. PXOR(xmm_x, xmm_o3)
  494. # Update book keeping.
  495. ADD(reg_src, 128)
  496. ADD(reg_dst, 128)
  497. ADD(reg_idx, 4)
  498. SUB(reg_bytes, 128)
  499. LABEL(process_64bytes)
  500. ADD(reg_bytes, 128)
  501. process_32bytes = Label()
  502. SUB(reg_bytes, 64)
  503. JB(process_32bytes)
  504. #
  505. # Process 4 * 16 bytes.
  506. #
  507. reg_l_offset = reg_tmp
  508. MOV(reg_l_offset, reg_idx)
  509. SHL(reg_l_offset, 4)
  510. ADD(reg_l_offset, reg_l) # reg_l_offset = reg_l + reg_idx*16 (L[i%8])
  511. # o0 = aes4(o0 ^ J ^ I ^ L[(i+0)%8], keys) // E(1,i)
  512. # o1 = aes4(o1 ^ J ^ I ^ L[(i+1)%8], keys) // E(1,i+1)
  513. MOVDQU(xmm_src_r0, [reg_src+16])
  514. MOVDQU(xmm_src_r1, [reg_src+48])
  515. MOVDQA(xmm_o0, xmm_src_r0)
  516. MOVDQA(xmm_o1, xmm_src_r1)
  517. PXOR(xmm_o0, xmm_j)
  518. PXOR(xmm_o1, xmm_j)
  519. PXOR(xmm_o0, xmm_iDbl)
  520. PXOR(xmm_o1, xmm_iDbl)
  521. PXOR(xmm_o0, [reg_l_offset]) # L[i]
  522. PXOR(xmm_o1, [reg_l_offset+16]) # L[i+1]
  523. aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero)
  524. # dst[: ] = in[: ] ^ o0
  525. # dst[32:] = in[32:] ^ o1
  526. MOVDQU(xmm_src_l0, [reg_src])
  527. MOVDQU(xmm_src_l1, [reg_src+32])
  528. PXOR(xmm_o0, xmm_src_l0)
  529. PXOR(xmm_o1, xmm_src_l1)
  530. MOVDQU([reg_dst], xmm_o0)
  531. MOVDQU([reg_dst+32], xmm_o1)
  532. # o0 = aes4(o0 ^ I, keys) // E(0,0)
  533. # o1 = aes4(o1 ^ I, keys) // E(0,0)
  534. PXOR(xmm_o0, xmm_i)
  535. PXOR(xmm_o1, xmm_i)
  536. aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero)
  537. # dst[16:] = o0 ^ in[16:]
  538. # dst[48:] = o1 ^ in[48:]
  539. PXOR(xmm_o0, xmm_src_r0)
  540. PXOR(xmm_o1, xmm_src_r1)
  541. MOVDQU([reg_dst+16], xmm_o0)
  542. MOVDQU([reg_dst+48], xmm_o1)
  543. # X ^= o0 ^ o2
  544. PXOR(xmm_x, xmm_o0)
  545. PXOR(xmm_x, xmm_o1)
  546. # Update book keeping.
  547. ADD(reg_src, 64)
  548. ADD(reg_dst, 64)
  549. ADD(reg_idx, 2)
  550. SUB(reg_bytes, 64)
  551. LABEL(process_32bytes)
  552. ADD(reg_bytes, 64)
  553. out = Label()
  554. SUB(reg_bytes, 32)
  555. JB(out)
  556. #
  557. # Process 2 * 16 bytes
  558. #
  559. # Pick the final L from the table. This is the only time
  560. # where wrapping needs to happen based on the index.
  561. AND(reg_idx, 7)
  562. SHL(reg_idx, 4)
  563. ADD(reg_l, reg_idx) # reg_l += reg_idx (&L[i%8])
  564. # o0 = aes4(o0 ^ J ^ I ^ L[i%8], keys) // E(1,i)
  565. MOVDQU(xmm_src_r0, [reg_src+16])
  566. MOVDQA(xmm_o0, xmm_src_r0)
  567. PXOR(xmm_o0, xmm_j)
  568. PXOR(xmm_o0, xmm_iDbl)
  569. PXOR(xmm_o0, [reg_l])
  570. aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero)
  571. # dst[:] = in[:] ^ o0
  572. MOVDQU(xmm_src_l0, [reg_src])
  573. PXOR(xmm_o0, xmm_src_l0)
  574. MOVDQU([reg_dst], xmm_o0)
  575. # o0 = aes4(o0 ^ I, keys) // E(0,0)
  576. PXOR(xmm_o0, xmm_i)
  577. aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero)
  578. # dst[16:] = o0 ^ in[16:]
  579. PXOR(xmm_o0, xmm_src_r0)
  580. MOVDQU([reg_dst+16], xmm_o0)
  581. # X ^= o0
  582. PXOR(xmm_x, xmm_o0)
  583. LABEL(out)
  584. # Write back X.
  585. MOVDQU([reg_x], xmm_x)
  586. RETURN()
  587. y = Argument(ptr(uint8_t))
  588. s = Argument(ptr(const_uint8_t))
  589. with Function("aezCorePass2AMD64AESNI", (dst, y, s, j, i, l, k, consts, sz), target=uarch.zen):
  590. reg_dst = GeneralPurposeRegister64()
  591. reg_y = GeneralPurposeRegister64()
  592. reg_s = GeneralPurposeRegister64()
  593. reg_j = GeneralPurposeRegister64()
  594. reg_l = GeneralPurposeRegister64()
  595. reg_tmp = GeneralPurposeRegister64()
  596. reg_bytes = GeneralPurposeRegister64()
  597. reg_idx = GeneralPurposeRegister64()
  598. reg_sp_save = GeneralPurposeRegister64()
  599. LOAD.ARGUMENT(reg_dst, dst) # dst pointer
  600. LOAD.ARGUMENT(reg_y, y)
  601. LOAD.ARGUMENT(reg_j, j)
  602. LOAD.ARGUMENT(reg_l, l)
  603. LOAD.ARGUMENT(reg_bytes, sz) # bytes remaining
  604. MOV(reg_idx, 1) # Index into e.L[]
  605. xmm_j = XMMRegister() # AESENC Round key J
  606. xmm_i = XMMRegister() # AESENC Round key I
  607. xmm_l = XMMRegister() # AESENC Round Key L
  608. xmm_s = XMMRegister() # S
  609. xmm_y = XMMRegister() # Checksum Y
  610. xmm_iDbl = XMMRegister() # e.I[1]
  611. xmm_zero = XMMRegister() # [16]byte{0x00}
  612. xmm_tmp0 = XMMRegister()
  613. o0 = XMMRegister()
  614. o1 = XMMRegister()
  615. o2 = XMMRegister()
  616. o3 = XMMRegister()
  617. o4 = XMMRegister()
  618. o5 = XMMRegister()
  619. o6 = XMMRegister()
  620. o7 = XMMRegister()
  621. LOAD.ARGUMENT(reg_tmp, k)
  622. MOVDQU(xmm_i, [reg_tmp])
  623. MOVDQU(xmm_j, [reg_tmp+16])
  624. MOVDQU(xmm_l, [reg_tmp+32])
  625. MOVDQU(xmm_y, [reg_y])
  626. LOAD.ARGUMENT(reg_tmp, i)
  627. MOVDQU(xmm_iDbl, [reg_tmp])
  628. LOAD.ARGUMENT(reg_tmp, consts)
  629. PXOR(xmm_zero, xmm_zero)
  630. LOAD.ARGUMENT(reg_s, s)
  631. MOVDQU(xmm_s, [reg_s])
  632. PXOR(xmm_s, [reg_j+16]) # S ^= J[1] (Once per call, in theory)
  633. # Save the stack pointer, align stack to 32 bytes, and allocate
  634. # 256 bytes of scratch space.
  635. MOV(reg_sp_save, registers.rsp)
  636. AND(registers.rsp, 0xffffffffffffffe0)
  637. SUB(registers.rsp, 256)
  638. # Name strategic offsets.
  639. mem_dst_l0 = [registers.rsp]
  640. mem_dst_r0 = [registers.rsp+16]
  641. mem_dst_l1 = [registers.rsp+32]
  642. mem_dst_r1 = [registers.rsp+48]
  643. mem_dst_l2 = [registers.rsp+64]
  644. mem_dst_r2 = [registers.rsp+80]
  645. mem_dst_l3 = [registers.rsp+96]
  646. mem_dst_r3 = [registers.rsp+112]
  647. mem_dst_l4 = [registers.rsp+128]
  648. mem_dst_r4 = [registers.rsp+144]
  649. mem_dst_l5 = [registers.rsp+160]
  650. mem_dst_r5 = [registers.rsp+176]
  651. mem_dst_l6 = [registers.rsp+192]
  652. mem_dst_r6 = [registers.rsp+208]
  653. mem_dst_l7 = [registers.rsp+224]
  654. mem_dst_r7 = [registers.rsp+240]
  655. #
  656. # Process 16 * 16 bytes at a time in a loop.
  657. #
  658. vector_loop256 = Loop()
  659. SUB(reg_bytes, 256)
  660. JB(vector_loop256.end)
  661. with vector_loop256:
  662. # o0 = aes4(J[1] ^ I ^ L[1] ^ S[:], keys) // E(1,1)
  663. # o1 = aes4(J[1] ^ I ^ L[2] ^ S[:], keys) // E(1,1)
  664. # ...
  665. # o6 = aes4(J[1] ^ I ^ L[7] ^ S[:], keys) // E(1,1)
  666. # o7 = aes4(J[1] ^ I ^ L[0] ^ S[:], keys) // E(1,0)
  667. MOVDQA(xmm_o0, xmm_s)
  668. PXOR(xmm_o0, xmm_iDbl) # o0 = s ^ I
  669. MOVDQA(xmm_o1, xmm_o0) # o1 = o1
  670. MOVDQA(xmm_o2, xmm_o0) # o2 = o1
  671. MOVDQA(xmm_o3, xmm_o0) # o3 = o1
  672. MOVDQA(xmm_o4, xmm_o0) # o1 = o1
  673. MOVDQA(xmm_o5, xmm_o0) # o2 = o1
  674. MOVDQA(xmm_o6, xmm_o0) # o3 = o1
  675. MOVDQA(xmm_o7, xmm_o0) # o3 = o1
  676. PXOR(xmm_o0, [reg_l+16]) # o0 ^= L[1]
  677. PXOR(xmm_o1, [reg_l+32]) # o1 ^= L[2]
  678. PXOR(xmm_o2, [reg_l+48]) # o2 ^= L[3]
  679. PXOR(xmm_o3, [reg_l+64]) # o3 ^= L[4]
  680. PXOR(xmm_o4, [reg_l+80]) # o4 ^= L[5]
  681. PXOR(xmm_o5, [reg_l+96]) # o5 ^= L[6]
  682. PXOR(xmm_o6, [reg_l+112]) # o6 ^= L[7]
  683. PXOR(xmm_o7, [reg_l]) # o7 ^= L[0]
  684. aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero)
  685. # TODO: Figure out how the fuck to remove some of these loads/stores.
  686. xmm_tmp1 = xmm_s # Use as scratch till the end of loop body.
  687. # dst_l0 ^= o0, ... dst_l7 ^= o7
  688. # Y ^= dst_l0 ^ ... ^ dst_l7
  689. MOVDQU(xmm_tmp0, [reg_dst])
  690. MOVDQU(xmm_tmp1, [reg_dst+32])
  691. PXOR(xmm_tmp0, xmm_o0)
  692. PXOR(xmm_tmp1, xmm_o1)
  693. PXOR(xmm_y, xmm_tmp0)
  694. PXOR(xmm_y, xmm_tmp1)
  695. MOVDQA(mem_dst_l0, xmm_tmp0)
  696. MOVDQA(mem_dst_l1, xmm_tmp1)
  697. MOVDQU(xmm_tmp0, [reg_dst+64])
  698. MOVDQU(xmm_tmp1, [reg_dst+96])
  699. PXOR(xmm_tmp0, xmm_o2)
  700. PXOR(xmm_tmp1, xmm_o3)
  701. PXOR(xmm_y, xmm_tmp0)
  702. PXOR(xmm_y, xmm_tmp1)
  703. MOVDQA(mem_dst_l2, xmm_tmp0)
  704. MOVDQA(mem_dst_l3, xmm_tmp1)
  705. MOVDQU(xmm_tmp0, [reg_dst+128])
  706. MOVDQU(xmm_tmp1, [reg_dst+160])
  707. PXOR(xmm_tmp0, xmm_o4)
  708. PXOR(xmm_tmp1, xmm_o5)
  709. PXOR(xmm_y, xmm_tmp0)
  710. PXOR(xmm_y, xmm_tmp1)
  711. MOVDQA(mem_dst_l4, xmm_tmp0)
  712. MOVDQA(mem_dst_l5, xmm_tmp1)
  713. MOVDQU(xmm_tmp0, [reg_dst+192])
  714. MOVDQU(xmm_tmp1, [reg_dst+224])
  715. PXOR(xmm_tmp0, xmm_o6)
  716. PXOR(xmm_tmp1, xmm_o7)
  717. PXOR(xmm_y, xmm_tmp0)
  718. PXOR(xmm_y, xmm_tmp1)
  719. MOVDQA(mem_dst_l6, xmm_tmp0)
  720. MOVDQA(mem_dst_l7, xmm_tmp1)
  721. # o0 ^= dst_r0, ... o7 ^= dst_r7
  722. # dst_r0 = o0, ... dst_r7 = o7
  723. MOVDQU(xmm_tmp0, [reg_dst+16])
  724. MOVDQU(xmm_tmp1, [reg_dst+48])
  725. PXOR(xmm_o0, xmm_tmp0)
  726. PXOR(xmm_o1, xmm_tmp1)
  727. MOVDQA(mem_dst_r0, xmm_o0)
  728. MOVDQA(mem_dst_r1, xmm_o1)
  729. MOVDQU(xmm_tmp0, [reg_dst+80])
  730. MOVDQU(xmm_tmp1, [reg_dst+112])
  731. PXOR(xmm_o2, xmm_tmp0)
  732. PXOR(xmm_o3, xmm_tmp1)
  733. MOVDQA(mem_dst_r2, xmm_o2)
  734. MOVDQA(mem_dst_r3, xmm_o3)
  735. MOVDQU(xmm_tmp0, [reg_dst+144])
  736. MOVDQU(xmm_tmp1, [reg_dst+176])
  737. PXOR(xmm_o4, xmm_tmp0)
  738. PXOR(xmm_o5, xmm_tmp1)
  739. MOVDQA(mem_dst_r4, xmm_o4)
  740. MOVDQA(mem_dst_r5, xmm_o5)
  741. MOVDQU(xmm_tmp0, [reg_dst+208])
  742. MOVDQU(xmm_tmp1, [reg_dst+240])
  743. PXOR(xmm_o6, xmm_tmp0)
  744. PXOR(xmm_o7, xmm_tmp1)
  745. MOVDQA(mem_dst_r6, xmm_o6)
  746. MOVDQA(mem_dst_r7, xmm_o7)
  747. # o0 = aes4(o0 ^ I[0]) // E(0,0)
  748. # ...
  749. # o7 = aes4(o7 ^ I[0]) // E(0,0)
  750. PXOR(xmm_o0, xmm_i)
  751. PXOR(xmm_o1, xmm_i)
  752. PXOR(xmm_o2, xmm_i)
  753. PXOR(xmm_o3, xmm_i)
  754. PXOR(xmm_o4, xmm_i)
  755. PXOR(xmm_o5, xmm_i)
  756. PXOR(xmm_o6, xmm_i)
  757. PXOR(xmm_o7, xmm_i)
  758. aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero)
  759. # o0 ^= dst_l0, ... o7 ^= dst_l7
  760. # dst_l0 = o0, ... dst_l7 = o7
  761. #
  762. # nb: Stored into the right hand blocks of dst[], because we are
  763. # done with the left hand side.
  764. PXOR(xmm_o0, mem_dst_l0)
  765. PXOR(xmm_o1, mem_dst_l1)
  766. PXOR(xmm_o2, mem_dst_l2)
  767. PXOR(xmm_o3, mem_dst_l3)
  768. PXOR(xmm_o4, mem_dst_l4)
  769. PXOR(xmm_o5, mem_dst_l5)
  770. PXOR(xmm_o6, mem_dst_l6)
  771. PXOR(xmm_o7, mem_dst_l7)
  772. MOVDQU([reg_dst+16], xmm_o0)
  773. MOVDQU([reg_dst+48], xmm_o1)
  774. MOVDQU([reg_dst+80], xmm_o2)
  775. MOVDQU([reg_dst+112], xmm_o3)
  776. MOVDQU([reg_dst+144], xmm_o4)
  777. MOVDQU([reg_dst+176], xmm_o5)
  778. MOVDQU([reg_dst+208], xmm_o6)
  779. MOVDQU([reg_dst+240], xmm_o7)
  780. # o0 = aes4(o0 ^ J[0] ^ I ^ L[1]) // E(1,1)
  781. # o1 = aes4(o0 ^ J[0] ^ I ^ L[2]) // E(1,2)
  782. # ...
  783. # o6 = aes4(o0 ^ J[0] ^ I ^ L[7]) // E(1,7)
  784. # o7 = aes4(o0 ^ J[0] ^ I ^ L[0]) // E(1,0)
  785. MOVDQA(xmm_tmp0, [reg_j])
  786. PXOR(xmm_tmp0, xmm_iDbl) # tmp = J[0] ^ I
  787. PXOR(xmm_o0, xmm_tmp0) # o0 ^= tmp
  788. PXOR(xmm_o1, xmm_tmp0) # o1 ^= tmp
  789. PXOR(xmm_o2, xmm_tmp0) # o2 ^= tmp
  790. PXOR(xmm_o3, xmm_tmp0) # o3 ^= tmp
  791. PXOR(xmm_o4, xmm_tmp0) # o4 ^= tmp
  792. PXOR(xmm_o5, xmm_tmp0) # o5 ^= tmp
  793. PXOR(xmm_o6, xmm_tmp0) # o6 ^= tmp
  794. PXOR(xmm_o7, xmm_tmp0) # o7 ^= tmp
  795. PXOR(xmm_o0, [reg_l+16]) # o0 ^= L[1]
  796. PXOR(xmm_o1, [reg_l+32]) # o1 ^= L[2]
  797. PXOR(xmm_o2, [reg_l+48]) # o2 ^= L[3]
  798. PXOR(xmm_o3, [reg_l+64]) # o3 ^= L[4]
  799. PXOR(xmm_o4, [reg_l+80]) # o4 ^= L[5]
  800. PXOR(xmm_o5, [reg_l+96]) # o5 ^= L[6]
  801. PXOR(xmm_o6, [reg_l+112]) # o6 ^= L[7]
  802. PXOR(xmm_o7, [reg_l]) # o7 ^= L[0]
  803. aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero)
  804. # dst_r0 ^= o0, ... dst_r7 ^= o7
  805. # dst_l0, dst_r0 = dst_r0, dst_l0 ... dst_l7, dst_r7 = dst_r7, dst_l7
  806. #
  807. # nb: dst_l0 ... dst_l7 already written after the previous aesenc4x8
  808. # call.
  809. PXOR(xmm_o0, mem_dst_r0)
  810. PXOR(xmm_o1, mem_dst_r1)
  811. PXOR(xmm_o2, mem_dst_r2)
  812. PXOR(xmm_o3, mem_dst_r3)
  813. PXOR(xmm_o4, mem_dst_r4)
  814. PXOR(xmm_o5, mem_dst_r5)
  815. PXOR(xmm_o6, mem_dst_r6)
  816. PXOR(xmm_o7, mem_dst_r7)
  817. MOVDQU([reg_dst], xmm_o0)
  818. MOVDQU([reg_dst+32], xmm_o1)
  819. MOVDQU([reg_dst+64], xmm_o2)
  820. MOVDQU([reg_dst+96], xmm_o3)
  821. MOVDQU([reg_dst+128], xmm_o4)
  822. MOVDQU([reg_dst+160], xmm_o5)
  823. MOVDQU([reg_dst+192], xmm_o6)
  824. MOVDQU([reg_dst+224], xmm_o7)
  825. # doubleBlock(I)
  826. doubleBlock(xmm_iDbl, xmm_tmp0, xmm_tmp1, reg_tmp)
  827. MOVDQU(xmm_s, [reg_s])
  828. PXOR(xmm_s, [reg_j+16]) # Re-derive since it was used as scratch space.
  829. # Update book keeping.
  830. ADD(reg_dst, 256)
  831. SUB(reg_bytes, 256)
  832. JAE(vector_loop256.begin)
  833. # Purge the scratch space that we are done with.
  834. MOVDQA(mem_dst_r0, xmm_zero)
  835. MOVDQA(mem_dst_r1, xmm_zero)
  836. MOVDQA(mem_dst_r2, xmm_zero)
  837. MOVDQA(mem_dst_r3, xmm_zero)
  838. MOVDQA(mem_dst_l4, xmm_zero)
  839. MOVDQA(mem_dst_r4, xmm_zero)
  840. MOVDQA(mem_dst_l5, xmm_zero)
  841. MOVDQA(mem_dst_r5, xmm_zero)
  842. MOVDQA(mem_dst_l6, xmm_zero)
  843. MOVDQA(mem_dst_r6, xmm_zero)
  844. MOVDQA(mem_dst_l7, xmm_zero)
  845. MOVDQA(mem_dst_r7, xmm_zero)
  846. ADD(reg_bytes, 256)
  847. process_64bytes = Label()
  848. SUB(reg_bytes, 128)
  849. JB(process_64bytes)
  850. # Can I haz registers?
  851. xmm_dst_l0 = xmm_o4
  852. xmm_dst_r0 = xmm_o5
  853. xmm_dst_l1 = xmm_o6
  854. xmm_dst_r1 = xmm_o7
  855. #
  856. # Process 8 * 16 bytes.
  857. #
  858. # o0 = aes4(J[1] ^ I ^ L[1] ^ S[:], keys) // E(1,1)
  859. # o1 = aes4(J[1] ^ I ^ L[2] ^ S[:], keys) // E(1,2)
  860. # o2 = aes4(J[1] ^ I ^ L[3] ^ S[:], keys) // E(1,3)
  861. # o3 = aes4(J[1] ^ I ^ L[4] ^ S[:], keys) // E(1,4)
  862. MOVDQA(xmm_o0, xmm_s)
  863. PXOR(xmm_o0, xmm_iDbl) # o0 = s ^ I
  864. MOVDQA(xmm_o1, xmm_o0) # o1 = o0
  865. MOVDQA(xmm_o2, xmm_o0) # o2 = o0
  866. MOVDQA(xmm_o3, xmm_o0) # o3 = o0
  867. PXOR(xmm_o0, [reg_l+16]) # o0 ^= L[1]
  868. PXOR(xmm_o1, [reg_l+32]) # o1 ^= L[2]
  869. PXOR(xmm_o2, [reg_l+48]) # o2 ^= L[3]
  870. PXOR(xmm_o3, [reg_l+64]) # o3 ^= L[4]
  871. aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero)
  872. # Load the left halfs of the dsts into registers.
  873. xmm_dst_l2 = xmm_dst_r0
  874. xmm_dst_l3 = xmm_dst_r1
  875. MOVDQU(xmm_dst_l0, [reg_dst]) # dst_l0 = dst[:]
  876. MOVDQU(xmm_dst_l1, [reg_dst+32]) # dst_l1 = dst[32:]
  877. MOVDQU(xmm_dst_l2, [reg_dst+64]) # dst_l2 = dst[64:]
  878. MOVDQU(xmm_dst_l3, [reg_dst+96]) # dst_l3 = dst[96:]
  879. # dst_l0 ^= o0, ... dst_l3 ^= o3
  880. PXOR(xmm_dst_l0, xmm_o0)
  881. PXOR(xmm_dst_l1, xmm_o1)
  882. PXOR(xmm_dst_l2, xmm_o2)
  883. PXOR(xmm_dst_l3, xmm_o3)
  884. # Y ^= dst_l0 ^ ... ^ dst_l3
  885. PXOR(xmm_y, xmm_dst_l0)
  886. PXOR(xmm_y, xmm_dst_l1)
  887. PXOR(xmm_y, xmm_dst_l2)
  888. PXOR(xmm_y, xmm_dst_l3)
  889. # Store the altered left halfs.
  890. MOVDQA(mem_dst_l0, xmm_dst_l0)
  891. MOVDQA(mem_dst_l1, xmm_dst_l1)
  892. MOVDQA(mem_dst_l2, xmm_dst_l2)
  893. MOVDQA(mem_dst_l3, xmm_dst_l3)
  894. # Load the right halfs of dst into registers.
  895. xmm_dst_r2 = xmm_dst_l0
  896. xmm_dst_r3 = xmm_dst_l1
  897. MOVDQU(xmm_dst_r0, [reg_dst+16]) # dst_r0 = dst[ 16:]
  898. MOVDQU(xmm_dst_r1, [reg_dst+48]) # dst_r1 = dst[ 48:]
  899. MOVDQU(xmm_dst_r2, [reg_dst+80]) # dst_r2 = dst[ 80:]
  900. MOVDQU(xmm_dst_r3, [reg_dst+112]) # dst_r3 = dst[112:]
  901. # o0 ^= dst_r0, ... o3 ^= dst_r3
  902. # dst_r0 = o0, ... dst_r3 = o3
  903. PXOR(xmm_o0, xmm_dst_r0)
  904. PXOR(xmm_o1, xmm_dst_r1)
  905. PXOR(xmm_o2, xmm_dst_r2)
  906. PXOR(xmm_o3, xmm_dst_r3)
  907. MOVDQU([reg_dst+16], xmm_o0)
  908. MOVDQU([reg_dst+48], xmm_o1)
  909. MOVDQU([reg_dst+80], xmm_o2)
  910. MOVDQU([reg_dst+112], xmm_o3)
  911. MOVDQA(xmm_dst_r0, xmm_o0)
  912. MOVDQA(xmm_dst_r1, xmm_o1)
  913. MOVDQA(xmm_dst_r2, xmm_o2)
  914. MOVDQA(xmm_dst_r3, xmm_o3)
  915. # o0 = aes4(o0 ^ I[0]) // E(0,0)
  916. # ...
  917. # o3 = aes4(o3 ^ I[0]) // E(0,0)
  918. PXOR(xmm_o0, xmm_i)
  919. PXOR(xmm_o1, xmm_i)
  920. PXOR(xmm_o2, xmm_i)
  921. PXOR(xmm_o3, xmm_i)
  922. aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero)
  923. # o0 ^= dst_l0, ... o3 ^= dst_l3
  924. # dst_l0 = o0, ... dst_l3 = o3
  925. #
  926. # nb: Stored into the right hand blocks of dst[], because we are
  927. # done with the left hand side.
  928. PXOR(xmm_o0, mem_dst_l0)
  929. PXOR(xmm_o1, mem_dst_l1)
  930. PXOR(xmm_o2, mem_dst_l2)
  931. PXOR(xmm_o3, mem_dst_l3)
  932. MOVDQU([reg_dst+16], xmm_o0)
  933. MOVDQU([reg_dst+48], xmm_o1)
  934. MOVDQU([reg_dst+80], xmm_o2)
  935. MOVDQU([reg_dst+112], xmm_o3)
  936. # o0 = aes4(o0 ^ J[0] ^ I ^ L[1]) // E(1,1)
  937. # o1 = aes4(o1 ^ J[0] ^ I ^ L[2]) // E(1,2)
  938. # o2 = aes4(o2 ^ J[0] ^ I ^ L[3]) // E(1,3)
  939. # o3 = aes4(o3 ^ J[0] ^ I ^ L[4]) // E(1,4)
  940. PXOR(xmm_o0, [reg_j])
  941. PXOR(xmm_o1, [reg_j])
  942. PXOR(xmm_o2, [reg_j])
  943. PXOR(xmm_o3, [reg_j])
  944. PXOR(xmm_o0, xmm_iDbl)
  945. PXOR(xmm_o1, xmm_iDbl)
  946. PXOR(xmm_o2, xmm_iDbl)
  947. PXOR(xmm_o3, xmm_iDbl)
  948. PXOR(xmm_o0, [reg_l+16]) # o0 ^= L[1]
  949. PXOR(xmm_o1, [reg_l+32]) # o1 ^= L[2]
  950. PXOR(xmm_o2, [reg_l+48]) # o2 ^= L[3]
  951. PXOR(xmm_o3, [reg_l+64]) # o3 ^= L[4]
  952. aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero)
  953. # dst_r0 ^= o0, ... dst_r3 ^= o3
  954. # dst_l0, dst_r0 = dst_r0, dst_l0 ... dst_l3, dst_r3 = dst_r, dst_l3
  955. #
  956. # nb: dst_l0 ... dst_l7 already written after the previous aesenc4x4
  957. # call.
  958. PXOR(xmm_o0, xmm_dst_r0)
  959. PXOR(xmm_o1, xmm_dst_r1)
  960. PXOR(xmm_o2, xmm_dst_r2)
  961. PXOR(xmm_o3, xmm_dst_r3)
  962. MOVDQU([reg_dst], xmm_o0)
  963. MOVDQU([reg_dst+32], xmm_o1)
  964. MOVDQU([reg_dst+64], xmm_o2)
  965. MOVDQU([reg_dst+96], xmm_o3)
  966. # Update book keeping.
  967. ADD(reg_dst, 128)
  968. ADD(reg_idx, 4)
  969. SUB(reg_bytes, 128)
  970. LABEL(process_64bytes)
  971. ADD(reg_bytes, 128)
  972. process_32bytes = Label()
  973. SUB(reg_bytes, 64)
  974. JB(process_32bytes)
  975. #
  976. # Process 4 * 16 bytes.
  977. #
  978. # (Scratch space unused past this point, working set fits into registers.)
  979. #
  980. reg_l_offset = reg_tmp
  981. MOV(reg_l_offset, reg_idx)
  982. SHL(reg_l_offset, 4)
  983. ADD(reg_l_offset, reg_l) # reg_l_offset = reg_l + reg_idx*16 (L[i%8])
  984. # o0 = aes4(J[1] ^ I ^ L[(i+0)%8] ^ S[:], keys) // E(1,i)
  985. # o1 = aes4(J[1] ^ I ^ L[(i+1)%8] ^ S[:], keys) // E(1,i+1)
  986. MOVDQA(xmm_o0, xmm_s)
  987. PXOR(xmm_o0, xmm_iDbl) # o0 = s ^ I
  988. MOVDQA(xmm_o1, xmm_o0) # o1 = o0
  989. PXOR(xmm_o0, [reg_l_offset]) # o0 ^= L[i]
  990. PXOR(xmm_o1, [reg_l_offset+16]) # o1 ^= L[i+1]
  991. aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero)
  992. # Load dst into registers.
  993. MOVDQU(xmm_dst_l0, [reg_dst]) # dst_l0 = dst[:]
  994. MOVDQU(xmm_dst_r0, [reg_dst+16]) # dst_r0 = dst[16:]
  995. MOVDQU(xmm_dst_l1, [reg_dst+32]) # dst_l1 = dst[32:]
  996. MOVDQU(xmm_dst_r1, [reg_dst+48]) # dst_r1 = dst[48:]
  997. # dst_l0 ^= o0, dst_l1 ^= o1
  998. PXOR(xmm_dst_l0, xmm_o0)
  999. PXOR(xmm_dst_l1, xmm_o1)
  1000. # Y ^= dst_l0 ^ dst_l1
  1001. PXOR(xmm_y, xmm_dst_l0)
  1002. PXOR(xmm_y, xmm_dst_l1)
  1003. # o0 ^= dst_r0, o1 ^= dst_r1
  1004. # dst_r0 = o0, dst_r1 = o1
  1005. PXOR(xmm_o0, xmm_dst_r0)
  1006. PXOR(xmm_o1, xmm_dst_r1)
  1007. MOVDQA(xmm_dst_r0, xmm_o0)
  1008. MOVDQA(xmm_dst_r1, xmm_o1)
  1009. # o0 = aes4(o0 ^ I[0]) // E(0,0)
  1010. # o1 = aes4(o1 ^ I[0]) // E(0,0)
  1011. PXOR(xmm_o0, xmm_i)
  1012. PXOR(xmm_o1, xmm_i)
  1013. aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero)
  1014. # o0 ^= dst_l0, o1 ^= dst_;1
  1015. # dst_l0 = o0, dst_l1 = o1
  1016. PXOR(xmm_o0, xmm_dst_l0)
  1017. PXOR(xmm_o1, xmm_dst_l1)
  1018. MOVDQA(xmm_dst_l0, xmm_o0)
  1019. MOVDQA(xmm_dst_l1, xmm_o1)
  1020. # o0 = aes4(o0 ^ J[0] ^ I ^ L[(i+0)%8]) // E(1,i)
  1021. # o1 = aes4(o1 ^ J[0] ^ I ^ L[(i+1)%8]) // E(1,i+1)
  1022. PXOR(xmm_o0, [reg_j])
  1023. PXOR(xmm_o1, [reg_j])
  1024. PXOR(xmm_o0, xmm_iDbl)
  1025. PXOR(xmm_o1, xmm_iDbl)
  1026. PXOR(xmm_o0, [reg_tmp])
  1027. PXOR(xmm_o1, [reg_tmp+16])
  1028. aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero)
  1029. # dst_r0 ^= o0
  1030. # dst_r1 ^= o1
  1031. PXOR(xmm_dst_r0, xmm_o0)
  1032. PXOR(xmm_dst_r1, xmm_o1)
  1033. # dst_l0, dst_r0 = dst_r0, dst_l0 .. dst_l1, dst_r1 = dst_r1, dst_l1
  1034. MOVDQU([reg_dst], xmm_dst_r0)
  1035. MOVDQU([reg_dst+16], xmm_dst_l0)
  1036. MOVDQU([reg_dst+32], xmm_dst_r1)
  1037. MOVDQU([reg_dst+48], xmm_dst_l1)
  1038. # Update book keeping.
  1039. ADD(reg_dst, 64)
  1040. ADD(reg_idx, 2)
  1041. SUB(reg_bytes, 64)
  1042. LABEL(process_32bytes)
  1043. ADD(reg_bytes, 64)
  1044. out = Label()
  1045. SUB(reg_bytes, 32)
  1046. JB(out)
  1047. #
  1048. # Process 2 * 16 bytes
  1049. #
  1050. # Pick the final L from the table. This is the only time
  1051. # where wrapping needs to happen based on the index.
  1052. AND(reg_idx, 7)
  1053. SHL(reg_idx, 4)
  1054. ADD(reg_l, reg_idx) # reg_l += reg_idx (&L[i%8])
  1055. # o0 = aes4(J[1] ^ I ^ L[i%8] ^ S[:], keys) // E(1,i)
  1056. MOVDQA(xmm_o0, xmm_s) # o0 = s
  1057. PXOR(xmm_o0, xmm_iDbl) # o0 ^= I
  1058. PXOR(xmm_o0, [reg_l]) # L[i%8]
  1059. aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero)
  1060. # Load dst into registers.
  1061. MOVDQU(xmm_dst_l0, [reg_dst]) # dst_l = dst[:]
  1062. MOVDQU(xmm_dst_r0, [reg_dst+16]) # dst_r = dst[16:]
  1063. # dst_l ^= o0
  1064. PXOR(xmm_dst_l0, xmm_o0)
  1065. # Y ^= dst_l
  1066. PXOR(xmm_y, xmm_dst_l0)
  1067. # dst_r ^= o0
  1068. PXOR(xmm_o0, xmm_dst_r0)
  1069. MOVDQA(xmm_dst_r0, xmm_o0) # o0 = dst_r
  1070. # o0 = aes4(o0 ^ I[0]) // E(0,0)
  1071. PXOR(xmm_o0, xmm_i)
  1072. aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero)
  1073. # dst_l ^= o0
  1074. PXOR(xmm_o0, xmm_dst_l0)
  1075. MOVDQA(xmm_dst_l0, xmm_o0) # o0 = dst_l
  1076. # o0 = aes4(o0 ^ J[0] ^ I ^ L[i%8]) // E(1,i)
  1077. PXOR(xmm_o0, [reg_j])
  1078. PXOR(xmm_o0, xmm_iDbl)
  1079. PXOR(xmm_o0, [reg_l])
  1080. aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero)
  1081. # dst_r ^= o0
  1082. PXOR(xmm_dst_r0, xmm_o0)
  1083. # dst_l, dst_r = dst_r, dst_l
  1084. MOVDQU([reg_dst], xmm_dst_r0)
  1085. MOVDQU([reg_dst+16], xmm_dst_l0)
  1086. LABEL(out)
  1087. # Write back Y.
  1088. MOVDQU([reg_y], xmm_y)
  1089. # Paranoia, cleanse the scratch space. Most of it is purged
  1090. # at the end of the 16x16 loop, but the 8x16 case uses these 4.
  1091. MOVDQA(mem_dst_l0, xmm_zero)
  1092. MOVDQA(mem_dst_l1, xmm_zero)
  1093. MOVDQA(mem_dst_l2, xmm_zero)
  1094. MOVDQA(mem_dst_l3, xmm_zero)
  1095. # Restore the stack pointer.
  1096. MOV(registers.rsp, reg_sp_save)
  1097. RETURN()