chacha20_amd64.s 6.0 KB


  1. // Generated by PeachPy 0.2.0 from chacha20_amd64.py
  2. // func blocksAmd64SSE2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint)
  3. TEXT ·blocksAmd64SSE2(SB),4,$0-32
  4. MOVQ x+0(FP), AX
  5. MOVQ inp+8(FP), BX
  6. MOVQ outp+16(FP), CX
  7. MOVQ nrBlocks+24(FP), DX
  8. MOVQ SP, DI
  9. ANDQ $15, DI
  10. MOVQ $16, SI
  11. SUBQ DI, SI
  12. SUBQ SI, SP
  13. SUBQ $16, SP
  14. MOVL $1, DI
  15. MOVL DI, 0(SP)
  16. MOVL $0, DI
  17. MOVL DI, 4(SP)
  18. MOVL DI, 8(SP)
  19. MOVL DI, 12(SP)
  20. MOVO 16(AX), X1
  21. MOVO 32(AX), X2
  22. MOVO 48(AX), X3
  23. SUBQ $3, DX
  24. JCS vector_loop_end
  25. vector_loop_begin:
  26. MOVO 0(AX), X4
  27. MOVO X1, X5
  28. MOVO X2, X6
  29. MOVO X3, X7
  30. MOVO 0(AX), X8
  31. MOVO X1, X9
  32. MOVO X2, X10
  33. MOVO X3, X11
  34. PADDQ 0(SP), X11
  35. MOVO 0(AX), X12
  36. MOVO X1, X13
  37. MOVO X2, X14
  38. MOVO X11, X15
  39. PADDQ 0(SP), X15
  40. MOVQ $20, DI
  41. rounds_loop0_begin:
  42. PADDL X5, X4
  43. PXOR X4, X7
  44. MOVO X7, X0
  45. PSLLL $16, X0
  46. PSRLL $16, X7
  47. PXOR X0, X7
  48. PADDL X7, X6
  49. PXOR X6, X5
  50. MOVO X5, X0
  51. PSLLL $12, X0
  52. PSRLL $20, X5
  53. PXOR X0, X5
  54. PADDL X5, X4
  55. PXOR X4, X7
  56. MOVO X7, X0
  57. PSLLL $8, X0
  58. PSRLL $24, X7
  59. PXOR X0, X7
  60. PADDL X7, X6
  61. PXOR X6, X5
  62. MOVO X5, X0
  63. PSLLL $7, X0
  64. PSRLL $25, X5
  65. PXOR X0, X5
  66. PSHUFL $57, X5, X5
  67. PSHUFL $78, X6, X6
  68. PSHUFL $147, X7, X7
  69. PADDL X5, X4
  70. PXOR X4, X7
  71. MOVO X7, X0
  72. PSLLL $16, X0
  73. PSRLL $16, X7
  74. PXOR X0, X7
  75. PADDL X7, X6
  76. PXOR X6, X5
  77. MOVO X5, X0
  78. PSLLL $12, X0
  79. PSRLL $20, X5
  80. PXOR X0, X5
  81. PADDL X5, X4
  82. PXOR X4, X7
  83. MOVO X7, X0
  84. PSLLL $8, X0
  85. PSRLL $24, X7
  86. PXOR X0, X7
  87. PADDL X7, X6
  88. PXOR X6, X5
  89. MOVO X5, X0
  90. PSLLL $7, X0
  91. PSRLL $25, X5
  92. PXOR X0, X5
  93. PSHUFL $147, X5, X5
  94. PSHUFL $78, X6, X6
  95. PSHUFL $57, X7, X7
  96. PADDL X9, X8
  97. PXOR X8, X11
  98. MOVO X11, X0
  99. PSLLL $16, X0
  100. PSRLL $16, X11
  101. PXOR X0, X11
  102. PADDL X11, X10
  103. PXOR X10, X9
  104. MOVO X9, X0
  105. PSLLL $12, X0
  106. PSRLL $20, X9
  107. PXOR X0, X9
  108. PADDL X9, X8
  109. PXOR X8, X11
  110. MOVO X11, X0
  111. PSLLL $8, X0
  112. PSRLL $24, X11
  113. PXOR X0, X11
  114. PADDL X11, X10
  115. PXOR X10, X9
  116. MOVO X9, X0
  117. PSLLL $7, X0
  118. PSRLL $25, X9
  119. PXOR X0, X9
  120. PSHUFL $57, X9, X9
  121. PSHUFL $78, X10, X10
  122. PSHUFL $147, X11, X11
  123. PADDL X9, X8
  124. PXOR X8, X11
  125. MOVO X11, X0
  126. PSLLL $16, X0
  127. PSRLL $16, X11
  128. PXOR X0, X11
  129. PADDL X11, X10
  130. PXOR X10, X9
  131. MOVO X9, X0
  132. PSLLL $12, X0
  133. PSRLL $20, X9
  134. PXOR X0, X9
  135. PADDL X9, X8
  136. PXOR X8, X11
  137. MOVO X11, X0
  138. PSLLL $8, X0
  139. PSRLL $24, X11
  140. PXOR X0, X11
  141. PADDL X11, X10
  142. PXOR X10, X9
  143. MOVO X9, X0
  144. PSLLL $7, X0
  145. PSRLL $25, X9
  146. PXOR X0, X9
  147. PSHUFL $147, X9, X9
  148. PSHUFL $78, X10, X10
  149. PSHUFL $57, X11, X11
  150. PADDL X13, X12
  151. PXOR X12, X15
  152. MOVO X15, X0
  153. PSLLL $16, X0
  154. PSRLL $16, X15
  155. PXOR X0, X15
  156. PADDL X15, X14
  157. PXOR X14, X13
  158. MOVO X13, X0
  159. PSLLL $12, X0
  160. PSRLL $20, X13
  161. PXOR X0, X13
  162. PADDL X13, X12
  163. PXOR X12, X15
  164. MOVO X15, X0
  165. PSLLL $8, X0
  166. PSRLL $24, X15
  167. PXOR X0, X15
  168. PADDL X15, X14
  169. PXOR X14, X13
  170. MOVO X13, X0
  171. PSLLL $7, X0
  172. PSRLL $25, X13
  173. PXOR X0, X13
  174. PSHUFL $57, X13, X13
  175. PSHUFL $78, X14, X14
  176. PSHUFL $147, X15, X15
  177. PADDL X13, X12
  178. PXOR X12, X15
  179. MOVO X15, X0
  180. PSLLL $16, X0
  181. PSRLL $16, X15
  182. PXOR X0, X15
  183. PADDL X15, X14
  184. PXOR X14, X13
  185. MOVO X13, X0
  186. PSLLL $12, X0
  187. PSRLL $20, X13
  188. PXOR X0, X13
  189. PADDL X13, X12
  190. PXOR X12, X15
  191. MOVO X15, X0
  192. PSLLL $8, X0
  193. PSRLL $24, X15
  194. PXOR X0, X15
  195. PADDL X15, X14
  196. PXOR X14, X13
  197. MOVO X13, X0
  198. PSLLL $7, X0
  199. PSRLL $25, X13
  200. PXOR X0, X13
  201. PSHUFL $147, X13, X13
  202. PSHUFL $78, X14, X14
  203. PSHUFL $57, X15, X15
  204. SUBQ $2, DI
  205. JNE rounds_loop0_begin
  206. PADDL 0(AX), X4
  207. PADDL X1, X5
  208. PADDL X2, X6
  209. PADDL X3, X7
  210. MOVOU 0(BX), X0
  211. PXOR X4, X0
  212. MOVOU X0, 0(CX)
  213. MOVOU 16(BX), X0
  214. PXOR X5, X0
  215. MOVOU X0, 16(CX)
  216. MOVOU 32(BX), X0
  217. PXOR X6, X0
  218. MOVOU X0, 32(CX)
  219. MOVOU 48(BX), X0
  220. PXOR X7, X0
  221. MOVOU X0, 48(CX)
  222. PADDQ 0(SP), X3
  223. PADDL 0(AX), X8
  224. PADDL X1, X9
  225. PADDL X2, X10
  226. PADDL X3, X11
  227. MOVOU 64(BX), X0
  228. PXOR X8, X0
  229. MOVOU X0, 64(CX)
  230. MOVOU 80(BX), X0
  231. PXOR X9, X0
  232. MOVOU X0, 80(CX)
  233. MOVOU 96(BX), X0
  234. PXOR X10, X0
  235. MOVOU X0, 96(CX)
  236. MOVOU 112(BX), X0
  237. PXOR X11, X0
  238. MOVOU X0, 112(CX)
  239. PADDQ 0(SP), X3
  240. PADDL 0(AX), X12
  241. PADDL X1, X13
  242. PADDL X2, X14
  243. PADDL X3, X15
  244. MOVOU 128(BX), X0
  245. PXOR X12, X0
  246. MOVOU X0, 128(CX)
  247. MOVOU 144(BX), X0
  248. PXOR X13, X0
  249. MOVOU X0, 144(CX)
  250. MOVOU 160(BX), X0
  251. PXOR X14, X0
  252. MOVOU X0, 160(CX)
  253. MOVOU 176(BX), X0
  254. PXOR X15, X0
  255. MOVOU X0, 176(CX)
  256. PADDQ 0(SP), X3
  257. ADDQ $192, BX
  258. ADDQ $192, CX
  259. SUBQ $3, DX
  260. JCC vector_loop_begin
  261. vector_loop_end:
  262. ADDQ $3, DX
  263. JEQ serial_loop_end
  264. MOVO 0(AX), X8
  265. MOVO 0(SP), X9
  266. serial_loop_begin:
  267. MOVO X8, X4
  268. MOVO X1, X5
  269. MOVO X2, X6
  270. MOVO X3, X7
  271. MOVQ $20, DI
  272. rounds_loop1_begin:
  273. PADDL X5, X4
  274. PXOR X4, X7
  275. MOVO X7, X0
  276. PSLLL $16, X0
  277. PSRLL $16, X7
  278. PXOR X0, X7
  279. PADDL X7, X6
  280. PXOR X6, X5
  281. MOVO X5, X0
  282. PSLLL $12, X0
  283. PSRLL $20, X5
  284. PXOR X0, X5
  285. PADDL X5, X4
  286. PXOR X4, X7
  287. MOVO X7, X0
  288. PSLLL $8, X0
  289. PSRLL $24, X7
  290. PXOR X0, X7
  291. PADDL X7, X6
  292. PXOR X6, X5
  293. MOVO X5, X0
  294. PSLLL $7, X0
  295. PSRLL $25, X5
  296. PXOR X0, X5
  297. PSHUFL $57, X5, X5
  298. PSHUFL $78, X6, X6
  299. PSHUFL $147, X7, X7
  300. PADDL X5, X4
  301. PXOR X4, X7
  302. MOVO X7, X0
  303. PSLLL $16, X0
  304. PSRLL $16, X7
  305. PXOR X0, X7
  306. PADDL X7, X6
  307. PXOR X6, X5
  308. MOVO X5, X0
  309. PSLLL $12, X0
  310. PSRLL $20, X5
  311. PXOR X0, X5
  312. PADDL X5, X4
  313. PXOR X4, X7
  314. MOVO X7, X0
  315. PSLLL $8, X0
  316. PSRLL $24, X7
  317. PXOR X0, X7
  318. PADDL X7, X6
  319. PXOR X6, X5
  320. MOVO X5, X0
  321. PSLLL $7, X0
  322. PSRLL $25, X5
  323. PXOR X0, X5
  324. PSHUFL $147, X5, X5
  325. PSHUFL $78, X6, X6
  326. PSHUFL $57, X7, X7
  327. SUBQ $2, DI
  328. JNE rounds_loop1_begin
  329. PADDL X8, X4
  330. PADDL X1, X5
  331. PADDL X2, X6
  332. PADDL X3, X7
  333. MOVOU 0(BX), X0
  334. PXOR X4, X0
  335. MOVOU X0, 0(CX)
  336. MOVOU 16(BX), X0
  337. PXOR X5, X0
  338. MOVOU X0, 16(CX)
  339. MOVOU 32(BX), X0
  340. PXOR X6, X0
  341. MOVOU X0, 32(CX)
  342. MOVOU 48(BX), X0
  343. PXOR X7, X0
  344. MOVOU X0, 48(CX)
  345. PADDQ X9, X3
  346. ADDQ $64, BX
  347. ADDQ $64, CX
  348. SUBQ $1, DX
  349. JNE serial_loop_begin
  350. serial_loop_end:
  351. MOVO X3, 48(AX)
  352. ADDQ $16, SP
  353. ADDQ SI, SP
  354. RET