hwaccel_amd64.s 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. // +build !noasm,go1.10
  2. // hwaccel_amd64.s - AMD64 optimized routines
  3. //
  4. // To the extent possible under law, Yawning Angel has waived all copyright
  5. // and related or neighboring rights to the software, using the Creative
  6. // Commons "CC0" public domain dedication. See LICENSE or
  7. // <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
  8. #include "textflag.h"
  9. // func cpuidAmd64(cpuidParams *uint32)
  10. TEXT ·cpuidAmd64(SB), NOSPLIT, $0-8
  11. MOVQ cpuidParams+0(FP), R15
  12. MOVL 0(R15), AX
  13. MOVL 8(R15), CX
  14. CPUID
  15. MOVL AX, 0(R15)
  16. MOVL BX, 4(R15)
  17. MOVL CX, 8(R15)
  18. MOVL DX, 12(R15)
  19. RET
  20. // func xgetbv0Amd64(xcrVec *uint32)
  21. TEXT ·xgetbv0Amd64(SB), NOSPLIT, $0-8
  22. MOVQ xcrVec+0(FP), BX
  23. XORL CX, CX
  24. XGETBV
  25. MOVL AX, 0(BX)
  26. MOVL DX, 4(BX)
  27. RET
  28. // Based heavily on the `ymm` referece implementation, but using assembly
  29. // language instead of using intrinsics like in a sane language.
  30. //
  31. // The TWEAK_LOW_LATENCY variant is used for the permutation.
  32. DATA ·vpshufb_idx_r0<>+0x00(SB)/8, $0x0007060504030201
  33. DATA ·vpshufb_idx_r0<>+0x08(SB)/8, $0x080f0e0d0c0b0a09
  34. DATA ·vpshufb_idx_r0<>+0x10(SB)/8, $0x0007060504030201
  35. DATA ·vpshufb_idx_r0<>+0x18(SB)/8, $0x080f0e0d0c0b0a09
  36. GLOBL ·vpshufb_idx_r0<>(SB), (NOPTR+RODATA), $32
  37. DATA ·vpshufb_idx_r2<>+0x00(SB)/8, $0x0403020100070605
  38. DATA ·vpshufb_idx_r2<>+0x08(SB)/8, $0x0c0b0a09080f0e0d
  39. DATA ·vpshufb_idx_r2<>+0x10(SB)/8, $0x0403020100070605
  40. DATA ·vpshufb_idx_r2<>+0x18(SB)/8, $0x0c0b0a09080f0e0d
  41. GLOBL ·vpshufb_idx_r2<>(SB), (NOPTR+RODATA), $32
  42. DATA ·tag_payload<>+0x00(SB)/8, $0x0000000000000000
  43. DATA ·tag_payload<>+0x08(SB)/8, $0x0000000000000000
  44. DATA ·tag_payload<>+0x10(SB)/8, $0x0000000000000000
  45. DATA ·tag_payload<>+0x18(SB)/8, $0x0000000000000002
  46. GLOBL ·tag_payload<>(SB), (NOPTR+RODATA), $32
  47. DATA ·tag_final<>+0x00(SB)/8, $0x0000000000000000
  48. DATA ·tag_final<>+0x08(SB)/8, $0x0000000000000000
  49. DATA ·tag_final<>+0x10(SB)/8, $0x0000000000000000
  50. DATA ·tag_final<>+0x18(SB)/8, $0x0000000000000008
  51. GLOBL ·tag_final<>(SB), (NOPTR+RODATA), $32
  52. #define G(A, B, C, D, T0, T1, R0, R2) \
  53. VPXOR A, B, T0 \
  54. VPAND A, B, T1 \
  55. VPADDQ T1, T1, T1 \
  56. VPXOR T0, T1, A \
  57. VPXOR D, T0, D \
  58. VPXOR D, T1, D \
  59. VPSHUFB R0, D, D \
  60. \
  61. VPXOR C, D, T0 \
  62. VPAND C, D, T1 \
  63. VPADDQ T1, T1, T1 \
  64. VPXOR T0, T1, C \
  65. VPXOR B, T0, B \
  66. VPXOR B, T1, B \
  67. VPSRLQ $19, B, T0 \
  68. VPSLLQ $45, B, T1 \
  69. VPOR T0, T1, B \
  70. \
  71. VPXOR A, B, T0 \
  72. VPAND A, B, T1 \
  73. VPADDQ T1, T1, T1 \
  74. VPXOR T0, T1, A \
  75. VPXOR D, T0, D \
  76. VPXOR D, T1, D \
  77. VPSHUFB R2, D, D \
  78. \
  79. VPXOR C, D, T0 \
  80. VPAND C, D, T1 \
  81. VPADDQ T1, T1, T1 \
  82. VPXOR T0, T1, C \
  83. VPXOR B, T0, B \
  84. VPXOR B, T1, B \
  85. VPADDQ B, B, T0 \
  86. VPSRLQ $63, B, T1 \
  87. VPOR T0, T1, B
  88. // -109 -> 147 (See: https://github.com/golang/go/issues/24378)
  89. #define DIAGONALIZE(A, B, C, D) \
  90. VPERMQ $-109, D, D \
  91. VPERMQ $78, C, C \
  92. VPERMQ $57, B, B
  93. #define UNDIAGONALIZE(A, B, C, D) \
  94. VPERMQ $57, D, D \
  95. VPERMQ $78, C, C \
  96. VPERMQ $-109, B, B
  97. // func initAVX2(s *uint64, key, nonce *byte, initConsts, instConsts *uint64)
  98. TEXT ·initAVX2(SB), NOSPLIT, $0-40
  99. MOVQ s+0(FP), R8
  100. MOVQ key+8(FP), R9
  101. MOVQ nonce+16(FP), R10
  102. MOVQ initConsts+24(FP), R11
  103. MOVQ instConsts+32(FP), R12
  104. MOVQ 8(R12), AX
  105. VMOVDQU (R10), Y0
  106. VMOVDQU (R9), Y1
  107. VMOVDQU (R11), Y2
  108. VMOVDQU 32(R11), Y3
  109. VMOVDQU (R12), Y4
  110. VMOVDQA Y1, Y5
  111. VPXOR Y3, Y4, Y3
  112. VMOVDQU ·vpshufb_idx_r0<>(SB), Y13
  113. VMOVDQU ·vpshufb_idx_r2<>(SB), Y12
  114. looprounds:
  115. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  116. DIAGONALIZE(Y0, Y1, Y2, Y3)
  117. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  118. UNDIAGONALIZE(Y0, Y1, Y2, Y3)
  119. SUBQ $1, AX
  120. JNZ looprounds
  121. VPXOR Y3, Y5, Y3
  122. VMOVDQU Y0, (R8)
  123. VMOVDQU Y1, 32(R8)
  124. VMOVDQU Y2, 64(R8)
  125. VMOVDQU Y3, 96(R8)
  126. VZEROUPPER
  127. RET
  128. // func absorbBlocksAVX2(s *uint64, in *byte, rounds, blocks uint64, tag *uint64)
  129. TEXT ·absorbBlocksAVX2(SB), NOSPLIT, $0-40
  130. MOVQ s+0(FP), R8
  131. MOVQ in+8(FP), R10
  132. MOVQ rounds+16(FP), R11
  133. MOVQ blocks+24(FP), R12
  134. MOVQ tag+32(FP), R13
  135. VMOVDQU (R8), Y0
  136. VMOVDQU 32(R8), Y1
  137. VMOVDQU 64(R8), Y2
  138. VMOVDQU 96(R8), Y3
  139. VMOVDQU ·vpshufb_idx_r0<>(SB), Y13
  140. VMOVDQU ·vpshufb_idx_r2<>(SB), Y12
  141. VMOVDQU (R13), Y11
  142. loopblocks:
  143. VPXOR Y3, Y11, Y3
  144. MOVQ R11, AX
  145. looprounds:
  146. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  147. DIAGONALIZE(Y0, Y1, Y2, Y3)
  148. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  149. UNDIAGONALIZE(Y0, Y1, Y2, Y3)
  150. SUBQ $1, AX
  151. JNZ looprounds
  152. VMOVDQU (R10), Y4
  153. VMOVDQU 32(R10), Y5
  154. VMOVDQU 64(R10), Y6
  155. VPXOR Y0, Y4, Y0
  156. VPXOR Y1, Y5, Y1
  157. VPXOR Y2, Y6, Y2
  158. VMOVDQU Y0, (R8)
  159. VMOVDQU Y1, 32(R8)
  160. VMOVDQU Y2, 64(R8)
  161. ADDQ $96, R10
  162. SUBQ $1, R12
  163. JNZ loopblocks
  164. VMOVDQU Y3, 96(R8)
  165. VZEROUPPER
  166. RET
  167. // func encryptBlocksAVX2(s *uint64, out, in *byte, rounds, blocks uint64)
  168. TEXT ·encryptBlocksAVX2(SB), NOSPLIT, $0-40
  169. MOVQ s+0(FP), R8
  170. MOVQ out+8(FP), R9
  171. MOVQ in+16(FP), R10
  172. MOVQ rounds+24(FP), R11
  173. MOVQ blocks+32(FP), R12
  174. VMOVDQU (R8), Y0
  175. VMOVDQU 32(R8), Y1
  176. VMOVDQU 64(R8), Y2
  177. VMOVDQU 96(R8), Y3
  178. VMOVDQU ·vpshufb_idx_r0<>(SB), Y13
  179. VMOVDQU ·vpshufb_idx_r2<>(SB), Y12
  180. VMOVDQU ·tag_payload<>(SB), Y11
  181. loopblocks:
  182. VPXOR Y3, Y11, Y3
  183. MOVQ R11, AX
  184. looprounds:
  185. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  186. DIAGONALIZE(Y0, Y1, Y2, Y3)
  187. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  188. UNDIAGONALIZE(Y0, Y1, Y2, Y3)
  189. SUBQ $1, AX
  190. JNZ looprounds
  191. VMOVDQU (R10), Y4
  192. VMOVDQU 32(R10), Y5
  193. VMOVDQU 64(R10), Y6
  194. VPXOR Y0, Y4, Y0
  195. VPXOR Y1, Y5, Y1
  196. VPXOR Y2, Y6, Y2
  197. VMOVDQU Y0, (R9)
  198. VMOVDQU Y1, 32(R9)
  199. VMOVDQU Y2, 64(R9)
  200. ADDQ $96, R9
  201. ADDQ $96, R10
  202. SUBQ $1, R12
  203. JNZ loopblocks
  204. VMOVDQU Y0, (R8)
  205. VMOVDQU Y1, 32(R8)
  206. VMOVDQU Y2, 64(R8)
  207. VMOVDQU Y3, 96(R8)
  208. VZEROUPPER
  209. RET
  210. // func decryptBlocksAVX2(s *uint64, out, in *byte, rounds, blocks uint64)
  211. TEXT ·decryptBlocksAVX2(SB), NOSPLIT, $0-40
  212. MOVQ s+0(FP), R8
  213. MOVQ out+8(FP), R9
  214. MOVQ in+16(FP), R10
  215. MOVQ rounds+24(FP), R11
  216. MOVQ blocks+32(FP), R12
  217. VMOVDQU (R8), Y0
  218. VMOVDQU 32(R8), Y1
  219. VMOVDQU 64(R8), Y2
  220. VMOVDQU 96(R8), Y3
  221. VMOVDQU ·vpshufb_idx_r0<>(SB), Y13
  222. VMOVDQU ·vpshufb_idx_r2<>(SB), Y12
  223. VMOVDQU ·tag_payload<>(SB), Y11
  224. loopblocks:
  225. VPXOR Y3, Y11, Y3
  226. MOVQ R11, AX
  227. looprounds:
  228. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  229. DIAGONALIZE(Y0, Y1, Y2, Y3)
  230. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  231. UNDIAGONALIZE(Y0, Y1, Y2, Y3)
  232. SUBQ $1, AX
  233. JNZ looprounds
  234. VMOVDQU (R10), Y4
  235. VMOVDQU 32(R10), Y5
  236. VMOVDQU 64(R10), Y6
  237. VPXOR Y0, Y4, Y0
  238. VPXOR Y1, Y5, Y1
  239. VPXOR Y2, Y6, Y2
  240. VMOVDQU Y0, (R9)
  241. VMOVDQU Y1, 32(R9)
  242. VMOVDQU Y2, 64(R9)
  243. VMOVDQA Y4, Y0
  244. VMOVDQA Y5, Y1
  245. VMOVDQA Y6, Y2
  246. ADDQ $96, R9
  247. ADDQ $96, R10
  248. SUBQ $1, R12
  249. JNZ loopblocks
  250. VMOVDQU Y0, (R8)
  251. VMOVDQU Y1, 32(R8)
  252. VMOVDQU Y2, 64(R8)
  253. VMOVDQU Y3, 96(R8)
  254. VZEROUPPER
  255. RET
  256. // func decryptLastBlockAVX2(s *uint64, out, in *byte, rounds, inLen uint64)
  257. TEXT ·decryptLastBlockAVX2(SB), NOSPLIT, $0-40
  258. MOVQ s+0(FP), R8
  259. MOVQ out+8(FP), R9
  260. MOVQ in+16(FP), R10
  261. MOVQ rounds+24(FP), AX
  262. MOVQ inLen+32(FP), R12
  263. VMOVDQU (R8), Y0
  264. VMOVDQU 32(R8), Y1
  265. VMOVDQU 64(R8), Y2
  266. VMOVDQU 96(R8), Y3
  267. VMOVDQU ·vpshufb_idx_r0<>(SB), Y13
  268. VMOVDQU ·vpshufb_idx_r2<>(SB), Y12
  269. VMOVDQU ·tag_payload<>(SB), Y11
  270. VPXOR Y3, Y11, Y3
  271. looprounds:
  272. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  273. DIAGONALIZE(Y0, Y1, Y2, Y3)
  274. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  275. UNDIAGONALIZE(Y0, Y1, Y2, Y3)
  276. SUBQ $1, AX
  277. JNZ looprounds
  278. VMOVDQU Y0, (R9)
  279. VMOVDQU Y1, 32(R9)
  280. VMOVDQU Y2, 64(R9)
  281. CMPQ R12, $0
  282. JEQ skipcopy
  283. XORQ AX, AX
  284. loopcopy:
  285. MOVB (R10)(AX*1), BX
  286. MOVB BX, (R9)(AX*1)
  287. ADDQ $1, AX
  288. CMPQ AX, R12
  289. JNE loopcopy
  290. skipcopy:
  291. XORB $0x01, (R9)(R12*1)
  292. XORB $0x80, 95(R9)
  293. VMOVDQU (R9), Y4
  294. VMOVDQU 32(R9), Y5
  295. VMOVDQU 64(R9), Y6
  296. VPXOR Y0, Y4, Y0
  297. VPXOR Y1, Y5, Y1
  298. VPXOR Y2, Y6, Y2
  299. VMOVDQU Y0, (R9)
  300. VMOVDQU Y1, 32(R9)
  301. VMOVDQU Y2, 64(R9)
  302. VMOVDQU Y4, (R8)
  303. VMOVDQU Y5, 32(R8)
  304. VMOVDQU Y6, 64(R8)
  305. VMOVDQU Y3, 96(R8)
  306. VZEROUPPER
  307. RET
  308. // func finalizeAVX2(s *uint64, out, key *byte, rounds uint64)
  309. TEXT ·finalizeAVX2(SB), NOSPLIT, $0-32
  310. MOVQ s+0(FP), R8
  311. MOVQ out+8(FP), R9
  312. MOVQ key+16(FP), R10
  313. MOVQ rounds+24(FP), R11
  314. VMOVDQU (R8), Y0
  315. VMOVDQU 32(R8), Y1
  316. VMOVDQU 64(R8), Y2
  317. VMOVDQU 96(R8), Y3
  318. VMOVDQU ·vpshufb_idx_r0<>(SB), Y13
  319. VMOVDQU ·vpshufb_idx_r2<>(SB), Y12
  320. VMOVDQU ·tag_final<>(SB), Y11
  321. VPXOR Y3, Y11, Y3
  322. MOVQ R11, AX
  323. looprounds:
  324. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  325. DIAGONALIZE(Y0, Y1, Y2, Y3)
  326. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  327. UNDIAGONALIZE(Y0, Y1, Y2, Y3)
  328. SUBQ $1, AX
  329. JNZ looprounds
  330. VMOVDQU (R10), Y11
  331. VPXOR Y3, Y11, Y3
  332. looprounds2:
  333. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  334. DIAGONALIZE(Y0, Y1, Y2, Y3)
  335. G(Y0, Y1, Y2, Y3, Y15, Y14, Y13, Y12)
  336. UNDIAGONALIZE(Y0, Y1, Y2, Y3)
  337. SUBQ $1, R11
  338. JNZ looprounds2
  339. VPXOR Y3, Y11, Y3
  340. VMOVDQU Y3, (R9)
  341. VZEROUPPER
  342. RET