hwaccel_amd64.s 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. // +build !noasm,go1.10
  2. // hwaccel_amd64.s - AMD64 optimized routines
  3. //
  4. // To the extent possible under law, Yawning Angel has waived all copyright
  5. // and related or neighboring rights to the software, using the Creative
  6. // Commons "CC0" public domain dedication. See LICENSE or
  7. // <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
  8. #include "textflag.h"
  9. // func cpuidAmd64(cpuidParams *uint32)
  10. TEXT ·cpuidAmd64(SB), NOSPLIT, $0-8
  11. MOVQ cpuidParams+0(FP), R15
  12. MOVL 0(R15), AX
  13. MOVL 8(R15), CX
  14. CPUID
  15. MOVL AX, 0(R15)
  16. MOVL BX, 4(R15)
  17. MOVL CX, 8(R15)
  18. MOVL DX, 12(R15)
  19. RET
  20. // func xgetbv0Amd64(xcrVec *uint32)
  21. TEXT ·xgetbv0Amd64(SB), NOSPLIT, $0-8
  22. MOVQ xcrVec+0(FP), BX
  23. XORL CX, CX
  24. XGETBV
  25. MOVL AX, 0(BX)
  26. MOVL DX, 4(BX)
  27. RET
  28. // Some useful macros for loading/storing the state, and the state update
  29. // function, along with aliases for the registers used for readability.
  30. // YMM Registers: Sx -> State, Mx -> Message, Tx -> Temporary
  31. // GP Registers: RAX, RBX, RCX -> Temporary
  32. #define S0 Y0
  33. #define S1 Y1
  34. #define S2 Y2
  35. #define S3 Y3
  36. #define S4 Y4
  37. #define M0 Y5
  38. #define T0 Y14
  39. #define T1 Y15
  40. // This essentially naively translated from the intrinsics, but neither GCC nor
  41. // clang's idea of what this should be appears to be better on Broadwell, and
  42. // there is a benefit to being easy to cross reference with the upstream
  43. // implementation.
  44. #define STATE_UPDATE() \
  45. VPXOR S0, S3, S0 \
  46. VPAND S1, S2, T0 \
  47. VPXOR S0, T0, S0 \
  48. VPSLLQ $13, S0, T0 \
  49. VPSRLQ $51, S0, T1 \
  50. VPOR T0, T1, S0 \
  51. VPERMQ $-109, S3, S3 \
  52. \
  53. VPXOR S1, M0, S1 \
  54. VPXOR S1, S4, S1 \
  55. VPAND S2, S3, T0 \
  56. VPXOR S1, T0, S1 \
  57. VPSLLQ $46, S1, T0 \
  58. VPSRLQ $18, S1, T1 \
  59. VPOR T0, T1, S1 \
  60. VPERMQ $78, S4, S4 \
  61. \
  62. VPXOR S2, M0, S2 \
  63. VPXOR S2, S0, S2 \
  64. VPAND S3, S4, T0 \
  65. VPXOR S2, T0, S2 \
  66. VPSLLQ $38, S2, T0 \
  67. VPSRLQ $26, S2, T1 \
  68. VPOR T0, T1, S2 \
  69. VPERMQ $57, S0, S0 \
  70. \
  71. VPXOR S3, M0, S3 \
  72. VPXOR S3, S1, S3 \
  73. VPAND S4, S0, T0 \
  74. VPXOR S3, T0, S3 \
  75. VPSLLQ $7, S3, T0 \
  76. VPSRLQ $57, S3, T1 \
  77. VPOR T0, T1, S3 \
  78. VPERMQ $78, S1, S1 \
  79. \
  80. VPXOR S4, M0, S4 \
  81. VPXOR S4, S2, S4 \
  82. VPAND S0, S1, T0 \
  83. VPXOR S4, T0, S4 \
  84. VPSLLQ $4, S4, T0 \
  85. VPSRLQ $60, S4, T1 \
  86. VPOR T0, T1, S4 \
  87. VPERMQ $-109, S2, S2
  88. #define COPY(DST, SRC, LEN) \
  89. MOVQ SRC, SI \
  90. MOVQ DST, DI \
  91. MOVQ LEN, CX \
  92. REP \
  93. MOVSB
  94. #define INIT_STATE(IV, KEY) \
  95. VPXOR S0, S0, S0 \
  96. MOVOU (IV), X0 \
  97. VMOVDQU (KEY), S1 \
  98. VPCMPEQD S2, S2, S2 \
  99. VPXOR S3, S3, S3 \
  100. VMOVDQU ·initializationConstants(SB), S4 \
  101. VPXOR M0, M0, M0 \
  102. VMOVDQA S1, Y6 \
  103. MOVQ $16, AX \
  104. \
  105. initLoop: \
  106. STATE_UPDATE() \
  107. SUBQ $1, AX \
  108. JNZ initLoop \
  109. \
  110. VPXOR Y6, S1, S1
  111. #define ABSORB_BLOCKS(A, ALEN, SCRATCH) \
  112. MOVQ ALEN, AX \
  113. SHRQ $5, AX \
  114. JZ absorbPartial \
  115. loopAbsorbFull: \
  116. VMOVDQU (A), M0 \
  117. STATE_UPDATE() \
  118. ADDQ $32, A \
  119. SUBQ $1, AX \
  120. JNZ loopAbsorbFull \
  121. absorbPartial: \
  122. ANDQ $31, ALEN \
  123. JZ absorbDone \
  124. COPY(SCRATCH, A, ALEN) \
  125. VMOVDQU (SCRATCH), M0 \
  126. STATE_UPDATE() \
  127. absorbDone:
  128. #define FINALIZE(TAG, ALEN, MLEN, SCRATCH) \
  129. SHLQ $3, ALEN \
  130. MOVQ ALEN, (SCRATCH) \
  131. SHLQ $3, MLEN \
  132. MOVQ MLEN, 8(SCRATCH) \
  133. \
  134. VPXOR S4, S0, S4 \
  135. VMOVDQU (SCRATCH), M0 \
  136. \
  137. MOVQ $10, AX \
  138. loopFinal: \
  139. STATE_UPDATE() \
  140. SUBQ $1, AX \
  141. JNZ loopFinal \
  142. \
  143. VPERMQ $57, S1, Y6 \
  144. VPXOR S0, Y6, Y6 \
  145. VPAND S2, S3, Y7 \
  146. VPXOR Y6, Y7, Y7 \
  147. MOVOU X7, (TAG)
  148. // func aeadEncryptAVX2(c, m, a []byte, nonce, key *byte)
  149. TEXT ·aeadEncryptAVX2(SB), NOSPLIT, $32-88
  150. MOVQ SP, R15
  151. VPXOR Y13, Y13, Y13
  152. VMOVDQU Y13, (R15)
  153. CLD
  154. // Initialize the state.
  155. MOVQ nonce+72(FP), R8
  156. MOVQ key+80(FP), R9
  157. INIT_STATE(R8, R9)
  158. // Absorb the AD.
  159. MOVQ a+48(FP), R8 // &a[0] -> R8
  160. MOVQ a+56(FP), R9 // len(a) -> R9
  161. ABSORB_BLOCKS(R8, R9, R15)
  162. // Encrypt the data.
  163. MOVQ m+24(FP), R8 // &m[0] -> R8
  164. MOVQ m+32(FP), R9 // len(m) -> R9
  165. MOVQ c+0(FP), R10 // &c[0] -> R10
  166. MOVQ R9, AX
  167. SHRQ $5, AX
  168. JZ encryptPartial
  169. loopEncryptFull:
  170. VMOVDQU (R8), M0
  171. VPERMQ $57, S1, Y6
  172. VPXOR S0, Y6, Y6
  173. VPAND S2, S3, Y7
  174. VPXOR Y6, Y7, Y6
  175. VPXOR M0, Y6, Y6
  176. VMOVDQU Y6, (R10)
  177. STATE_UPDATE()
  178. ADDQ $32, R8
  179. ADDQ $32, R10
  180. SUBQ $1, AX
  181. JNZ loopEncryptFull
  182. encryptPartial:
  183. ANDQ $31, R9
  184. JZ encryptDone
  185. VMOVDQU Y13, (R15)
  186. COPY(R15, R8, R9)
  187. VMOVDQU (R15), M0
  188. VPERMQ $57, S1, Y6
  189. VPXOR S0, Y6, Y6
  190. VPAND S2, S3, Y7
  191. VPXOR Y6, Y7, Y6
  192. VPXOR M0, Y6, Y6
  193. VMOVDQU Y6, (R15)
  194. STATE_UPDATE()
  195. COPY(R10, R15, R9)
  196. ADDQ R9, R10
  197. encryptDone:
  198. // Finalize and write the tag.
  199. MOVQ a+56(FP), R8 // len(a) -> R8
  200. MOVQ m+32(FP), R9 // len(m) -> R9
  201. VMOVDQU Y13, (R15)
  202. FINALIZE(R10, R8, R9, R15)
  203. VMOVDQU Y13, (R15)
  204. VZEROUPPER
  205. RET
  206. // func aeadDecryptAVX2(m, c, a []byte, nonce, key, tag *byte)
  207. TEXT ·aeadDecryptAVX2(SB), NOSPLIT, $32-96
  208. MOVQ SP, R15
  209. VPXOR Y13, Y13, Y13
  210. VMOVDQU Y13, (R15)
  211. CLD
  212. // Initialize the state.
  213. MOVQ nonce+72(FP), R8
  214. MOVQ key+80(FP), R9
  215. INIT_STATE(R8, R9)
  216. // Absorb the AD.
  217. MOVQ a+48(FP), R8 // &a[0] -> R8
  218. MOVQ a+56(FP), R9 // len(a) -> R9
  219. ABSORB_BLOCKS(R8, R9, R15)
  220. // Decrypt the data.
  221. MOVQ c+24(FP), R8 // &c[0] -> R8
  222. MOVQ c+32(FP), R9 // len(c) -> R9
  223. MOVQ m+0(FP), R10 // &m[0] -> R10
  224. MOVQ R9, AX
  225. SHRQ $5, AX
  226. JZ decryptPartial
  227. loopDecryptFull:
  228. VMOVDQU (R8), M0
  229. VPERMQ $57, S1, Y6
  230. VPXOR S0, Y6, Y6
  231. VPAND S2, S3, Y7
  232. VPXOR Y6, Y7, Y6
  233. VPXOR M0, Y6, M0
  234. VMOVDQU M0, (R10)
  235. STATE_UPDATE()
  236. ADDQ $32, R8
  237. ADDQ $32, R10
  238. SUBQ $1, AX
  239. JNZ loopDecryptFull
  240. decryptPartial:
  241. ANDQ $31, R9
  242. JZ decryptDone
  243. VMOVDQU Y13, (R15)
  244. COPY(R15, R8, R9)
  245. VMOVDQU (R15), M0
  246. VPERMQ $57, S1, Y6
  247. VPXOR S0, Y6, Y6
  248. VPAND S2, S3, Y7
  249. VPXOR Y6, Y7, Y6
  250. VPXOR M0, Y6, M0
  251. VMOVDQU M0, (R15)
  252. COPY(R10, R15, R9)
  253. MOVQ $0, AX
  254. MOVQ R15, DI
  255. MOVQ $32, CX
  256. SUBQ R9, CX
  257. ADDQ R9, DI
  258. REP
  259. STOSB
  260. VMOVDQU (R15), M0
  261. STATE_UPDATE()
  262. decryptDone:
  263. // Finalize and write the tag.
  264. MOVQ a+56(FP), R8 // len(a) -> R8
  265. MOVQ m+32(FP), R9 // len(m) -> R9
  266. MOVQ tag+88(FP), R14 // tag -> R14
  267. VMOVDQU Y13, (R15)
  268. FINALIZE(R14, R8, R9, R15)
  269. VMOVDQU Y13, (R15)
  270. VZEROUPPER
  271. RET