round_bitsliced64.go 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. // round_bitsliced64.go - 64bit constant time AES round function.
  2. //
  3. // To the extent possible under law, Yawning Angel has waived all copyright
  4. // and related or neighboring rights to aez, using the Creative
  5. // Commons "CC0" public domain dedication. See LICENSE or
  6. // <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
  7. package aez
  8. import "git.schwanenlied.me/yawning/bsaes.git/ct64"
  9. type roundB64 struct {
  10. skey [32]uint64 // I, J, L, 0
  11. }
  12. func newRoundB64(extractedKey *[extractedKeySize]byte) aesImpl {
  13. r := new(roundB64)
  14. for i := 0; i < 3; i++ {
  15. ct64.RkeyOrtho(r.skey[i*8:], extractedKey[i*16:])
  16. }
  17. return r
  18. }
  19. func (r *roundB64) Reset() {
  20. memwipeU64(r.skey[:])
  21. }
  22. func (r *roundB64) AES4(j, i, l *[blockSize]byte, src []byte, dst *[blockSize]byte) {
  23. var q [8]uint64
  24. xorBytes4x16(j[:], i[:], l[:], src, dst[:])
  25. ct64.Load4xU32(&q, dst[:])
  26. r.round(&q, r.skey[8:]) // J
  27. r.round(&q, r.skey[0:]) // I
  28. r.round(&q, r.skey[16:]) // L
  29. r.round(&q, r.skey[24:]) // zero
  30. ct64.Store4xU32(dst[:], &q)
  31. memwipeU64(q[:])
  32. }
  33. func (r *roundB64) aes4x4(
  34. j0, i0, l0 *[blockSize]byte, src0 []byte, dst0 *[blockSize]byte,
  35. j1, i1, l1 *[blockSize]byte, src1 []byte, dst1 *[blockSize]byte,
  36. j2, i2, l2 *[blockSize]byte, src2 []byte, dst2 *[blockSize]byte,
  37. j3, i3, l3 *[blockSize]byte, src3 []byte, dst3 *[blockSize]byte) {
  38. var q [8]uint64
  39. xorBytes4x16(j0[:], i0[:], l0[:], src0, dst0[:])
  40. xorBytes4x16(j1[:], i1[:], l1[:], src1, dst1[:])
  41. xorBytes4x16(j2[:], i2[:], l2[:], src2, dst2[:])
  42. xorBytes4x16(j3[:], i3[:], l3[:], src3, dst3[:])
  43. ct64.Load16xU32(&q, dst0[:], dst1[:], dst2[:], dst3[:])
  44. r.round(&q, r.skey[8:]) // J
  45. r.round(&q, r.skey[0:]) // I
  46. r.round(&q, r.skey[16:]) // L
  47. r.round(&q, r.skey[24:]) // zero
  48. ct64.Store16xU32(dst0[:], dst1[:], dst2[:], dst3[:], &q)
  49. memwipeU64(q[:])
  50. }
  51. func (r *roundB64) AES10(l *[blockSize]byte, src []byte, dst *[blockSize]byte) {
  52. var q [8]uint64
  53. xorBytes1x16(src, l[:], dst[:])
  54. ct64.Load4xU32(&q, dst[:])
  55. for i := 0; i < 3; i++ {
  56. r.round(&q, r.skey[0:]) // I
  57. r.round(&q, r.skey[8:]) // J
  58. r.round(&q, r.skey[16:]) // L
  59. }
  60. r.round(&q, r.skey[0:]) // I
  61. ct64.Store4xU32(dst[:], &q)
  62. memwipeU64(q[:])
  63. }
  64. func (r *roundB64) round(q *[8]uint64, k []uint64) {
  65. ct64.Sbox(q)
  66. ct64.ShiftRows(q)
  67. ct64.MixColumns(q)
  68. ct64.AddRoundKey(q, k)
  69. }
  70. func (r *roundB64) aezCorePass1(e *eState, in, out []byte, X *[blockSize]byte, sz int) {
  71. var tmp0, tmp1, tmp2, tmp3, I [blockSize]byte
  72. copy(I[:], e.I[1][:])
  73. i := 1
  74. // Process 8 * 16 bytes at a time in a loop.
  75. for mult := false; sz >= 8*blockSize; mult = !mult {
  76. r.aes4x4(&e.J[0], &I, &e.L[(i+0)%8], in[blockSize:], &tmp0,
  77. &e.J[0], &I, &e.L[(i+1)%8], in[blockSize*3:], &tmp1,
  78. &e.J[0], &I, &e.L[(i+2)%8], in[blockSize*5:], &tmp2,
  79. &e.J[0], &I, &e.L[(i+3)%8], in[blockSize*7:], &tmp3) // E(1,i) ... E(1,i+3)
  80. xorBytes1x16(in[:], tmp0[:], out[:])
  81. xorBytes1x16(in[blockSize*2:], tmp1[:], out[blockSize*2:])
  82. xorBytes1x16(in[blockSize*4:], tmp2[:], out[blockSize*4:])
  83. xorBytes1x16(in[blockSize*6:], tmp3[:], out[blockSize*6:])
  84. r.aes4x4(&zero, &e.I[0], &e.L[0], out[:], &tmp0,
  85. &zero, &e.I[0], &e.L[0], out[blockSize*2:], &tmp1,
  86. &zero, &e.I[0], &e.L[0], out[blockSize*4:], &tmp2,
  87. &zero, &e.I[0], &e.L[0], out[blockSize*6:], &tmp3) // E(0,0) x4
  88. xorBytes1x16(in[blockSize:], tmp0[:], out[blockSize:])
  89. xorBytes1x16(in[blockSize*3:], tmp1[:], out[blockSize*3:])
  90. xorBytes1x16(in[blockSize*5:], tmp2[:], out[blockSize*5:])
  91. xorBytes1x16(in[blockSize*7:], tmp3[:], out[blockSize*7:])
  92. xorBytes1x16(out[blockSize:], X[:], X[:])
  93. xorBytes1x16(out[blockSize*3:], X[:], X[:])
  94. xorBytes1x16(out[blockSize*5:], X[:], X[:])
  95. xorBytes1x16(out[blockSize*7:], X[:], X[:])
  96. sz -= 8 * blockSize
  97. in, out = in[128:], out[128:]
  98. if mult { // Multiply every other pass.
  99. doubleBlock(&I)
  100. }
  101. i += 4
  102. }
  103. // XXX/performance: 4 * 16 bytes at a time.
  104. for sz > 0 {
  105. r.AES4(&e.J[0], &I, &e.L[i%8], in[blockSize:], &tmp0) // E(1,i)
  106. xorBytes1x16(in[:], tmp0[:], out[:])
  107. r.AES4(&zero, &e.I[0], &e.L[0], out[:], &tmp0) // E(0,0)
  108. xorBytes1x16(in[blockSize:], tmp0[:], out[blockSize:])
  109. xorBytes1x16(out[blockSize:], X[:], X[:])
  110. sz -= 2 * blockSize
  111. in, out = in[32:], out[32:]
  112. if i%8 == 0 {
  113. doubleBlock(&I)
  114. }
  115. i++
  116. }
  117. memwipe(tmp0[:])
  118. memwipe(tmp1[:])
  119. memwipe(tmp2[:])
  120. memwipe(tmp3[:])
  121. memwipe(I[:])
  122. }
  123. func (r *roundB64) aezCorePass2(e *eState, out []byte, Y, S *[blockSize]byte, sz int) {
  124. var tmp0, tmp1, tmp2, tmp3, I [blockSize]byte
  125. copy(I[:], e.I[1][:])
  126. i := 1
  127. // Process 8 * 16 bytes at a time in a loop.
  128. for mult := false; sz >= 8*blockSize; mult = !mult {
  129. r.aes4x4(&e.J[1], &I, &e.L[(i+0)%8], S[:], &tmp0,
  130. &e.J[1], &I, &e.L[(i+1)%8], S[:], &tmp1,
  131. &e.J[1], &I, &e.L[(i+2)%8], S[:], &tmp2,
  132. &e.J[1], &I, &e.L[(i+3)%8], S[:], &tmp3) // E(2,i) .. E(2,i+3)
  133. xorBytes1x16(out, tmp0[:], out[:])
  134. xorBytes1x16(out[blockSize*2:], tmp1[:], out[blockSize*2:])
  135. xorBytes1x16(out[blockSize*4:], tmp2[:], out[blockSize*4:])
  136. xorBytes1x16(out[blockSize*6:], tmp3[:], out[blockSize*6:])
  137. xorBytes1x16(out[blockSize:], tmp0[:], out[blockSize:])
  138. xorBytes1x16(out[blockSize*3:], tmp1[:], out[blockSize*3:])
  139. xorBytes1x16(out[blockSize*5:], tmp2[:], out[blockSize*5:])
  140. xorBytes1x16(out[blockSize*7:], tmp3[:], out[blockSize*7:])
  141. xorBytes1x16(out, Y[:], Y[:])
  142. xorBytes1x16(out[blockSize*2:], Y[:], Y[:])
  143. xorBytes1x16(out[blockSize*4:], Y[:], Y[:])
  144. xorBytes1x16(out[blockSize*6:], Y[:], Y[:])
  145. r.aes4x4(&zero, &e.I[0], &e.L[0], out[blockSize:], &tmp0,
  146. &zero, &e.I[0], &e.L[0], out[blockSize*3:], &tmp1,
  147. &zero, &e.I[0], &e.L[0], out[blockSize*5:], &tmp2,
  148. &zero, &e.I[0], &e.L[0], out[blockSize*7:], &tmp3) // E(0,0)x4
  149. xorBytes1x16(out, tmp0[:], out[:])
  150. xorBytes1x16(out[blockSize*2:], tmp1[:], out[blockSize*2:])
  151. xorBytes1x16(out[blockSize*4:], tmp2[:], out[blockSize*4:])
  152. xorBytes1x16(out[blockSize*6:], tmp3[:], out[blockSize*6:])
  153. r.aes4x4(&e.J[0], &I, &e.L[(i+0)%8], out[:], &tmp0,
  154. &e.J[0], &I, &e.L[(i+1)%8], out[blockSize*2:], &tmp1,
  155. &e.J[0], &I, &e.L[(i+2)%8], out[blockSize*4:], &tmp2,
  156. &e.J[0], &I, &e.L[(i+3)%8], out[blockSize*6:], &tmp3) // E(1,i) ... E(1,i+3)
  157. xorBytes1x16(out[blockSize:], tmp0[:], out[blockSize:])
  158. xorBytes1x16(out[blockSize*3:], tmp1[:], out[blockSize*3:])
  159. xorBytes1x16(out[blockSize*5:], tmp2[:], out[blockSize*5:])
  160. xorBytes1x16(out[blockSize*7:], tmp3[:], out[blockSize*7:])
  161. swapBlocks(&tmp0, out)
  162. swapBlocks(&tmp0, out[blockSize*2:])
  163. swapBlocks(&tmp0, out[blockSize*4:])
  164. swapBlocks(&tmp0, out[blockSize*6:])
  165. sz -= 8 * blockSize
  166. out = out[128:]
  167. if mult { // Multiply every other pass.
  168. doubleBlock(&I)
  169. }
  170. i += 4
  171. }
  172. // XXX/performance: 4 * 16 bytes at a time.
  173. for sz > 0 {
  174. r.AES4(&e.J[1], &I, &e.L[i%8], S[:], &tmp0) // E(2,i)
  175. xorBytes1x16(out, tmp0[:], out[:])
  176. xorBytes1x16(out[blockSize:], tmp0[:], out[blockSize:])
  177. xorBytes1x16(out, Y[:], Y[:])
  178. r.AES4(&zero, &e.I[0], &e.L[0], out[blockSize:], &tmp0) // E(0,0)
  179. xorBytes1x16(out, tmp0[:], out[:])
  180. r.AES4(&e.J[0], &I, &e.L[i%8], out[:], &tmp0) // E(1,i)
  181. xorBytes1x16(out[blockSize:], tmp0[:], out[blockSize:])
  182. swapBlocks(&tmp0, out)
  183. sz -= 2 * blockSize
  184. out = out[32:]
  185. if i%8 == 0 {
  186. doubleBlock(&I)
  187. }
  188. i++
  189. }
  190. memwipe(tmp0[:])
  191. memwipe(tmp1[:])
  192. memwipe(tmp2[:])
  193. memwipe(tmp3[:])
  194. memwipe(I[:])
  195. }
  196. func memwipeU64(s []uint64) {
  197. for i := range s {
  198. s[i] = 0
  199. }
  200. }