x448_ref.go 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783
  1. // The MIT License (MIT)
  2. //
  3. // Copyright (c) 2011 Stanford University.
  4. // Copyright (c) 2014-2015 Cryptography Research, Inc.
  5. // Copyright (c) 2015 Yawning Angel.
  6. //
  7. // Permission is hereby granted, free of charge, to any person obtaining a copy
  8. // of this software and associated documentation files (the "Software"), to deal
  9. // in the Software without restriction, including without limitation the rights
  10. // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11. // copies of the Software, and to permit persons to whom the Software is
  12. // furnished to do so, subject to the following conditions:
  13. //
  14. // The above copyright notice and this permission notice shall be included in
  15. // all copies or substantial portions of the Software.
  16. //
  17. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23. // THE SOFTWARE.
  24. package x448
  25. // This should really use 64 bit limbs, but Go is fucking retarded and doesn't
  26. // have __(u)int128_t, so the 32 bit code it is, at a hefty performance
  27. // penalty. Fuck my life, I'm going to have to bust out PeachPy to get this
  28. // to go fast aren't I.
  29. //
  30. // This is equivalent to the non-unrolled reference code, though the compiler
  31. // is free to unroll as it sees fit. If performance is horrendous I'll
  32. // manually unroll things.
  33. const (
  34. wBits = 32
  35. lBits = (wBits * 7 / 8)
  36. x448Limbs = (448 / lBits)
  37. lMask = (1 << lBits) - 1
  38. )
  39. type limbUint uint32
  40. type limbSint int32
  41. type gf struct {
  42. limb [x448Limbs]uint32
  43. }
  44. var zero = gf{[x448Limbs]uint32{0}}
  45. var one = gf{[x448Limbs]uint32{1}}
  46. var p = gf{[x448Limbs]uint32{
  47. lMask, lMask, lMask, lMask, lMask, lMask, lMask, lMask,
  48. lMask - 1, lMask, lMask, lMask, lMask, lMask, lMask, lMask,
  49. }}
  50. // cpy copies x = y.
  51. func (x *gf) cpy(y *gf) {
  52. // for i, v := range y.limb {
  53. // x.limb[i] = v
  54. // }
  55. copy(x.limb[:], y.limb[:])
  56. }
  57. // mul multiplies c = a * b. (PERF)
  58. func (c *gf) mul(a, b *gf) {
  59. var aa gf
  60. aa.cpy(a)
  61. //
  62. // This is *by far* the most CPU intesive routine in the code.
  63. //
  64. // var accum [x448Limbs]uint64
  65. // for i, bv := range b.limb {
  66. // for j, aav := range aa.limb {
  67. // accum[(i+j)%x448Limbs] += (uint64)(bv) * (uint64)(aav)
  68. // }
  69. // aa.limb[(x448Limbs-1-i)^(x448Limbs/2)] += aa.limb[x448Limbs-1-i]
  70. // }
  71. // So fucking stupid that this is actually a fairly massive gain.
  72. var accum0, accum1, accum2, accum3, accum4, accum5, accum6, accum7, accum8, accum9, accum10, accum11, accum12, accum13, accum14, accum15 uint64
  73. var bv uint64
  74. bv = (uint64)(b.limb[0])
  75. accum0 += bv * (uint64)(aa.limb[0])
  76. accum1 += bv * (uint64)(aa.limb[1])
  77. accum2 += bv * (uint64)(aa.limb[2])
  78. accum3 += bv * (uint64)(aa.limb[3])
  79. accum4 += bv * (uint64)(aa.limb[4])
  80. accum5 += bv * (uint64)(aa.limb[5])
  81. accum6 += bv * (uint64)(aa.limb[6])
  82. accum7 += bv * (uint64)(aa.limb[7])
  83. accum8 += bv * (uint64)(aa.limb[8])
  84. accum9 += bv * (uint64)(aa.limb[9])
  85. accum10 += bv * (uint64)(aa.limb[10])
  86. accum11 += bv * (uint64)(aa.limb[11])
  87. accum12 += bv * (uint64)(aa.limb[12])
  88. accum13 += bv * (uint64)(aa.limb[13])
  89. accum14 += bv * (uint64)(aa.limb[14])
  90. accum15 += bv * (uint64)(aa.limb[15])
  91. aa.limb[(x448Limbs-1-0)^(x448Limbs/2)] += aa.limb[x448Limbs-1-0]
  92. bv = (uint64)(b.limb[1])
  93. accum1 += bv * (uint64)(aa.limb[0])
  94. accum2 += bv * (uint64)(aa.limb[1])
  95. accum3 += bv * (uint64)(aa.limb[2])
  96. accum4 += bv * (uint64)(aa.limb[3])
  97. accum5 += bv * (uint64)(aa.limb[4])
  98. accum6 += bv * (uint64)(aa.limb[5])
  99. accum7 += bv * (uint64)(aa.limb[6])
  100. accum8 += bv * (uint64)(aa.limb[7])
  101. accum9 += bv * (uint64)(aa.limb[8])
  102. accum10 += bv * (uint64)(aa.limb[9])
  103. accum11 += bv * (uint64)(aa.limb[10])
  104. accum12 += bv * (uint64)(aa.limb[11])
  105. accum13 += bv * (uint64)(aa.limb[12])
  106. accum14 += bv * (uint64)(aa.limb[13])
  107. accum15 += bv * (uint64)(aa.limb[14])
  108. accum0 += bv * (uint64)(aa.limb[15])
  109. aa.limb[(x448Limbs-1-1)^(x448Limbs/2)] += aa.limb[x448Limbs-1-1]
  110. bv = (uint64)(b.limb[2])
  111. accum2 += bv * (uint64)(aa.limb[0])
  112. accum3 += bv * (uint64)(aa.limb[1])
  113. accum4 += bv * (uint64)(aa.limb[2])
  114. accum5 += bv * (uint64)(aa.limb[3])
  115. accum6 += bv * (uint64)(aa.limb[4])
  116. accum7 += bv * (uint64)(aa.limb[5])
  117. accum8 += bv * (uint64)(aa.limb[6])
  118. accum9 += bv * (uint64)(aa.limb[7])
  119. accum10 += bv * (uint64)(aa.limb[8])
  120. accum11 += bv * (uint64)(aa.limb[9])
  121. accum12 += bv * (uint64)(aa.limb[10])
  122. accum13 += bv * (uint64)(aa.limb[11])
  123. accum14 += bv * (uint64)(aa.limb[12])
  124. accum15 += bv * (uint64)(aa.limb[13])
  125. accum0 += bv * (uint64)(aa.limb[14])
  126. accum1 += bv * (uint64)(aa.limb[15])
  127. aa.limb[(x448Limbs-1-2)^(x448Limbs/2)] += aa.limb[x448Limbs-1-2]
  128. bv = (uint64)(b.limb[3])
  129. accum3 += bv * (uint64)(aa.limb[0])
  130. accum4 += bv * (uint64)(aa.limb[1])
  131. accum5 += bv * (uint64)(aa.limb[2])
  132. accum6 += bv * (uint64)(aa.limb[3])
  133. accum7 += bv * (uint64)(aa.limb[4])
  134. accum8 += bv * (uint64)(aa.limb[5])
  135. accum9 += bv * (uint64)(aa.limb[6])
  136. accum10 += bv * (uint64)(aa.limb[7])
  137. accum11 += bv * (uint64)(aa.limb[8])
  138. accum12 += bv * (uint64)(aa.limb[9])
  139. accum13 += bv * (uint64)(aa.limb[10])
  140. accum14 += bv * (uint64)(aa.limb[11])
  141. accum15 += bv * (uint64)(aa.limb[12])
  142. accum0 += bv * (uint64)(aa.limb[13])
  143. accum1 += bv * (uint64)(aa.limb[14])
  144. accum2 += bv * (uint64)(aa.limb[15])
  145. aa.limb[(x448Limbs-1-3)^(x448Limbs/2)] += aa.limb[x448Limbs-1-3]
  146. bv = (uint64)(b.limb[4])
  147. accum4 += bv * (uint64)(aa.limb[0])
  148. accum5 += bv * (uint64)(aa.limb[1])
  149. accum6 += bv * (uint64)(aa.limb[2])
  150. accum7 += bv * (uint64)(aa.limb[3])
  151. accum8 += bv * (uint64)(aa.limb[4])
  152. accum9 += bv * (uint64)(aa.limb[5])
  153. accum10 += bv * (uint64)(aa.limb[6])
  154. accum11 += bv * (uint64)(aa.limb[7])
  155. accum12 += bv * (uint64)(aa.limb[8])
  156. accum13 += bv * (uint64)(aa.limb[9])
  157. accum14 += bv * (uint64)(aa.limb[10])
  158. accum15 += bv * (uint64)(aa.limb[11])
  159. accum0 += bv * (uint64)(aa.limb[12])
  160. accum1 += bv * (uint64)(aa.limb[13])
  161. accum2 += bv * (uint64)(aa.limb[14])
  162. accum3 += bv * (uint64)(aa.limb[15])
  163. aa.limb[(x448Limbs-1-4)^(x448Limbs/2)] += aa.limb[x448Limbs-1-4]
  164. bv = (uint64)(b.limb[5])
  165. accum5 += bv * (uint64)(aa.limb[0])
  166. accum6 += bv * (uint64)(aa.limb[1])
  167. accum7 += bv * (uint64)(aa.limb[2])
  168. accum8 += bv * (uint64)(aa.limb[3])
  169. accum9 += bv * (uint64)(aa.limb[4])
  170. accum10 += bv * (uint64)(aa.limb[5])
  171. accum11 += bv * (uint64)(aa.limb[6])
  172. accum12 += bv * (uint64)(aa.limb[7])
  173. accum13 += bv * (uint64)(aa.limb[8])
  174. accum14 += bv * (uint64)(aa.limb[9])
  175. accum15 += bv * (uint64)(aa.limb[10])
  176. accum0 += bv * (uint64)(aa.limb[11])
  177. accum1 += bv * (uint64)(aa.limb[12])
  178. accum2 += bv * (uint64)(aa.limb[13])
  179. accum3 += bv * (uint64)(aa.limb[14])
  180. accum4 += bv * (uint64)(aa.limb[15])
  181. aa.limb[(x448Limbs-1-5)^(x448Limbs/2)] += aa.limb[x448Limbs-1-5]
  182. bv = (uint64)(b.limb[6])
  183. accum6 += bv * (uint64)(aa.limb[0])
  184. accum7 += bv * (uint64)(aa.limb[1])
  185. accum8 += bv * (uint64)(aa.limb[2])
  186. accum9 += bv * (uint64)(aa.limb[3])
  187. accum10 += bv * (uint64)(aa.limb[4])
  188. accum11 += bv * (uint64)(aa.limb[5])
  189. accum12 += bv * (uint64)(aa.limb[6])
  190. accum13 += bv * (uint64)(aa.limb[7])
  191. accum14 += bv * (uint64)(aa.limb[8])
  192. accum15 += bv * (uint64)(aa.limb[9])
  193. accum0 += bv * (uint64)(aa.limb[10])
  194. accum1 += bv * (uint64)(aa.limb[11])
  195. accum2 += bv * (uint64)(aa.limb[12])
  196. accum3 += bv * (uint64)(aa.limb[13])
  197. accum4 += bv * (uint64)(aa.limb[14])
  198. accum5 += bv * (uint64)(aa.limb[15])
  199. aa.limb[(x448Limbs-1-6)^(x448Limbs/2)] += aa.limb[x448Limbs-1-6]
  200. bv = (uint64)(b.limb[7])
  201. accum7 += bv * (uint64)(aa.limb[0])
  202. accum8 += bv * (uint64)(aa.limb[1])
  203. accum9 += bv * (uint64)(aa.limb[2])
  204. accum10 += bv * (uint64)(aa.limb[3])
  205. accum11 += bv * (uint64)(aa.limb[4])
  206. accum12 += bv * (uint64)(aa.limb[5])
  207. accum13 += bv * (uint64)(aa.limb[6])
  208. accum14 += bv * (uint64)(aa.limb[7])
  209. accum15 += bv * (uint64)(aa.limb[8])
  210. accum0 += bv * (uint64)(aa.limb[9])
  211. accum1 += bv * (uint64)(aa.limb[10])
  212. accum2 += bv * (uint64)(aa.limb[11])
  213. accum3 += bv * (uint64)(aa.limb[12])
  214. accum4 += bv * (uint64)(aa.limb[13])
  215. accum5 += bv * (uint64)(aa.limb[14])
  216. accum6 += bv * (uint64)(aa.limb[15])
  217. aa.limb[(x448Limbs-1-7)^(x448Limbs/2)] += aa.limb[x448Limbs-1-7]
  218. bv = (uint64)(b.limb[8])
  219. accum8 += bv * (uint64)(aa.limb[0])
  220. accum9 += bv * (uint64)(aa.limb[1])
  221. accum10 += bv * (uint64)(aa.limb[2])
  222. accum11 += bv * (uint64)(aa.limb[3])
  223. accum12 += bv * (uint64)(aa.limb[4])
  224. accum13 += bv * (uint64)(aa.limb[5])
  225. accum14 += bv * (uint64)(aa.limb[6])
  226. accum15 += bv * (uint64)(aa.limb[7])
  227. accum0 += bv * (uint64)(aa.limb[8])
  228. accum1 += bv * (uint64)(aa.limb[9])
  229. accum2 += bv * (uint64)(aa.limb[10])
  230. accum3 += bv * (uint64)(aa.limb[11])
  231. accum4 += bv * (uint64)(aa.limb[12])
  232. accum5 += bv * (uint64)(aa.limb[13])
  233. accum6 += bv * (uint64)(aa.limb[14])
  234. accum7 += bv * (uint64)(aa.limb[15])
  235. aa.limb[(x448Limbs-1-8)^(x448Limbs/2)] += aa.limb[x448Limbs-1-8]
  236. bv = (uint64)(b.limb[9])
  237. accum9 += bv * (uint64)(aa.limb[0])
  238. accum10 += bv * (uint64)(aa.limb[1])
  239. accum11 += bv * (uint64)(aa.limb[2])
  240. accum12 += bv * (uint64)(aa.limb[3])
  241. accum13 += bv * (uint64)(aa.limb[4])
  242. accum14 += bv * (uint64)(aa.limb[5])
  243. accum15 += bv * (uint64)(aa.limb[6])
  244. accum0 += bv * (uint64)(aa.limb[7])
  245. accum1 += bv * (uint64)(aa.limb[8])
  246. accum2 += bv * (uint64)(aa.limb[9])
  247. accum3 += bv * (uint64)(aa.limb[10])
  248. accum4 += bv * (uint64)(aa.limb[11])
  249. accum5 += bv * (uint64)(aa.limb[12])
  250. accum6 += bv * (uint64)(aa.limb[13])
  251. accum7 += bv * (uint64)(aa.limb[14])
  252. accum8 += bv * (uint64)(aa.limb[15])
  253. aa.limb[(x448Limbs-1-9)^(x448Limbs/2)] += aa.limb[x448Limbs-1-9]
  254. bv = (uint64)(b.limb[10])
  255. accum10 += bv * (uint64)(aa.limb[0])
  256. accum11 += bv * (uint64)(aa.limb[1])
  257. accum12 += bv * (uint64)(aa.limb[2])
  258. accum13 += bv * (uint64)(aa.limb[3])
  259. accum14 += bv * (uint64)(aa.limb[4])
  260. accum15 += bv * (uint64)(aa.limb[5])
  261. accum0 += bv * (uint64)(aa.limb[6])
  262. accum1 += bv * (uint64)(aa.limb[7])
  263. accum2 += bv * (uint64)(aa.limb[8])
  264. accum3 += bv * (uint64)(aa.limb[9])
  265. accum4 += bv * (uint64)(aa.limb[10])
  266. accum5 += bv * (uint64)(aa.limb[11])
  267. accum6 += bv * (uint64)(aa.limb[12])
  268. accum7 += bv * (uint64)(aa.limb[13])
  269. accum8 += bv * (uint64)(aa.limb[14])
  270. accum9 += bv * (uint64)(aa.limb[15])
  271. aa.limb[(x448Limbs-1-10)^(x448Limbs/2)] += aa.limb[x448Limbs-1-10]
  272. bv = (uint64)(b.limb[11])
  273. accum11 += bv * (uint64)(aa.limb[0])
  274. accum12 += bv * (uint64)(aa.limb[1])
  275. accum13 += bv * (uint64)(aa.limb[2])
  276. accum14 += bv * (uint64)(aa.limb[3])
  277. accum15 += bv * (uint64)(aa.limb[4])
  278. accum0 += bv * (uint64)(aa.limb[5])
  279. accum1 += bv * (uint64)(aa.limb[6])
  280. accum2 += bv * (uint64)(aa.limb[7])
  281. accum3 += bv * (uint64)(aa.limb[8])
  282. accum4 += bv * (uint64)(aa.limb[9])
  283. accum5 += bv * (uint64)(aa.limb[10])
  284. accum6 += bv * (uint64)(aa.limb[11])
  285. accum7 += bv * (uint64)(aa.limb[12])
  286. accum8 += bv * (uint64)(aa.limb[13])
  287. accum9 += bv * (uint64)(aa.limb[14])
  288. accum10 += bv * (uint64)(aa.limb[15])
  289. aa.limb[(x448Limbs-1-11)^(x448Limbs/2)] += aa.limb[x448Limbs-1-11]
  290. bv = (uint64)(b.limb[12])
  291. accum12 += bv * (uint64)(aa.limb[0])
  292. accum13 += bv * (uint64)(aa.limb[1])
  293. accum14 += bv * (uint64)(aa.limb[2])
  294. accum15 += bv * (uint64)(aa.limb[3])
  295. accum0 += bv * (uint64)(aa.limb[4])
  296. accum1 += bv * (uint64)(aa.limb[5])
  297. accum2 += bv * (uint64)(aa.limb[6])
  298. accum3 += bv * (uint64)(aa.limb[7])
  299. accum4 += bv * (uint64)(aa.limb[8])
  300. accum5 += bv * (uint64)(aa.limb[9])
  301. accum6 += bv * (uint64)(aa.limb[10])
  302. accum7 += bv * (uint64)(aa.limb[11])
  303. accum8 += bv * (uint64)(aa.limb[12])
  304. accum9 += bv * (uint64)(aa.limb[13])
  305. accum10 += bv * (uint64)(aa.limb[14])
  306. accum11 += bv * (uint64)(aa.limb[15])
  307. aa.limb[(x448Limbs-1-12)^(x448Limbs/2)] += aa.limb[x448Limbs-1-12]
  308. bv = (uint64)(b.limb[13])
  309. accum13 += bv * (uint64)(aa.limb[0])
  310. accum14 += bv * (uint64)(aa.limb[1])
  311. accum15 += bv * (uint64)(aa.limb[2])
  312. accum0 += bv * (uint64)(aa.limb[3])
  313. accum1 += bv * (uint64)(aa.limb[4])
  314. accum2 += bv * (uint64)(aa.limb[5])
  315. accum3 += bv * (uint64)(aa.limb[6])
  316. accum4 += bv * (uint64)(aa.limb[7])
  317. accum5 += bv * (uint64)(aa.limb[8])
  318. accum6 += bv * (uint64)(aa.limb[9])
  319. accum7 += bv * (uint64)(aa.limb[10])
  320. accum8 += bv * (uint64)(aa.limb[11])
  321. accum9 += bv * (uint64)(aa.limb[12])
  322. accum10 += bv * (uint64)(aa.limb[13])
  323. accum11 += bv * (uint64)(aa.limb[14])
  324. accum12 += bv * (uint64)(aa.limb[15])
  325. aa.limb[(x448Limbs-1-13)^(x448Limbs/2)] += aa.limb[x448Limbs-1-13]
  326. bv = (uint64)(b.limb[14])
  327. accum14 += bv * (uint64)(aa.limb[0])
  328. accum15 += bv * (uint64)(aa.limb[1])
  329. accum0 += bv * (uint64)(aa.limb[2])
  330. accum1 += bv * (uint64)(aa.limb[3])
  331. accum2 += bv * (uint64)(aa.limb[4])
  332. accum3 += bv * (uint64)(aa.limb[5])
  333. accum4 += bv * (uint64)(aa.limb[6])
  334. accum5 += bv * (uint64)(aa.limb[7])
  335. accum6 += bv * (uint64)(aa.limb[8])
  336. accum7 += bv * (uint64)(aa.limb[9])
  337. accum8 += bv * (uint64)(aa.limb[10])
  338. accum9 += bv * (uint64)(aa.limb[11])
  339. accum10 += bv * (uint64)(aa.limb[12])
  340. accum11 += bv * (uint64)(aa.limb[13])
  341. accum12 += bv * (uint64)(aa.limb[14])
  342. accum13 += bv * (uint64)(aa.limb[15])
  343. aa.limb[(x448Limbs-1-14)^(x448Limbs/2)] += aa.limb[x448Limbs-1-14]
  344. bv = (uint64)(b.limb[15])
  345. accum15 += bv * (uint64)(aa.limb[0])
  346. accum0 += bv * (uint64)(aa.limb[1])
  347. accum1 += bv * (uint64)(aa.limb[2])
  348. accum2 += bv * (uint64)(aa.limb[3])
  349. accum3 += bv * (uint64)(aa.limb[4])
  350. accum4 += bv * (uint64)(aa.limb[5])
  351. accum5 += bv * (uint64)(aa.limb[6])
  352. accum6 += bv * (uint64)(aa.limb[7])
  353. accum7 += bv * (uint64)(aa.limb[8])
  354. accum8 += bv * (uint64)(aa.limb[9])
  355. accum9 += bv * (uint64)(aa.limb[10])
  356. accum10 += bv * (uint64)(aa.limb[11])
  357. accum11 += bv * (uint64)(aa.limb[12])
  358. accum12 += bv * (uint64)(aa.limb[13])
  359. accum13 += bv * (uint64)(aa.limb[14])
  360. accum14 += bv * (uint64)(aa.limb[15])
  361. aa.limb[(x448Limbs-1-15)^(x448Limbs/2)] += aa.limb[x448Limbs-1-15]
  362. // accum[x448Limbs-1] += accum[x448Limbs-2] >> lBits
  363. // accum[x448Limbs-2] &= lMask
  364. // accum[x448Limbs/2] += accum[x448Limbs-1] >> lBits
  365. accum15 += accum14 >> lBits
  366. accum14 &= lMask
  367. accum8 += accum15 >> lBits
  368. // for j := uint(0); j < x448Limbs; j++ {
  369. // accum[j] += accum[(j-1)%x448Limbs] >> lBits
  370. // accum[(j-1)%x448Limbs] &= lMask
  371. // }
  372. accum0 += accum15 >> lBits
  373. accum15 &= lMask
  374. accum1 += accum0 >> lBits
  375. accum0 &= lMask
  376. accum2 += accum1 >> lBits
  377. accum1 &= lMask
  378. accum3 += accum2 >> lBits
  379. accum2 &= lMask
  380. accum4 += accum3 >> lBits
  381. accum3 &= lMask
  382. accum5 += accum4 >> lBits
  383. accum4 &= lMask
  384. accum6 += accum5 >> lBits
  385. accum5 &= lMask
  386. accum7 += accum6 >> lBits
  387. accum6 &= lMask
  388. accum8 += accum7 >> lBits
  389. accum7 &= lMask
  390. accum9 += accum8 >> lBits
  391. accum8 &= lMask
  392. accum10 += accum9 >> lBits
  393. accum9 &= lMask
  394. accum11 += accum10 >> lBits
  395. accum10 &= lMask
  396. accum12 += accum11 >> lBits
  397. accum11 &= lMask
  398. accum13 += accum12 >> lBits
  399. accum12 &= lMask
  400. accum14 += accum13 >> lBits
  401. accum13 &= lMask
  402. accum15 += accum14 >> lBits
  403. accum14 &= lMask
  404. // for j, accv := range accum {
  405. // c.limb[j] = (uint32)(accv)
  406. // }
  407. c.limb[0] = (uint32)(accum0)
  408. c.limb[1] = (uint32)(accum1)
  409. c.limb[2] = (uint32)(accum2)
  410. c.limb[3] = (uint32)(accum3)
  411. c.limb[4] = (uint32)(accum4)
  412. c.limb[5] = (uint32)(accum5)
  413. c.limb[6] = (uint32)(accum6)
  414. c.limb[7] = (uint32)(accum7)
  415. c.limb[8] = (uint32)(accum8)
  416. c.limb[9] = (uint32)(accum9)
  417. c.limb[10] = (uint32)(accum10)
  418. c.limb[11] = (uint32)(accum11)
  419. c.limb[12] = (uint32)(accum12)
  420. c.limb[13] = (uint32)(accum13)
  421. c.limb[14] = (uint32)(accum14)
  422. c.limb[15] = (uint32)(accum15)
  423. }
  424. // sqr squares (c = x * x). Just calls multiply. (PERF)
  425. func (c *gf) sqr(x *gf) {
  426. c.mul(x, x)
  427. }
  428. // isqrt inverse square roots (y = 1/sqrt(x)), using an addition chain.
  429. func (y *gf) isqrt(x *gf) {
  430. var a, b, c gf
  431. c.sqr(x)
  432. // XXX/Yawning, could unroll, but this is called only once.
  433. // STEP(b,x,1);
  434. b.mul(x, &c)
  435. c.cpy(&b)
  436. for i := 0; i < 1; i++ {
  437. c.sqr(&c)
  438. }
  439. // STEP(b,x,3);
  440. b.mul(x, &c)
  441. c.cpy(&b)
  442. for i := 0; i < 3; i++ {
  443. c.sqr(&c)
  444. }
  445. //STEP(a,b,3);
  446. a.mul(&b, &c)
  447. c.cpy(&a)
  448. for i := 0; i < 3; i++ {
  449. c.sqr(&c)
  450. }
  451. // STEP(a,b,9);
  452. a.mul(&b, &c)
  453. c.cpy(&a)
  454. for i := 0; i < 9; i++ {
  455. c.sqr(&c)
  456. }
  457. // STEP(b,a,1);
  458. b.mul(&a, &c)
  459. c.cpy(&b)
  460. for i := 0; i < 1; i++ {
  461. c.sqr(&c)
  462. }
  463. // STEP(a,x,18);
  464. a.mul(x, &c)
  465. c.cpy(&a)
  466. for i := 0; i < 18; i++ {
  467. c.sqr(&c)
  468. }
  469. // STEP(a,b,37);
  470. a.mul(&b, &c)
  471. c.cpy(&a)
  472. for i := 0; i < 37; i++ {
  473. c.sqr(&c)
  474. }
  475. // STEP(b,a,37);
  476. b.mul(&a, &c)
  477. c.cpy(&b)
  478. for i := 0; i < 37; i++ {
  479. c.sqr(&c)
  480. }
  481. // STEP(b,a,111);
  482. b.mul(&a, &c)
  483. c.cpy(&b)
  484. for i := 0; i < 111; i++ {
  485. c.sqr(&c)
  486. }
  487. // STEP(a,b,1);
  488. a.mul(&b, &c)
  489. c.cpy(&a)
  490. for i := 0; i < 1; i++ {
  491. c.sqr(&c)
  492. }
  493. // STEP(b,x,223);
  494. b.mul(x, &c)
  495. c.cpy(&b)
  496. for i := 0; i < 223; i++ {
  497. c.sqr(&c)
  498. }
  499. y.mul(&a, &c)
  500. }
  501. // inv inverses (y = 1/x).
  502. func (y *gf) inv(x *gf) {
  503. var z, w gf
  504. z.sqr(x) // x^2
  505. w.isqrt(&z) // +- 1/sqrt(x^2) = +- 1/x
  506. z.sqr(&w) // 1/x^2
  507. w.mul(x, &z) // 1/x
  508. y.cpy(&w)
  509. }
  510. // reduce weakly reduces mod p
  511. func (x *gf) reduce() {
  512. x.limb[x448Limbs/2] += x.limb[x448Limbs-1] >> lBits
  513. // for j := uint(0); j < x448Limbs; j++ {
  514. // x.limb[j] += x.limb[(j-1)%x448Limbs] >> lBits
  515. // x.limb[(j-1)%x448Limbs] &= lMask
  516. // }
  517. x.limb[0] += x.limb[15] >> lBits
  518. x.limb[15] &= lMask
  519. x.limb[1] += x.limb[0] >> lBits
  520. x.limb[0] &= lMask
  521. x.limb[2] += x.limb[1] >> lBits
  522. x.limb[1] &= lMask
  523. x.limb[3] += x.limb[2] >> lBits
  524. x.limb[2] &= lMask
  525. x.limb[4] += x.limb[3] >> lBits
  526. x.limb[3] &= lMask
  527. x.limb[5] += x.limb[4] >> lBits
  528. x.limb[4] &= lMask
  529. x.limb[6] += x.limb[5] >> lBits
  530. x.limb[5] &= lMask
  531. x.limb[7] += x.limb[6] >> lBits
  532. x.limb[6] &= lMask
  533. x.limb[8] += x.limb[7] >> lBits
  534. x.limb[7] &= lMask
  535. x.limb[9] += x.limb[8] >> lBits
  536. x.limb[8] &= lMask
  537. x.limb[10] += x.limb[9] >> lBits
  538. x.limb[9] &= lMask
  539. x.limb[11] += x.limb[10] >> lBits
  540. x.limb[10] &= lMask
  541. x.limb[12] += x.limb[11] >> lBits
  542. x.limb[11] &= lMask
  543. x.limb[13] += x.limb[12] >> lBits
  544. x.limb[12] &= lMask
  545. x.limb[14] += x.limb[13] >> lBits
  546. x.limb[13] &= lMask
  547. x.limb[15] += x.limb[14] >> lBits
  548. x.limb[14] &= lMask
  549. }
  550. // add adds mod p. Conservatively always weak-reduces. (PERF)
  551. func (x *gf) add(y, z *gf) {
  552. // for i, yv := range y.limb {
  553. // x.limb[i] = yv + z.limb[i]
  554. // }
  555. x.limb[0] = y.limb[0] + z.limb[0]
  556. x.limb[1] = y.limb[1] + z.limb[1]
  557. x.limb[2] = y.limb[2] + z.limb[2]
  558. x.limb[3] = y.limb[3] + z.limb[3]
  559. x.limb[4] = y.limb[4] + z.limb[4]
  560. x.limb[5] = y.limb[5] + z.limb[5]
  561. x.limb[6] = y.limb[6] + z.limb[6]
  562. x.limb[7] = y.limb[7] + z.limb[7]
  563. x.limb[8] = y.limb[8] + z.limb[8]
  564. x.limb[9] = y.limb[9] + z.limb[9]
  565. x.limb[10] = y.limb[10] + z.limb[10]
  566. x.limb[11] = y.limb[11] + z.limb[11]
  567. x.limb[12] = y.limb[12] + z.limb[12]
  568. x.limb[13] = y.limb[13] + z.limb[13]
  569. x.limb[14] = y.limb[14] + z.limb[14]
  570. x.limb[15] = y.limb[15] + z.limb[15]
  571. x.reduce()
  572. }
  573. // sub subtracts mod p. Conservatively always weak-reduces. (PERF)
  574. func (x *gf) sub(y, z *gf) {
  575. // for i, yv := range y.limb {
  576. // x.limb[i] = yv - z.limb[i] + 2*p.limb[i]
  577. // }
  578. x.limb[0] = y.limb[0] - z.limb[0] + 2*lMask
  579. x.limb[1] = y.limb[1] - z.limb[1] + 2*lMask
  580. x.limb[2] = y.limb[2] - z.limb[2] + 2*lMask
  581. x.limb[3] = y.limb[3] - z.limb[3] + 2*lMask
  582. x.limb[4] = y.limb[4] - z.limb[4] + 2*lMask
  583. x.limb[5] = y.limb[5] - z.limb[5] + 2*lMask
  584. x.limb[6] = y.limb[6] - z.limb[6] + 2*lMask
  585. x.limb[7] = y.limb[7] - z.limb[7] + 2*lMask
  586. x.limb[8] = y.limb[8] - z.limb[8] + 2*(lMask-1)
  587. x.limb[9] = y.limb[9] - z.limb[9] + 2*lMask
  588. x.limb[10] = y.limb[10] - z.limb[10] + 2*lMask
  589. x.limb[11] = y.limb[11] - z.limb[11] + 2*lMask
  590. x.limb[12] = y.limb[12] - z.limb[12] + 2*lMask
  591. x.limb[13] = y.limb[13] - z.limb[13] + 2*lMask
  592. x.limb[14] = y.limb[14] - z.limb[14] + 2*lMask
  593. x.limb[15] = y.limb[15] - z.limb[15] + 2*lMask
  594. x.reduce()
  595. }
  596. // condSwap swaps x and y in constant time.
  597. func (x *gf) condSwap(y *gf, swap limbUint) {
  598. // for i, xv := range x.limb {
  599. // s := (xv ^ y.limb[i]) & (uint32)(swap) // Sort of dumb, oh well.
  600. // x.limb[i] ^= s
  601. // y.limb[i] ^= s
  602. // }
  603. var s uint32
  604. s = (x.limb[0] ^ y.limb[0]) & (uint32)(swap)
  605. x.limb[0] ^= s
  606. y.limb[0] ^= s
  607. s = (x.limb[1] ^ y.limb[1]) & (uint32)(swap)
  608. x.limb[1] ^= s
  609. y.limb[1] ^= s
  610. s = (x.limb[2] ^ y.limb[2]) & (uint32)(swap)
  611. x.limb[2] ^= s
  612. y.limb[2] ^= s
  613. s = (x.limb[3] ^ y.limb[3]) & (uint32)(swap)
  614. x.limb[3] ^= s
  615. y.limb[3] ^= s
  616. s = (x.limb[4] ^ y.limb[4]) & (uint32)(swap)
  617. x.limb[4] ^= s
  618. y.limb[4] ^= s
  619. s = (x.limb[5] ^ y.limb[5]) & (uint32)(swap)
  620. x.limb[5] ^= s
  621. y.limb[5] ^= s
  622. s = (x.limb[6] ^ y.limb[6]) & (uint32)(swap)
  623. x.limb[6] ^= s
  624. y.limb[6] ^= s
  625. s = (x.limb[7] ^ y.limb[7]) & (uint32)(swap)
  626. x.limb[7] ^= s
  627. y.limb[7] ^= s
  628. s = (x.limb[8] ^ y.limb[8]) & (uint32)(swap)
  629. x.limb[8] ^= s
  630. y.limb[8] ^= s
  631. s = (x.limb[9] ^ y.limb[9]) & (uint32)(swap)
  632. x.limb[9] ^= s
  633. y.limb[9] ^= s
  634. s = (x.limb[10] ^ y.limb[10]) & (uint32)(swap)
  635. x.limb[10] ^= s
  636. y.limb[10] ^= s
  637. s = (x.limb[11] ^ y.limb[11]) & (uint32)(swap)
  638. x.limb[11] ^= s
  639. y.limb[11] ^= s
  640. s = (x.limb[12] ^ y.limb[12]) & (uint32)(swap)
  641. x.limb[12] ^= s
  642. y.limb[12] ^= s
  643. s = (x.limb[13] ^ y.limb[13]) & (uint32)(swap)
  644. x.limb[13] ^= s
  645. y.limb[13] ^= s
  646. s = (x.limb[14] ^ y.limb[14]) & (uint32)(swap)
  647. x.limb[14] ^= s
  648. y.limb[14] ^= s
  649. s = (x.limb[15] ^ y.limb[15]) & (uint32)(swap)
  650. x.limb[15] ^= s
  651. y.limb[15] ^= s
  652. }
  653. // mlw multiplies by a signed int. NOT CONSTANT TIME wrt the sign of the int,
  654. // but that's ok because it's only ever called with w = -edwardsD. Just uses
  655. // a full multiply. (PERF)
  656. func (a *gf) mlw(b *gf, w int) {
  657. if w > 0 {
  658. ww := gf{[x448Limbs]uint32{(uint32)(w)}}
  659. a.mul(b, &ww)
  660. } else {
  661. // This branch is *NEVER* taken with the current code.
  662. panic("mul called with negative w")
  663. ww := gf{[x448Limbs]uint32{(uint32)(-w)}}
  664. a.mul(b, &ww)
  665. a.sub(&zero, a)
  666. }
  667. }
  668. // canon canonicalizes.
  669. func (a *gf) canon() {
  670. a.reduce()
  671. // Subtract p with borrow.
  672. var carry int64
  673. for i, v := range a.limb {
  674. carry = carry + (int64)(v) - (int64)(p.limb[i])
  675. a.limb[i] = (uint32)(carry & lMask)
  676. carry >>= lBits
  677. }
  678. addback := carry
  679. carry = 0
  680. // Add it back.
  681. for i, v := range a.limb {
  682. carry = carry + (int64)(v) + (int64)(p.limb[i]&(uint32)(addback))
  683. a.limb[i] = uint32(carry & lMask)
  684. carry >>= lBits
  685. }
  686. }
  687. // deser deserializes into the limb representation.
  688. func (s *gf) deser(ser *[x448Bytes]byte) {
  689. var buf uint64
  690. bits := uint(0)
  691. k := 0
  692. for i, v := range ser {
  693. buf |= (uint64)(v) << bits
  694. for bits += 8; (bits >= lBits || i == x448Bytes-1) && k < x448Limbs; bits, buf = bits-lBits, buf>>lBits {
  695. s.limb[k] = (uint32)(buf & lMask)
  696. k++
  697. }
  698. }
  699. }
  700. // ser serializes into byte representation.
  701. func (a *gf) ser(ser *[x448Bytes]byte) {
  702. a.canon()
  703. k := 0
  704. bits := uint(0)
  705. var buf uint64
  706. for i, v := range a.limb {
  707. buf |= (uint64)(v) << bits
  708. for bits += lBits; (bits >= 8 || i == x448Limbs-1) && k < x448Bytes; bits, buf = bits-8, buf>>8 {
  709. ser[k] = (byte)(buf)
  710. k++
  711. }
  712. }
  713. }
  714. func init() {
  715. if x448Limbs != 16 {
  716. panic("x448Limbs != 16, unrolled loops likely broken")
  717. }
  718. }