Browse Source

First pass at making it go fast.

The optimizations in this commit are worth a 5x performance increase or
so, bringing the runtime of a scalar mult down to 1 ms on my mobile
Haswell.
Yawning Angel 4 years ago
parent
commit
6c2f60b02b
1 changed files with 528 additions and 44 deletions
  1. 528 44
      x448_ref.go

+ 528 - 44
x448_ref.go

@@ -56,9 +56,11 @@ var p = gf{[x448Limbs]uint32{
 
 // cpy copies x = y.
 func (x *gf) cpy(y *gf) {
-	for i, v := range y.limb { // XXX: Unroll
-		x.limb[i] = v
-	}
+	// for i, v := range y.limb {
+	//	x.limb[i] = v
+	// }
+
+	copy(x.limb[:], y.limb[:])
 }
 
 // mul multiplies c = a * b. (PERF)
@@ -66,24 +68,389 @@ func (c *gf) mul(a, b *gf) {
 	var aa gf
 	aa.cpy(a)
 
-	var accum [x448Limbs]uint64
-	for i, bv := range b.limb { // XXX: Unroll
-		for j, aav := range aa.limb { // XXX: Unroll
-			accum[(i+j)%x448Limbs] += (uint64)(bv) * (uint64)(aav)
-		}
-		aa.limb[(x448Limbs-1-i)^(x448Limbs/2)] += aa.limb[x448Limbs-1-i]
-	}
-
-	accum[x448Limbs-1] += accum[x448Limbs-2] >> lBits
-	accum[x448Limbs-2] &= lMask
-	accum[x448Limbs/2] += accum[x448Limbs-1] >> lBits
-	for j := uint(0); j < x448Limbs; j++ { // XXX: Unroll
-		accum[j] += accum[(j-1)%x448Limbs] >> lBits
-		accum[(j-1)%x448Limbs] &= lMask
-	}
-	for j, accv := range accum { // XXX: Unroll
-		c.limb[j] = (uint32)(accv)
-	}
+	//
+	// This is *by far* the most CPU intesive routine in the code.
+	//
+
+	// var accum [x448Limbs]uint64
+	// for i, bv := range b.limb {
+	//	for j, aav := range aa.limb {
+	//		accum[(i+j)%x448Limbs] += (uint64)(bv) * (uint64)(aav)
+	//	}
+	//	aa.limb[(x448Limbs-1-i)^(x448Limbs/2)] += aa.limb[x448Limbs-1-i]
+	// }
+
+	// So fucking stupid that this is actually a fairly massive gain.
+	var accum0, accum1, accum2, accum3, accum4, accum5, accum6, accum7, accum8, accum9, accum10, accum11, accum12, accum13, accum14, accum15 uint64
+	var bv uint64
+
+	bv = (uint64)(b.limb[0])
+	accum0 += bv * (uint64)(aa.limb[0])
+	accum1 += bv * (uint64)(aa.limb[1])
+	accum2 += bv * (uint64)(aa.limb[2])
+	accum3 += bv * (uint64)(aa.limb[3])
+	accum4 += bv * (uint64)(aa.limb[4])
+	accum5 += bv * (uint64)(aa.limb[5])
+	accum6 += bv * (uint64)(aa.limb[6])
+	accum7 += bv * (uint64)(aa.limb[7])
+	accum8 += bv * (uint64)(aa.limb[8])
+	accum9 += bv * (uint64)(aa.limb[9])
+	accum10 += bv * (uint64)(aa.limb[10])
+	accum11 += bv * (uint64)(aa.limb[11])
+	accum12 += bv * (uint64)(aa.limb[12])
+	accum13 += bv * (uint64)(aa.limb[13])
+	accum14 += bv * (uint64)(aa.limb[14])
+	accum15 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-0)^(x448Limbs/2)] += aa.limb[x448Limbs-1-0]
+
+	bv = (uint64)(b.limb[1])
+	accum1 += bv * (uint64)(aa.limb[0])
+	accum2 += bv * (uint64)(aa.limb[1])
+	accum3 += bv * (uint64)(aa.limb[2])
+	accum4 += bv * (uint64)(aa.limb[3])
+	accum5 += bv * (uint64)(aa.limb[4])
+	accum6 += bv * (uint64)(aa.limb[5])
+	accum7 += bv * (uint64)(aa.limb[6])
+	accum8 += bv * (uint64)(aa.limb[7])
+	accum9 += bv * (uint64)(aa.limb[8])
+	accum10 += bv * (uint64)(aa.limb[9])
+	accum11 += bv * (uint64)(aa.limb[10])
+	accum12 += bv * (uint64)(aa.limb[11])
+	accum13 += bv * (uint64)(aa.limb[12])
+	accum14 += bv * (uint64)(aa.limb[13])
+	accum15 += bv * (uint64)(aa.limb[14])
+	accum0 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-1)^(x448Limbs/2)] += aa.limb[x448Limbs-1-1]
+
+	bv = (uint64)(b.limb[2])
+	accum2 += bv * (uint64)(aa.limb[0])
+	accum3 += bv * (uint64)(aa.limb[1])
+	accum4 += bv * (uint64)(aa.limb[2])
+	accum5 += bv * (uint64)(aa.limb[3])
+	accum6 += bv * (uint64)(aa.limb[4])
+	accum7 += bv * (uint64)(aa.limb[5])
+	accum8 += bv * (uint64)(aa.limb[6])
+	accum9 += bv * (uint64)(aa.limb[7])
+	accum10 += bv * (uint64)(aa.limb[8])
+	accum11 += bv * (uint64)(aa.limb[9])
+	accum12 += bv * (uint64)(aa.limb[10])
+	accum13 += bv * (uint64)(aa.limb[11])
+	accum14 += bv * (uint64)(aa.limb[12])
+	accum15 += bv * (uint64)(aa.limb[13])
+	accum0 += bv * (uint64)(aa.limb[14])
+	accum1 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-2)^(x448Limbs/2)] += aa.limb[x448Limbs-1-2]
+
+	bv = (uint64)(b.limb[3])
+	accum3 += bv * (uint64)(aa.limb[0])
+	accum4 += bv * (uint64)(aa.limb[1])
+	accum5 += bv * (uint64)(aa.limb[2])
+	accum6 += bv * (uint64)(aa.limb[3])
+	accum7 += bv * (uint64)(aa.limb[4])
+	accum8 += bv * (uint64)(aa.limb[5])
+	accum9 += bv * (uint64)(aa.limb[6])
+	accum10 += bv * (uint64)(aa.limb[7])
+	accum11 += bv * (uint64)(aa.limb[8])
+	accum12 += bv * (uint64)(aa.limb[9])
+	accum13 += bv * (uint64)(aa.limb[10])
+	accum14 += bv * (uint64)(aa.limb[11])
+	accum15 += bv * (uint64)(aa.limb[12])
+	accum0 += bv * (uint64)(aa.limb[13])
+	accum1 += bv * (uint64)(aa.limb[14])
+	accum2 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-3)^(x448Limbs/2)] += aa.limb[x448Limbs-1-3]
+
+	bv = (uint64)(b.limb[4])
+	accum4 += bv * (uint64)(aa.limb[0])
+	accum5 += bv * (uint64)(aa.limb[1])
+	accum6 += bv * (uint64)(aa.limb[2])
+	accum7 += bv * (uint64)(aa.limb[3])
+	accum8 += bv * (uint64)(aa.limb[4])
+	accum9 += bv * (uint64)(aa.limb[5])
+	accum10 += bv * (uint64)(aa.limb[6])
+	accum11 += bv * (uint64)(aa.limb[7])
+	accum12 += bv * (uint64)(aa.limb[8])
+	accum13 += bv * (uint64)(aa.limb[9])
+	accum14 += bv * (uint64)(aa.limb[10])
+	accum15 += bv * (uint64)(aa.limb[11])
+	accum0 += bv * (uint64)(aa.limb[12])
+	accum1 += bv * (uint64)(aa.limb[13])
+	accum2 += bv * (uint64)(aa.limb[14])
+	accum3 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-4)^(x448Limbs/2)] += aa.limb[x448Limbs-1-4]
+
+	bv = (uint64)(b.limb[5])
+	accum5 += bv * (uint64)(aa.limb[0])
+	accum6 += bv * (uint64)(aa.limb[1])
+	accum7 += bv * (uint64)(aa.limb[2])
+	accum8 += bv * (uint64)(aa.limb[3])
+	accum9 += bv * (uint64)(aa.limb[4])
+	accum10 += bv * (uint64)(aa.limb[5])
+	accum11 += bv * (uint64)(aa.limb[6])
+	accum12 += bv * (uint64)(aa.limb[7])
+	accum13 += bv * (uint64)(aa.limb[8])
+	accum14 += bv * (uint64)(aa.limb[9])
+	accum15 += bv * (uint64)(aa.limb[10])
+	accum0 += bv * (uint64)(aa.limb[11])
+	accum1 += bv * (uint64)(aa.limb[12])
+	accum2 += bv * (uint64)(aa.limb[13])
+	accum3 += bv * (uint64)(aa.limb[14])
+	accum4 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-5)^(x448Limbs/2)] += aa.limb[x448Limbs-1-5]
+
+	bv = (uint64)(b.limb[6])
+	accum6 += bv * (uint64)(aa.limb[0])
+	accum7 += bv * (uint64)(aa.limb[1])
+	accum8 += bv * (uint64)(aa.limb[2])
+	accum9 += bv * (uint64)(aa.limb[3])
+	accum10 += bv * (uint64)(aa.limb[4])
+	accum11 += bv * (uint64)(aa.limb[5])
+	accum12 += bv * (uint64)(aa.limb[6])
+	accum13 += bv * (uint64)(aa.limb[7])
+	accum14 += bv * (uint64)(aa.limb[8])
+	accum15 += bv * (uint64)(aa.limb[9])
+	accum0 += bv * (uint64)(aa.limb[10])
+	accum1 += bv * (uint64)(aa.limb[11])
+	accum2 += bv * (uint64)(aa.limb[12])
+	accum3 += bv * (uint64)(aa.limb[13])
+	accum4 += bv * (uint64)(aa.limb[14])
+	accum5 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-6)^(x448Limbs/2)] += aa.limb[x448Limbs-1-6]
+
+	bv = (uint64)(b.limb[7])
+	accum7 += bv * (uint64)(aa.limb[0])
+	accum8 += bv * (uint64)(aa.limb[1])
+	accum9 += bv * (uint64)(aa.limb[2])
+	accum10 += bv * (uint64)(aa.limb[3])
+	accum11 += bv * (uint64)(aa.limb[4])
+	accum12 += bv * (uint64)(aa.limb[5])
+	accum13 += bv * (uint64)(aa.limb[6])
+	accum14 += bv * (uint64)(aa.limb[7])
+	accum15 += bv * (uint64)(aa.limb[8])
+	accum0 += bv * (uint64)(aa.limb[9])
+	accum1 += bv * (uint64)(aa.limb[10])
+	accum2 += bv * (uint64)(aa.limb[11])
+	accum3 += bv * (uint64)(aa.limb[12])
+	accum4 += bv * (uint64)(aa.limb[13])
+	accum5 += bv * (uint64)(aa.limb[14])
+	accum6 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-7)^(x448Limbs/2)] += aa.limb[x448Limbs-1-7]
+
+	bv = (uint64)(b.limb[8])
+	accum8 += bv * (uint64)(aa.limb[0])
+	accum9 += bv * (uint64)(aa.limb[1])
+	accum10 += bv * (uint64)(aa.limb[2])
+	accum11 += bv * (uint64)(aa.limb[3])
+	accum12 += bv * (uint64)(aa.limb[4])
+	accum13 += bv * (uint64)(aa.limb[5])
+	accum14 += bv * (uint64)(aa.limb[6])
+	accum15 += bv * (uint64)(aa.limb[7])
+	accum0 += bv * (uint64)(aa.limb[8])
+	accum1 += bv * (uint64)(aa.limb[9])
+	accum2 += bv * (uint64)(aa.limb[10])
+	accum3 += bv * (uint64)(aa.limb[11])
+	accum4 += bv * (uint64)(aa.limb[12])
+	accum5 += bv * (uint64)(aa.limb[13])
+	accum6 += bv * (uint64)(aa.limb[14])
+	accum7 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-8)^(x448Limbs/2)] += aa.limb[x448Limbs-1-8]
+
+	bv = (uint64)(b.limb[9])
+	accum9 += bv * (uint64)(aa.limb[0])
+	accum10 += bv * (uint64)(aa.limb[1])
+	accum11 += bv * (uint64)(aa.limb[2])
+	accum12 += bv * (uint64)(aa.limb[3])
+	accum13 += bv * (uint64)(aa.limb[4])
+	accum14 += bv * (uint64)(aa.limb[5])
+	accum15 += bv * (uint64)(aa.limb[6])
+	accum0 += bv * (uint64)(aa.limb[7])
+	accum1 += bv * (uint64)(aa.limb[8])
+	accum2 += bv * (uint64)(aa.limb[9])
+	accum3 += bv * (uint64)(aa.limb[10])
+	accum4 += bv * (uint64)(aa.limb[11])
+	accum5 += bv * (uint64)(aa.limb[12])
+	accum6 += bv * (uint64)(aa.limb[13])
+	accum7 += bv * (uint64)(aa.limb[14])
+	accum8 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-9)^(x448Limbs/2)] += aa.limb[x448Limbs-1-9]
+
+	bv = (uint64)(b.limb[10])
+	accum10 += bv * (uint64)(aa.limb[0])
+	accum11 += bv * (uint64)(aa.limb[1])
+	accum12 += bv * (uint64)(aa.limb[2])
+	accum13 += bv * (uint64)(aa.limb[3])
+	accum14 += bv * (uint64)(aa.limb[4])
+	accum15 += bv * (uint64)(aa.limb[5])
+	accum0 += bv * (uint64)(aa.limb[6])
+	accum1 += bv * (uint64)(aa.limb[7])
+	accum2 += bv * (uint64)(aa.limb[8])
+	accum3 += bv * (uint64)(aa.limb[9])
+	accum4 += bv * (uint64)(aa.limb[10])
+	accum5 += bv * (uint64)(aa.limb[11])
+	accum6 += bv * (uint64)(aa.limb[12])
+	accum7 += bv * (uint64)(aa.limb[13])
+	accum8 += bv * (uint64)(aa.limb[14])
+	accum9 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-10)^(x448Limbs/2)] += aa.limb[x448Limbs-1-10]
+
+	bv = (uint64)(b.limb[11])
+	accum11 += bv * (uint64)(aa.limb[0])
+	accum12 += bv * (uint64)(aa.limb[1])
+	accum13 += bv * (uint64)(aa.limb[2])
+	accum14 += bv * (uint64)(aa.limb[3])
+	accum15 += bv * (uint64)(aa.limb[4])
+	accum0 += bv * (uint64)(aa.limb[5])
+	accum1 += bv * (uint64)(aa.limb[6])
+	accum2 += bv * (uint64)(aa.limb[7])
+	accum3 += bv * (uint64)(aa.limb[8])
+	accum4 += bv * (uint64)(aa.limb[9])
+	accum5 += bv * (uint64)(aa.limb[10])
+	accum6 += bv * (uint64)(aa.limb[11])
+	accum7 += bv * (uint64)(aa.limb[12])
+	accum8 += bv * (uint64)(aa.limb[13])
+	accum9 += bv * (uint64)(aa.limb[14])
+	accum10 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-11)^(x448Limbs/2)] += aa.limb[x448Limbs-1-11]
+
+	bv = (uint64)(b.limb[12])
+	accum12 += bv * (uint64)(aa.limb[0])
+	accum13 += bv * (uint64)(aa.limb[1])
+	accum14 += bv * (uint64)(aa.limb[2])
+	accum15 += bv * (uint64)(aa.limb[3])
+	accum0 += bv * (uint64)(aa.limb[4])
+	accum1 += bv * (uint64)(aa.limb[5])
+	accum2 += bv * (uint64)(aa.limb[6])
+	accum3 += bv * (uint64)(aa.limb[7])
+	accum4 += bv * (uint64)(aa.limb[8])
+	accum5 += bv * (uint64)(aa.limb[9])
+	accum6 += bv * (uint64)(aa.limb[10])
+	accum7 += bv * (uint64)(aa.limb[11])
+	accum8 += bv * (uint64)(aa.limb[12])
+	accum9 += bv * (uint64)(aa.limb[13])
+	accum10 += bv * (uint64)(aa.limb[14])
+	accum11 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-12)^(x448Limbs/2)] += aa.limb[x448Limbs-1-12]
+
+	bv = (uint64)(b.limb[13])
+	accum13 += bv * (uint64)(aa.limb[0])
+	accum14 += bv * (uint64)(aa.limb[1])
+	accum15 += bv * (uint64)(aa.limb[2])
+	accum0 += bv * (uint64)(aa.limb[3])
+	accum1 += bv * (uint64)(aa.limb[4])
+	accum2 += bv * (uint64)(aa.limb[5])
+	accum3 += bv * (uint64)(aa.limb[6])
+	accum4 += bv * (uint64)(aa.limb[7])
+	accum5 += bv * (uint64)(aa.limb[8])
+	accum6 += bv * (uint64)(aa.limb[9])
+	accum7 += bv * (uint64)(aa.limb[10])
+	accum8 += bv * (uint64)(aa.limb[11])
+	accum9 += bv * (uint64)(aa.limb[12])
+	accum10 += bv * (uint64)(aa.limb[13])
+	accum11 += bv * (uint64)(aa.limb[14])
+	accum12 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-13)^(x448Limbs/2)] += aa.limb[x448Limbs-1-13]
+
+	bv = (uint64)(b.limb[14])
+	accum14 += bv * (uint64)(aa.limb[0])
+	accum15 += bv * (uint64)(aa.limb[1])
+	accum0 += bv * (uint64)(aa.limb[2])
+	accum1 += bv * (uint64)(aa.limb[3])
+	accum2 += bv * (uint64)(aa.limb[4])
+	accum3 += bv * (uint64)(aa.limb[5])
+	accum4 += bv * (uint64)(aa.limb[6])
+	accum5 += bv * (uint64)(aa.limb[7])
+	accum6 += bv * (uint64)(aa.limb[8])
+	accum7 += bv * (uint64)(aa.limb[9])
+	accum8 += bv * (uint64)(aa.limb[10])
+	accum9 += bv * (uint64)(aa.limb[11])
+	accum10 += bv * (uint64)(aa.limb[12])
+	accum11 += bv * (uint64)(aa.limb[13])
+	accum12 += bv * (uint64)(aa.limb[14])
+	accum13 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-14)^(x448Limbs/2)] += aa.limb[x448Limbs-1-14]
+
+	bv = (uint64)(b.limb[15])
+	accum15 += bv * (uint64)(aa.limb[0])
+	accum0 += bv * (uint64)(aa.limb[1])
+	accum1 += bv * (uint64)(aa.limb[2])
+	accum2 += bv * (uint64)(aa.limb[3])
+	accum3 += bv * (uint64)(aa.limb[4])
+	accum4 += bv * (uint64)(aa.limb[5])
+	accum5 += bv * (uint64)(aa.limb[6])
+	accum6 += bv * (uint64)(aa.limb[7])
+	accum7 += bv * (uint64)(aa.limb[8])
+	accum8 += bv * (uint64)(aa.limb[9])
+	accum9 += bv * (uint64)(aa.limb[10])
+	accum10 += bv * (uint64)(aa.limb[11])
+	accum11 += bv * (uint64)(aa.limb[12])
+	accum12 += bv * (uint64)(aa.limb[13])
+	accum13 += bv * (uint64)(aa.limb[14])
+	accum14 += bv * (uint64)(aa.limb[15])
+	aa.limb[(x448Limbs-1-15)^(x448Limbs/2)] += aa.limb[x448Limbs-1-15]
+
+	// accum[x448Limbs-1] += accum[x448Limbs-2] >> lBits
+	// accum[x448Limbs-2] &= lMask
+	// accum[x448Limbs/2] += accum[x448Limbs-1] >> lBits
+	accum15 += accum14 >> lBits
+	accum14 &= lMask
+	accum8 += accum15 >> lBits
+
+	// for j := uint(0); j < x448Limbs; j++ {
+	//	accum[j] += accum[(j-1)%x448Limbs] >> lBits
+	//	accum[(j-1)%x448Limbs] &= lMask
+	// }
+	accum0 += accum15 >> lBits
+	accum15 &= lMask
+	accum1 += accum0 >> lBits
+	accum0 &= lMask
+	accum2 += accum1 >> lBits
+	accum1 &= lMask
+	accum3 += accum2 >> lBits
+	accum2 &= lMask
+	accum4 += accum3 >> lBits
+	accum3 &= lMask
+	accum5 += accum4 >> lBits
+	accum4 &= lMask
+	accum6 += accum5 >> lBits
+	accum5 &= lMask
+	accum7 += accum6 >> lBits
+	accum6 &= lMask
+	accum8 += accum7 >> lBits
+	accum7 &= lMask
+	accum9 += accum8 >> lBits
+	accum8 &= lMask
+	accum10 += accum9 >> lBits
+	accum9 &= lMask
+	accum11 += accum10 >> lBits
+	accum10 &= lMask
+	accum12 += accum11 >> lBits
+	accum11 &= lMask
+	accum13 += accum12 >> lBits
+	accum12 &= lMask
+	accum14 += accum13 >> lBits
+	accum13 &= lMask
+	accum15 += accum14 >> lBits
+	accum14 &= lMask
+
+	// for j, accv := range accum {
+	//	c.limb[j] = (uint32)(accv)
+	// }
+	c.limb[0] = (uint32)(accum0)
+	c.limb[1] = (uint32)(accum1)
+	c.limb[2] = (uint32)(accum2)
+	c.limb[3] = (uint32)(accum3)
+	c.limb[4] = (uint32)(accum4)
+	c.limb[5] = (uint32)(accum5)
+	c.limb[6] = (uint32)(accum6)
+	c.limb[7] = (uint32)(accum7)
+	c.limb[8] = (uint32)(accum8)
+	c.limb[9] = (uint32)(accum9)
+	c.limb[10] = (uint32)(accum10)
+	c.limb[11] = (uint32)(accum11)
+	c.limb[12] = (uint32)(accum12)
+	c.limb[13] = (uint32)(accum13)
+	c.limb[14] = (uint32)(accum14)
+	c.limb[15] = (uint32)(accum15)
 }
 
 // sqr squares (c = x * x).  Just calls multiply. (PERF)
@@ -191,35 +558,153 @@ func (y *gf) inv(x *gf) {
 // reduce weakly reduces mod p
 func (x *gf) reduce() {
 	x.limb[x448Limbs/2] += x.limb[x448Limbs-1] >> lBits
-	for j := uint(0); j < x448Limbs; j++ { // XXX: Unroll
-		x.limb[j] += x.limb[(j-1)%x448Limbs] >> lBits
-		x.limb[(j-1)%x448Limbs] &= lMask
-	}
+
+	// for j := uint(0); j < x448Limbs; j++ {
+	//	x.limb[j] += x.limb[(j-1)%x448Limbs] >> lBits
+	//	x.limb[(j-1)%x448Limbs] &= lMask
+	// }
+	x.limb[0] += x.limb[15] >> lBits
+	x.limb[15] &= lMask
+	x.limb[1] += x.limb[0] >> lBits
+	x.limb[0] &= lMask
+	x.limb[2] += x.limb[1] >> lBits
+	x.limb[1] &= lMask
+	x.limb[3] += x.limb[2] >> lBits
+	x.limb[2] &= lMask
+	x.limb[4] += x.limb[3] >> lBits
+	x.limb[3] &= lMask
+	x.limb[5] += x.limb[4] >> lBits
+	x.limb[4] &= lMask
+	x.limb[6] += x.limb[5] >> lBits
+	x.limb[5] &= lMask
+	x.limb[7] += x.limb[6] >> lBits
+	x.limb[6] &= lMask
+	x.limb[8] += x.limb[7] >> lBits
+	x.limb[7] &= lMask
+	x.limb[9] += x.limb[8] >> lBits
+	x.limb[8] &= lMask
+	x.limb[10] += x.limb[9] >> lBits
+	x.limb[9] &= lMask
+	x.limb[11] += x.limb[10] >> lBits
+	x.limb[10] &= lMask
+	x.limb[12] += x.limb[11] >> lBits
+	x.limb[11] &= lMask
+	x.limb[13] += x.limb[12] >> lBits
+	x.limb[12] &= lMask
+	x.limb[14] += x.limb[13] >> lBits
+	x.limb[13] &= lMask
+	x.limb[15] += x.limb[14] >> lBits
+	x.limb[14] &= lMask
 }
 
 // add adds mod p. Conservatively always weak-reduces. (PERF)
 func (x *gf) add(y, z *gf) {
-	for i, yv := range y.limb { // XXX: Unroll
-		x.limb[i] = yv + z.limb[i]
-	}
+	// for i, yv := range y.limb {
+	//	x.limb[i] = yv + z.limb[i]
+	// }
+	x.limb[0] = y.limb[0] + z.limb[0]
+	x.limb[1] = y.limb[1] + z.limb[1]
+	x.limb[2] = y.limb[2] + z.limb[2]
+	x.limb[3] = y.limb[3] + z.limb[3]
+	x.limb[4] = y.limb[4] + z.limb[4]
+	x.limb[5] = y.limb[5] + z.limb[5]
+	x.limb[6] = y.limb[6] + z.limb[6]
+	x.limb[7] = y.limb[7] + z.limb[7]
+	x.limb[8] = y.limb[8] + z.limb[8]
+	x.limb[9] = y.limb[9] + z.limb[9]
+	x.limb[10] = y.limb[10] + z.limb[10]
+	x.limb[11] = y.limb[11] + z.limb[11]
+	x.limb[12] = y.limb[12] + z.limb[12]
+	x.limb[13] = y.limb[13] + z.limb[13]
+	x.limb[14] = y.limb[14] + z.limb[14]
+	x.limb[15] = y.limb[15] + z.limb[15]
+
 	x.reduce()
 }
 
 // sub subtracts mod p.  Conservatively always weak-reduces. (PERF)
 func (x *gf) sub(y, z *gf) {
-	for i, yv := range y.limb { // XXX: Unroll
-		x.limb[i] = yv - z.limb[i] + 2*p.limb[i]
-	}
+	// for i, yv := range y.limb {
+	//	x.limb[i] = yv - z.limb[i] + 2*p.limb[i]
+	// }
+	x.limb[0] = y.limb[0] - z.limb[0] + 2*lMask
+	x.limb[1] = y.limb[1] - z.limb[1] + 2*lMask
+	x.limb[2] = y.limb[2] - z.limb[2] + 2*lMask
+	x.limb[3] = y.limb[3] - z.limb[3] + 2*lMask
+	x.limb[4] = y.limb[4] - z.limb[4] + 2*lMask
+	x.limb[5] = y.limb[5] - z.limb[5] + 2*lMask
+	x.limb[6] = y.limb[6] - z.limb[6] + 2*lMask
+	x.limb[7] = y.limb[7] - z.limb[7] + 2*lMask
+	x.limb[8] = y.limb[8] - z.limb[8] + 2*(lMask-1)
+	x.limb[9] = y.limb[9] - z.limb[9] + 2*lMask
+	x.limb[10] = y.limb[10] - z.limb[10] + 2*lMask
+	x.limb[11] = y.limb[11] - z.limb[11] + 2*lMask
+	x.limb[12] = y.limb[12] - z.limb[12] + 2*lMask
+	x.limb[13] = y.limb[13] - z.limb[13] + 2*lMask
+	x.limb[14] = y.limb[14] - z.limb[14] + 2*lMask
+	x.limb[15] = y.limb[15] - z.limb[15] + 2*lMask
+
 	x.reduce()
 }
 
 // condSwap swaps x and y in constant time.
 func (x *gf) condSwap(y *gf, swap limbUint) {
-	for i, xv := range x.limb { // XXX: Unroll
-		s := (xv ^ y.limb[i]) & (uint32)(swap) // Sort of dumb, oh well.
-		x.limb[i] ^= s
-		y.limb[i] ^= s
-	}
+	// for i, xv := range x.limb {
+	//	s := (xv ^ y.limb[i]) & (uint32)(swap) // Sort of dumb, oh well.
+	//	x.limb[i] ^= s
+	//	y.limb[i] ^= s
+	// }
+
+	var s uint32
+
+	s = (x.limb[0] ^ y.limb[0]) & (uint32)(swap)
+	x.limb[0] ^= s
+	y.limb[0] ^= s
+	s = (x.limb[1] ^ y.limb[1]) & (uint32)(swap)
+	x.limb[1] ^= s
+	y.limb[1] ^= s
+	s = (x.limb[2] ^ y.limb[2]) & (uint32)(swap)
+	x.limb[2] ^= s
+	y.limb[2] ^= s
+	s = (x.limb[3] ^ y.limb[3]) & (uint32)(swap)
+	x.limb[3] ^= s
+	y.limb[3] ^= s
+	s = (x.limb[4] ^ y.limb[4]) & (uint32)(swap)
+	x.limb[4] ^= s
+	y.limb[4] ^= s
+	s = (x.limb[5] ^ y.limb[5]) & (uint32)(swap)
+	x.limb[5] ^= s
+	y.limb[5] ^= s
+	s = (x.limb[6] ^ y.limb[6]) & (uint32)(swap)
+	x.limb[6] ^= s
+	y.limb[6] ^= s
+	s = (x.limb[7] ^ y.limb[7]) & (uint32)(swap)
+	x.limb[7] ^= s
+	y.limb[7] ^= s
+	s = (x.limb[8] ^ y.limb[8]) & (uint32)(swap)
+	x.limb[8] ^= s
+	y.limb[8] ^= s
+	s = (x.limb[9] ^ y.limb[9]) & (uint32)(swap)
+	x.limb[9] ^= s
+	y.limb[9] ^= s
+	s = (x.limb[10] ^ y.limb[10]) & (uint32)(swap)
+	x.limb[10] ^= s
+	y.limb[10] ^= s
+	s = (x.limb[11] ^ y.limb[11]) & (uint32)(swap)
+	x.limb[11] ^= s
+	y.limb[11] ^= s
+	s = (x.limb[12] ^ y.limb[12]) & (uint32)(swap)
+	x.limb[12] ^= s
+	y.limb[12] ^= s
+	s = (x.limb[13] ^ y.limb[13]) & (uint32)(swap)
+	x.limb[13] ^= s
+	y.limb[13] ^= s
+	s = (x.limb[14] ^ y.limb[14]) & (uint32)(swap)
+	x.limb[14] ^= s
+	y.limb[14] ^= s
+	s = (x.limb[15] ^ y.limb[15]) & (uint32)(swap)
+	x.limb[15] ^= s
+	y.limb[15] ^= s
 }
 
 // mlw multiplies by a signed int.  NOT CONSTANT TIME wrt the sign of the int,
@@ -262,7 +747,7 @@ func (a *gf) canon() {
 }
 
 // deser deserializes into the limb representation.
-func (s *gf) deser(ser *[x448Bytes]byte) int64 {
+func (s *gf) deser(ser *[x448Bytes]byte) {
 	var buf uint64
 	bits := uint(0)
 	k := 0
@@ -274,13 +759,6 @@ func (s *gf) deser(ser *[x448Bytes]byte) int64 {
 			k++
 		}
 	}
-
-	// XXX: Return value never used, this can be omitted.
-	var accum int64
-	for i, v := range s.limb {
-		accum = (accum + (int64)(v) - (int64)(p.limb[i])) >> wBits
-	}
-	return accum
 }
 
 // ser serializes into byte representation.
@@ -297,3 +775,9 @@ func (a *gf) ser(ser *[x448Bytes]byte) {
 		}
 	}
 }
+
+func init() {
+	if x448Limbs != 16 {
+		panic("x448Limbs != 16, unrolled loops likely broken")
+	}
+}