Browse Source

Carve off the implementation into a separate file.

The 32 bit implementation is still faster since Go doesn't support
a uint128 datatype yet.
Yawning Angel 4 years ago
parent
commit
d999746350
4 changed files with 254 additions and 244 deletions
  1. 2 6
      README.md
  2. 19 235
      poly1305.go
  3. 230 0
      poly1305_32.go
  4. 3 3
      poly1305_test.go

+ 2 - 6
README.md

@@ -5,12 +5,8 @@ Poly1305 implements the Poly1305 MAC algorithm, exposing a saner interface than
 the one provided by golang.org/x/crypto/poly1305.  In particular it exposes a
 object that implements a hash.Hash interface.
 
-The implementation is based on the Public Domain poly1305-donna-32 by Andrew
-Moon, as a cgo based poly1305-donna-64 implementation didn't appear to perform
-much faster.
-
-The package uses "unsafe" to skip the byteswapping, since the Go compiler is
-utterly terrible at inlining.
+The implementation is based on the Public Domain poly1305-donna by Andrew
+Moon.
 
 | Implementation       | 64 byte      | 1024 byte   |
 | -------------------- | ------------ | ----------- |

+ 19 - 235
poly1305.go

@@ -4,13 +4,11 @@
 // golang.org/x/crypto implementation in that it exports a hash.Hash interface
 // to support incremental updates.
 //
-// The implementation is based on Andrew Moon's poly1305-donna-32, as it is
-// the most performant variant implementatble in pure Go.
+// The implementation is based on Andrew Moon's poly1305-donna.
 package poly1305
 
 import (
 	"crypto/subtle"
-	"encoding/binary"
 	"errors"
 	"hash"
 	"unsafe"
@@ -36,19 +34,21 @@ var (
 	// encountered.
 	ErrInvalidMacSize = errors.New("poly1305: invalid mac size")
 
-	// useUnsafe is set at package load time when the current CPU does not
-	// require the byteswap operations.
-	useUnsafe bool
+	isLittleEndian = false
 )
 
+type implInterface interface {
+	init(key []byte)
+	clear()
+	blocks(m []byte, bytes int, isFinal bool)
+	finish(mac *[Size]byte)
+}
+
 // Poly1305 is an instance of the Poly1305 MAC algorithm.
 type Poly1305 struct {
-	r        [5]uint32
-	h        [5]uint32
-	pad      [4]uint32
+	impl     implState
 	leftover int
 	buffer   [BlockSize]byte
-	final    bool
 }
 
 // Write adds more data to the running hash.  It never returns an error.
@@ -75,14 +75,14 @@ func (st *Poly1305) Write(p []byte) (n int, err error) {
 		if st.leftover < BlockSize {
 			return len(p), nil
 		}
-		st.blocks(st.buffer[:], BlockSize)
+		st.impl.blocks(st.buffer[:], BlockSize, false)
 		st.leftover = 0
 	}
 
 	// process full blocks
 	if bytes >= BlockSize {
 		want := bytes & (^(BlockSize - 1))
-		st.blocks(m, want)
+		st.impl.blocks(m, want, false)
 		m = m[want:]
 		bytes -= want
 	}
@@ -132,242 +132,27 @@ func (st *Poly1305) Init(key []byte) {
 		panic(ErrInvalidKeySize)
 	}
 
-	//
-	// poly1305-donna-32.h:poly1305_init()
-	//
-
-	// r &= 0xffffffc0ffffffc0ffffffc0fffffff
-	if useUnsafe {
-		st.r[0] = *(*uint32)(unsafe.Pointer(&key[0])) & 0x3ffffff
-		st.r[1] = (*(*uint32)(unsafe.Pointer(&key[3])) >> 2) & 0x3ffff03
-		st.r[2] = (*(*uint32)(unsafe.Pointer(&key[6])) >> 4) & 0x3ffc0ff
-		st.r[3] = (*(*uint32)(unsafe.Pointer(&key[9])) >> 6) & 0x3f03fff
-		st.r[4] = (*(*uint32)(unsafe.Pointer(&key[12])) >> 8) & 0x00fffff
-	} else {
-		st.r[0] = binary.LittleEndian.Uint32(key[0:]) & 0x3ffffff
-		st.r[1] = (binary.LittleEndian.Uint32(key[3:]) >> 2) & 0x3ffff03
-		st.r[2] = (binary.LittleEndian.Uint32(key[6:]) >> 4) & 0x3ffc0ff
-		st.r[3] = (binary.LittleEndian.Uint32(key[9:]) >> 6) & 0x3f03fff
-		st.r[4] = (binary.LittleEndian.Uint32(key[12:]) >> 8) & 0x00fffff
-	}
-
-	// h = 0
-	for i := range st.h {
-		st.h[i] = 0
-	}
-
-	// save pad for later
-	if useUnsafe {
-		padArr := (*[4]uint32)(unsafe.Pointer(&key[16]))
-		st.pad[0] = padArr[0]
-		st.pad[1] = padArr[1]
-		st.pad[2] = padArr[2]
-		st.pad[3] = padArr[3]
-	} else {
-		st.pad[0] = binary.LittleEndian.Uint32(key[16:])
-		st.pad[1] = binary.LittleEndian.Uint32(key[20:])
-		st.pad[2] = binary.LittleEndian.Uint32(key[24:])
-		st.pad[3] = binary.LittleEndian.Uint32(key[28:])
-	}
-
+	st.impl.init(key)
 	st.leftover = 0
-	st.final = false
 }
 
 // Clear purges the sensitive material in hash's internal state.
 func (st *Poly1305) Clear() {
-	for i := range st.h {
-		st.h[i] = 0
-	}
-	for i := range st.r {
-		st.r[i] = 0
-	}
-	for i := range st.pad {
-		st.pad[i] = 0
-	}
-}
-
-func (st *Poly1305) blocks(m []byte, bytes int) {
-	//
-	// poly1305-donna-32.h:poly1305_blocks()
-	//
-
-	var hibit uint32
-	var d0, d1, d2, d3, d4 uint64
-	var c uint32
-	if !st.final {
-		hibit = 1 << 24 // 1 << 128
-	}
-	r0, r1, r2, r3, r4 := st.r[0], st.r[1], st.r[2], st.r[3], st.r[4]
-	s1, s2, s3, s4 := r1*5, r2*5, r3*5, r4*5
-	h0, h1, h2, h3, h4 := st.h[0], st.h[1], st.h[2], st.h[3], st.h[4]
-
-	for bytes >= BlockSize {
-		// h += m[i]
-		if useUnsafe {
-			h0 += *(*uint32)(unsafe.Pointer(&m[0])) & 0x3ffffff
-			h1 += (*(*uint32)(unsafe.Pointer(&m[3])) >> 2) & 0x3ffffff
-			h2 += (*(*uint32)(unsafe.Pointer(&m[6])) >> 4) & 0x3ffffff
-			h3 += (*(*uint32)(unsafe.Pointer(&m[9])) >> 6) & 0x3ffffff
-			h4 += (*(*uint32)(unsafe.Pointer(&m[12])) >> 8) | hibit
-		} else {
-			h0 += binary.LittleEndian.Uint32(m[0:]) & 0x3ffffff
-			h1 += (binary.LittleEndian.Uint32(m[3:]) >> 2) & 0x3ffffff
-			h2 += (binary.LittleEndian.Uint32(m[6:]) >> 4) & 0x3ffffff
-			h3 += (binary.LittleEndian.Uint32(m[9:]) >> 6) & 0x3ffffff
-			h4 += (binary.LittleEndian.Uint32(m[12:]) >> 8) | hibit
-		}
-
-		// h *= r
-		d0 = (uint64(h0) * uint64(r0)) + (uint64(h1) * uint64(s4)) + (uint64(h2) * uint64(s3)) + (uint64(h3) * uint64(s2)) + (uint64(h4) * uint64(s1))
-		d1 = (uint64(h0) * uint64(r1)) + (uint64(h1) * uint64(r0)) + (uint64(h2) * uint64(s4)) + (uint64(h3) * uint64(s3)) + (uint64(h4) * uint64(s2))
-		d2 = (uint64(h0) * uint64(r2)) + (uint64(h1) * uint64(r1)) + (uint64(h2) * uint64(r0)) + (uint64(h3) * uint64(s4)) + (uint64(h4) * uint64(s3))
-		d3 = (uint64(h0) * uint64(r3)) + (uint64(h1) * uint64(r2)) + (uint64(h2) * uint64(r1)) + (uint64(h3) * uint64(r0)) + (uint64(h4) * uint64(s4))
-		d4 = (uint64(h0) * uint64(r4)) + (uint64(h1) * uint64(r3)) + (uint64(h2) * uint64(r2)) + (uint64(h3) * uint64(r1)) + (uint64(h4) * uint64(r0))
-
-		// (partial) h %= p
-		c = uint32(d0 >> 26)
-		h0 = uint32(d0) & 0x3ffffff
-
-		d1 += uint64(c)
-		c = uint32(d1 >> 26)
-		h1 = uint32(d1) & 0x3ffffff
-
-		d2 += uint64(c)
-		c = uint32(d2 >> 26)
-		h2 = uint32(d2) & 0x3ffffff
-
-		d3 += uint64(c)
-		c = uint32(d3 >> 26)
-		h3 = uint32(d3) & 0x3ffffff
-
-		d4 += uint64(c)
-		c = uint32(d4 >> 26)
-		h4 = uint32(d4) & 0x3ffffff
-
-		h0 += c * 5
-		c = h0 >> 26
-		h0 = h0 & 0x3ffffff
-
-		h1 += c
-
-		m = m[BlockSize:]
-		bytes -= BlockSize
-	}
-
-	st.h[0], st.h[1], st.h[2], st.h[3], st.h[4] = h0, h1, h2, h3, h4
+	st.impl.clear()
 }
 
 func (st *Poly1305) finish(mac *[Size]byte) {
-	//
-	// poly1305-donna-32.h:poly1305_finish()
-	//
-
-	var c uint32
-	var g0, g1, g2, g3, g4 uint32
-	var f uint64
-	var mask uint32
-
 	// process the remaining block
 	if st.leftover > 0 {
 		st.buffer[st.leftover] = 1
 		for i := st.leftover + 1; i < BlockSize; i++ {
 			st.buffer[i] = 0
 		}
-		st.final = true
-		st.blocks(st.buffer[:], BlockSize)
-	}
-
-	// fully carry h
-	h0, h1, h2, h3, h4 := st.h[0], st.h[1], st.h[2], st.h[3], st.h[4]
-	c = h1 >> 26
-	h1 &= 0x3ffffff
-
-	h2 += c
-	c = h2 >> 26
-	h2 &= 0x3ffffff
-
-	h3 += c
-	c = h3 >> 26
-	h3 &= 0x3ffffff
-
-	h4 += c
-	c = h4 >> 26
-	h4 &= 0x3ffffff
-
-	h0 += c * 5
-	c = h0 >> 26
-	h0 &= 0x3ffffff
-
-	h1 += c
-
-	// compute h + -p
-	g0 = h0 + 5
-	c = g0 >> 26
-	g0 &= 0x3ffffff
-
-	g1 = h1 + c
-	c = g1 >> 26
-	g1 &= 0x3ffffff
-
-	g2 = h2 + c
-	c = g2 >> 26
-	g2 &= 0x3ffffff
-
-	g3 = h3 + c
-	c = g3 >> 26
-	g3 &= 0x3ffffff
-
-	g4 = h4 + c - (1 << 26)
-
-	// select h if h < p, or h + -p if h >= p
-	mask = (g4 >> ((4 * 8) - 1)) - 1
-	g0 &= mask
-	g1 &= mask
-	g2 &= mask
-	g3 &= mask
-	g4 &= mask
-	mask = ^mask
-	h0 = (h0 & mask) | g0
-	h1 = (h1 & mask) | g1
-	h2 = (h2 & mask) | g2
-	h3 = (h3 & mask) | g3
-	h4 = (h4 & mask) | g4
-
-	// h = h % (2^128)
-	h0 = ((h0) | (h1 << 26)) & 0xffffffff
-	h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff
-	h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff
-	h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff
-
-	// mac = (h + pad) % (2^128)
-	f = uint64(h0) + uint64(st.pad[0])
-	h0 = uint32(f)
-
-	f = uint64(h1) + uint64(st.pad[1]) + (f >> 32)
-	h1 = uint32(f)
-
-	f = uint64(h2) + uint64(st.pad[2]) + (f >> 32)
-	h2 = uint32(f)
-
-	f = uint64(h3) + uint64(st.pad[3]) + (f >> 32)
-	h3 = uint32(f)
-
-	if useUnsafe {
-		macArr := (*[4]uint32)(unsafe.Pointer(&mac[0]))
-		macArr[0] = h0
-		macArr[1] = h1
-		macArr[2] = h2
-		macArr[3] = h3
-	} else {
-		binary.LittleEndian.PutUint32(mac[0:], h0)
-		binary.LittleEndian.PutUint32(mac[4:], h1)
-		binary.LittleEndian.PutUint32(mac[8:], h2)
-		binary.LittleEndian.PutUint32(mac[12:], h3)
+		st.impl.blocks(st.buffer[:], BlockSize, true)
 	}
 
-	// zero out the state
-	st.Clear()
+	st.impl.finish(mac)
+	st.impl.clear()
 }
 
 // New returns a new Poly1305 instance keyed with the supplied key.
@@ -398,14 +183,13 @@ func Verify(mac *[Size]byte, m []byte, key *[KeySize]byte) bool {
 
 func init() {
 	// Use the UTF-32 (UCS-4) Byte Order Mark to detect host byte order,
-	// which enables the further use of 'unsafe' to work around the Go
-	// compiler's piss-poor inlining.  Gotta Go Fast.
+	// which enables the further use of 'unsafe' for added performance.
 	const bomLE = 0x0000feff
 	bom := [4]byte{0xff, 0xfe, 0x00, 0x00}
 
 	bomHost := *(*uint32)(unsafe.Pointer(&bom[0]))
 	if bomHost == 0x0000feff { // Little endian, use unsafe.
-		useUnsafe = true
+		isLittleEndian = true
 	}
 }
 

+ 230 - 0
poly1305_32.go

@@ -0,0 +1,230 @@
+// poly1305_32.go: 32->64 bit multiplies, 64 bit additions
+
+package poly1305
+
+import (
+	"encoding/binary"
+	"unsafe"
+)
+
+type implState struct {
+	r   [5]uint32
+	h   [5]uint32
+	pad [4]uint32
+}
+
+func (impl *implState) init(key []byte) {
+	//
+	// poly1305-donna-32.h:poly1305_init()
+	//
+
+	// r &= 0xffffffc0ffffffc0ffffffc0fffffff
+	if isLittleEndian {
+		impl.r[0] = *(*uint32)(unsafe.Pointer(&key[0])) & 0x3ffffff
+		impl.r[1] = (*(*uint32)(unsafe.Pointer(&key[3])) >> 2) & 0x3ffff03
+		impl.r[2] = (*(*uint32)(unsafe.Pointer(&key[6])) >> 4) & 0x3ffc0ff
+		impl.r[3] = (*(*uint32)(unsafe.Pointer(&key[9])) >> 6) & 0x3f03fff
+		impl.r[4] = (*(*uint32)(unsafe.Pointer(&key[12])) >> 8) & 0x00fffff
+	} else {
+		impl.r[0] = binary.LittleEndian.Uint32(key[0:]) & 0x3ffffff
+		impl.r[1] = (binary.LittleEndian.Uint32(key[3:]) >> 2) & 0x3ffff03
+		impl.r[2] = (binary.LittleEndian.Uint32(key[6:]) >> 4) & 0x3ffc0ff
+		impl.r[3] = (binary.LittleEndian.Uint32(key[9:]) >> 6) & 0x3f03fff
+		impl.r[4] = (binary.LittleEndian.Uint32(key[12:]) >> 8) & 0x00fffff
+	}
+
+	// h = 0
+	for i := range impl.h {
+		impl.h[i] = 0
+	}
+
+	// save pad for later
+	impl.pad[0] = binary.LittleEndian.Uint32(key[16:])
+	impl.pad[1] = binary.LittleEndian.Uint32(key[20:])
+	impl.pad[2] = binary.LittleEndian.Uint32(key[24:])
+	impl.pad[3] = binary.LittleEndian.Uint32(key[28:])
+}
+
+func (impl *implState) clear() {
+	for i := range impl.h {
+		impl.h[i] = 0
+	}
+	for i := range impl.r {
+		impl.r[i] = 0
+	}
+	for i := range impl.pad {
+		impl.pad[i] = 0
+	}
+}
+
+func (impl *implState) blocks(m []byte, bytes int, isFinal bool) {
+	//
+	// poly1305-donna-32.h:poly1305_blocks()
+	//
+
+	var hibit uint32
+	var d0, d1, d2, d3, d4 uint64
+	var c uint32
+	if !isFinal {
+		hibit = 1 << 24 // 1 << 128
+	}
+	r0, r1, r2, r3, r4 := impl.r[0], impl.r[1], impl.r[2], impl.r[3], impl.r[4]
+	s1, s2, s3, s4 := r1*5, r2*5, r3*5, r4*5
+	h0, h1, h2, h3, h4 := impl.h[0], impl.h[1], impl.h[2], impl.h[3], impl.h[4]
+
+	for bytes >= BlockSize {
+		// h += m[i]
+		if isLittleEndian {
+			h0 += *(*uint32)(unsafe.Pointer(&m[0])) & 0x3ffffff
+			h1 += (*(*uint32)(unsafe.Pointer(&m[3])) >> 2) & 0x3ffffff
+			h2 += (*(*uint32)(unsafe.Pointer(&m[6])) >> 4) & 0x3ffffff
+			h3 += (*(*uint32)(unsafe.Pointer(&m[9])) >> 6) & 0x3ffffff
+			h4 += (*(*uint32)(unsafe.Pointer(&m[12])) >> 8) | hibit
+		} else {
+			h0 += binary.LittleEndian.Uint32(m[0:]) & 0x3ffffff
+			h1 += (binary.LittleEndian.Uint32(m[3:]) >> 2) & 0x3ffffff
+			h2 += (binary.LittleEndian.Uint32(m[6:]) >> 4) & 0x3ffffff
+			h3 += (binary.LittleEndian.Uint32(m[9:]) >> 6) & 0x3ffffff
+			h4 += (binary.LittleEndian.Uint32(m[12:]) >> 8) | hibit
+		}
+
+		// h *= r
+		d0 = (uint64(h0) * uint64(r0)) + (uint64(h1) * uint64(s4)) + (uint64(h2) * uint64(s3)) + (uint64(h3) * uint64(s2)) + (uint64(h4) * uint64(s1))
+		d1 = (uint64(h0) * uint64(r1)) + (uint64(h1) * uint64(r0)) + (uint64(h2) * uint64(s4)) + (uint64(h3) * uint64(s3)) + (uint64(h4) * uint64(s2))
+		d2 = (uint64(h0) * uint64(r2)) + (uint64(h1) * uint64(r1)) + (uint64(h2) * uint64(r0)) + (uint64(h3) * uint64(s4)) + (uint64(h4) * uint64(s3))
+		d3 = (uint64(h0) * uint64(r3)) + (uint64(h1) * uint64(r2)) + (uint64(h2) * uint64(r1)) + (uint64(h3) * uint64(r0)) + (uint64(h4) * uint64(s4))
+		d4 = (uint64(h0) * uint64(r4)) + (uint64(h1) * uint64(r3)) + (uint64(h2) * uint64(r2)) + (uint64(h3) * uint64(r1)) + (uint64(h4) * uint64(r0))
+
+		// (partial) h %= p
+		c = uint32(d0 >> 26)
+		h0 = uint32(d0) & 0x3ffffff
+
+		d1 += uint64(c)
+		c = uint32(d1 >> 26)
+		h1 = uint32(d1) & 0x3ffffff
+
+		d2 += uint64(c)
+		c = uint32(d2 >> 26)
+		h2 = uint32(d2) & 0x3ffffff
+
+		d3 += uint64(c)
+		c = uint32(d3 >> 26)
+		h3 = uint32(d3) & 0x3ffffff
+
+		d4 += uint64(c)
+		c = uint32(d4 >> 26)
+		h4 = uint32(d4) & 0x3ffffff
+
+		h0 += c * 5
+		c = h0 >> 26
+		h0 = h0 & 0x3ffffff
+
+		h1 += c
+
+		m = m[BlockSize:]
+		bytes -= BlockSize
+	}
+
+	impl.h[0], impl.h[1], impl.h[2], impl.h[3], impl.h[4] = h0, h1, h2, h3, h4
+}
+
+func (impl *implState) finish(mac *[Size]byte) {
+	//
+	// poly1305-donna-32.h:poly1305_finish()
+	//
+
+	var c uint32
+	var g0, g1, g2, g3, g4 uint32
+	var f uint64
+	var mask uint32
+
+	// fully carry h
+	h0, h1, h2, h3, h4 := impl.h[0], impl.h[1], impl.h[2], impl.h[3], impl.h[4]
+	c = h1 >> 26
+	h1 &= 0x3ffffff
+
+	h2 += c
+	c = h2 >> 26
+	h2 &= 0x3ffffff
+
+	h3 += c
+	c = h3 >> 26
+	h3 &= 0x3ffffff
+
+	h4 += c
+	c = h4 >> 26
+	h4 &= 0x3ffffff
+
+	h0 += c * 5
+	c = h0 >> 26
+	h0 &= 0x3ffffff
+
+	h1 += c
+
+	// compute h + -p
+	g0 = h0 + 5
+	c = g0 >> 26
+	g0 &= 0x3ffffff
+
+	g1 = h1 + c
+	c = g1 >> 26
+	g1 &= 0x3ffffff
+
+	g2 = h2 + c
+	c = g2 >> 26
+	g2 &= 0x3ffffff
+
+	g3 = h3 + c
+	c = g3 >> 26
+	g3 &= 0x3ffffff
+
+	g4 = h4 + c - (1 << 26)
+
+	// select h if h < p, or h + -p if h >= p
+	mask = (g4 >> ((4 * 8) - 1)) - 1
+	g0 &= mask
+	g1 &= mask
+	g2 &= mask
+	g3 &= mask
+	g4 &= mask
+	mask = ^mask
+	h0 = (h0 & mask) | g0
+	h1 = (h1 & mask) | g1
+	h2 = (h2 & mask) | g2
+	h3 = (h3 & mask) | g3
+	h4 = (h4 & mask) | g4
+
+	// h = h % (2^128)
+	h0 = ((h0) | (h1 << 26)) & 0xffffffff
+	h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff
+	h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff
+	h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff
+
+	// mac = (h + pad) % (2^128)
+	f = uint64(h0) + uint64(impl.pad[0])
+	h0 = uint32(f)
+
+	f = uint64(h1) + uint64(impl.pad[1]) + (f >> 32)
+	h1 = uint32(f)
+
+	f = uint64(h2) + uint64(impl.pad[2]) + (f >> 32)
+	h2 = uint32(f)
+
+	f = uint64(h3) + uint64(impl.pad[3]) + (f >> 32)
+	h3 = uint32(f)
+
+	if isLittleEndian {
+		macArr := (*[4]uint32)(unsafe.Pointer(&mac[0]))
+		macArr[0] = h0
+		macArr[1] = h1
+		macArr[2] = h2
+		macArr[3] = h3
+	} else {
+		binary.LittleEndian.PutUint32(mac[0:], h0)
+		binary.LittleEndian.PutUint32(mac[4:], h1)
+		binary.LittleEndian.PutUint32(mac[8:], h2)
+		binary.LittleEndian.PutUint32(mac[12:], h3)
+	}
+}
+
+var _ implInterface = (*implState)(nil)

+ 3 - 3
poly1305_test.go

@@ -539,12 +539,12 @@ func TestIETFDraft(t *testing.T) {
 }
 
 func TestIETFDraftForceByteswap(t *testing.T) {
-	if !useUnsafe {
+	if !isLittleEndian {
 		t.Skipf("not little endian, slow path already taken")
 	} else {
-		useUnsafe = false
+		isLittleEndian = false
 		TestIETFDraft(t)
-		useUnsafe = true
+		isLittleEndian = true
 	}
 }