Browse Source

Initial import.

Yawning Angel 3 years ago
commit
e02639ba9f
7 changed files with 1733 additions and 0 deletions
  1. 122 0
      LICENSE
  2. 368 0
      chacha20.go
  3. 34 0
      chacha20_amd64.go
  4. 264 0
      chacha20_amd64.py
  5. 342 0
      chacha20_amd64.s
  6. 233 0
      chacha20_ref.go
  7. 370 0
      chacha20_test.go

+ 122 - 0
LICENSE

@@ -0,0 +1,122 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
+

+ 368 - 0
chacha20.go

@@ -0,0 +1,368 @@
+// chacha20.go - A ChaCha stream cipher implementation.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to chacha20, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+package chacha20
+
+import (
+	"crypto/cipher"
+	"encoding/binary"
+	"errors"
+	"runtime"
+)
+
+const (
+	// KeySize is the ChaCha20 key size in bytes.
+	KeySize = 32
+
+	// NonceSize is the ChaCha20 nonce size in bytes.
+	NonceSize = 8
+
+	// XNonceSize is the XChaCha20 nonce size in bytes.
+	XNonceSize = 24
+
+	// HNonceSize is the HChaCha20 nonce size in bytes.
+	HNonceSize = 16
+
+	// BlockSize is the ChaCha20 block size in bytes.
+	BlockSize = 64
+
+	stateSize    = 16 - 4
+	chachaRounds = 20
+
+	// The constant "expand 32-byte k" as little endian uint32s.
+	sigma0 = uint32(0x61707865)
+	sigma1 = uint32(0x3320646e)
+	sigma2 = uint32(0x79622d32)
+	sigma3 = uint32(0x6b206574)
+)
+
+var (
+	// ErrInvalidKey is the error returned when the key is invalid.
+	ErrInvalidKey = errors.New("key length must be KeySize bytes")
+
+	// ErrInvalidNonce is the error returned when the nonce is invalid.
+	ErrInvalidNonce = errors.New("nonce length must be NonceSize/XNonceSize bytes")
+
+	useUnsafe    = false
+	usingVectors = false
+	blocksFn     = blocksRef
+)
+
+// A Cipher is an instance of ChaCha20/XChaCha20 using a particular key and
+// nonce.
+type Cipher struct {
+	state [stateSize]uint32
+
+	buf [BlockSize]byte
+	off int
+}
+
+// Reset zeros the key data so that it will no longer appear in the process's
+// memory.
+func (c *Cipher) Reset() {
+	for i := range c.state {
+		c.state[i] = 0
+	}
+	for i := range c.buf {
+		c.buf[i] = 0
+	}
+}
+
+// XORKeyStream sets dst to the result of XORing src with the key stream.  Dst
+// and src may be the same slice but otherwise should not overlap.
+func (c *Cipher) XORKeyStream(dst, src []byte) {
+	if len(dst) < len(src) {
+		src = src[:len(dst)]
+	}
+
+	for remaining := len(src); remaining > 0; {
+		// Process multiple blocks at once.
+		if c.off == BlockSize {
+			nrBlocks := remaining / BlockSize
+			directBytes := nrBlocks * BlockSize
+			if nrBlocks > 0 {
+				blocksFn(&c.state, src, dst, nrBlocks)
+				remaining -= directBytes
+				if remaining == 0 {
+					return
+				}
+				dst = dst[directBytes:]
+				src = src[directBytes:]
+			}
+
+			// If there's a partial block, generate 1 block of keystream into
+			// the internal buffer.
+			blocksFn(&c.state, nil, c.buf[:], 1)
+			c.off = 0
+		}
+
+		// Process partial blocks from the buffered keystream.
+		toXor := BlockSize - c.off
+		if remaining < toXor {
+			toXor = remaining
+		}
+		if toXor > 0 {
+			for i, v := range src[:toXor] {
+				dst[i] = v ^ c.buf[c.off+i]
+			}
+			dst = dst[toXor:]
+			src = src[toXor:]
+
+			remaining -= toXor
+			c.off += toXor
+		}
+	}
+}
+
+// KeyStream sets dst to the raw keystream.
+func (c *Cipher) KeyStream(dst []byte) {
+	for remaining := len(dst); remaining > 0; {
+		// Process multiple blocks at once.
+		if c.off == BlockSize {
+			nrBlocks := remaining / BlockSize
+			directBytes := nrBlocks * BlockSize
+			if nrBlocks > 0 {
+				blocksFn(&c.state, nil, dst, nrBlocks)
+				remaining -= directBytes
+				if remaining == 0 {
+					return
+				}
+				dst = dst[directBytes:]
+			}
+
+			// If there's a partial block, generate 1 block of keystream into
+			// the internal buffer.
+			blocksFn(&c.state, nil, c.buf[:], 1)
+			c.off = 0
+		}
+
+		// Process partial blocks from the buffered keystream.
+		toCopy := BlockSize - c.off
+		if remaining < toCopy {
+			toCopy = remaining
+		}
+		if toCopy > 0 {
+			copy(dst[:toCopy], c.buf[c.off:c.off+toCopy])
+			dst = dst[toCopy:]
+			remaining -= toCopy
+			c.off += toCopy
+		}
+	}
+}
+
+// ReKey reinitializes the ChaCha20/XChaCha20 instance with the provided key
+// and nonce.
+func (c *Cipher) ReKey(key, nonce []byte) error {
+	if len(key) != KeySize {
+		return ErrInvalidKey
+	}
+
+	switch len(nonce) {
+	case NonceSize:
+	case XNonceSize:
+		var subkey [KeySize]byte
+		var subnonce [HNonceSize]byte
+		copy(subnonce[:], nonce[0:16])
+		HChaCha(key, &subnonce, &subkey)
+		key = subkey[:]
+		nonce = nonce[16:24]
+		defer func() {
+			for i := range subkey {
+				subkey[i] = 0
+			}
+		}()
+	default:
+		return ErrInvalidNonce
+	}
+
+	c.Reset()
+	c.state[0] = binary.LittleEndian.Uint32(key[0:4])
+	c.state[1] = binary.LittleEndian.Uint32(key[4:8])
+	c.state[2] = binary.LittleEndian.Uint32(key[8:12])
+	c.state[3] = binary.LittleEndian.Uint32(key[12:16])
+	c.state[4] = binary.LittleEndian.Uint32(key[16:20])
+	c.state[5] = binary.LittleEndian.Uint32(key[20:24])
+	c.state[6] = binary.LittleEndian.Uint32(key[24:28])
+	c.state[7] = binary.LittleEndian.Uint32(key[28:32])
+	c.state[8] = 0
+	c.state[9] = 0
+	c.state[10] = binary.LittleEndian.Uint32(nonce[0:4])
+	c.state[11] = binary.LittleEndian.Uint32(nonce[4:8])
+	c.off = BlockSize
+	return nil
+
+}
+
+// NewCipher returns a new ChaCha20/XChaCha20 instance.
+func NewCipher(key, nonce []byte) (*Cipher, error) {
+	c := new(Cipher)
+	if err := c.ReKey(key, nonce); err != nil {
+		return nil, err
+	}
+	return c, nil
+}
+
+// HChaCha is the HChaCha20 hash function used to make XChaCha.
+func HChaCha(key []byte, nonce *[HNonceSize]byte, out *[32]byte) {
+	var x [stateSize]uint32
+	x[0] = binary.LittleEndian.Uint32(key[0:4])
+	x[1] = binary.LittleEndian.Uint32(key[4:8])
+	x[2] = binary.LittleEndian.Uint32(key[8:12])
+	x[3] = binary.LittleEndian.Uint32(key[12:16])
+	x[4] = binary.LittleEndian.Uint32(key[16:20])
+	x[5] = binary.LittleEndian.Uint32(key[20:24])
+	x[6] = binary.LittleEndian.Uint32(key[24:28])
+	x[7] = binary.LittleEndian.Uint32(key[28:32])
+	x[8] = binary.LittleEndian.Uint32(nonce[0:4])
+	x[9] = binary.LittleEndian.Uint32(nonce[4:8])
+	x[10] = binary.LittleEndian.Uint32(nonce[8:12])
+	x[11] = binary.LittleEndian.Uint32(nonce[12:16])
+	hChaChaRef(&x, out)
+}
+
+func hChaChaRef(x *[stateSize]uint32, out *[32]byte) {
+	x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
+	x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11]
+
+	for i := chachaRounds; i > 0; i -= 2 {
+		// quarterround(x, 0, 4, 8, 12)
+		x0 += x4
+		x12 ^= x0
+		x12 = (x12 << 16) | (x12 >> 16)
+		x8 += x12
+		x4 ^= x8
+		x4 = (x4 << 12) | (x4 >> 20)
+		x0 += x4
+		x12 ^= x0
+		x12 = (x12 << 8) | (x12 >> 24)
+		x8 += x12
+		x4 ^= x8
+		x4 = (x4 << 7) | (x4 >> 25)
+
+		// quarterround(x, 1, 5, 9, 13)
+		x1 += x5
+		x13 ^= x1
+		x13 = (x13 << 16) | (x13 >> 16)
+		x9 += x13
+		x5 ^= x9
+		x5 = (x5 << 12) | (x5 >> 20)
+		x1 += x5
+		x13 ^= x1
+		x13 = (x13 << 8) | (x13 >> 24)
+		x9 += x13
+		x5 ^= x9
+		x5 = (x5 << 7) | (x5 >> 25)
+
+		// quarterround(x, 2, 6, 10, 14)
+		x2 += x6
+		x14 ^= x2
+		x14 = (x14 << 16) | (x14 >> 16)
+		x10 += x14
+		x6 ^= x10
+		x6 = (x6 << 12) | (x6 >> 20)
+		x2 += x6
+		x14 ^= x2
+		x14 = (x14 << 8) | (x14 >> 24)
+		x10 += x14
+		x6 ^= x10
+		x6 = (x6 << 7) | (x6 >> 25)
+
+		// quarterround(x, 3, 7, 11, 15)
+		x3 += x7
+		x15 ^= x3
+		x15 = (x15 << 16) | (x15 >> 16)
+		x11 += x15
+		x7 ^= x11
+		x7 = (x7 << 12) | (x7 >> 20)
+		x3 += x7
+		x15 ^= x3
+		x15 = (x15 << 8) | (x15 >> 24)
+		x11 += x15
+		x7 ^= x11
+		x7 = (x7 << 7) | (x7 >> 25)
+
+		// quarterround(x, 0, 5, 10, 15)
+		x0 += x5
+		x15 ^= x0
+		x15 = (x15 << 16) | (x15 >> 16)
+		x10 += x15
+		x5 ^= x10
+		x5 = (x5 << 12) | (x5 >> 20)
+		x0 += x5
+		x15 ^= x0
+		x15 = (x15 << 8) | (x15 >> 24)
+		x10 += x15
+		x5 ^= x10
+		x5 = (x5 << 7) | (x5 >> 25)
+
+		// quarterround(x, 1, 6, 11, 12)
+		x1 += x6
+		x12 ^= x1
+		x12 = (x12 << 16) | (x12 >> 16)
+		x11 += x12
+		x6 ^= x11
+		x6 = (x6 << 12) | (x6 >> 20)
+		x1 += x6
+		x12 ^= x1
+		x12 = (x12 << 8) | (x12 >> 24)
+		x11 += x12
+		x6 ^= x11
+		x6 = (x6 << 7) | (x6 >> 25)
+
+		// quarterround(x, 2, 7, 8, 13)
+		x2 += x7
+		x13 ^= x2
+		x13 = (x13 << 16) | (x13 >> 16)
+		x8 += x13
+		x7 ^= x8
+		x7 = (x7 << 12) | (x7 >> 20)
+		x2 += x7
+		x13 ^= x2
+		x13 = (x13 << 8) | (x13 >> 24)
+		x8 += x13
+		x7 ^= x8
+		x7 = (x7 << 7) | (x7 >> 25)
+
+		// quarterround(x, 3, 4, 9, 14)
+		x3 += x4
+		x14 ^= x3
+		x14 = (x14 << 16) | (x14 >> 16)
+		x9 += x14
+		x4 ^= x9
+		x4 = (x4 << 12) | (x4 >> 20)
+		x3 += x4
+		x14 ^= x3
+		x14 = (x14 << 8) | (x14 >> 24)
+		x9 += x14
+		x4 ^= x9
+		x4 = (x4 << 7) | (x4 >> 25)
+	}
+
+	// HChaCha returns x0...x3 | x12...x15, which corresponds to the
+	// indexes of the ChaCha constant and the indexes of the IV.
+	binary.LittleEndian.PutUint32(out[0:4], x0)
+	binary.LittleEndian.PutUint32(out[4:8], x1)
+	binary.LittleEndian.PutUint32(out[8:12], x2)
+	binary.LittleEndian.PutUint32(out[12:16], x3)
+	binary.LittleEndian.PutUint32(out[16:20], x12)
+	binary.LittleEndian.PutUint32(out[20:24], x13)
+	binary.LittleEndian.PutUint32(out[24:28], x14)
+	binary.LittleEndian.PutUint32(out[28:32], x15)
+	return
+}
+
+func init() {
+	switch runtime.GOARCH {
+	case "386", "amd64":
+		// Abuse unsafe to skip calling binary.LittleEndian.PutUint32
+		// in the critical path.  This is a big boost on systems that are
+		// little endian and not overly picky about alignment.
+		useUnsafe = true
+	}
+}
+
+var _ cipher.Stream = (*Cipher)(nil)

+ 34 - 0
chacha20_amd64.go

@@ -0,0 +1,34 @@
+// chacha20_amd64.go - AMD64 optimized chacha20.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to chacha20, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+// +build amd64,!gccgo,!appengine
+package chacha20
+
+func blocksAmd64SSE2(sigma, one, x *uint32, in, out *byte, nrBlocks uint)
+
+// One day these won't be parameters when PeachPy fixes issue #11, and they
+// can be made into local data, though leaving them as is isn't horrible
+// since the assembly code doesn't have XMM registers to spare.  Minor gain
+// from being able to ensure they're 16 byte aligned.
+var one = [4]uint32{1, 0, 0, 0}
+var sigma = [4]uint32{sigma0, sigma1, sigma2, sigma3}
+
+func blocksAmd64(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int) {
+	if in == nil {
+		for i := range out {
+			out[i] = 0
+		}
+		in = out
+	}
+
+	blocksAmd64SSE2(&sigma[0], &one[0], &x[0], &in[0], &out[0], uint(nrBlocks))
+}
+
+func init() {
+	blocksFn = blocksAmd64
+	usingVectors = true
+}

+ 264 - 0
chacha20_amd64.py

@@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+#
+# To the extent possible under law, Yawning Angel has waived all copyright
+# and related or neighboring rights to chacha20, using the Creative
+# Commons "CC0" public domain dedication. See LICENSE or
+# <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+#
+# Ok.  The first revision of this code started off as a cgo version of Ted
+# Krovetz's vec128 ChaCha20 implementation, but cgo sucks because it carves
+# off a separate stack (needed, but expensive), and worse, can allocate an OS
+# thread because it treats all cgo invocations as system calls.
+#
+# For something like a low level cryptography routine, both of these behaviors
+# are just unneccecary overhead, and the latter is totally fucking retarded.
+#
+# Since Golang doesn't have SIMD intrinsics, this means, that it's either
+# "learn plan 9 assembly", or resort to more extreme measures like using a
+# python code generator.  This obviously goes for the latter.
+#
+# Dependencies: https://github.com/Maratyszcza/PeachPy
+#
+# python3 -m peachpy.x86_64 -mabi=goasm -S -o chacha20_amd64.s chacha20_amd64.py
+#
+
+from peachpy import *
+from peachpy.x86_64 import *
+
+sigma = Argument(ptr(const_uint32_t))
+one = Argument(ptr(const_uint32_t))
+x = Argument(ptr(uint32_t))
+inp = Argument(ptr(const_uint8_t))
+outp = Argument(ptr(uint8_t))
+nrBlocks = Argument(ptr(size_t))
+
+def RotV1(x):
+    PSHUFD(x, x, 0x39)
+
+def RotV2(x):
+    PSHUFD(x, x, 0x4e)
+
+def RotV3(x):
+    PSHUFD(x, x, 0x93)
+
+def RotW7(tmp, x):
+    MOVDQA(tmp, x)
+    PSLLD(tmp, 7)
+    PSRLD(x, 25)
+    PXOR(x, tmp)
+
+def RotW8(tmp, x):
+    MOVDQA(tmp, x)
+    PSLLD(tmp, 8)
+    PSRLD(x, 24)
+    PXOR(x, tmp)
+
+def RotW12(tmp, x):
+    MOVDQA(tmp, x)
+    PSLLD(tmp, 12)
+    PSRLD(x, 20)
+    PXOR(x, tmp)
+
+def RotW16(tmp, x):
+    MOVDQA(tmp, x)
+    PSLLD(tmp, 16)
+    PSRLD(x, 16)
+    PXOR(x, tmp)
+
+def DQRoundVectors(tmp, a, b, c, d):
+    # a += b; d ^= a; d = ROTW16(d);
+    PADDD(a, b)
+    PXOR(d, a)
+    RotW16(tmp, d)
+
+    # c += d; b ^= c; b = ROTW12(b);
+    PADDD(c, d)
+    PXOR(b, c)
+    RotW12(tmp, b)
+
+    # a += b; d ^= a; d = ROTW8(d);
+    PADDD(a, b)
+    PXOR(d, a)
+    RotW8(tmp, d)
+
+    # c += d; b ^= c; b = ROTW7(b)
+    PADDD(c, d)
+    PXOR(b, c)
+    RotW7(tmp, b)
+
+    # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+    RotV1(b)
+    RotV2(c)
+    RotV3(d)
+
+    # a += b; d ^= a; d = ROTW16(d);
+    PADDD(a, b)
+    PXOR(d, a)
+    RotW16(tmp, d)
+
+    # c += d; b ^= c; b = ROTW12(b);
+    PADDD(c, d)
+    PXOR(b, c)
+    RotW12(tmp, b)
+
+    # a += b; d ^= a; d = ROTW8(d);
+    PADDD(a, b)
+    PXOR(d, a)
+    RotW8(tmp, d)
+
+    # c += d; b ^= c; b = ROTW7(b);
+    PADDD(c, d)
+    PXOR(b, c)
+    RotW7(tmp, b)
+
+    # b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
+    RotV3(b)
+    RotV2(c)
+    RotV1(d)
+
+def WriteXor(tmp, inp, outp, d, v0, v1, v2, v3):
+    MOVDQU(tmp, [inp+d])
+    PXOR(tmp, v0)
+    MOVDQU([outp+d], tmp)
+    MOVDQU(tmp, [inp+d+16])
+    PXOR(tmp, v1)
+    MOVDQU([outp+d+16], tmp)
+    MOVDQU(tmp, [inp+d+32])
+    PXOR(tmp, v2)
+    MOVDQU([outp+d+32], tmp)
+    MOVDQU(tmp, [inp+d+48])
+    PXOR(tmp, v3)
+    MOVDQU([outp+d+48], tmp)
+
+with Function("blocksAmd64SSE2", (sigma, one, x, inp, outp, nrBlocks)):
+    reg_sigma = GeneralPurposeRegister64()
+    reg_one = GeneralPurposeRegister64()
+    reg_x = GeneralPurposeRegister64()
+    reg_inp = GeneralPurposeRegister64()
+    reg_outp = GeneralPurposeRegister64()
+    reg_blocks = GeneralPurposeRegister64()
+
+    LOAD.ARGUMENT(reg_sigma, sigma)
+    LOAD.ARGUMENT(reg_one, one)
+    LOAD.ARGUMENT(reg_x, x)
+    LOAD.ARGUMENT(reg_inp, inp)
+    LOAD.ARGUMENT(reg_outp, outp)
+    LOAD.ARGUMENT(reg_blocks, nrBlocks)
+
+    xmm_tmp = XMMRegister()
+    xmm_s1 = XMMRegister()
+    MOVDQU(xmm_s1, [reg_x])
+    xmm_s2 = XMMRegister()
+    MOVDQU(xmm_s2, [reg_x+16])
+    xmm_s3 = XMMRegister()
+    MOVDQU(xmm_s3, [reg_x+32])
+
+    vector_loop = Loop()
+    serial_loop = Loop()
+
+    xmm_v0 = XMMRegister()
+    xmm_v1 = XMMRegister()
+    xmm_v2 = XMMRegister()
+    xmm_v3 = XMMRegister()
+
+    SUB(reg_blocks, 3)
+    JB(vector_loop.end)
+    with vector_loop:
+        MOVDQU(xmm_v0, [reg_sigma])
+        MOVDQA(xmm_v1, xmm_s1)
+        MOVDQA(xmm_v2, xmm_s2)
+        MOVDQA(xmm_v3, xmm_s3)
+
+        xmm_v4 = XMMRegister()
+        MOVDQU(xmm_v4, [reg_sigma])
+        xmm_v5 = XMMRegister()
+        MOVDQA(xmm_v5, xmm_s1)
+        xmm_v6 = XMMRegister()
+        MOVDQA(xmm_v6, xmm_s2)
+        xmm_v7 = XMMRegister()
+        MOVDQA(xmm_v7, xmm_s3)
+        PADDQ(xmm_v7, [reg_one])
+
+        xmm_v8 = XMMRegister()
+        MOVDQU(xmm_v8, [reg_sigma])
+        xmm_v9 = XMMRegister()
+        MOVDQA(xmm_v9, xmm_s1)
+        xmm_v10 = XMMRegister()
+        MOVDQA(xmm_v10, xmm_s2)
+        xmm_v11 = XMMRegister()
+        MOVDQA(xmm_v11, xmm_v7)
+        PADDQ(xmm_v11, [reg_one])
+
+        reg_rounds = GeneralPurposeRegister64()
+        MOV(reg_rounds, 20)
+        rounds_loop = Loop()
+        with rounds_loop:
+            DQRoundVectors(xmm_tmp, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
+            DQRoundVectors(xmm_tmp, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
+            DQRoundVectors(xmm_tmp, xmm_v8, xmm_v9, xmm_v10, xmm_v11)
+            SUB(reg_rounds, 2)
+            JNZ(rounds_loop.begin)
+
+        PADDD(xmm_v0, [reg_sigma])
+        PADDD(xmm_v1, xmm_s1)
+        PADDD(xmm_v2, xmm_s2)
+        PADDD(xmm_v3, xmm_s3)
+        WriteXor(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
+        PADDQ(xmm_s3, [reg_one])
+
+        PADDD(xmm_v4, [reg_sigma])
+        PADDD(xmm_v5, xmm_s1)
+        PADDD(xmm_v6, xmm_s2)
+        PADDD(xmm_v7, xmm_s3)
+        WriteXor(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
+        PADDQ(xmm_s3, [reg_one])
+
+        PADDD(xmm_v8, [reg_sigma])
+        PADDD(xmm_v9, xmm_s1)
+        PADDD(xmm_v10, xmm_s2)
+        PADDD(xmm_v11, xmm_s3)
+        WriteXor(xmm_tmp, reg_inp, reg_outp, 128, xmm_v8, xmm_v9, xmm_v10, xmm_v11)
+        PADDQ(xmm_s3, [reg_one])
+
+        ADD(reg_inp, 192)
+        ADD(reg_outp, 192)
+
+        SUB(reg_blocks, 3)
+        JAE(vector_loop.begin)
+
+    ADD(reg_blocks, 3)
+    JZ(serial_loop.end)
+
+    with serial_loop:
+        MOVDQU(xmm_v0, [reg_sigma])
+        MOVDQA(xmm_v1, xmm_s1)
+        MOVDQA(xmm_v2, xmm_s2)
+        MOVDQA(xmm_v3, xmm_s3)
+
+        reg_rounds = GeneralPurposeRegister64()
+        MOV(reg_rounds, 20)
+        rounds_loop = Loop()
+        with rounds_loop:
+            DQRoundVectors(xmm_tmp, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
+            SUB(reg_rounds, 2)
+            JNZ(rounds_loop.begin)
+
+        PADDD(xmm_v0, [reg_sigma])
+        PADDD(xmm_v1, xmm_s1)
+        PADDD(xmm_v2, xmm_s2)
+        PADDD(xmm_v3, xmm_s3)
+        WriteXor(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
+        PADDQ(xmm_s3, [reg_one])
+
+        ADD(reg_inp, 64)
+        ADD(reg_outp, 64)
+
+        SUB(reg_blocks, 1)
+        JNZ(serial_loop.begin)
+
+    # Write back the updated counter.  Stoping at 2^70 bytes is the user's
+    # problem, not mine.
+    MOVDQU([reg_x+32], xmm_s3)
+
+    RETURN()

+ 342 - 0
chacha20_amd64.s

@@ -0,0 +1,342 @@
+// Generated by PeachPy 0.2.0 from chacha20_amd64.py
+
+
+// func blocksAmd64SSE2(sigma *uint32, one *uint32, x *uint32, inp *uint8, outp *uint8, nrBlocks *uint)
+TEXT ·blocksAmd64SSE2(SB),4,$0-48
+	MOVQ sigma+0(FP), AX
+	MOVQ one+8(FP), BX
+	MOVQ x+16(FP), CX
+	MOVQ inp+24(FP), DX
+	MOVQ outp+32(FP), DI
+	MOVQ nrBlocks+40(FP), SI
+	MOVOU 0(CX), X1
+	MOVOU 16(CX), X2
+	MOVOU 32(CX), X3
+	SUBQ $3, SI
+	JCS vector_loop_end
+vector_loop_begin:
+		MOVOU 0(AX), X4
+		MOVO X1, X5
+		MOVO X2, X6
+		MOVO X3, X7
+		MOVOU 0(AX), X8
+		MOVO X1, X9
+		MOVO X2, X10
+		MOVO X3, X11
+		PADDQ 0(BX), X11
+		MOVOU 0(AX), X12
+		MOVO X1, X13
+		MOVO X2, X14
+		MOVO X11, X15
+		PADDQ 0(BX), X15
+		MOVQ $20, BP
+rounds_loop0_begin:
+			PADDL X5, X4
+			PXOR X4, X7
+			MOVO X7, X0
+			PSLLL $16, X0
+			PSRLL $16, X7
+			PXOR X0, X7
+			PADDL X7, X6
+			PXOR X6, X5
+			MOVO X5, X0
+			PSLLL $12, X0
+			PSRLL $20, X5
+			PXOR X0, X5
+			PADDL X5, X4
+			PXOR X4, X7
+			MOVO X7, X0
+			PSLLL $8, X0
+			PSRLL $24, X7
+			PXOR X0, X7
+			PADDL X7, X6
+			PXOR X6, X5
+			MOVO X5, X0
+			PSLLL $7, X0
+			PSRLL $25, X5
+			PXOR X0, X5
+			PSHUFL $57, X5, X5
+			PSHUFL $78, X6, X6
+			PSHUFL $147, X7, X7
+			PADDL X5, X4
+			PXOR X4, X7
+			MOVO X7, X0
+			PSLLL $16, X0
+			PSRLL $16, X7
+			PXOR X0, X7
+			PADDL X7, X6
+			PXOR X6, X5
+			MOVO X5, X0
+			PSLLL $12, X0
+			PSRLL $20, X5
+			PXOR X0, X5
+			PADDL X5, X4
+			PXOR X4, X7
+			MOVO X7, X0
+			PSLLL $8, X0
+			PSRLL $24, X7
+			PXOR X0, X7
+			PADDL X7, X6
+			PXOR X6, X5
+			MOVO X5, X0
+			PSLLL $7, X0
+			PSRLL $25, X5
+			PXOR X0, X5
+			PSHUFL $147, X5, X5
+			PSHUFL $78, X6, X6
+			PSHUFL $57, X7, X7
+			PADDL X9, X8
+			PXOR X8, X11
+			MOVO X11, X0
+			PSLLL $16, X0
+			PSRLL $16, X11
+			PXOR X0, X11
+			PADDL X11, X10
+			PXOR X10, X9
+			MOVO X9, X0
+			PSLLL $12, X0
+			PSRLL $20, X9
+			PXOR X0, X9
+			PADDL X9, X8
+			PXOR X8, X11
+			MOVO X11, X0
+			PSLLL $8, X0
+			PSRLL $24, X11
+			PXOR X0, X11
+			PADDL X11, X10
+			PXOR X10, X9
+			MOVO X9, X0
+			PSLLL $7, X0
+			PSRLL $25, X9
+			PXOR X0, X9
+			PSHUFL $57, X9, X9
+			PSHUFL $78, X10, X10
+			PSHUFL $147, X11, X11
+			PADDL X9, X8
+			PXOR X8, X11
+			MOVO X11, X0
+			PSLLL $16, X0
+			PSRLL $16, X11
+			PXOR X0, X11
+			PADDL X11, X10
+			PXOR X10, X9
+			MOVO X9, X0
+			PSLLL $12, X0
+			PSRLL $20, X9
+			PXOR X0, X9
+			PADDL X9, X8
+			PXOR X8, X11
+			MOVO X11, X0
+			PSLLL $8, X0
+			PSRLL $24, X11
+			PXOR X0, X11
+			PADDL X11, X10
+			PXOR X10, X9
+			MOVO X9, X0
+			PSLLL $7, X0
+			PSRLL $25, X9
+			PXOR X0, X9
+			PSHUFL $147, X9, X9
+			PSHUFL $78, X10, X10
+			PSHUFL $57, X11, X11
+			PADDL X13, X12
+			PXOR X12, X15
+			MOVO X15, X0
+			PSLLL $16, X0
+			PSRLL $16, X15
+			PXOR X0, X15
+			PADDL X15, X14
+			PXOR X14, X13
+			MOVO X13, X0
+			PSLLL $12, X0
+			PSRLL $20, X13
+			PXOR X0, X13
+			PADDL X13, X12
+			PXOR X12, X15
+			MOVO X15, X0
+			PSLLL $8, X0
+			PSRLL $24, X15
+			PXOR X0, X15
+			PADDL X15, X14
+			PXOR X14, X13
+			MOVO X13, X0
+			PSLLL $7, X0
+			PSRLL $25, X13
+			PXOR X0, X13
+			PSHUFL $57, X13, X13
+			PSHUFL $78, X14, X14
+			PSHUFL $147, X15, X15
+			PADDL X13, X12
+			PXOR X12, X15
+			MOVO X15, X0
+			PSLLL $16, X0
+			PSRLL $16, X15
+			PXOR X0, X15
+			PADDL X15, X14
+			PXOR X14, X13
+			MOVO X13, X0
+			PSLLL $12, X0
+			PSRLL $20, X13
+			PXOR X0, X13
+			PADDL X13, X12
+			PXOR X12, X15
+			MOVO X15, X0
+			PSLLL $8, X0
+			PSRLL $24, X15
+			PXOR X0, X15
+			PADDL X15, X14
+			PXOR X14, X13
+			MOVO X13, X0
+			PSLLL $7, X0
+			PSRLL $25, X13
+			PXOR X0, X13
+			PSHUFL $147, X13, X13
+			PSHUFL $78, X14, X14
+			PSHUFL $57, X15, X15
+			SUBQ $2, BP
+			JNE rounds_loop0_begin
+		PADDL 0(AX), X4
+		PADDL X1, X5
+		PADDL X2, X6
+		PADDL X3, X7
+		MOVOU 0(DX), X0
+		PXOR X4, X0
+		MOVOU X0, 0(DI)
+		MOVOU 16(DX), X0
+		PXOR X5, X0
+		MOVOU X0, 16(DI)
+		MOVOU 32(DX), X0
+		PXOR X6, X0
+		MOVOU X0, 32(DI)
+		MOVOU 48(DX), X0
+		PXOR X7, X0
+		MOVOU X0, 48(DI)
+		PADDQ 0(BX), X3
+		PADDL 0(AX), X8
+		PADDL X1, X9
+		PADDL X2, X10
+		PADDL X3, X11
+		MOVOU 64(DX), X0
+		PXOR X8, X0
+		MOVOU X0, 64(DI)
+		MOVOU 80(DX), X0
+		PXOR X9, X0
+		MOVOU X0, 80(DI)
+		MOVOU 96(DX), X0
+		PXOR X10, X0
+		MOVOU X0, 96(DI)
+		MOVOU 112(DX), X0
+		PXOR X11, X0
+		MOVOU X0, 112(DI)
+		PADDQ 0(BX), X3
+		PADDL 0(AX), X12
+		PADDL X1, X13
+		PADDL X2, X14
+		PADDL X3, X15
+		MOVOU 128(DX), X0
+		PXOR X12, X0
+		MOVOU X0, 128(DI)
+		MOVOU 144(DX), X0
+		PXOR X13, X0
+		MOVOU X0, 144(DI)
+		MOVOU 160(DX), X0
+		PXOR X14, X0
+		MOVOU X0, 160(DI)
+		MOVOU 176(DX), X0
+		PXOR X15, X0
+		MOVOU X0, 176(DI)
+		PADDQ 0(BX), X3
+		ADDQ $192, DX
+		ADDQ $192, DI
+		SUBQ $3, SI
+		JCC vector_loop_begin
+vector_loop_end:
+	ADDQ $3, SI
+	JEQ serial_loop_end
+serial_loop_begin:
+		MOVOU 0(AX), X4
+		MOVO X1, X5
+		MOVO X2, X6
+		MOVO X3, X7
+		MOVQ $20, BP
+rounds_loop1_begin:
+			PADDL X5, X4
+			PXOR X4, X7
+			MOVO X7, X0
+			PSLLL $16, X0
+			PSRLL $16, X7
+			PXOR X0, X7
+			PADDL X7, X6
+			PXOR X6, X5
+			MOVO X5, X0
+			PSLLL $12, X0
+			PSRLL $20, X5
+			PXOR X0, X5
+			PADDL X5, X4
+			PXOR X4, X7
+			MOVO X7, X0
+			PSLLL $8, X0
+			PSRLL $24, X7
+			PXOR X0, X7
+			PADDL X7, X6
+			PXOR X6, X5
+			MOVO X5, X0
+			PSLLL $7, X0
+			PSRLL $25, X5
+			PXOR X0, X5
+			PSHUFL $57, X5, X5
+			PSHUFL $78, X6, X6
+			PSHUFL $147, X7, X7
+			PADDL X5, X4
+			PXOR X4, X7
+			MOVO X7, X0
+			PSLLL $16, X0
+			PSRLL $16, X7
+			PXOR X0, X7
+			PADDL X7, X6
+			PXOR X6, X5
+			MOVO X5, X0
+			PSLLL $12, X0
+			PSRLL $20, X5
+			PXOR X0, X5
+			PADDL X5, X4
+			PXOR X4, X7
+			MOVO X7, X0
+			PSLLL $8, X0
+			PSRLL $24, X7
+			PXOR X0, X7
+			PADDL X7, X6
+			PXOR X6, X5
+			MOVO X5, X0
+			PSLLL $7, X0
+			PSRLL $25, X5
+			PXOR X0, X5
+			PSHUFL $147, X5, X5
+			PSHUFL $78, X6, X6
+			PSHUFL $57, X7, X7
+			SUBQ $2, BP
+			JNE rounds_loop1_begin
+		PADDL 0(AX), X4
+		PADDL X1, X5
+		PADDL X2, X6
+		PADDL X3, X7
+		MOVOU 0(DX), X0
+		PXOR X4, X0
+		MOVOU X0, 0(DI)
+		MOVOU 16(DX), X0
+		PXOR X5, X0
+		MOVOU X0, 16(DI)
+		MOVOU 32(DX), X0
+		PXOR X6, X0
+		MOVOU X0, 32(DI)
+		MOVOU 48(DX), X0
+		PXOR X7, X0
+		MOVOU X0, 48(DI)
+		PADDQ 0(BX), X3
+		ADDQ $64, DX
+		ADDQ $64, DI
+		SUBQ $1, SI
+		JNE serial_loop_begin
+serial_loop_end:
+	MOVOU X3, 32(CX)
+	RET

+ 233 - 0
chacha20_ref.go

@@ -0,0 +1,233 @@
+// chacha20_ref.go - Reference ChaCha20.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to chacha20, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+package chacha20
+
+import (
+	"encoding/binary"
+	"unsafe"
+)
+
+func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int) {
+	for n := 0; n < nrBlocks; n++ {
+		x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
+		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11]
+
+		for i := chachaRounds; i > 0; i -= 2 {
+			// quarterround(x, 0, 4, 8, 12)
+			x0 += x4
+			x12 ^= x0
+			x12 = (x12 << 16) | (x12 >> 16)
+			x8 += x12
+			x4 ^= x8
+			x4 = (x4 << 12) | (x4 >> 20)
+			x0 += x4
+			x12 ^= x0
+			x12 = (x12 << 8) | (x12 >> 24)
+			x8 += x12
+			x4 ^= x8
+			x4 = (x4 << 7) | (x4 >> 25)
+
+			// quarterround(x, 1, 5, 9, 13)
+			x1 += x5
+			x13 ^= x1
+			x13 = (x13 << 16) | (x13 >> 16)
+			x9 += x13
+			x5 ^= x9
+			x5 = (x5 << 12) | (x5 >> 20)
+			x1 += x5
+			x13 ^= x1
+			x13 = (x13 << 8) | (x13 >> 24)
+			x9 += x13
+			x5 ^= x9
+			x5 = (x5 << 7) | (x5 >> 25)
+
+			// quarterround(x, 2, 6, 10, 14)
+			x2 += x6
+			x14 ^= x2
+			x14 = (x14 << 16) | (x14 >> 16)
+			x10 += x14
+			x6 ^= x10
+			x6 = (x6 << 12) | (x6 >> 20)
+			x2 += x6
+			x14 ^= x2
+			x14 = (x14 << 8) | (x14 >> 24)
+			x10 += x14
+			x6 ^= x10
+			x6 = (x6 << 7) | (x6 >> 25)
+
+			// quarterround(x, 3, 7, 11, 15)
+			x3 += x7
+			x15 ^= x3
+			x15 = (x15 << 16) | (x15 >> 16)
+			x11 += x15
+			x7 ^= x11
+			x7 = (x7 << 12) | (x7 >> 20)
+			x3 += x7
+			x15 ^= x3
+			x15 = (x15 << 8) | (x15 >> 24)
+			x11 += x15
+			x7 ^= x11
+			x7 = (x7 << 7) | (x7 >> 25)
+
+			// quarterround(x, 0, 5, 10, 15)
+			x0 += x5
+			x15 ^= x0
+			x15 = (x15 << 16) | (x15 >> 16)
+			x10 += x15
+			x5 ^= x10
+			x5 = (x5 << 12) | (x5 >> 20)
+			x0 += x5
+			x15 ^= x0
+			x15 = (x15 << 8) | (x15 >> 24)
+			x10 += x15
+			x5 ^= x10
+			x5 = (x5 << 7) | (x5 >> 25)
+
+			// quarterround(x, 1, 6, 11, 12)
+			x1 += x6
+			x12 ^= x1
+			x12 = (x12 << 16) | (x12 >> 16)
+			x11 += x12
+			x6 ^= x11
+			x6 = (x6 << 12) | (x6 >> 20)
+			x1 += x6
+			x12 ^= x1
+			x12 = (x12 << 8) | (x12 >> 24)
+			x11 += x12
+			x6 ^= x11
+			x6 = (x6 << 7) | (x6 >> 25)
+
+			// quarterround(x, 2, 7, 8, 13)
+			x2 += x7
+			x13 ^= x2
+			x13 = (x13 << 16) | (x13 >> 16)
+			x8 += x13
+			x7 ^= x8
+			x7 = (x7 << 12) | (x7 >> 20)
+			x2 += x7
+			x13 ^= x2
+			x13 = (x13 << 8) | (x13 >> 24)
+			x8 += x13
+			x7 ^= x8
+			x7 = (x7 << 7) | (x7 >> 25)
+
+			// quarterround(x, 3, 4, 9, 14)
+			x3 += x4
+			x14 ^= x3
+			x14 = (x14 << 16) | (x14 >> 16)
+			x9 += x14
+			x4 ^= x9
+			x4 = (x4 << 12) | (x4 >> 20)
+			x3 += x4
+			x14 ^= x3
+			x14 = (x14 << 8) | (x14 >> 24)
+			x9 += x14
+			x4 ^= x9
+			x4 = (x4 << 7) | (x4 >> 25)
+		}
+
+		// On amd64 at least, this is a rather big boost.
+		if useUnsafe {
+			if in != nil {
+				inArr := (*[16]uint32)(unsafe.Pointer(&in[n*BlockSize]))
+				outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
+				outArr[0] = inArr[0] ^ (x0 + sigma0)
+				outArr[1] = inArr[1] ^ (x1 + sigma1)
+				outArr[2] = inArr[2] ^ (x2 + sigma2)
+				outArr[3] = inArr[3] ^ (x3 + sigma3)
+				outArr[4] = inArr[4] ^ (x4 + x[0])
+				outArr[5] = inArr[5] ^ (x5 + x[1])
+				outArr[6] = inArr[6] ^ (x6 + x[2])
+				outArr[7] = inArr[7] ^ (x7 + x[3])
+				outArr[8] = inArr[8] ^ (x8 + x[4])
+				outArr[9] = inArr[9] ^ (x9 + x[5])
+				outArr[10] = inArr[10] ^ (x10 + x[6])
+				outArr[11] = inArr[11] ^ (x11 + x[7])
+				outArr[12] = inArr[12] ^ (x12 + x[8])
+				outArr[13] = inArr[13] ^ (x13 + x[9])
+				outArr[14] = inArr[14] ^ (x14 + x[10])
+				outArr[15] = inArr[15] ^ (x15 + x[11])
+			} else {
+				outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
+				outArr[0] = x0 + sigma0
+				outArr[1] = x1 + sigma1
+				outArr[2] = x2 + sigma2
+				outArr[3] = x3 + sigma3
+				outArr[4] = x4 + x[0]
+				outArr[5] = x5 + x[1]
+				outArr[6] = x6 + x[2]
+				outArr[7] = x7 + x[3]
+				outArr[8] = x8 + x[4]
+				outArr[9] = x9 + x[5]
+				outArr[10] = x10 + x[6]
+				outArr[11] = x11 + x[7]
+				outArr[12] = x12 + x[8]
+				outArr[13] = x13 + x[9]
+				outArr[14] = x14 + x[10]
+				outArr[15] = x15 + x[11]
+			}
+		} else {
+			// Slow path, either the architecture cares about alignment, or is not litte endian.
+			x4 += x[0]
+			x5 += x[1]
+			x6 += x[2]
+			x7 += x[3]
+			x8 += x[4]
+			x9 += x[5]
+			x10 += x[6]
+			x11 += x[7]
+			x12 += x[8]
+			x13 += x[9]
+			x14 += x[10]
+			x15 += x[11]
+			if in != nil {
+				binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^(x0+sigma0))
+				binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^(x1+sigma1))
+				binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^(x2+sigma2))
+				binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^(x3+sigma3))
+				binary.LittleEndian.PutUint32(out[16:20], binary.LittleEndian.Uint32(in[16:20])^x4)
+				binary.LittleEndian.PutUint32(out[20:24], binary.LittleEndian.Uint32(in[20:24])^x5)
+				binary.LittleEndian.PutUint32(out[24:28], binary.LittleEndian.Uint32(in[24:28])^x6)
+				binary.LittleEndian.PutUint32(out[28:32], binary.LittleEndian.Uint32(in[28:32])^x7)
+				binary.LittleEndian.PutUint32(out[32:36], binary.LittleEndian.Uint32(in[32:36])^x8)
+				binary.LittleEndian.PutUint32(out[36:40], binary.LittleEndian.Uint32(in[36:40])^x9)
+				binary.LittleEndian.PutUint32(out[40:44], binary.LittleEndian.Uint32(in[40:44])^x10)
+				binary.LittleEndian.PutUint32(out[44:48], binary.LittleEndian.Uint32(in[44:48])^x11)
+				binary.LittleEndian.PutUint32(out[48:52], binary.LittleEndian.Uint32(in[48:52])^x12)
+				binary.LittleEndian.PutUint32(out[52:56], binary.LittleEndian.Uint32(in[52:56])^x13)
+				binary.LittleEndian.PutUint32(out[56:60], binary.LittleEndian.Uint32(in[56:60])^x14)
+				binary.LittleEndian.PutUint32(out[60:64], binary.LittleEndian.Uint32(in[60:64])^x15)
+				in = in[BlockSize:]
+			} else {
+				binary.LittleEndian.PutUint32(out[0:4], x0+sigma0)
+				binary.LittleEndian.PutUint32(out[4:8], x1+sigma1)
+				binary.LittleEndian.PutUint32(out[8:12], x2+sigma2)
+				binary.LittleEndian.PutUint32(out[12:16], x3+sigma3)
+				binary.LittleEndian.PutUint32(out[16:20], x4)
+				binary.LittleEndian.PutUint32(out[20:24], x5)
+				binary.LittleEndian.PutUint32(out[24:28], x6)
+				binary.LittleEndian.PutUint32(out[28:32], x7)
+				binary.LittleEndian.PutUint32(out[32:36], x8)
+				binary.LittleEndian.PutUint32(out[36:40], x9)
+				binary.LittleEndian.PutUint32(out[40:44], x10)
+				binary.LittleEndian.PutUint32(out[44:48], x11)
+				binary.LittleEndian.PutUint32(out[48:52], x12)
+				binary.LittleEndian.PutUint32(out[52:56], x13)
+				binary.LittleEndian.PutUint32(out[56:60], x14)
+				binary.LittleEndian.PutUint32(out[60:64], x15)
+			}
+			out = out[BlockSize:]
+		}
+
+		// Stoping at 2^70 bytes per nonce is the user's responsibility.
+		ctr := uint64(x[9])<<32 | uint64(x[8])
+		ctr++
+		x[8] = uint32(ctr)
+		x[9] = uint32(ctr >> 32)
+	}
+}

+ 370 - 0
chacha20_test.go

@@ -0,0 +1,370 @@
+// chacha20_test.go - ChaCha stream cipher implementation tests.
+//
+// To the extent possible under law, Yawning Angel waived all copyright
+// and related or neighboring rights to chacha20, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+package chacha20
+
+import (
+	"bytes"
+	"crypto/rand"
+	"testing"
+)
+
+// Test vectors taken from:
+// https://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-01
+var draftTestVectors = []struct {
+	name   string
+	key    []byte
+	iv     []byte
+	stream []byte
+}{
+	{
+		name: "IETF Draft: TC1: All zero key and IV.",
+		key: []byte{
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		},
+		iv: []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+		stream: []byte{
+			0x76, 0xb8, 0xe0, 0xad, 0xa0, 0xf1, 0x3d, 0x90,
+			0x40, 0x5d, 0x6a, 0xe5, 0x53, 0x86, 0xbd, 0x28,
+			0xbd, 0xd2, 0x19, 0xb8, 0xa0, 0x8d, 0xed, 0x1a,
+			0xa8, 0x36, 0xef, 0xcc, 0x8b, 0x77, 0x0d, 0xc7,
+			0xda, 0x41, 0x59, 0x7c, 0x51, 0x57, 0x48, 0x8d,
+			0x77, 0x24, 0xe0, 0x3f, 0xb8, 0xd8, 0x4a, 0x37,
+			0x6a, 0x43, 0xb8, 0xf4, 0x15, 0x18, 0xa1, 0x1c,
+			0xc3, 0x87, 0xb6, 0x69, 0xb2, 0xee, 0x65, 0x86,
+			0x9f, 0x07, 0xe7, 0xbe, 0x55, 0x51, 0x38, 0x7a,
+			0x98, 0xba, 0x97, 0x7c, 0x73, 0x2d, 0x08, 0x0d,
+			0xcb, 0x0f, 0x29, 0xa0, 0x48, 0xe3, 0x65, 0x69,
+			0x12, 0xc6, 0x53, 0x3e, 0x32, 0xee, 0x7a, 0xed,
+			0x29, 0xb7, 0x21, 0x76, 0x9c, 0xe6, 0x4e, 0x43,
+			0xd5, 0x71, 0x33, 0xb0, 0x74, 0xd8, 0x39, 0xd5,
+			0x31, 0xed, 0x1f, 0x28, 0x51, 0x0a, 0xfb, 0x45,
+			0xac, 0xe1, 0x0a, 0x1f, 0x4b, 0x79, 0x4d, 0x6f,
+		},
+	},
+	{
+		name: "IETF Draft: TC2: Single bit in key set. All zero IV.",
+		key: []byte{
+			0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		},
+		iv: []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+		stream: []byte{
+			0xc5, 0xd3, 0x0a, 0x7c, 0xe1, 0xec, 0x11, 0x93,
+			0x78, 0xc8, 0x4f, 0x48, 0x7d, 0x77, 0x5a, 0x85,
+			0x42, 0xf1, 0x3e, 0xce, 0x23, 0x8a, 0x94, 0x55,
+			0xe8, 0x22, 0x9e, 0x88, 0x8d, 0xe8, 0x5b, 0xbd,
+			0x29, 0xeb, 0x63, 0xd0, 0xa1, 0x7a, 0x5b, 0x99,
+			0x9b, 0x52, 0xda, 0x22, 0xbe, 0x40, 0x23, 0xeb,
+			0x07, 0x62, 0x0a, 0x54, 0xf6, 0xfa, 0x6a, 0xd8,
+			0x73, 0x7b, 0x71, 0xeb, 0x04, 0x64, 0xda, 0xc0,
+			0x10, 0xf6, 0x56, 0xe6, 0xd1, 0xfd, 0x55, 0x05,
+			0x3e, 0x50, 0xc4, 0x87, 0x5c, 0x99, 0x30, 0xa3,
+			0x3f, 0x6d, 0x02, 0x63, 0xbd, 0x14, 0xdf, 0xd6,
+			0xab, 0x8c, 0x70, 0x52, 0x1c, 0x19, 0x33, 0x8b,
+			0x23, 0x08, 0xb9, 0x5c, 0xf8, 0xd0, 0xbb, 0x7d,
+			0x20, 0x2d, 0x21, 0x02, 0x78, 0x0e, 0xa3, 0x52,
+			0x8f, 0x1c, 0xb4, 0x85, 0x60, 0xf7, 0x6b, 0x20,
+			0xf3, 0x82, 0xb9, 0x42, 0x50, 0x0f, 0xce, 0xac,
+		},
+	},
+	{
+		name: "IETF Draft: TC3: Single bit in IV set. All zero key.",
+		key: []byte{
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		},
+		iv: []byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+		stream: []byte{
+			0xef, 0x3f, 0xdf, 0xd6, 0xc6, 0x15, 0x78, 0xfb,
+			0xf5, 0xcf, 0x35, 0xbd, 0x3d, 0xd3, 0x3b, 0x80,
+			0x09, 0x63, 0x16, 0x34, 0xd2, 0x1e, 0x42, 0xac,
+			0x33, 0x96, 0x0b, 0xd1, 0x38, 0xe5, 0x0d, 0x32,
+			0x11, 0x1e, 0x4c, 0xaf, 0x23, 0x7e, 0xe5, 0x3c,
+			0xa8, 0xad, 0x64, 0x26, 0x19, 0x4a, 0x88, 0x54,
+			0x5d, 0xdc, 0x49, 0x7a, 0x0b, 0x46, 0x6e, 0x7d,
+			0x6b, 0xbd, 0xb0, 0x04, 0x1b, 0x2f, 0x58, 0x6b,
+			0x53, 0x05, 0xe5, 0xe4, 0x4a, 0xff, 0x19, 0xb2,
+			0x35, 0x93, 0x61, 0x44, 0x67, 0x5e, 0xfb, 0xe4,
+			0x40, 0x9e, 0xb7, 0xe8, 0xe5, 0xf1, 0x43, 0x0f,
+			0x5f, 0x58, 0x36, 0xae, 0xb4, 0x9b, 0xb5, 0x32,
+			0x8b, 0x01, 0x7c, 0x4b, 0x9d, 0xc1, 0x1f, 0x8a,
+			0x03, 0x86, 0x3f, 0xa8, 0x03, 0xdc, 0x71, 0xd5,
+			0x72, 0x6b, 0x2b, 0x6b, 0x31, 0xaa, 0x32, 0x70,
+			0x8a, 0xfe, 0x5a, 0xf1, 0xd6, 0xb6, 0x90, 0x58,
+		},
+	},
+	{
+		name: "IETF Draft: TC4: All bits in key and IV are set.",
+		key: []byte{
+			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		},
+		iv: []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+		stream: []byte{
+			0xd9, 0xbf, 0x3f, 0x6b, 0xce, 0x6e, 0xd0, 0xb5,
+			0x42, 0x54, 0x55, 0x77, 0x67, 0xfb, 0x57, 0x44,
+			0x3d, 0xd4, 0x77, 0x89, 0x11, 0xb6, 0x06, 0x05,
+			0x5c, 0x39, 0xcc, 0x25, 0xe6, 0x74, 0xb8, 0x36,
+			0x3f, 0xea, 0xbc, 0x57, 0xfd, 0xe5, 0x4f, 0x79,
+			0x0c, 0x52, 0xc8, 0xae, 0x43, 0x24, 0x0b, 0x79,
+			0xd4, 0x90, 0x42, 0xb7, 0x77, 0xbf, 0xd6, 0xcb,
+			0x80, 0xe9, 0x31, 0x27, 0x0b, 0x7f, 0x50, 0xeb,
+			0x5b, 0xac, 0x2a, 0xcd, 0x86, 0xa8, 0x36, 0xc5,
+			0xdc, 0x98, 0xc1, 0x16, 0xc1, 0x21, 0x7e, 0xc3,
+			0x1d, 0x3a, 0x63, 0xa9, 0x45, 0x13, 0x19, 0xf0,
+			0x97, 0xf3, 0xb4, 0xd6, 0xda, 0xb0, 0x77, 0x87,
+			0x19, 0x47, 0x7d, 0x24, 0xd2, 0x4b, 0x40, 0x3a,
+			0x12, 0x24, 0x1d, 0x7c, 0xca, 0x06, 0x4f, 0x79,
+			0x0f, 0x1d, 0x51, 0xcc, 0xaf, 0xf6, 0xb1, 0x66,
+			0x7d, 0x4b, 0xbc, 0xa1, 0x95, 0x8c, 0x43, 0x06,
+		},
+	},
+	{
+		name: "IETF Draft: TC5: Every even bit set in key and IV.",
+		key: []byte{
+			0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+			0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+			0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+			0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		},
+		iv: []byte{0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55},
+		stream: []byte{
+			0xbe, 0xa9, 0x41, 0x1a, 0xa4, 0x53, 0xc5, 0x43,
+			0x4a, 0x5a, 0xe8, 0xc9, 0x28, 0x62, 0xf5, 0x64,
+			0x39, 0x68, 0x55, 0xa9, 0xea, 0x6e, 0x22, 0xd6,
+			0xd3, 0xb5, 0x0a, 0xe1, 0xb3, 0x66, 0x33, 0x11,
+			0xa4, 0xa3, 0x60, 0x6c, 0x67, 0x1d, 0x60, 0x5c,
+			0xe1, 0x6c, 0x3a, 0xec, 0xe8, 0xe6, 0x1e, 0xa1,
+			0x45, 0xc5, 0x97, 0x75, 0x01, 0x7b, 0xee, 0x2f,
+			0xa6, 0xf8, 0x8a, 0xfc, 0x75, 0x80, 0x69, 0xf7,
+			0xe0, 0xb8, 0xf6, 0x76, 0xe6, 0x44, 0x21, 0x6f,
+			0x4d, 0x2a, 0x34, 0x22, 0xd7, 0xfa, 0x36, 0xc6,
+			0xc4, 0x93, 0x1a, 0xca, 0x95, 0x0e, 0x9d, 0xa4,
+			0x27, 0x88, 0xe6, 0xd0, 0xb6, 0xd1, 0xcd, 0x83,
+			0x8e, 0xf6, 0x52, 0xe9, 0x7b, 0x14, 0x5b, 0x14,
+			0x87, 0x1e, 0xae, 0x6c, 0x68, 0x04, 0xc7, 0x00,
+			0x4d, 0xb5, 0xac, 0x2f, 0xce, 0x4c, 0x68, 0xc7,
+			0x26, 0xd0, 0x04, 0xb1, 0x0f, 0xca, 0xba, 0x86,
+		},
+	},
+	{
+		name: "IETF Draft: TC6: Every odd bit set in key and IV.",
+		key: []byte{
+			0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+			0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+			0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+			0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+		},
+		iv: []byte{0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa},
+		stream: []byte{
+			0x9a, 0xa2, 0xa9, 0xf6, 0x56, 0xef, 0xde, 0x5a,
+			0xa7, 0x59, 0x1c, 0x5f, 0xed, 0x4b, 0x35, 0xae,
+			0xa2, 0x89, 0x5d, 0xec, 0x7c, 0xb4, 0x54, 0x3b,
+			0x9e, 0x9f, 0x21, 0xf5, 0xe7, 0xbc, 0xbc, 0xf3,
+			0xc4, 0x3c, 0x74, 0x8a, 0x97, 0x08, 0x88, 0xf8,
+			0x24, 0x83, 0x93, 0xa0, 0x9d, 0x43, 0xe0, 0xb7,
+			0xe1, 0x64, 0xbc, 0x4d, 0x0b, 0x0f, 0xb2, 0x40,
+			0xa2, 0xd7, 0x21, 0x15, 0xc4, 0x80, 0x89, 0x06,
+			0x72, 0x18, 0x44, 0x89, 0x44, 0x05, 0x45, 0xd0,
+			0x21, 0xd9, 0x7e, 0xf6, 0xb6, 0x93, 0xdf, 0xe5,
+			0xb2, 0xc1, 0x32, 0xd4, 0x7e, 0x6f, 0x04, 0x1c,
+			0x90, 0x63, 0x65, 0x1f, 0x96, 0xb6, 0x23, 0xe6,
+			0x2a, 0x11, 0x99, 0x9a, 0x23, 0xb6, 0xf7, 0xc4,
+			0x61, 0xb2, 0x15, 0x30, 0x26, 0xad, 0x5e, 0x86,
+			0x6a, 0x2e, 0x59, 0x7e, 0xd0, 0x7b, 0x84, 0x01,
+			0xde, 0xc6, 0x3a, 0x09, 0x34, 0xc6, 0xb2, 0xa9,
+		},
+	},
+	{
+		name: "IETF Draft: TC7: Sequence patterns in key and IV.",
+		key: []byte{
+			0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+			0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
+			0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88,
+			0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11, 0x00,
+		},
+		iv: []byte{0x0f, 0x1e, 0x2d, 0x3c, 0x4b, 0x5a, 0x69, 0x78},
+		stream: []byte{
+			0x9f, 0xad, 0xf4, 0x09, 0xc0, 0x08, 0x11, 0xd0,
+			0x04, 0x31, 0xd6, 0x7e, 0xfb, 0xd8, 0x8f, 0xba,
+			0x59, 0x21, 0x8d, 0x5d, 0x67, 0x08, 0xb1, 0xd6,
+			0x85, 0x86, 0x3f, 0xab, 0xbb, 0x0e, 0x96, 0x1e,
+			0xea, 0x48, 0x0f, 0xd6, 0xfb, 0x53, 0x2b, 0xfd,
+			0x49, 0x4b, 0x21, 0x51, 0x01, 0x50, 0x57, 0x42,
+			0x3a, 0xb6, 0x0a, 0x63, 0xfe, 0x4f, 0x55, 0xf7,
+			0xa2, 0x12, 0xe2, 0x16, 0x7c, 0xca, 0xb9, 0x31,
+			0xfb, 0xfd, 0x29, 0xcf, 0x7b, 0xc1, 0xd2, 0x79,
+			0xed, 0xdf, 0x25, 0xdd, 0x31, 0x6b, 0xb8, 0x84,
+			0x3d, 0x6e, 0xde, 0xe0, 0xbd, 0x1e, 0xf1, 0x21,
+			0xd1, 0x2f, 0xa1, 0x7c, 0xbc, 0x2c, 0x57, 0x4c,
+			0xcc, 0xab, 0x5e, 0x27, 0x51, 0x67, 0xb0, 0x8b,
+			0xd6, 0x86, 0xf8, 0xa0, 0x9d, 0xf8, 0x7e, 0xc3,
+			0xff, 0xb3, 0x53, 0x61, 0xb9, 0x4e, 0xbf, 0xa1,
+			0x3f, 0xec, 0x0e, 0x48, 0x89, 0xd1, 0x8d, 0xa5,
+		},
+	},
+	{
+		name: "IETF Draft: TC8: key: 'All your base are belong to us!, IV: 'IETF2013'",
+		key: []byte{
+			0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78,
+			0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35,
+			0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb,
+			0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d,
+		},
+		iv: []byte{0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21},
+		stream: []byte{
+			0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9,
+			0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06,
+			0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00,
+			0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf,
+			0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd,
+			0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f,
+			0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f,
+			0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92,
+			0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9,
+			0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36,
+			0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1,
+			0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38,
+			0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea,
+			0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0,
+			0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27,
+			0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33, 0x32,
+		},
+	},
+	{
+		name: "XChaCha20 Test",
+		key: []byte{
+			0x1b, 0x27, 0x55, 0x64, 0x73, 0xe9, 0x85, 0xd4,
+			0x62, 0xcd, 0x51, 0x19, 0x7a, 0x9a, 0x46, 0xc7,
+			0x60, 0x09, 0x54, 0x9e, 0xac, 0x64, 0x74, 0xf2,
+			0x06, 0xc4, 0xee, 0x08, 0x44, 0xf6, 0x83, 0x89,
+		},
+		iv: []byte{
+			0x69, 0x69, 0x6e, 0xe9, 0x55, 0xb6, 0x2b, 0x73,
+			0xcd, 0x62, 0xbd, 0xa8, 0x75, 0xfc, 0x73, 0xd6,
+			0x82, 0x19, 0xe0, 0x03, 0x6b, 0x7a, 0x0b, 0x37,
+		},
+		stream: []byte{
+			0x4f, 0xeb, 0xf2, 0xfe, 0x4b, 0x35, 0x9c, 0x50,
+			0x8d, 0xc5, 0xe8, 0xb5, 0x98, 0x0c, 0x88, 0xe3,
+			0x89, 0x46, 0xd8, 0xf1, 0x8f, 0x31, 0x34, 0x65,
+			0xc8, 0x62, 0xa0, 0x87, 0x82, 0x64, 0x82, 0x48,
+			0x01, 0x8d, 0xac, 0xdc, 0xb9, 0x04, 0x17, 0x88,
+			0x53, 0xa4, 0x6d, 0xca, 0x3a, 0x0e, 0xaa, 0xee,
+			0x74, 0x7c, 0xba, 0x97, 0x43, 0x4e, 0xaf, 0xfa,
+			0xd5, 0x8f, 0xea, 0x82, 0x22, 0x04, 0x7e, 0x0d,
+			0xe6, 0xc3, 0xa6, 0x77, 0x51, 0x06, 0xe0, 0x33,
+			0x1a, 0xd7, 0x14, 0xd2, 0xf2, 0x7a, 0x55, 0x64,
+			0x13, 0x40, 0xa1, 0xf1, 0xdd, 0x9f, 0x94, 0x53,
+			0x2e, 0x68, 0xcb, 0x24, 0x1c, 0xbd, 0xd1, 0x50,
+			0x97, 0x0d, 0x14, 0xe0, 0x5c, 0x5b, 0x17, 0x31,
+			0x93, 0xfb, 0x14, 0xf5, 0x1c, 0x41, 0xf3, 0x93,
+			0x83, 0x5b, 0xf7, 0xf4, 0x16, 0xa7, 0xe0, 0xbb,
+			0xa8, 0x1f, 0xfb, 0x8b, 0x13, 0xaf, 0x0e, 0x21,
+			0x69, 0x1d, 0x7e, 0xce, 0xc9, 0x3b, 0x75, 0xe6,
+			0xe4, 0x18, 0x3a,
+		},
+	},
+}
+
+func TestChaCha20(t *testing.T) {
+	for _, v := range draftTestVectors {
+		c, err := NewCipher(v.key, v.iv)
+		if err != nil {
+			t.Errorf("[%s]: New(k, iv) returned: %s", v.name, err)
+			continue
+		}
+		out := make([]byte, len(v.stream))
+		c.XORKeyStream(out, out)
+		if !bytes.Equal(out, v.stream) {
+			t.Errorf("[%s]: out != stream (%x != %x)", v.name, out, v.stream)
+		}
+	}
+}
+
+func TestChaCha20Vectorized(t *testing.T) {
+	if !usingVectors {
+		t.Skip("vectorized ChaCha20 support not compiled in")
+	}
+
+	// Save the batch blocks processing routine so we can mess with it, and
+	// restore it when we're done.
+	oldBlocksFn := blocksFn
+	defer func() {
+		blocksFn = oldBlocksFn
+	}()
+
+	// Generate a random key, nonce and input.
+	var key [KeySize]byte
+	var nonce [NonceSize]byte
+	var input [1024 * 1024]byte
+	rand.Read(key[:])
+	rand.Read(nonce[:])
+	rand.Read(input[:])
+
+	// Encrypt with the vectorized implementation.
+	c, err := NewCipher(key[:], nonce[:])
+	if err != nil {
+		t.Fatal(err)
+	}
+	var vecOut [1024 * 1024]byte
+	c.XORKeyStream(vecOut[:], input[:])
+
+	c, err = NewCipher(key[:], nonce[:])
+	if err != nil {
+		t.Fatal(err)
+	}
+	var refOut [1024 * 1024]byte
+	blocksFn = blocksRef
+	c.XORKeyStream(refOut[:], input[:])
+	if !bytes.Equal(refOut[:], vecOut[:]) {
+		for i, v := range refOut {
+			if vecOut[i] != v {
+				t.Errorf("mismatch at offset: %d", i)
+				break
+			}
+		}
+		t.Errorf("refOut != vecOut")
+	}
+}
+
+func doBenchN(b *testing.B, n int) {
+	var key [KeySize]byte
+	var nonce [NonceSize]byte
+	s := make([]byte, n)
+	c, err := NewCipher(key[:], nonce[:])
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.SetBytes(int64(n))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		c.XORKeyStream(s, s)
+	}
+}
+
+func BenchmarkChaCha20_16(b *testing.B) {
+	doBenchN(b, 16)
+}
+
+func BenchmarkChaCha20_1k(b *testing.B) {
+	doBenchN(b, 1024)
+}
+
+func BenchmarkChaCha20_64k(b *testing.B) {
+	doBenchN(b, 65536)
+}