Browse Source

X25519: Add a 64 bit optimized implementation.

Based on curve25519-donna-64bit.h by Andrew Moon.  One day, LDC at least
will have ucent implemented and all the LLVM IR can go away, but this
works fine till then.
Yawning Angel 3 years ago
parent
commit
bf0c5c62b4

+ 13 - 5
benchmarks/bench.d

@@ -8,7 +8,7 @@
 
 import core.memory : GC;
 import core.time : Duration, MonoTime, hnsecs, seconds, to;
-import std.stdio : writeln, writefln;
+import std.stdio : write, writeln, writefln;
 import deuterium.random : getrandom;
 
 private {
@@ -108,19 +108,27 @@ private void benchSipHash24() {
 }
 
 private void benchX25519() {
-    import deuterium.ecc.x25519 : x25519, g_keySize;
+    import deuterium.ecc.x25519 : scalarBaseMult, scalarMult, g_keySize;
 
     writeln("X25519:");
     ubyte[g_keySize] key, bp, ss;
     bp[0] = 9;
     key.getrandom();
 
-    void oneIter() {
-        x25519(ss, key, bp);
+    void oneScalarMultIter() {
+        scalarMult(ss, key, bp);
     }
+    auto elapsed = benchmark(&oneScalarMultIter);
+    write(" scalarMult\t");
+    printTimedResult(elapsed);
 
-    auto elapsed = benchmark(&oneIter);
+    void oneScalarBaseMultIter() {
+        scalarBaseMult(ss, key);
+    }
+    elapsed = benchmark(&oneScalarBaseMultIter);
+    write(" scalarBaseMult\t");
     printTimedResult(elapsed);
+
     writeln();
 }
 

+ 290 - 39
source/deuterium/ecc/x25519.d

@@ -20,7 +20,7 @@ module deuterium.ecc.x25519;
 import deuterium.memory : explicitBzero;
 
 import std.bitmanip : peek;
-import std.stdint : int32_t, uint32_t, uint64_t;
+import std.stdint : int32_t, int64_t, uint32_t, uint64_t;
 import std.system : Endian;
 
 immutable {
@@ -30,21 +30,19 @@ immutable {
     private {
         uint32_t reduce_mask_26 = (1 << 26) - 1;
         uint32_t reduce_mask_25 = (1 << 25) - 1;
+
+        uint64_t reduce_mask_51 = (1UL << 51) - 1;
+        // uint64_t reduce_mask_52 = (1UL << 52) - 1;
+        uint64_t two54m152 = (1UL << 54) - 152;
+        uint64_t two54m8 = (1UL << 54) - 8;
+
+        ubyte[g_keySize] g_basepoint = [
+            9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
     }
 }
 
-/**
- * The templated function abstracting away the underlying scalar multiply
- * implementation.
- *
- * Most users should use the `x25519` alias instead.
- *
- * Params:
- *   mypublic  = The resulting u-coordinate.
- *   secret    = The scalar.
- *   basepoint = The u-coordinate.
- */
-void scalarmult(T)(ref ubyte[g_keySize] mypublic, in ref ubyte[g_keySize] secret, in ref ubyte[g_keySize] basepoint) pure nothrow @nogc @safe {
+private void scalarMultImpl(T)(ref ubyte[g_keySize] mypublic, in ref ubyte[g_keySize] secret, in ref ubyte[g_keySize] basepoint) pure nothrow @nogc @safe {
     ubyte[g_keySize] e;
     scope(exit) explicitBzero(e);
 
@@ -52,10 +50,10 @@ void scalarmult(T)(ref ubyte[g_keySize] mypublic, in ref ubyte[g_keySize] secret
     e[0] &= 0xf8;
     e[31] &= 0x7f;
     e[31] |= 0x40;
-    scalarmult_donna!T(mypublic, e, basepoint);
+    scalarMultDonna!T(mypublic, e, basepoint);
 }
 
-private void scalarmult_donna(T)(ref ubyte[g_keySize] mypublic, in ref ubyte[g_keySize] n, in ref ubyte[g_keySize] basepoint) pure nothrow @nogc @safe {
+private void scalarMultDonna(T)(ref ubyte[g_keySize] mypublic, in ref ubyte[g_keySize] n, in ref ubyte[g_keySize] basepoint) pure nothrow @nogc @safe {
     T nqpqx, nqpqz, nqz;
     T qx, qpqx, qqx, zzz, zmone;
 
@@ -470,6 +468,7 @@ package struct BignumImpl32 {
         this[9] = r9;
     }
 
+    // this = a ^ (2 * count)
     void squareTimes(in ref BignumImpl32 a, int count) pure nothrow @nogc @safe {
         uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
         uint32_t d6, d7, d8, d9;
@@ -578,42 +577,294 @@ package struct BignumImpl32 {
     uint32_t[10] m_limb;
 }
 
-/// X25519 scalar multiply.
-alias x25519 = scalarmult!BignumImpl32;
+version (LDC) {
+    // The 64 bit version should work on anything, as long as the appropriate
+    // compiler features are present, but on systems that lack a fast
+    // 64 bit * 64 bit = 128 bit multiply, benchmarking is required to see
+    // if it is worth enabling over the 32 bit code.
+
+    version (X86_64) {
+        version = useLLVM_IR;
+        version = bignum25519_64;
+    }
+
+    version (useLLVM_IR) {
+        pure:
+        nothrow:
+        @nogc:
+        @safe:
+        pragma(LDC_inline_ir) R inlineIR(string s, R, P...)(P);
+    }
+}
+
+version (bignum25519_64) {
+    package struct BignumImpl64 {
+        // this = rhs
+        this(in ref BignumImpl64 rhs) pure nothrow @nogc @safe {
+            m_limb[] = rhs.m_limb;
+        }
+
+        // this = expand(rhs), where rhs is a little-endian 32-byte number.
+        this(in ref ubyte[32] rhs) pure nothrow @nogc @trusted {
+            auto r = rhs[0 .. $];
+            auto x0 = peek!(uint64_t, Endian.littleEndian)(r, 0);
+            auto x1 = peek!(uint64_t, Endian.littleEndian)(r, 8);
+            auto x2 = peek!(uint64_t, Endian.littleEndian)(r, 16);
+            auto x3 = peek!(uint64_t, Endian.littleEndian)(r, 24);
+
+            this[0] = x0 & reduce_mask_51; x0 = (x0 >> 51) | (x1 << 13);
+            this[1] = x0 & reduce_mask_51; x1 = (x1 >> 38) | (x2 << 26);
+            this[2] = x1 & reduce_mask_51; x2 = (x2 >> 25) | (x3 << 39);
+            this[3] = x2 & reduce_mask_51; x3 = (x3 >> 12);
+            this[4] = x3 & reduce_mask_51; // ignore the top bit
+        }
+
+        // lhs = contract(this), where this is a fully reduced polynomial form
+        // number, and the output is a little-endian 32-byte number.
+        void contract(ref ubyte[32] lhs) const pure nothrow @nogc @safe {
+            uint64_t[5] t;
+            uint64_t f, i;
+
+            t[0] = this[0];
+            t[1] = this[1];
+            t[2] = this[2];
+            t[3] = this[3];
+            t[4] = this[4];
+
+            enum contractCarry = `
+                t[1] += t[0] >> 51; t[0] &= reduce_mask_51;
+                t[2] += t[1] >> 51; t[1] &= reduce_mask_51;
+                t[3] += t[2] >> 51; t[2] &= reduce_mask_51;
+                t[4] += t[3] >> 51; t[3] &= reduce_mask_51;`;
+            enum contractCarryFull = contractCarry ~ `
+                t[0] += 19 * (t[4] >> 51); t[4] &= reduce_mask_51;`;
+            enum contractCarryFinal = contractCarry ~ `
+                t[4] &= reduce_mask_51;`;
+
+            mixin(contractCarryFull);
+            mixin(contractCarryFull);
+
+            // now t is between 0 and 2^255-1, properly carried.
+            // case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1.
+            t[0] += 19;
+            mixin(contractCarryFull);
+
+            // now between 19 and 2^255-1 in both cases, and offset by 19.
+            t[0] += 0x8000000000000 - 19;
+            t[1] += 0x8000000000000 - 1;
+            t[2] += 0x8000000000000 - 1;
+            t[3] += 0x8000000000000 - 1;
+            t[4] += 0x8000000000000 - 1;
+
+            // now between 2^255 and 2^256-20, and offset by 2^255.
+            mixin(contractCarryFinal);
+
+            size_t idx = 0;
+            void write51(size_t n) {
+                pragma(inline, true);
+
+                immutable size_t shift = 13 * n;
+                uint64_t f = ((t[n] >> shift) | (t[n+1] << (51 - shift)));
+                for (size_t i = 0; i < 8; i++, f >>= 8) {
+                    lhs[idx] = cast(ubyte)f;
+                    idx++;
+                }
+            }
+
+            write51(0);
+            write51(1);
+            write51(2);
+            write51(3);
+        }
+
+        // this = a + b
+        void add(in ref BignumImpl64 a, in ref BignumImpl64 b) pure nothrow @nogc @safe {
+            this[0] = a[0] + b[0];
+            this[1] = a[1] + b[1];
+            this[2] = a[2] + b[2];
+            this[3] = a[3] + b[3];
+            this[4] = a[4] + b[4];
+        }
+
+        // this = a - b
+        void sub(in ref BignumImpl64 a, in ref BignumImpl64 b) pure nothrow @nogc @safe {
+            this[0] = a[0] + two54m152 - b[0];
+            this[1] = a[1] + two54m8 - b[1];
+            this[2] = a[2] + two54m8 - b[2];
+            this[3] = a[3] + two54m8 - b[3];
+            this[4] = a[4] + two54m8 - b[4];
+        }
+
+        // this = rhs * scalar
+        void scalarProduct(in ref BignumImpl64 rhs, immutable uint64_t scalar) pure nothrow @nogc @safe {
+            version (useLLVM_IR) {
+                enum scalarProductIR = import("curve25519_donna64_scalarProduct.ll");
+                inlineIR!(scalarProductIR, void)(&rhs.m_limb[0], &this.m_limb[0], scalar);
+            }
+        }
+
+        // this = a * b
+        void mul(in ref BignumImpl64 a, in ref BignumImpl64 b) pure nothrow @nogc @safe {
+            version (useLLVM_IR) {
+                enum mulIR = import("curve25519_donna64_mul.ll");
+                inlineIR!(mulIR, void)(&a.m_limb[0], &b.m_limb[0], &this.m_limb[0]);
+            }
+        }
+
+        // this = a * a
+        void square(in ref BignumImpl64 a) pure nothrow @nogc @trusted {
+            uint64_t r0, r1, r2, r3, r4;
+
+            r0 = a[0];
+            r1 = a[1];
+            r2 = a[2];
+            r3 = a[3];
+            r4 = a[4];
+
+            version (useLLVM_IR) {
+                enum squareIR = import("curve25519_donna64_square.ll");
+                inlineIR!(squareIR, void)(&r0, &r1, &r2, &r3, &r4);
+            }
+
+            this[0] = r0;
+            this[1] = r1;
+            this[2] = r2;
+            this[3] = r3;
+            this[4] = r4;
+        }
+
+        // this = a ^ (2 * count)
+        void squareTimes(in ref BignumImpl64 a, int count) pure nothrow @nogc @trusted {
+            uint64_t r0, r1, r2, r3, r4;
+
+            r0 = a[0];
+            r1 = a[1];
+            r2 = a[2];
+            r3 = a[3];
+            r4 = a[4];
+
+            do {
+                version (useLLVM_IR) {
+                    enum squareIR = import("curve25519_donna64_square.ll");
+                    inlineIR!(squareIR, void)(&r0, &r1, &r2, &r3, &r4);
+                }
+            } while(--count);
+
+            this[0] = r0;
+            this[1] = r1;
+            this[2] = r2;
+            this[3] = r3;
+            this[4] = r4;
+        }
+
+        // this, x = x, this iff iswap != 0.
+        void swapConditional(ref BignumImpl64 x, immutable uint64_t iswap) pure nothrow @nogc @safe {
+            immutable uint64_t swap = cast(uint64_t)(-cast(int64_t)iswap);
+            uint64_t x0,x1,x2,x3,x4;
+
+            x0 = swap & (this[0] ^ x[0]); this[0] ^= x0; x[0] ^= x0;
+            x1 = swap & (this[1] ^ x[1]); this[1] ^= x1; x[1] ^= x1;
+            x2 = swap & (this[2] ^ x[2]); this[2] ^= x2; x[2] ^= x2;
+            x3 = swap & (this[3] ^ x[3]); this[3] ^= x3; x[3] ^= x3;
+            x4 = swap & (this[4] ^ x[4]); this[4] ^= x4; x[4] ^= x4;
+        }
+
+        uint64_t opIndex(size_t i) const pure nothrow @nogc @safe {
+            pragma(inline, true);
+            return m_limb[i];
+        }
+
+        uint64_t opIndexAssign(in uint64_t value, size_t i) pure nothrow @nogc @safe {
+            m_limb[i] = value;
+            return value;
+        }
+
+        uint64_t opIndexOpAssign(string op)(uint64_t value, size_t i) pure nothrow @nogc @safe {
+            return opIndexAssign(mixin(`opIndex(i)` ~ op ~ `value`), i);
+        }
+
+        uint64_t[5] m_limb;
+    }
+}
+
+/**
+ * The X25519 scalar multiply.
+ *
+ * Params:
+ *   mypublic  = The resulting u-coordinate.
+ *   secret    = The scalar.
+ *   basepoint = The u-coordinate.
+ */
+void scalarMult(ref ubyte[g_keySize] mypublic, in ref ubyte[g_keySize] secret, in ref ubyte[g_keySize] basepoint) pure nothrow @nogc @safe {
+    version (bignum25519_64) {
+        scalarMultImpl!BignumImpl64(mypublic, secret, basepoint);
+    } else {
+        scalarMultImpl!BignumImpl32(mypublic, secret, basepoint);
+    }
+}
+
+/**
+ * The X25519 scalar basepoint multiply.
+ *
+ * Params:
+ *  mypublic = The resulting u-coordinate.
+ *  secret   = The scalar.
+ */
+void scalarBaseMult(ref ubyte[g_keySize] mypublic, in ref ubyte[g_keySize] secret) pure nothrow @nogc @safe {
+    scalarMult(mypublic, secret, g_basepoint);
+}
 
 @system unittest {
     import deuterium.hex : decode;
     import deuterium.memory : ctIsEqual;
-
-    ubyte[32] newU, k, u;
-
-    k[0] = 9;
-    u[0] = 9;
+    import deuterium.random : getrandom;
 
     // The 1 million invocation test takes too long to be part of my work flow,
-    // especially when this is built with DMD.
+    // especially when this is built with DMD, but it does pass with both LDC
+    // and DMD.
     auto expected1 = decode("422c8e7a6227d7bca1350b3e2bb7279f7897b87bb6854b783c60e80311ae3079");
     auto expected1k = decode("684cf59ba83309552800ef566f2f4d3c1c3887c49360e3875f2eb94d99532c51");
     // auto expected1m = decode("7c3911e0ab2586fd864497297e575e6f3bc601c0883c30df5f4dd2d24f665424");
 
-    foreach (size_t i; 0 .. 1000 /* 000 */) {
-        scalarmult!BignumImpl32(newU, k, u);
-        u[] = k;
-        k[] = newU;
-        switch (i) {
-        case 0:
-            assert(ctIsEqual(newU, expected1), "Mismatch at 1 iter");
-            break;
-        case 999:
-            assert(ctIsEqual(newU, expected1k), "Mismatch at 1k iter");
-            break;
+    void iterativeKAT(T)() {
+        ubyte[g_keySize] newU, k, u;
+
+        k[0] = 9;
+        u[0] = 9;
+
+        foreach (size_t i; 0 .. 1000/+ 000 +/) {
+            scalarMultImpl!T(newU, k, u);
+            u[] = k;
+            k[] = newU;
+            switch (i) {
+            case 0:
+                assert(ctIsEqual(newU, expected1), "Mismatch at 1 iter");
+                break;
+            case 999:
+                assert(ctIsEqual(newU, expected1k), "Mismatch at 1k iter");
+                break;
     /+
-        case 999999:
-            assert(ctIsEqual(newU, expected1m), "Mismatch at 1m iter");
-            break;
+            case 999999:
+                assert(ctIsEqual(newU, expected1m), "Mismatch at 1m iter");
+                break;
      +/
-        default:
-            break;
+            default:
+                break;
+            }
         }
     }
+
+    iterativeKAT!BignumImpl32();
+    version (bignum25519_64) {
+        iterativeKAT!BignumImpl64();
+    }
+
+    // Sanity check the scalar basepoint multiply.
+    ubyte[g_keySize] s, p, pCheck;
+
+    getrandom(s);
+    scalarBaseMult(p, s);
+    scalarMult(pCheck, s, g_basepoint);
+
+    assert(ctIsEqual(p, pCheck), "ScalarBaseMult/ScalarMult mismatch");
 }

+ 162 - 0
views/curve25519_donna64_mul.ll

@@ -0,0 +1,162 @@
+; curve25519_donna64_mul.ll
+;
+; A straight forward port of the curve25519_mul routine to LLVM IR.
+;
+; Inputs:
+;  %0 i64* a
+;  %1 i64* b
+;  %2 i64* out
+;
+; Constants:
+;  (1 << 51) - 1      = 2251799813685247
+
+%a0addr = getelementptr i64, i64* %0, i64 0
+%a1addr = getelementptr i64, i64* %0, i64 1
+%a2addr = getelementptr i64, i64* %0, i64 2
+%a3addr = getelementptr i64, i64* %0, i64 3
+%a4addr = getelementptr i64, i64* %0, i64 4
+
+%b0addr = getelementptr i64, i64* %1, i64 0
+%b1addr = getelementptr i64, i64* %1, i64 1
+%b2addr = getelementptr i64, i64* %1, i64 2
+%b3addr = getelementptr i64, i64* %1, i64 3
+%b4addr = getelementptr i64, i64* %1, i64 4
+
+%4 = load i64, i64* %b0addr
+%5 = load i64, i64* %b1addr
+%6 = load i64, i64* %b2addr
+%7 = load i64, i64* %b3addr
+%8 = load i64, i64* %b4addr
+
+%r0_0 = zext i64 %4 to i128
+%r1_0 = zext i64 %5 to i128
+%r2_0 = zext i64 %6 to i128
+%r3_0 = zext i64 %7 to i128
+%r4_0 = zext i64 %8 to i128
+
+%9 = load i64, i64* %a0addr
+%10 = load i64, i64* %a1addr
+%11 = load i64, i64* %a2addr
+%12 = load i64, i64* %a3addr
+%13 = load i64, i64* %a4addr
+
+%s0 = zext i64 %9 to i128
+%s1 = zext i64 %10 to i128
+%s2 = zext i64 %11 to i128
+%s3 = zext i64 %12 to i128
+%s4 = zext i64 %13 to i128
+
+%14 = mul nuw i128 %r0_0, %s0 ; t[0] = r0 * s0  (t[0])
+
+%15 = mul nuw i128 %r0_0, %s1 ; t[1] = r0 * s1
+%16 = mul nuw i128 %r1_0, %s0 ;      + r1 * s0
+%17 = add nuw i128 %15, %16   ;                 (t[1])
+
+%18 = mul nuw i128 %r0_0, %s2 ; t[2] = r0 * s2
+%19 = mul nuw i128 %r2_0, %s0 ;      + r2 * s0
+%20 = add nuw i128 %18, %19   ;
+%21 = mul nuw i128 %r1_0, %s1 ;      + r1 * s1
+%22 = add nuw i128 %20, %21   ;                 (t[2])
+
+%23 = mul nuw i128 %r0_0, %s3 ; t[3] = r0 * s3
+%24 = mul nuw i128 %r3_0, %s0 ;      + r3 * s0
+%25 = add nuw i128 %23, %24   ;
+%26 = mul nuw i128 %r1_0, %s2 ;      + r1 * s2
+%27 = add nuw i128 %25, %26   ;
+%28 = mul nuw i128 %r2_0, %s1 ;      + r2 * s1
+%29 = add nuw i128 %27, %28   ;                 (t[3])
+
+%30 = mul nuw i128 %r0_0, %s4 ; t[4] = r0 * s4
+%31 = mul nuw i128 %r4_0, %s0 ;      + r4 * s0
+%32 = add nuw i128 %30, %31   ;
+%33 = mul nuw i128 %r3_0, %s1 ;      + r3 * s1
+%34 = add nuw i128 %32, %33   ;
+%35 = mul nuw i128 %r1_0, %s3 ;      + r1 * s3
+%36 = add nuw i128 %34, %35   ;
+%37 = mul nuw i128 %r2_0, %s2 ;      + r2 * s2
+%38 = add nuw i128 %36, %37   ;                 (t[4])
+
+%39 = mul nuw i64 %5, 19      ; r1 *= 19
+%40 = mul nuw i64 %6, 19      ; r2 *= 19
+%41 = mul nuw i64 %7, 19      ; r3 *= 19
+%42 = mul nuw i64 %8, 19      ; r4 *= 19
+
+%r1_1 = zext i64 %39 to i128
+%r2_1 = zext i64 %40 to i128
+%r3_1 = zext i64 %41 to i128
+%r4_1 = zext i64 %42 to i128
+
+%43 = mul nuw i128 %r4_1, %s1 ; t[0] += r4 * s1
+%44 = add nuw i128 %14, %43   ;
+%45 = mul nuw i128 %r1_1, %s4 ;       + r1 * s4
+%46 = add nuw i128 %44, %45   ;
+%47 = mul nuw i128 %r2_1, %s3 ;       + r2 * s3
+%48 = add nuw i128 %46, %47   ;
+%49 = mul nuw i128 %r3_1, %s2 ;       + r3 * s2
+%t0 = add nuw i128 %48, %49   ;                 (t[0], named)
+
+%50 = mul nuw i128 %r4_1, %s2 ; t[1] += r4 * s2
+%51 = add nuw i128 %17, %50   ;
+%52 = mul nuw i128 %r2_1, %s4 ;       + r2 * s4
+%53 = add nuw i128 %51, %52   ;
+%54 = mul nuw i128 %r3_1, %s3 ;       + r3 * s3
+%55 = add nuw i128 %53, %54   ;                 (t[1])
+
+%56 = mul nuw i128 %r4_1, %s3 ; t[2] += r4 * s3
+%57 = add nuw i128 %22, %56   ;
+%58 = mul nuw i128 %r3_1, %s4 ;       + r3 * s4
+%59 = add nuw i128 %57, %58   ;                 (t[2])
+
+%60 = mul nuw i128 %r4_1, %s4 ; t[3] += r4 * s4
+%61 = add nuw i128 %29, %60   ;                 (t[3])
+
+%62 = trunc i128 %t0 to i64         ;
+%63 = and i64 %62, 2251799813685247 ; r0 = lo128(t[0]) & reduce_mask_51   <- r0
+%64 = lshr i128 %t0, 51             ;
+%65 = trunc i128 %64 to i64         ; shr128(c, t[0], 51)
+
+%66 = zext i64 %65 to i128          ;
+%t1 = add nuw i128 %55, %66;        ; t[1] += c
+%67 = trunc i128 %t1 to i64         ;
+%68 = and i64 %67, 2251799813685247 ; r1 = lo128(t[1]) & reduce_mask_51   <- r1
+%69 = lshr i128 %t1, 51             ;
+%70 = trunc i128 %69 to i64         ; shr128(c, t[1], 51)
+
+%71 = zext i64 %70 to i128          ;
+%t2 = add nuw i128 %59, %71         ; t[2] += c
+%72 = trunc i128 %t2 to i64         ;
+%r2 = and i64 %72, 2251799813685247 ; r2 = lo128(t[2]) & reduce_mask_51
+%73 = lshr i128 %t2, 51             ;
+%74 = trunc i128 %73 to i64         ; shr128(c, t[2], 51)
+
+%75 = zext i64 %74 to i128          ;
+%t3 = add nuw i128 %61, %75         ; t[3] += c
+%76 = trunc i128 %t3 to i64         ;
+%r3 = and i64 %76, 2251799813685247 ; r3 = lo128(t[3]) & reduce_mask_51
+%77 = lshr i128 %t3, 51             ;
+%78 = trunc i128 %77 to i64         ; shr128(c, t[3], 51)
+
+%79 = zext i64 %78 to i128          ;
+%t4 = add nuw i128 %38, %79         ; t[4] += c
+%80 = trunc i128 %t4 to i64         ;
+%r4 = and i64 %80, 2251799813685247 ; r4 = lo128(t[4]) & reduce_mask_51
+%81 = lshr i128 %t4, 51             ;
+%82 = trunc i128 %81 to i64         ; shr128(c, t[4], t51)
+
+%83 = mul nuw i64 %82, 19           ;
+%84 = add nuw i64 %63, %83          ; r0 += c * 19
+%85 = lshr i64 %84, 51              ; c = r0 >> 51
+%r0 = and i64 %84, 2251799813685247 ; r0 = r0 * reduce_mask_51
+%r1 = add nuw i64 %68, %85          ; r1 += c
+
+%out0addr = getelementptr i64, i64* %2, i64 0
+%out1addr = getelementptr i64, i64* %2, i64 1
+%out2addr = getelementptr i64, i64* %2, i64 2
+%out3addr = getelementptr i64, i64* %2, i64 3
+%out4addr = getelementptr i64, i64* %2, i64 4
+
+store i64 %r0, i64* %out0addr
+store i64 %r1, i64* %out1addr
+store i64 %r2, i64* %out2addr
+store i64 %r3, i64* %out3addr
+store i64 %r4, i64* %out4addr

+ 83 - 0
views/curve25519_donna64_scalarProduct.ll

@@ -0,0 +1,83 @@
+; curve25519_donna64_scalarProduct.ll
+;
+; A straight forward port of the curve25519_scalar_product routine to LLVM IR.
+;
+; Inputs:
+;  %0 i64* in
+;  %1 i64* out
+;  %2 i64 scalar
+;
+; Constants:
+;  (1 << 51) - 1      = 2251799813685247
+
+%in0addr = getelementptr i64, i64* %0, i64 0
+%in1addr = getelementptr i64, i64* %0, i64 1
+%in2addr = getelementptr i64, i64* %0, i64 2
+%in3addr = getelementptr i64, i64* %0, i64 3
+%in4addr = getelementptr i64, i64* %0, i64 4
+
+%4 = load i64, i64* %in0addr
+%5 = load i64, i64* %in1addr
+%6 = load i64, i64* %in2addr
+%7 = load i64, i64* %in3addr
+%8 = load i64, i64* %in4addr
+
+%in0 = zext i64 %4 to i128
+%in1 = zext i64 %5 to i128
+%in2 = zext i64 %6 to i128
+%in3 = zext i64 %7 to i128
+%in4 = zext i64 %8 to i128
+%scalar = zext i64 %2 to i128
+
+%9 = mul nuw i128 %in0, %scalar          ; a = ((uint128_t) in[0]) * scalar
+%10 = trunc i128 %9 to i64               ;
+%11 = and i64 %10, 2251799813685247      ; a & reduce_mask_51 (out[0], pre-carry)
+%12 = lshr i128 %9, 51                   ;
+%13 = trunc i128 %12 to i64              ; c = LO(a >> 51)
+
+%14 = mul nuw i128 %in1, %scalar         ;
+%15 = zext i64 %13 to i128               ;
+%16 = add nuw i128 %14, %15              ; a = ((uint128_t) in[1]) * scalar + c
+%17 = trunc i128 %16 to i64              ;
+%out1 = and i64 %17, 2251799813685247    ; out[1] = a & reduce_mask_51
+%18 = lshr i128 %16, 51                  ;
+%19 = trunc i128 %18 to i64              ; c = LO(a >> 51)
+
+%20 = mul nuw i128 %in2, %scalar         ;
+%21 = zext i64 %19 to i128               ;
+%22 = add nuw i128 %20, %21              ; a = ((uint128_t) in[2]) * scalar + c
+%23 = trunc i128 %22 to i64              ;
+%out2 = and i64 %23, 2251799813685247    ; out[2] = a & reduce_mask_51
+%24 = lshr i128 %22, 51                  ;
+%25 = trunc i128 %24 to i64              ; c = LO(a >> 51)
+
+%26 = mul nuw i128 %in3, %scalar         ;
+%27 = zext i64 %25 to i128               ;
+%28 = add nuw i128 %26, %27              ; a = ((uint128_t) in[3]) * scalar + c
+%29 = trunc i128 %28 to i64              ;
+%out3 = and i64 %29, 2251799813685247    ; out[3] = a & reduce_mask_51
+%30 = lshr i128 %28, 51                  ;
+%31 = trunc i128 %30 to i64              ; c = LO(a >> 51)
+
+%32 = mul nuw i128 %in4, %scalar         ;
+%33 = zext i64 %31 to i128               ;
+%34 = add nuw i128 %32, %33              ; a = ((uint128_t) in[4]) * scalar + c
+%35 = trunc i128 %34 to i64              ;
+%out4 = and i64 %35, 2251799813685247    ; out[4] = a & reduce_mask_51
+%36 = lshr i128 %34, 51                  ;
+%37 = trunc i128 %36 to i64              ; c = LO(a >> 51)
+
+%38 = mul nuw i64 %37, 19                ;
+%out0 = add nuw i64 %11, %38             ; out[0] += c * 19
+
+%out0addr = getelementptr i64, i64* %1, i64 0
+%out1addr = getelementptr i64, i64* %1, i64 1
+%out2addr = getelementptr i64, i64* %1, i64 2
+%out3addr = getelementptr i64, i64* %1, i64 3
+%out4addr = getelementptr i64, i64* %1, i64 4
+
+store i64 %out0, i64* %out0addr
+store i64 %out1, i64* %out1addr
+store i64 %out2, i64* %out2addr
+store i64 %out3, i64* %out3addr
+store i64 %out4, i64* %out4addr

+ 120 - 0
views/curve25519_donna64_square.ll

@@ -0,0 +1,120 @@
+; curve25519_donna64_square.ll
+;
+; A straight forward port of the curve25519_square routine to LLVM IR.  The
+; parameters are passed as pointers to allow this routine to be reused for
+; the body of the curve25519_square_times loop as well.
+;
+; Inputs:
+;  %0 i64* r0
+;  %1 i64* r1
+;  %2 i64* r2
+;  %3 i64* r3
+;  %4 i64* r4
+;
+; Constants:
+;  (1 << 51) - 1      = 2251799813685247
+
+%6 = load i64, i64* %0
+%7 = load i64, i64* %1
+%8 = load i64, i64* %2
+%9 = load i64, i64* %3
+%10 = load i64, i64* %4
+
+%r0 = zext i64 %6 to i128
+%r1 = zext i64 %7 to i128
+%r2 = zext i64 %8 to i128
+%r3 = zext i64 %9 to i128
+%r4 = zext i64 %10 to i128
+
+%11 = mul nuw i64 %6, 2       ;
+%d0 = zext i64 %11 to i128    ; d0 = r0 * 2
+
+%12 = mul nuw i64 %7, 2       ;
+%d1 = zext i64 %12 to i128    ; d1 = r1 * 2
+
+%13 = mul nuw i64 %8, 38      ;
+%d2 = zext i64 %13 to i128    ; d2 = r2 * 2 * 19
+
+%14 = mul nuw i64 %9, 19      ;
+%d319 = zext i64 %14 to i128  ; d319 = r3 * 19
+
+%15 = mul nuw i64 %10, 19     ;
+%d419 = zext i64 %15 to i128  ; d419 = r4 * 19
+
+%16 = mul nuw i64 %15, 2      ;
+%d4 = zext i64 %16 to i128    ; d4 = d419 * 2
+
+%17 = mul nuw i128 %r0, %r0   ; t[0] = r0 * r0
+%18 = mul nuw i128 %d4, %r1   ;
+%19 = add nuw i128 %17, %18   ;      + d4 * r1
+%20 = mul nuw i128 %d2, %r3   ;
+%t0 = add nuw i128 %19, %20   ;      + d2 * r3
+
+%21 = mul nuw i128 %d0, %r1   ; t[1] = d0 * r1
+%22 = mul nuw i128 %d4, %r2   ;
+%23 = add nuw i128 %21, %22   ;      + d4 * r2
+%24 = mul nuw i128 %r3, %d319 ;
+%t1 = add nuw i128 %23, %24   ;      + r3 * (r3 * 19) aka d319
+
+%25 = mul nuw i128 %d0, %r2   ; t[2] = d0 * r2
+%26 = mul nuw i128 %r1, %r1   ;
+%27 = add nuw i128 %25, %26   ;      + r1 * r1
+%28 = mul nuw i128 %d4, %r3   ;
+%t2 = add nuw i128 %27, %28   ;      + d4 * r3
+
+%29 = mul nuw i128 %d0, %r3   ; t[3] = d0 * r3
+%30 = mul nuw i128 %d1, %r2   ;
+%31 = add nuw i128 %29, %30   ;      + d1 * r2
+%32 = mul nuw i128 %r4, %d419 ;
+%t3 = add nuw i128 %31, %32   ;      + r4 * d419
+
+%33 = mul nuw i128 %d0, %r4   ; t[4] = d0 * r4
+%34 = mul nuw i128 %d1, %r3   ;
+%35 = add nuw i128 %33, %34   ;      + d1 * r3
+%36 = mul nuw i128 %r2, %r2   ;
+%t4 = add nuw i128 %35, %36   ;      + r2 * r2
+
+%37 = trunc i128 %t0 to i64          ;
+%38 = and i64 %37, 2251799813685247  ; r0 = lo128(t[0]) & reduce_mask_51   <- r0
+%39 = lshr i128 %t0, 51              ;
+%40 = trunc i128 %39 to i64          ; shr128(c, t[0], 51)
+
+%41 = zext i64 %40 to i128           ;
+%t1c = add nuw i128 %t1, %41         ; add128_64(t[1], c)
+%42 = trunc i128 %t1c to i64         ;
+%43 = and i64 %42, 2251799813685247  ; r1 = lo128(t[1]) & reduce_mask_51   <- r1
+%44 = lshr i128 %t1c, 51             ;
+%45 = trunc i128 %44 to i64          ; shr128(c, t[1], 51)
+
+%46 = zext i64 %45 to i128           ;
+%t2c = add nuw i128 %t2, %46         ; add128_64(t[2], c)
+%47 = trunc i128 %t2c to i64         ;
+%r2o = and i64 %47, 2251799813685247 ; r2 = lo128(t[2]) & reduce_mask_51
+%48 = lshr i128 %t2c, 51             ;
+%49 = trunc i128 %48 to i64          ; shr128(c, t[2], 51);
+
+%50 = zext i64 %49 to i128           ;
+%t3c = add nuw i128 %t3, %50         ; add128_64(t[3], c)
+%51 = trunc i128 %t3c to i64         ;
+%r3o = and i64 %51, 2251799813685247 ; r3 = lo128(t[3]) & reduce_mask_51
+%52 = lshr i128 %t3c, 51             ;
+%53 = trunc i128 %52 to i64          ; shr128(c, t[3], 51)
+
+%54 = zext i64 %53 to i128           ;
+%t4c = add nuw i128 %t4, %54         ; add128_64(t[4], c)
+%55 = trunc i128 %t4c to i64         ;
+%r4o = and i64 %55, 2251799813685247 ; r4 = lo128(t[4]) & reduce_mask_51
+%56 = lshr i128 %t4c, 51             ;
+%57 = trunc i128 %56 to i64          ; shr128(c, t[4], 51);
+
+%58 = mul nuw i64 %57, 19            ;
+%59 = add nuw i64 %38, %58           ; r0 += c * 19
+%60 = lshr i64 %59, 51               ; c = r0 >> 51
+%r0o = and i64 %59, 2251799813685247 ; r0 = r0 & reduce_mask_51
+%r1o = add nuw i64 %43, %60
+
+store i64 %r0o, i64* %0
+store i64 %r1o, i64* %1
+store i64 %r2o, i64* %2
+store i64 %r3o, i64* %3
+store i64 %r4o, i64* %4