493 lines
11 KiB
D
493 lines
11 KiB
D
module angel.utils.cryptography.gcm.multiplier;
|
|
|
|
package:
|
|
|
|
import angel.utils.cryptography.gcm.galoisfield;
|
|
import std.algorithm: swap;
|
|
|
|
// TODO Dynamically make use of intel pclmulqdq instruction for fast multiplication.
|
|
|
|
/// test if T is a GCM multiplier
|
|
@safe
|
|
template isGCMMultiplier(T)
|
|
{
|
|
enum bool isGCMMultiplier =
|
|
is(T == struct) &&
|
|
is(typeof(
|
|
{
|
|
ubyte[16] block;
|
|
T m = void;
|
|
m.init(block);
|
|
m.multiply(block);
|
|
}));
|
|
}
|
|
|
|
/// This struct provides schoolbook multiplication in GF(2^128).
|
|
@safe
|
|
struct GCMBasicMultiplier
|
|
{
|
|
|
|
private {
|
|
ubyte[16] H;
|
|
}
|
|
|
|
this(in ubyte[] H) nothrow @nogc
|
|
in {
|
|
assert(H.length == 16, "H: invalid length");
|
|
}
|
|
body {
|
|
init(H);
|
|
}
|
|
|
|
nothrow @nogc {
|
|
/**
|
|
* initialize the multiplicator
|
|
*/
|
|
void init(in ubyte[] H)
|
|
in {
|
|
assert(H.length == 16, "H: invalid length");
|
|
}
|
|
body {
|
|
this.H[] = H[];
|
|
}
|
|
|
|
/// Multiply x by H and store result in x.
|
|
///
|
|
/// Params:
|
|
/// x = 16 byte block
|
|
void multiply(ubyte[] x)
|
|
in {
|
|
assert(x.length == 16, "x: invalid length.");
|
|
}
|
|
body {
|
|
GF128.multiply(x, H);
|
|
}
|
|
}
|
|
|
|
/// test multiplication using schoolbook multiplication
|
|
unittest {
|
|
|
|
immutable ubyte[16] testH = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e";
|
|
ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78";
|
|
|
|
GCMBasicMultiplier mult = GCMBasicMultiplier(testH);
|
|
|
|
mult.multiply(X1);
|
|
|
|
assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 8k table failed!");
|
|
}
|
|
|
|
}
|
|
|
|
/// This struct provides table driven multiplication in GF(2^128).
|
|
@safe
|
|
struct GCMMultiplier8kTable
|
|
{
|
|
|
|
private {
|
|
ubyte[16][16][32] M;
|
|
}
|
|
|
|
this(in ubyte[] H) nothrow @nogc
|
|
in {
|
|
assert(H.length == 16, "H: invalid length");
|
|
}
|
|
body {
|
|
init(H);
|
|
}
|
|
|
|
nothrow @nogc {
|
|
/**
|
|
* initialize the multiplicator
|
|
*/
|
|
void init(in ubyte[] H) {
|
|
tableSetup(H);
|
|
}
|
|
|
|
/// Multiply x by H and store result in x.
|
|
///
|
|
/// Params:
|
|
/// x = 16 byte block
|
|
void multiply(ubyte[] x)
|
|
in {
|
|
assert(x.length == 16, "x: invalid length.");
|
|
}
|
|
body {
|
|
|
|
ubyte[16] z;
|
|
|
|
for(uint i = 0; i < 16; ++i) {
|
|
z[] ^= M[2*i][x[i]>>4][];
|
|
z[] ^= M[2*i+1][x[i]&0xF][];
|
|
}
|
|
|
|
x[] = z[];
|
|
}
|
|
}
|
|
|
|
/// test multiplication using 8k table
|
|
unittest {
|
|
|
|
immutable ubyte[16] H = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e";
|
|
ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78";
|
|
|
|
GCMMultiplier8kTable mult = GCMMultiplier8kTable(H);
|
|
|
|
mult.multiply(X1);
|
|
|
|
assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 8k table failed!");
|
|
}
|
|
|
|
private void tableSetup(in ubyte[] H) nothrow @nogc
|
|
in {
|
|
assert(H.length == 16, "H: invalid length");
|
|
}
|
|
body {
|
|
ubyte[16] Pi;
|
|
Pi[0] = 0x80;
|
|
ubyte[1] oneByte;
|
|
for(int i = 0; i < 32; ++i) {
|
|
for(uint j = 0; j < 16; ++j) {
|
|
M[i][j] = H;
|
|
oneByte[0] = cast(ubyte) (j<<4);
|
|
GF128.multiply(M[i][j], oneByte);
|
|
GF128.multiply(M[i][j], Pi);
|
|
}
|
|
multiplyP4(Pi);
|
|
}
|
|
}
|
|
|
|
private void multiplyP4(ubyte[] x) nothrow @nogc {
|
|
foreach(i;0..4){
|
|
GF128.multiplyP(x);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
/// This class provides table driven multiplication in GF(2^128).
|
|
/// The 64k table is rather large and probably won't fit into the cache.
|
|
/// Use the 8k table to avoid timing based leaks.
|
|
@safe
|
|
struct GCMMultiplier64kTable
|
|
{
|
|
|
|
private {
|
|
ubyte[16][256][16] M;
|
|
}
|
|
|
|
this(in ubyte[] H) nothrow @nogc
|
|
in {
|
|
assert(H.length == 16, "H: invalid length");
|
|
}
|
|
body {
|
|
init(H);
|
|
}
|
|
|
|
nothrow @nogc {
|
|
|
|
/// initialize the multiplicator
|
|
void init(in ubyte[] H) {
|
|
tableSetup(H);
|
|
}
|
|
|
|
/// Multiply x by H and store result in x.
|
|
///
|
|
/// Params:
|
|
/// x = 16 byte block
|
|
void multiply(ubyte[] x)
|
|
in {
|
|
assert(x.length == 16, "x: invalid length.");
|
|
}
|
|
body {
|
|
|
|
ubyte[16] z;
|
|
|
|
for(uint i = 0; i < 16; ++i) {
|
|
z[] ^= M[i][x[i]][];
|
|
}
|
|
|
|
x[] = z[];
|
|
}
|
|
}
|
|
|
|
/// test multiplication using 64k table
|
|
unittest {
|
|
immutable ubyte[16] H = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e";
|
|
ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78";
|
|
|
|
GCMMultiplier64kTable mult = GCMMultiplier64kTable(H);
|
|
|
|
mult.multiply(X1);
|
|
|
|
assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 64k table failed!");
|
|
}
|
|
|
|
private void tableSetup(in ubyte[] H) nothrow @nogc
|
|
in {
|
|
assert(H.length == 16, "H: invalid length");
|
|
}
|
|
body {
|
|
ubyte[16] P;
|
|
P[0] = 0x80;
|
|
ubyte[1] oneByte;
|
|
for(int i = 0; i < 16; ++i) {
|
|
for(uint j = 0; j <= 255; ++j) {
|
|
M[i][j] = H;
|
|
oneByte[0] = cast(ubyte) j;
|
|
GF128.multiply(M[i][j], oneByte);
|
|
GF128.multiply(M[i][j], P);
|
|
}
|
|
GF128.multiplyP8(P);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
/// This struct provides hardware accelerated multiplication in GF(2^128)
|
|
/// using the Intel PCLMULQDQ instruction.
|
|
///
|
|
/// See: https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
|
|
@safe
|
|
struct GCMPCLMULQDQMultiplier
|
|
{
|
|
|
|
private {
|
|
ubyte[16] H;
|
|
}
|
|
|
|
this(in ubyte[] H) nothrow @nogc
|
|
in {
|
|
assert(H.length == 16, "H: invalid length");
|
|
}
|
|
body {
|
|
init(H);
|
|
}
|
|
|
|
nothrow @nogc {
|
|
/**
|
|
* initialize the multiplicator
|
|
*/
|
|
void init(in ubyte[] H)
|
|
in {
|
|
assert(H.length == 16, "H: invalid length");
|
|
}
|
|
body {
|
|
this.H[] = H[];
|
|
}
|
|
|
|
/// Multiply x by H and store result in x.
|
|
///
|
|
/// Params:
|
|
/// x = 16 byte block
|
|
void multiply(ubyte[] x)
|
|
in {
|
|
assert(x.length == 16, "x: invalid length.");
|
|
}
|
|
body {
|
|
//GF128.multiply(x, H);
|
|
gfmul(x, H);
|
|
}
|
|
}
|
|
|
|
/// Multiplies a with b, result is stored in a.
|
|
@trusted
|
|
private void gfmul(ubyte[] a, in ubyte[] b) nothrow @nogc
|
|
in {
|
|
assert(a.length == 16, "Invalid length of input. Must be 16 bytes.");
|
|
assert(b.length == 16, "Invalid length of input. Must be 16 bytes.");
|
|
}
|
|
body {
|
|
auto aLength = a.length;
|
|
foreach (i; 0 .. aLength / 2) {
|
|
swap(a[i], a[aLength - 1 - i]);
|
|
}
|
|
ubyte[16] revB = b;
|
|
foreach (i; 0 .. revB.length / 2) {
|
|
auto bLen = revB.length;
|
|
swap(revB[i], revB[bLen - 1 - i]);
|
|
}
|
|
|
|
version(D_InlineAsm_X86_64) {
|
|
__vector(ubyte[16]) va = *cast(__vector(ubyte[16])*)a.ptr;
|
|
__vector(ubyte[16]) vb = *cast(__vector(ubyte[16])*)revB.ptr;
|
|
|
|
__vector(ubyte[16]) r0 = __pclmulqdq(va, vb, 0x00); // a0 * b0
|
|
__vector(ubyte[16]) r1 = __pclmulqdq(va, vb, 0x10); // a0 * b1
|
|
__vector(ubyte[16]) r2 = __pclmulqdq(va, vb, 0x01); // a1 * b0
|
|
__vector(ubyte[16]) r3 = __pclmulqdq(va, vb, 0x11); // a1 * b1
|
|
|
|
__vector(ubyte[16]) t1 = r1 ^ r2;
|
|
__vector(ubyte[16]) t2 = __shiftright(t1, 8);
|
|
__vector(ubyte[16]) t3 = __shiftleft(t1, 8);
|
|
__vector(ubyte[16]) t4 = r0 ^ t2;
|
|
__vector(ubyte[16]) t5 = r3 ^ t3;
|
|
|
|
__vector(ubyte[16]) t6 = __shiftleft(t4, 1);
|
|
__vector(ubyte[16]) t7 = __shiftleft(t5, 1);
|
|
__vector(ubyte[16]) t8 = __shiftright(t4, 31);
|
|
__vector(ubyte[16]) t9 = __shiftright(t5, 31);
|
|
__vector(ubyte[16]) t10 = __shiftleft(t8, 4);
|
|
__vector(ubyte[16]) t11 = __shiftleft(t9, 4);
|
|
__vector(ubyte[16]) t12 = __shiftright(t8, 12);
|
|
__vector(ubyte[16]) t13 = t6 ^ t10;
|
|
__vector(ubyte[16]) t14 = t7 ^ t11;
|
|
__vector(ubyte[16]) t15 = t14 ^ t12;
|
|
|
|
*cast(__vector(ubyte[16])*)a.ptr = t13 ^ t15;
|
|
}
|
|
|
|
foreach (i; 0 .. a.length / 2) {
|
|
auto aLen = cast(int)a.length;
|
|
swap(a[i], a[aLen - 1 - i]);
|
|
}
|
|
}
|
|
|
|
// test pclmulqdq instruction with multiplication by 1
|
|
@trusted
|
|
unittest {
|
|
import core.cpuid;
|
|
version(D_InlineAsm_X86_64) {
|
|
if(aes) {
|
|
|
|
ubyte[16] a = cast(const ubyte[16]) x"12345678000000000000000000000000";
|
|
ubyte[16] b = cast(const ubyte[16]) x"01000000000000000000000000000000";
|
|
ubyte[16] c;
|
|
|
|
asm {
|
|
movdqu xmm1, [RBP + a];
|
|
movdqu xmm3, [EBP + b];
|
|
|
|
db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00; // pclmulqdq xmm3, xmm1, 0x00; // xmm3 holds a0*b0
|
|
|
|
movdqu [EBP + c], xmm3;
|
|
}
|
|
|
|
assert(c == x"12345678000000000000000000000000");
|
|
}
|
|
}
|
|
}
|
|
|
|
/// test pclmulqdq instruction with test vectors from
|
|
/// https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
|
|
@trusted
|
|
unittest {
|
|
import core.cpuid;
|
|
|
|
version(D_InlineAsm_X86_64) {
|
|
if(aes) {
|
|
|
|
/// Python code to convert test vectors into little endian format.
|
|
/// Reverses the string by bytes (not by hexits):
|
|
///
|
|
/// import binascii
|
|
/// def conv(xmmstr):
|
|
/// bytearr=bytearray.fromhex(xmmstr)[::-1]
|
|
/// return binascii.hexlify(bytearr)
|
|
///
|
|
/// conv('7b5b54657374566563746f725d53475d')
|
|
/// conv('48692853686179295b477565726f6e5d')
|
|
/// conv('1d4d84c85c3440c0929633d5d36f0451')
|
|
///
|
|
|
|
ubyte[16] a = cast(const ubyte[16]) x"5d47535d726f74636556747365545b7b"; // xxm1 high: 7b5b546573745665 low: 63746f725d53475d
|
|
ubyte[16] b = cast(const ubyte[16]) x"5d6e6f726575475b2979616853286948"; // 4869285368617929 5b477565726f6e5d
|
|
ubyte[16] c;
|
|
|
|
asm {
|
|
movdqu xmm1, [RBP + a];
|
|
movdqu xmm3, [EBP + b];
|
|
|
|
db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00; // pclmulqdq xmm3, xmm1, 0x00; // xmm3 holds a0*b0
|
|
|
|
movdqu [EBP + c], xmm3;
|
|
}
|
|
assert(c == x"51046fd3d5339692c040345cc8844d1d");
|
|
|
|
asm {
|
|
movdqu xmm1, [RBP + a];
|
|
movdqu xmm3, [EBP + b];
|
|
|
|
db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x01;
|
|
|
|
movdqu [EBP + c], xmm3;
|
|
}
|
|
assert(c == x"1513282aac40a57fa1b56a558d7cd11b");
|
|
|
|
asm {
|
|
movdqu xmm1, [RBP + a];
|
|
movdqu xmm3, [EBP + b];
|
|
|
|
db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x10;
|
|
|
|
movdqu [EBP + c], xmm3;
|
|
}
|
|
assert(c == x"c9d5b7f42d26bfba2f86303adbf62b1a");
|
|
|
|
asm {
|
|
movdqu xmm1, [RBP + a];
|
|
movdqu xmm3, [EBP + b];
|
|
|
|
db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x11;
|
|
|
|
movdqu [EBP + c], xmm3;
|
|
}
|
|
assert(c == x"edd40f413ee06ed6457c2e592c1f1e1d");
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// /// test hardware accelerated multiplication (pclmulqdq)
|
|
// unittest {
|
|
//
|
|
// immutable ubyte[16] H = cast(immutable ubyte[16]) x"00000000000000000000000000000080"; // neutral element
|
|
// ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78";
|
|
//
|
|
// GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(H);
|
|
//
|
|
// mult.multiply(X1);
|
|
//
|
|
// assert(X1 == x"0388dace60b6a392f328c2b971b2fe78", "GF128 multiplication with pclmulqdq failed!");
|
|
// }
|
|
|
|
/// test hardware accelerated multiplication (pclmulqdq)
|
|
unittest {
|
|
|
|
import std.algorithm: reverse;
|
|
|
|
ubyte[16] testH = cast(immutable ubyte[16]) x"952b2a56a5604ac0b32b6656a05b40b6";
|
|
ubyte[16] X1 = cast(immutable ubyte[16]) x"dfa6bf4ded81db03ffcaff95f830f061";
|
|
|
|
ubyte[16] expected = cast(immutable ubyte[16]) x"da53eb0ad2c55bb64fc4802cc3feda60";
|
|
|
|
// reverse(H[]);
|
|
// reverse(X1[]);
|
|
// reverse(expected[]);
|
|
|
|
//GCMMultiplier8kTable mult = GCMMultiplier8kTable(H);
|
|
GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(testH);
|
|
|
|
mult.multiply(X1);
|
|
|
|
assert(X1 == expected, "GF128 multiplication with pclmulqdq failed!");
|
|
}
|
|
|
|
// /// test hardware accelerated multiplication (pclmulqdq)
|
|
// unittest {
|
|
//
|
|
// ulong[2] H = [0xb32b6656a05b40b6, 0x952b2a56a5604ac0];
|
|
// ulong[2] X1 = [0xffcaff95f830f061, 0xdfa6bf4ded81db03];
|
|
//
|
|
// ulong[2] expected = [0x4fc4802cc3feda60, 0xda53eb0ad2c55bb6];
|
|
//
|
|
// //GCMMultiplier8kTable mult = GCMMultiplier8kTable(H);
|
|
// GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(cast(ubyte[16])H);
|
|
//
|
|
// mult.multiply(cast(ubyte[16])X1);
|
|
//
|
|
// assert(X1 == expected, "GF128 multiplication with pclmulqdq failed!");
|
|
// }
|
|
|
|
} |