module angel.utils.cryptography.gcm.multiplier; package: import angel.utils.cryptography.gcm.galoisfield; import std.algorithm: swap; // TODO Dynamically make use of intel pclmulqdq instruction for fast multiplication. /// test if T is a GCM multiplier @safe template isGCMMultiplier(T) { enum bool isGCMMultiplier = is(T == struct) && is(typeof( { ubyte[16] block; T m = void; m.init(block); m.multiply(block); })); } /// This struct provides schoolbook multiplication in GF(2^128). @safe struct GCMBasicMultiplier { private { ubyte[16] H; } this(in ubyte[] H) nothrow @nogc in { assert(H.length == 16, "H: invalid length"); } body { init(H); } nothrow @nogc { /** * initialize the multiplicator */ void init(in ubyte[] H) in { assert(H.length == 16, "H: invalid length"); } body { this.H[] = H[]; } /// Multiply x by H and store result in x. /// /// Params: /// x = 16 byte block void multiply(ubyte[] x) in { assert(x.length == 16, "x: invalid length."); } body { GF128.multiply(x, H); } } /// test multiplication using schoolbook multiplication unittest { immutable ubyte[16] testH = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e"; ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78"; GCMBasicMultiplier mult = GCMBasicMultiplier(testH); mult.multiply(X1); assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 8k table failed!"); } } /// This struct provides table driven multiplication in GF(2^128). @safe struct GCMMultiplier8kTable { private { ubyte[16][16][32] M; } this(in ubyte[] H) nothrow @nogc in { assert(H.length == 16, "H: invalid length"); } body { init(H); } nothrow @nogc { /** * initialize the multiplicator */ void init(in ubyte[] H) { tableSetup(H); } /// Multiply x by H and store result in x. /// /// Params: /// x = 16 byte block void multiply(ubyte[] x) in { assert(x.length == 16, "x: invalid length."); } body { ubyte[16] z; for(uint i = 0; i < 16; ++i) { z[] ^= M[2*i][x[i]>>4][]; z[] ^= M[2*i+1][x[i]&0xF][]; } x[] = z[]; } } /// test multiplication using 8k table unittest { immutable ubyte[16] H = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e"; ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78"; GCMMultiplier8kTable mult = GCMMultiplier8kTable(H); mult.multiply(X1); assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 8k table failed!"); } private void tableSetup(in ubyte[] H) nothrow @nogc in { assert(H.length == 16, "H: invalid length"); } body { ubyte[16] Pi; Pi[0] = 0x80; ubyte[1] oneByte; for(int i = 0; i < 32; ++i) { for(uint j = 0; j < 16; ++j) { M[i][j] = H; oneByte[0] = cast(ubyte) (j<<4); GF128.multiply(M[i][j], oneByte); GF128.multiply(M[i][j], Pi); } multiplyP4(Pi); } } private void multiplyP4(ubyte[] x) nothrow @nogc { foreach(i;0..4){ GF128.multiplyP(x); } } } /// This class provides table driven multiplication in GF(2^128). /// The 64k table is rather large and probably won't fit into the cache. /// Use the 8k table to avoid timing based leaks. @safe struct GCMMultiplier64kTable { private { ubyte[16][256][16] M; } this(in ubyte[] H) nothrow @nogc in { assert(H.length == 16, "H: invalid length"); } body { init(H); } nothrow @nogc { /// initialize the multiplicator void init(in ubyte[] H) { tableSetup(H); } /// Multiply x by H and store result in x. /// /// Params: /// x = 16 byte block void multiply(ubyte[] x) in { assert(x.length == 16, "x: invalid length."); } body { ubyte[16] z; for(uint i = 0; i < 16; ++i) { z[] ^= M[i][x[i]][]; } x[] = z[]; } } /// test multiplication using 64k table unittest { immutable ubyte[16] H = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e"; ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78"; GCMMultiplier64kTable mult = GCMMultiplier64kTable(H); mult.multiply(X1); assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 64k table failed!"); } private void tableSetup(in ubyte[] H) nothrow @nogc in { assert(H.length == 16, "H: invalid length"); } body { ubyte[16] P; P[0] = 0x80; ubyte[1] oneByte; for(int i = 0; i < 16; ++i) { for(uint j = 0; j <= 255; ++j) { M[i][j] = H; oneByte[0] = cast(ubyte) j; GF128.multiply(M[i][j], oneByte); GF128.multiply(M[i][j], P); } GF128.multiplyP8(P); } } } /// This struct provides hardware accelerated multiplication in GF(2^128) /// using the Intel PCLMULQDQ instruction. /// /// See: https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf @safe struct GCMPCLMULQDQMultiplier { private { ubyte[16] H; } this(in ubyte[] H) nothrow @nogc in { assert(H.length == 16, "H: invalid length"); } body { init(H); } nothrow @nogc { /** * initialize the multiplicator */ void init(in ubyte[] H) in { assert(H.length == 16, "H: invalid length"); } body { this.H[] = H[]; } /// Multiply x by H and store result in x. /// /// Params: /// x = 16 byte block void multiply(ubyte[] x) in { assert(x.length == 16, "x: invalid length."); } body { //GF128.multiply(x, H); gfmul(x, H); } } /// Multiplies a with b, result is stored in a. @trusted private void gfmul(ubyte[] a, in ubyte[] b) nothrow @nogc in { assert(a.length == 16, "Invalid length of input. Must be 16 bytes."); assert(b.length == 16, "Invalid length of input. Must be 16 bytes."); } body { auto aLength = a.length; foreach (i; 0 .. aLength / 2) { swap(a[i], a[aLength - 1 - i]); } ubyte[16] revB = b; foreach (i; 0 .. revB.length / 2) { auto bLen = revB.length; swap(revB[i], revB[bLen - 1 - i]); } version(D_InlineAsm_X86_64) { __vector(ubyte[16]) va = *cast(__vector(ubyte[16])*)a.ptr; __vector(ubyte[16]) vb = *cast(__vector(ubyte[16])*)revB.ptr; __vector(ubyte[16]) r0 = __pclmulqdq(va, vb, 0x00); // a0 * b0 __vector(ubyte[16]) r1 = __pclmulqdq(va, vb, 0x10); // a0 * b1 __vector(ubyte[16]) r2 = __pclmulqdq(va, vb, 0x01); // a1 * b0 __vector(ubyte[16]) r3 = __pclmulqdq(va, vb, 0x11); // a1 * b1 __vector(ubyte[16]) t1 = r1 ^ r2; __vector(ubyte[16]) t2 = __shiftright(t1, 8); __vector(ubyte[16]) t3 = __shiftleft(t1, 8); __vector(ubyte[16]) t4 = r0 ^ t2; __vector(ubyte[16]) t5 = r3 ^ t3; __vector(ubyte[16]) t6 = __shiftleft(t4, 1); __vector(ubyte[16]) t7 = __shiftleft(t5, 1); __vector(ubyte[16]) t8 = __shiftright(t4, 31); __vector(ubyte[16]) t9 = __shiftright(t5, 31); __vector(ubyte[16]) t10 = __shiftleft(t8, 4); __vector(ubyte[16]) t11 = __shiftleft(t9, 4); __vector(ubyte[16]) t12 = __shiftright(t8, 12); __vector(ubyte[16]) t13 = t6 ^ t10; __vector(ubyte[16]) t14 = t7 ^ t11; __vector(ubyte[16]) t15 = t14 ^ t12; *cast(__vector(ubyte[16])*)a.ptr = t13 ^ t15; } foreach (i; 0 .. a.length / 2) { auto aLen = cast(int)a.length; swap(a[i], a[aLen - 1 - i]); } } // test pclmulqdq instruction with multiplication by 1 @trusted unittest { import core.cpuid; version(D_InlineAsm_X86_64) { if(aes) { ubyte[16] a = cast(const ubyte[16]) x"12345678000000000000000000000000"; ubyte[16] b = cast(const ubyte[16]) x"01000000000000000000000000000000"; ubyte[16] c; asm { movdqu xmm1, [RBP + a]; movdqu xmm3, [EBP + b]; db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00; // pclmulqdq xmm3, xmm1, 0x00; // xmm3 holds a0*b0 movdqu [EBP + c], xmm3; } assert(c == x"12345678000000000000000000000000"); } } } /// test pclmulqdq instruction with test vectors from /// https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf @trusted unittest { import core.cpuid; version(D_InlineAsm_X86_64) { if(aes) { /// Python code to convert test vectors into little endian format. /// Reverses the string by bytes (not by hexits): /// /// import binascii /// def conv(xmmstr): /// bytearr=bytearray.fromhex(xmmstr)[::-1] /// return binascii.hexlify(bytearr) /// /// conv('7b5b54657374566563746f725d53475d') /// conv('48692853686179295b477565726f6e5d') /// conv('1d4d84c85c3440c0929633d5d36f0451') /// ubyte[16] a = cast(const ubyte[16]) x"5d47535d726f74636556747365545b7b"; // xxm1 high: 7b5b546573745665 low: 63746f725d53475d ubyte[16] b = cast(const ubyte[16]) x"5d6e6f726575475b2979616853286948"; // 4869285368617929 5b477565726f6e5d ubyte[16] c; asm { movdqu xmm1, [RBP + a]; movdqu xmm3, [EBP + b]; db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00; // pclmulqdq xmm3, xmm1, 0x00; // xmm3 holds a0*b0 movdqu [EBP + c], xmm3; } assert(c == x"51046fd3d5339692c040345cc8844d1d"); asm { movdqu xmm1, [RBP + a]; movdqu xmm3, [EBP + b]; db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x01; movdqu [EBP + c], xmm3; } assert(c == x"1513282aac40a57fa1b56a558d7cd11b"); asm { movdqu xmm1, [RBP + a]; movdqu xmm3, [EBP + b]; db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x10; movdqu [EBP + c], xmm3; } assert(c == x"c9d5b7f42d26bfba2f86303adbf62b1a"); asm { movdqu xmm1, [RBP + a]; movdqu xmm3, [EBP + b]; db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x11; movdqu [EBP + c], xmm3; } assert(c == x"edd40f413ee06ed6457c2e592c1f1e1d"); } } } // /// test hardware accelerated multiplication (pclmulqdq) // unittest { // // immutable ubyte[16] H = cast(immutable ubyte[16]) x"00000000000000000000000000000080"; // neutral element // ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78"; // // GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(H); // // mult.multiply(X1); // // assert(X1 == x"0388dace60b6a392f328c2b971b2fe78", "GF128 multiplication with pclmulqdq failed!"); // } /// test hardware accelerated multiplication (pclmulqdq) unittest { import std.algorithm: reverse; ubyte[16] testH = cast(immutable ubyte[16]) x"952b2a56a5604ac0b32b6656a05b40b6"; ubyte[16] X1 = cast(immutable ubyte[16]) x"dfa6bf4ded81db03ffcaff95f830f061"; ubyte[16] expected = cast(immutable ubyte[16]) x"da53eb0ad2c55bb64fc4802cc3feda60"; // reverse(H[]); // reverse(X1[]); // reverse(expected[]); //GCMMultiplier8kTable mult = GCMMultiplier8kTable(H); GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(testH); mult.multiply(X1); assert(X1 == expected, "GF128 multiplication with pclmulqdq failed!"); } // /// test hardware accelerated multiplication (pclmulqdq) // unittest { // // ulong[2] H = [0xb32b6656a05b40b6, 0x952b2a56a5604ac0]; // ulong[2] X1 = [0xffcaff95f830f061, 0xdfa6bf4ded81db03]; // // ulong[2] expected = [0x4fc4802cc3feda60, 0xda53eb0ad2c55bb6]; // // //GCMMultiplier8kTable mult = GCMMultiplier8kTable(H); // GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(cast(ubyte[16])H); // // mult.multiply(cast(ubyte[16])X1); // // assert(X1 == expected, "GF128 multiplication with pclmulqdq failed!"); // } }