Files
2025-12-22 16:23:48 +01:00

493 lines
11 KiB
D

module angel.utils.cryptography.gcm.multiplier;
package:
import angel.utils.cryptography.gcm.galoisfield;
import std.algorithm: swap;
// TODO Dynamically make use of intel pclmulqdq instruction for fast multiplication.
/// test if T is a GCM multiplier
@safe
template isGCMMultiplier(T)
{
enum bool isGCMMultiplier =
is(T == struct) &&
is(typeof(
{
ubyte[16] block;
T m = void;
m.init(block);
m.multiply(block);
}));
}
/// This struct provides schoolbook multiplication in GF(2^128).
@safe
struct GCMBasicMultiplier
{
private {
ubyte[16] H;
}
this(in ubyte[] H) nothrow @nogc
in {
assert(H.length == 16, "H: invalid length");
}
body {
init(H);
}
nothrow @nogc {
/**
* initialize the multiplicator
*/
void init(in ubyte[] H)
in {
assert(H.length == 16, "H: invalid length");
}
body {
this.H[] = H[];
}
/// Multiply x by H and store result in x.
///
/// Params:
/// x = 16 byte block
void multiply(ubyte[] x)
in {
assert(x.length == 16, "x: invalid length.");
}
body {
GF128.multiply(x, H);
}
}
/// test multiplication using schoolbook multiplication
unittest {
immutable ubyte[16] testH = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e";
ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78";
GCMBasicMultiplier mult = GCMBasicMultiplier(testH);
mult.multiply(X1);
assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 8k table failed!");
}
}
/// This struct provides table driven multiplication in GF(2^128).
@safe
struct GCMMultiplier8kTable
{
private {
ubyte[16][16][32] M;
}
this(in ubyte[] H) nothrow @nogc
in {
assert(H.length == 16, "H: invalid length");
}
body {
init(H);
}
nothrow @nogc {
/**
* initialize the multiplicator
*/
void init(in ubyte[] H) {
tableSetup(H);
}
/// Multiply x by H and store result in x.
///
/// Params:
/// x = 16 byte block
void multiply(ubyte[] x)
in {
assert(x.length == 16, "x: invalid length.");
}
body {
ubyte[16] z;
for(uint i = 0; i < 16; ++i) {
z[] ^= M[2*i][x[i]>>4][];
z[] ^= M[2*i+1][x[i]&0xF][];
}
x[] = z[];
}
}
/// test multiplication using 8k table
unittest {
immutable ubyte[16] H = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e";
ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78";
GCMMultiplier8kTable mult = GCMMultiplier8kTable(H);
mult.multiply(X1);
assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 8k table failed!");
}
private void tableSetup(in ubyte[] H) nothrow @nogc
in {
assert(H.length == 16, "H: invalid length");
}
body {
ubyte[16] Pi;
Pi[0] = 0x80;
ubyte[1] oneByte;
for(int i = 0; i < 32; ++i) {
for(uint j = 0; j < 16; ++j) {
M[i][j] = H;
oneByte[0] = cast(ubyte) (j<<4);
GF128.multiply(M[i][j], oneByte);
GF128.multiply(M[i][j], Pi);
}
multiplyP4(Pi);
}
}
private void multiplyP4(ubyte[] x) nothrow @nogc {
foreach(i;0..4){
GF128.multiplyP(x);
}
}
}
/// This class provides table driven multiplication in GF(2^128).
/// The 64k table is rather large and probably won't fit into the cache.
/// Use the 8k table to avoid timing based leaks.
@safe
struct GCMMultiplier64kTable
{
private {
ubyte[16][256][16] M;
}
this(in ubyte[] H) nothrow @nogc
in {
assert(H.length == 16, "H: invalid length");
}
body {
init(H);
}
nothrow @nogc {
/// initialize the multiplicator
void init(in ubyte[] H) {
tableSetup(H);
}
/// Multiply x by H and store result in x.
///
/// Params:
/// x = 16 byte block
void multiply(ubyte[] x)
in {
assert(x.length == 16, "x: invalid length.");
}
body {
ubyte[16] z;
for(uint i = 0; i < 16; ++i) {
z[] ^= M[i][x[i]][];
}
x[] = z[];
}
}
/// test multiplication using 64k table
unittest {
immutable ubyte[16] H = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e";
ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78";
GCMMultiplier64kTable mult = GCMMultiplier64kTable(H);
mult.multiply(X1);
assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 64k table failed!");
}
private void tableSetup(in ubyte[] H) nothrow @nogc
in {
assert(H.length == 16, "H: invalid length");
}
body {
ubyte[16] P;
P[0] = 0x80;
ubyte[1] oneByte;
for(int i = 0; i < 16; ++i) {
for(uint j = 0; j <= 255; ++j) {
M[i][j] = H;
oneByte[0] = cast(ubyte) j;
GF128.multiply(M[i][j], oneByte);
GF128.multiply(M[i][j], P);
}
GF128.multiplyP8(P);
}
}
}
/// This struct provides hardware accelerated multiplication in GF(2^128)
/// using the Intel PCLMULQDQ instruction.
///
/// See: https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
@safe
struct GCMPCLMULQDQMultiplier
{
private {
ubyte[16] H;
}
this(in ubyte[] H) nothrow @nogc
in {
assert(H.length == 16, "H: invalid length");
}
body {
init(H);
}
nothrow @nogc {
/**
* initialize the multiplicator
*/
void init(in ubyte[] H)
in {
assert(H.length == 16, "H: invalid length");
}
body {
this.H[] = H[];
}
/// Multiply x by H and store result in x.
///
/// Params:
/// x = 16 byte block
void multiply(ubyte[] x)
in {
assert(x.length == 16, "x: invalid length.");
}
body {
//GF128.multiply(x, H);
gfmul(x, H);
}
}
/// Multiplies a with b, result is stored in a.
@trusted
private void gfmul(ubyte[] a, in ubyte[] b) nothrow @nogc
in {
assert(a.length == 16, "Invalid length of input. Must be 16 bytes.");
assert(b.length == 16, "Invalid length of input. Must be 16 bytes.");
}
body {
auto aLength = a.length;
foreach (i; 0 .. aLength / 2) {
swap(a[i], a[aLength - 1 - i]);
}
ubyte[16] revB = b;
foreach (i; 0 .. revB.length / 2) {
auto bLen = revB.length;
swap(revB[i], revB[bLen - 1 - i]);
}
version(D_InlineAsm_X86_64) {
__vector(ubyte[16]) va = *cast(__vector(ubyte[16])*)a.ptr;
__vector(ubyte[16]) vb = *cast(__vector(ubyte[16])*)revB.ptr;
__vector(ubyte[16]) r0 = __pclmulqdq(va, vb, 0x00); // a0 * b0
__vector(ubyte[16]) r1 = __pclmulqdq(va, vb, 0x10); // a0 * b1
__vector(ubyte[16]) r2 = __pclmulqdq(va, vb, 0x01); // a1 * b0
__vector(ubyte[16]) r3 = __pclmulqdq(va, vb, 0x11); // a1 * b1
__vector(ubyte[16]) t1 = r1 ^ r2;
__vector(ubyte[16]) t2 = __shiftright(t1, 8);
__vector(ubyte[16]) t3 = __shiftleft(t1, 8);
__vector(ubyte[16]) t4 = r0 ^ t2;
__vector(ubyte[16]) t5 = r3 ^ t3;
__vector(ubyte[16]) t6 = __shiftleft(t4, 1);
__vector(ubyte[16]) t7 = __shiftleft(t5, 1);
__vector(ubyte[16]) t8 = __shiftright(t4, 31);
__vector(ubyte[16]) t9 = __shiftright(t5, 31);
__vector(ubyte[16]) t10 = __shiftleft(t8, 4);
__vector(ubyte[16]) t11 = __shiftleft(t9, 4);
__vector(ubyte[16]) t12 = __shiftright(t8, 12);
__vector(ubyte[16]) t13 = t6 ^ t10;
__vector(ubyte[16]) t14 = t7 ^ t11;
__vector(ubyte[16]) t15 = t14 ^ t12;
*cast(__vector(ubyte[16])*)a.ptr = t13 ^ t15;
}
foreach (i; 0 .. a.length / 2) {
auto aLen = cast(int)a.length;
swap(a[i], a[aLen - 1 - i]);
}
}
// test pclmulqdq instruction with multiplication by 1
@trusted
unittest {
import core.cpuid;
version(D_InlineAsm_X86_64) {
if(aes) {
ubyte[16] a = cast(const ubyte[16]) x"12345678000000000000000000000000";
ubyte[16] b = cast(const ubyte[16]) x"01000000000000000000000000000000";
ubyte[16] c;
asm {
movdqu xmm1, [RBP + a];
movdqu xmm3, [EBP + b];
db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00; // pclmulqdq xmm3, xmm1, 0x00; // xmm3 holds a0*b0
movdqu [EBP + c], xmm3;
}
assert(c == x"12345678000000000000000000000000");
}
}
}
/// test pclmulqdq instruction with test vectors from
/// https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
@trusted
unittest {
import core.cpuid;
version(D_InlineAsm_X86_64) {
if(aes) {
/// Python code to convert test vectors into little endian format.
/// Reverses the string by bytes (not by hexits):
///
/// import binascii
/// def conv(xmmstr):
/// bytearr=bytearray.fromhex(xmmstr)[::-1]
/// return binascii.hexlify(bytearr)
///
/// conv('7b5b54657374566563746f725d53475d')
/// conv('48692853686179295b477565726f6e5d')
/// conv('1d4d84c85c3440c0929633d5d36f0451')
///
ubyte[16] a = cast(const ubyte[16]) x"5d47535d726f74636556747365545b7b"; // xxm1 high: 7b5b546573745665 low: 63746f725d53475d
ubyte[16] b = cast(const ubyte[16]) x"5d6e6f726575475b2979616853286948"; // 4869285368617929 5b477565726f6e5d
ubyte[16] c;
asm {
movdqu xmm1, [RBP + a];
movdqu xmm3, [EBP + b];
db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00; // pclmulqdq xmm3, xmm1, 0x00; // xmm3 holds a0*b0
movdqu [EBP + c], xmm3;
}
assert(c == x"51046fd3d5339692c040345cc8844d1d");
asm {
movdqu xmm1, [RBP + a];
movdqu xmm3, [EBP + b];
db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x01;
movdqu [EBP + c], xmm3;
}
assert(c == x"1513282aac40a57fa1b56a558d7cd11b");
asm {
movdqu xmm1, [RBP + a];
movdqu xmm3, [EBP + b];
db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x10;
movdqu [EBP + c], xmm3;
}
assert(c == x"c9d5b7f42d26bfba2f86303adbf62b1a");
asm {
movdqu xmm1, [RBP + a];
movdqu xmm3, [EBP + b];
db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x11;
movdqu [EBP + c], xmm3;
}
assert(c == x"edd40f413ee06ed6457c2e592c1f1e1d");
}
}
}
// /// test hardware accelerated multiplication (pclmulqdq)
// unittest {
//
// immutable ubyte[16] H = cast(immutable ubyte[16]) x"00000000000000000000000000000080"; // neutral element
// ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78";
//
// GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(H);
//
// mult.multiply(X1);
//
// assert(X1 == x"0388dace60b6a392f328c2b971b2fe78", "GF128 multiplication with pclmulqdq failed!");
// }
/// test hardware accelerated multiplication (pclmulqdq)
unittest {
import std.algorithm: reverse;
ubyte[16] testH = cast(immutable ubyte[16]) x"952b2a56a5604ac0b32b6656a05b40b6";
ubyte[16] X1 = cast(immutable ubyte[16]) x"dfa6bf4ded81db03ffcaff95f830f061";
ubyte[16] expected = cast(immutable ubyte[16]) x"da53eb0ad2c55bb64fc4802cc3feda60";
// reverse(H[]);
// reverse(X1[]);
// reverse(expected[]);
//GCMMultiplier8kTable mult = GCMMultiplier8kTable(H);
GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(testH);
mult.multiply(X1);
assert(X1 == expected, "GF128 multiplication with pclmulqdq failed!");
}
// /// test hardware accelerated multiplication (pclmulqdq)
// unittest {
//
// ulong[2] H = [0xb32b6656a05b40b6, 0x952b2a56a5604ac0];
// ulong[2] X1 = [0xffcaff95f830f061, 0xdfa6bf4ded81db03];
//
// ulong[2] expected = [0x4fc4802cc3feda60, 0xda53eb0ad2c55bb6];
//
// //GCMMultiplier8kTable mult = GCMMultiplier8kTable(H);
// GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(cast(ubyte[16])H);
//
// mult.multiply(cast(ubyte[16])X1);
//
// assert(X1 == expected, "GF128 multiplication with pclmulqdq failed!");
// }
}