Commit c088a3d3 authored by Santiago Marco-Sola's avatar Santiago Marco-Sola
Browse files

First set of kernels

parent d7913995
################################################################################
# RISCV
################################################################################
EPI_TOOLCHAIN=/shared_folder/llvm-EPI-0.7-development-toolchain-native/
EPI_CC=$(EPI_TOOLCHAIN)/bin/clang
CC=$(EPI_CC) -O2 -mepi -fno-vectorize -Wall -Rpass=loop-vectorize
################################################################################
# HOST
################################################################################
#CC=gcc
#FLAGS=-O3 -march=native -fopt-info-vec-optimized -g
################################################################################
# FLAGS
################################################################################
LDFLAGS=-lm
################################################################################
# MODULES
################################################################################
MODULES=hpc.matmul \
pqs.mceliece \
wfa.compute \
wfe.align
SRCS=$(addsuffix .c, $(MODULES))
OBJS=$(SRCS:.c=.o)
all: $(MODULES)
$(MODULES): %: %.c
$(CC) $(FLAGS) $< -o $@
clean:
rm -f $(MODULES)
.PHONY: all clean
\ No newline at end of file
################################################################################
# [MA-RISCV] Kernels
################################################################################
############################################################
1. Matrix Multiplication
############################################################
DESCRIPTION:
Vectorial matrix multiplication
EXECUTE:
> ./hpc.matmul
############################################################
2. Wavefront Alignment Edit
############################################################
DESCRIPTION:
Aligns two strings of 512 characters each using the Wavefront
Alignment Algorithm (WFA) for edit distance.
EXECUTE:
> ./wfe.align <REPETITIONS>
> ./wfe.align 100000
############################################################
3. Wavefront Alignment Gap-Affine
############################################################
DESCRIPTION:
Executes the compute-next() step for string alignment using
the Wavefront Alignment algorithm (WFA) for Gap-Affine distance.
EXECUTE:
> ./wfa.compute <REPETITIONS> <VECTOR_LENGTH>
> ./wfa.compute 100000 10000
############################################################
4. Post-Quantum Computing. MCeliece Criptosystem
############################################################
DESCRIPTION:
In cryptography, the McEliece cryptosystem is an asymmetric encryption
algorithm developed in 1978 by Robert McEliece. It was the first such
scheme to use randomization in the encryption process. The algorithm has
never gained much acceptance in the cryptographic community, but is a
candidate for "post-quantum cryptography", as it is immune to attacks
using Shor's algorithm and – more generally – measuring coset states
using Fourier sampling.
EXECUTE:
> ./pqs.mceliece <REPETITIONS>
> ./pqs.mceliece 10
\ No newline at end of file
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <assert.h>
static void matmul_builtins(int n, int bi, int bj, int bk, double (* restrict c)[n],
double (* restrict a)[n], double (* restrict b)[n]) {
unsigned vl = bi;
for (int ii = 0; ii < n; ii += bi) {
for (int kk = 0; kk < n; kk += bk) {
for (int jj = 0; jj < n; jj += bj) {
// c[ii:ii+bi-1][jj:jj+bj-1] += a[ii:ii+bi-1][kk:kk+bk-1] *
// b[kk:kk+bk-1][jj:jj+bj-1];
for (int i = ii; i < ii + bi; i++) {
__epi_1xf64 v1 = __builtin_epi_vload_1xf64(&c[i][jj], vl);
__epi_1xf64 v2 = __builtin_epi_vload_1xf64(&a[i][kk], vl);
for (int k = kk; k < kk + bk; k++) {
__epi_1xf64 v3 = __builtin_epi_vload_1xf64(&b[k][jj], vl);
// v1 = v2 * v3 + v1
v1 = __builtin_epi_vfmacc_1xf64(v1, v2, v3, vl);
}
__builtin_epi_vstore_1xf64(&c[i][jj], v1, vl);
}
}
}
}
}
static void matmul_reference(int n, double (*c)[n], double (*a)[n], double (*b)[n]) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++) {
c[i][j] += a[i][k] * b[k][j];
}
}
}
}
typedef enum {
INIT_ALL_ONES,
INIT_IOTA,
} init_mode_t;
static void init_matrix(int n, double (*a)[n], init_mode_t init_mode) {
switch (init_mode) {
case INIT_ALL_ONES:
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
a[i][j] = 1.0;
}
}
break;
case INIT_IOTA: {
double v = 1.0;
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
a[i][j] = v;
v += 1.0;
}
}
break;
}
}
}
static void zero_matrix(int n, double (*a)[n]) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
a[i][j] = 0.0;
}
}
}
static void compare_results(int n, double (*ref)[n], double (*c)[n]) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
if (fabs(ref[i][j] - c[i][j]) > 1e-9) {
printf("ERROR: reference [%d][%d] == %e vs computed [%d][%d] == %e\n", i,
j, ref[i][j], i, j, c[i][j]);
exit(EXIT_FAILURE);
}
}
}
printf("RESULT OK\n");
}
int main(int argc, char *argv[])
{
// FIXME: Make this a parameter read from the command line
int n = 64;
double (*ref)[n] = malloc(sizeof(*ref) * n);
double (*c)[n] = malloc(sizeof(*c) * n);
double (*a)[n] = malloc(sizeof(*a) * n);
double (*b)[n] = malloc(sizeof(*b) * n);
zero_matrix(n, c);
zero_matrix(n, ref);
// FIXME: Allow other ways of initializing
init_matrix(n, a, INIT_ALL_ONES);
init_matrix(n, b, INIT_ALL_ONES);
matmul_reference(n, ref, a, b);
unsigned long vl = __builtin_epi_vsetvl(n, __epi_e64, __epi_m1);
int bi = vl;//64;
int bj = vl;// 64;
int bk = vl;// 64;
matmul_builtins(n, bi, bj, bk, c, a, b);
compare_results(n, ref, c);
free(b);
free(a);
free(c);
free(ref);
return 0;
}
/*
* PROJECT: Post-Quantum Computing. MCeliece Criptosystem
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <time.h>
#define GFBITS 13
#define SYS_N 6688
#define SYS_T 128
#define GFMASK ((1 << GFBITS) - 1)
#define PK_NROWS (SYS_T*GFBITS)
#define PK_NCOLS (SYS_N - PK_NROWS)
#define PK_ROW_BYTES ((PK_NCOLS + 7)/8)
typedef char bit;
typedef uint16_t gf;
/*
* Bit manipulations
*/
static bit is_smaller_63b(uint64_t a, uint64_t b) {
uint64_t ret = 0;
ret = a - b;
ret >>= 63;
return ret;
}
static void cswap_63b(uint64_t *x,uint64_t *y,bit swap) {
uint64_t m;
uint64_t d;
m = swap;
m = 0 - m;
d = (*x ^ *y);
d &= m;
*x ^= d;
*y ^= d;
}
static void minmax_63b(uint64_t *x, uint64_t *y) {
bit m;
m = is_smaller_63b(*y, *x);
cswap_63b(x, y, m);
}
static void merge_63b(int n,uint64_t x[n],int step) {
int i;
if (n == 1)
minmax_63b(&x[0],&x[step]);
else {
merge_63b(n / 2,x,step * 2);
merge_63b(n / 2,x + step,step * 2);
for (i = 1;i < 2*n-1;i += 2)
minmax_63b(&x[i * step],&x[(i + 1) * step]);
}
}
void sort_63b(int n, uint64_t x[n]) {
if (n <= 1) return;
sort_63b(n/2,x);
sort_63b(n/2,x + n/2);
merge_63b(n/2,x,1);
}
gf bitrev(gf a) {
a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8);
a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4);
a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2);
a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1);
return a >> 3;
}
uint16_t load2(const unsigned char *src) {
uint16_t a;
a = src[1];
a <<= 8;
a |= src[0];
return a & GFMASK;
}
/*
* GF Mul
*/
gf gf_mul(gf in0, gf in1) {
int i;
uint64_t tmp;
uint64_t t0;
uint64_t t1;
uint64_t t;
t0 = in0;
t1 = in1;
tmp = t0 * (t1 & 1);
for (i = 1; i < GFBITS; i++)
tmp ^= (t0 * (t1 & (1 << i)));
//
t = tmp & 0x1FF0000;
tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
t = tmp & 0x000E000;
tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
return tmp & GFMASK;
}
gf gf_sq2mul(gf in, gf m) {
int i;
uint64_t x;
uint64_t t0;
uint64_t t1;
uint64_t t;
const uint64_t M[] = {0x1FF0000000000000,
0x000FF80000000000,
0x000007FC00000000,
0x00000003FE000000,
0x0000000001FE0000,
0x000000000001E000};
t0 = in;
t1 = m;
x = (t1 << 18) * (t0 & (1 << 6));
t0 ^= (t0 << 21);
x ^= (t1 * (t0 & (0x010000001)));
x ^= (t1 * (t0 & (0x020000002))) << 3;
x ^= (t1 * (t0 & (0x040000004))) << 6;
x ^= (t1 * (t0 & (0x080000008))) << 9;
x ^= (t1 * (t0 & (0x100000010))) << 12;
x ^= (t1 * (t0 & (0x200000020))) << 15;
for (i = 0; i < 6; i++)
{
t = x & M[i];
x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
}
return x & GFMASK;
}
gf gf_sqmul(gf in, gf m) {
int i;
uint64_t x;
uint64_t t0;
uint64_t t1;
uint64_t t;
const uint64_t M[] = {0x0000001FF0000000,
0x000000000FF80000,
0x000000000007E000};
t0 = in;
t1 = m;
x = (t1 << 6) * (t0 & (1 << 6));
t0 ^= (t0 << 7);
x ^= (t1 * (t0 & (0x04001)));
x ^= (t1 * (t0 & (0x08002))) << 1;
x ^= (t1 * (t0 & (0x10004))) << 2;
x ^= (t1 * (t0 & (0x20008))) << 3;
x ^= (t1 * (t0 & (0x40010))) << 4;
x ^= (t1 * (t0 & (0x80020))) << 5;
for (i = 0; i < 3; i++)
{
t = x & M[i];
x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
}
return x & GFMASK;
}
gf gf_sq2(gf in) {
int i;
const uint64_t B[] = {0x1111111111111111,
0x0303030303030303,
0x000F000F000F000F,
0x000000FF000000FF};
const uint64_t M[] = {0x0001FF0000000000,
0x000000FF80000000,
0x000000007FC00000,
0x00000000003FE000};
uint64_t x = in;
uint64_t t;
x = (x | (x << 24)) & B[3];
x = (x | (x << 12)) & B[2];
x = (x | (x << 6)) & B[1];
x = (x | (x << 3)) & B[0];
for (i = 0; i < 4; i++)
{
t = x & M[i];
x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
}
return x & GFMASK;
}
gf gf_frac(gf den, gf num) {
gf tmp_11;
gf tmp_1111;
gf out;
tmp_11 = gf_sqmul(den, den); // ^11
tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111
out = gf_sq2(tmp_1111);
out = gf_sq2mul(out, tmp_1111); // ^11111111
out = gf_sq2(out);
out = gf_sq2mul(out, tmp_1111); // ^111111111111
return gf_sqmul(out, num); // ^1111111111110 = ^-1
}
gf gf_inv(gf den) {
return gf_frac(den, ((gf) 1));
}
/*
* Polynomial Root
* input: polynomial f and field element a
* return f(a)
*/
void root(gf *f, gf *a, gf* out) {
int i, j, k;
gf r, r_tmp;
uint32_t tmp;
uint32_t t0;
uint32_t t1;
uint32_t t;
for (k = 0; k < SYS_N; k++) {
r = f[ SYS_T ];
for (i = SYS_T-1; i >= 0; i--) {
t0 = r;
t1 = a[k];
tmp = t0 * (t1 & 1);
for (j = 1; j < GFBITS; j++)
tmp ^= (t0 * (t1 & (1 << j)));
t = tmp & 0x1FF0000;
tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
t = tmp & 0x000E000;
tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
r_tmp = tmp & ((1 << GFBITS)-1);
r = r_tmp ^ f[i];
}
out[k]=r;
}
}
/*
* Public Key Generation
* input: secret key sk
* output: public key pk
*/
int pk_gen(
unsigned char * pk,
unsigned char * sk,
uint32_t * perm) {
int i, j, k;
int row, c;
uint64_t buf[ 1 << GFBITS ];
unsigned char mat[ GFBITS * SYS_T ][ SYS_N/8 ];
unsigned char mask;
unsigned char b;
gf g[ SYS_T+1 ]; // Goppa polynomial
gf L[ SYS_N ]; // support
gf inv[ SYS_N ];
g[ SYS_T ] = 1;
for (i = 0; i < SYS_T; i++) {
g[i] = load2(sk);
g[i] &= GFMASK;
sk += 2;
}
for (i = 0; i < (1 << GFBITS); i++) {
buf[i] = perm[i];
buf[i] <<= 31;
buf[i] |= i;
}
sort_63b(1 << GFBITS, buf);
for (i = 0; i < (1 << GFBITS); i++) perm[i] = buf[i] & GFMASK;
for (i = 0; i < SYS_N; i++) L[i] = bitrev(perm[i]);
// filling the matrix
root(g, L, inv);
for (i = 0; i < SYS_N; i++) {
inv[i] = gf_inv(inv[i]);
}
for (i = 0; i < PK_NROWS; i++) {
for (j = 0; j < SYS_N/8; j++) {
mat[i][j] = 0;
}
}
for (i = 0; i < SYS_T; i++) {
for (j = 0; j < SYS_N; j+=8) {
for (k = 0; k < GFBITS; k++) {
b = (inv[j+7] >> k) & 1; b <<= 1;
b |= (inv[j+6] >> k) & 1; b <<= 1;
b |= (inv[j+5] >> k) & 1; b <<= 1;
b |= (inv[j+4] >> k) & 1; b <<= 1;
b |= (inv[j+3] >> k) & 1; b <<= 1;
b |= (inv[j+2] >> k) & 1; b <<= 1;
b |= (inv[j+1] >> k) & 1; b <<= 1;
b |= (inv[j+0] >> k) & 1;
mat[ i*GFBITS + k ][ j/8 ] = b;
}
}
for (j = 0; j < SYS_N; j++) {
inv[j] = gf_mul(inv[j], L[j]);
}
}
// gaussian elimination
for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) {
for (j = 0; j < 8; j++) {
row = i*8 + j;
if (row >= GFBITS * SYS_T) break;
for (k = row + 1; k < GFBITS * SYS_T; k++)
{
mask = mat[ row ][ i ] ^ mat[ k ][ i ];
mask >>= j;
mask &= 1;
mask = -mask;
for (c = 0; c < SYS_N/8; c++)
mat[ row ][ c ] ^= mat[ k ][ c ] & mask;
}
if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic
return -1;
}
for (k = 0; k < GFBITS * SYS_T; k++)
{
if (k != row)
{
mask = mat[ k ][ i ] >> j;
mask &= 1;
mask = -mask;
#pragma clang loop vectorize(enable)
for (c = 0; c < SYS_N/8; c++)
mat[ k ][ c ] ^= mat[ row ][ c ] & mask;
}
}
}
}
for (i = 0; i < PK_NROWS; i++)
memcpy(pk + i*PK_ROW_BYTES, mat[i] + PK_NROWS/8, PK_ROW_BYTES);
return 0;
}
/*
* Init
*/
void random_init(
uint8_t* const buffer,
const int buffer_size) {
int k;
for (k=0;k<buffer_size;++k) {
buffer[k] = rand() % 255; // [0,RAND_MAX]
}
}
/*
* Main