Skip to content

Commit

Permalink
Add benchmarks for intermediate calculations
Browse files Browse the repository at this point in the history
Add NIBBLING_MAYO.md
  • Loading branch information
bhess committed Mar 15, 2024
1 parent 0e3fafb commit 7354542
Show file tree
Hide file tree
Showing 4 changed files with 264 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ env:

jobs:
build_test:
runs-on: ubuntu-latest
runs-on: ubuntu-20.04
strategy:
matrix:
mayo_build_type: [ref, opt, avx2]
Expand Down
87 changes: 87 additions & 0 deletions NIBBLING_MAYO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Nibbling-MAYO Artifact

AVX2 implementations of MAYO, a multivariate quadratic signature scheme, as described in the paper **Nibbling MAYO: Optimized Implementations for AVX2 and Cortex-M4** available [here](https://eprint.iacr.org/2023/1683.pdf).

It consists of two variants:

1. A slower version compatible with the round-1 specification of MAYO using bitsliced representation. This version is contained in the [main](https://github.com/PQCMayo/MAYO-C/tree/main) branch of this repository.
2. A faster version that changes representation of keys and PRNG output to nibble-sliced representation. This version is contained in the [nibbling-mayo branch](https://github.com/PQCMayo/MAYO-C/tree/nibbling-mayo) of this repository.

All implementations implement the following parameter sets:

| Parameter Set | NIST Security Level | n | m | o | k | q | sk size | pk size | sig size |
| --- | ---- | -- | -- | -- | -- | -- | -- | -- | -- |
| MAYO_1 | 1 | 66 | 64 | 8 | 9 | 16 | 24 B | 1168 B | 321 B |
| MAYO_2 | 1 | 78 | 64 | 18 | 4 | 16 | 24 B | 5488 B | 180 B |
| MAYO_3 | 3 | 99 | 96 | 10 | 11 | 16 | 32 B | 2656 B | 577 B |
| MAYO_5 | 5 | 133 | 128 | 12 | 12 | 16 | 40 B | 5008 B | 838 B |

This file contains information about the build environment, build instructions and benchmarking programs to reproduce the results of the paper.

## Build Requirements

- CMake (version 3.5 or later)
- C99 compatible compiler
- Intel Haswell or later for AVX2 optimizations

## Evaluation environment used in the paper

The following CPUs were used:
- Intel Xeon X3-1245 v5 (Skylake)
- Intel Xeon Gold 6338 (Ice Lake)

Operating system: Ubuntu 22.04.3 LTS

Compiler: Ubuntu clang version 14.0.0-1ubuntu1.1

Turbo Boost was disabled in the UEFI.

## Build

The following steps build MAYO-C with AVX2 optimizations using clang:

- `git checkout main` or `git checkout nibbling-mayo`
- `mkdir -p build`
- `cd build`
- `cmake -DMAYO_BUILD_TYPE=avx2 -DCMAKE_C_COMPILER=clang ..`
- `make -j`

Other options for `MAYO_BUILD_TYPE` are `opt` and `ref`.

## Running benchmarks

Benchmarking programs are available in folder `test` after a successful build. To reproduce the results of Table 1, run:

- `test/mayo_bench_MAYO_1 <repetitions>`
- `test/mayo_bench_MAYO_2 <repetitions>`
- `test/mayo_bench_MAYO_3 <repetitions>`
- `test/mayo_bench_MAYO_5 <repetitions>`

The benchmarks report the median and the average cycle count from the specified number of repetitions (10000 in the paper), for the following MAYO operations:

- KeyGen (mayo_keygen)
- ExpandSK (mayo_expand_sk)
- ExpandPK (mayo_expand_pk)
- ExpandSK + Sign (mayo_sign)
- ExpandPK + Verify (mayo_verify)

To reproduce the results of Table 3, run:

- `test/mayo_bench_table3_MAYO_1 <repetitions>`
- `test/mayo_bench_table3_MAYO_2 <repetitions>`
- `test/mayo_bench_table3_MAYO_3 <repetitions>`
- `test/mayo_bench_table3_MAYO_5 <repetitions>`

## KAT and self-tests

KAT and selftests are available after a successful build. To run them, run one of the following in the build folder:

- `make test` or `ctest`

## Detailed user guide

A more detailed [README](README.md) with all configuration options of the MAYO-C library is available.

## License

The implementations are licenced under Apache-2.0, see the LICENSE and NOTICE files.
3 changes: 3 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ else()
target_link_libraries(mayo_test_scheme_${MVARIANT} ${MVARIANT_LOWER})
target_include_directories(mayo_test_scheme_${MVARIANT} PUBLIC ../src/common ${INC_PLATFORM})

add_executable(mayo_bench_table3_${MVARIANT} bench_mayo_table3.c)
target_link_libraries(mayo_bench_table3_${MVARIANT} ${MVARIANT_LOWER})

add_test(${MVARIANT}_KAT mayo_test_kat_${MVARIANT} ${MVARIANT})
add_test(${MVARIANT}_SELFTEST mayo_test_scheme_${MVARIANT} ${MVARIANT})
ENDFOREACH()
Expand Down
173 changes: 173 additions & 0 deletions test/bench_mayo_table3.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
// SPDX-License-Identifier: Apache-2.0

#include <mayo.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <inttypes.h>
#include <stdalign.h>
#include <rng.h>


#if defined(TARGET_OS_UNIX) && (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_OTHER))
#include <time.h>
#endif
#if (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_S390X) || defined(TARGET_OTHER))
#define print_unit printf("nsec\n");
#else
#define print_unit printf("cycles\n");
#endif

static int bench_sig(const mayo_params_t *p, int runs, int csv);
static inline int64_t cpucycles(void);

int main(int argc, char *argv[]) {
int rc = 0;

#ifdef ENABLE_PARAMS_DYNAMIC
if (argc < 3) {
printf("Two arguments needed\n");
rc = 1;
goto end;
}
int runs = atoi(argv[2]);
if (!strcmp(argv[1], "MAYO_1")) {
rc = bench_sig(&MAYO_1, runs, 0);
} else if (!strcmp(argv[1], "MAYO_2")) {
rc = bench_sig(&MAYO_2, runs, 0);
} else if (!strcmp(argv[1], "MAYO_3")) {
rc = bench_sig(&MAYO_3, runs, 0);
} else if (!strcmp(argv[1], "MAYO_5")) {
rc = bench_sig(&MAYO_5, runs, 0);
}
#else
if (argc < 2) {
printf("One argument needed\n");
rc = 1;
goto end;
}
int runs = atoi(argv[1]);
rc = bench_sig(&MAYO_VARIANT, runs, 0);
#endif



end:
return rc;
}

#if (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_S390X))
#define BENCH_UNITS "nsec"
#else
#define BENCH_UNITS "cycles"
#endif

int cmpfunc (const void *a, const void *b) {
return ( *(uint64_t *)a - * (uint64_t *)b );
}

#define BENCH_CODE_1(r) \
cycles = 0; \
for (i = 0; i < (r); ++i) { \
cycles1 = cpucycles();

#define BENCH_CODE_2(name, csv) \
cycles2 = cpucycles(); \
if(i < LIST_SIZE) \
cycles_list[i] = (cycles2 - cycles1);\
cycles = cycles + (cycles2 - cycles1); \
} \
qsort(cycles_list, (runs < LIST_SIZE)? runs : LIST_SIZE, sizeof(uint64_t), cmpfunc);\
if (csv) \
printf("%2" PRId64 ",", cycles_list[(runs < LIST_SIZE)? runs/2 : LIST_SIZE/2]); \
else { \
printf(" %-35s -> median: %2" PRId64 ", average: %2" PRId64 " ", name, \
cycles_list[(runs < LIST_SIZE)? runs/2 : LIST_SIZE/2], (cycles / runs)); \
printf("%s\n", BENCH_UNITS); \
}

#define LIST_SIZE 10000

extern void P1_times_O(const mayo_params_t* p, const uint32_t* P1, const unsigned char* O, uint32_t* acc);
extern void mul_add_mat_trans_x_bitsliced_m_mat(int m_legs, const unsigned char *mat, const uint32_t *bs_mat, uint32_t *acc, int mat_rows, int mat_cols, int bs_mat_cols);
extern void mul_add_mat_x_bitsliced_m_mat(int m_legs, const unsigned char *mat, const uint32_t *bs_mat, uint32_t *acc, int mat_rows, int mat_cols, int bs_mat_cols);
extern void P1_times_Vt(const mayo_params_t* p, const uint32_t* P1, const unsigned char* V, uint32_t* acc);
extern void bitsliced_m_calculate_PS_SPS(const uint32_t *bitsliced_P1, const uint32_t *bitsliced_P2, const uint32_t *bitsliced_P3, const unsigned char *S,
const int m, const int v, const int o, const int k, uint32_t *bitsliced_SPS);

static int bench_sig(const mayo_params_t *p, int runs, int csv) {

int rc = 0;
int i;

int64_t cycles, cycles1, cycles2;
int64_t cycles_list[runs];

// fill variables with dummy data
alignas (32) uint32_t bitsliced_P[(P1_BYTES_MAX + P2_BYTES_MAX) / 4] = {1};
alignas (32) uint32_t bitsliced_P3[O_MAX * O_MAX * M_MAX / 8] = {2};
unsigned char Vdec[N_MINUS_O_MAX * K_MAX] = {3};
uint32_t bitsliced_M[K_MAX * O_MAX * M_MAX / 8] = {4};
unsigned char O[(N_MINUS_O_MAX)*O_MAX] = {5};
unsigned char s[K_MAX * N_MAX] = {6};

uint32_t *bitsliced_P1 = bitsliced_P;
uint32_t *bitsliced_P2 = bitsliced_P + (P1_BYTES_MAX / 4);
uint32_t *bitsliced_L = bitsliced_P + (P1_BYTES_MAX/4);
uint32_t *bitsliced_P1O_P2 = bitsliced_P2;

if (csv) {
printf("%s,", p->name);
} else {
printf("Benchmarking %s\n", p->name);
}

BENCH_CODE_1(runs);
P1_times_O(p, bitsliced_P1, O, bitsliced_P1O_P2);
mul_add_mat_trans_x_bitsliced_m_mat(M_MAX/32, O, bitsliced_P1O_P2, bitsliced_P3,
V_MAX, O_MAX, O_MAX);
BENCH_CODE_2("Tab.3/Col 1: -O^t * (P1*O + P2)", csv);

BENCH_CODE_1(runs);
// compute all the v_i^t * P^(1) * v_j
alignas (32) uint32_t bitsliced_Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 8] = {0};
alignas (32) uint32_t bitsliced_vPv[K_MAX * K_MAX * M_MAX / 8] = {0};
// compute all the v_i^T * L matrices.
mul_add_mat_x_bitsliced_m_mat(M_MAX / 32, Vdec, bitsliced_L, bitsliced_M,
K_MAX, N_MAX - O_MAX, O_MAX);

P1_times_Vt(p, bitsliced_P1, Vdec, bitsliced_Pv);
mul_add_mat_x_bitsliced_m_mat(M_MAX / 32, Vdec, bitsliced_Pv,
bitsliced_vPv, K_MAX, N_MAX - O_MAX,
K_MAX);
BENCH_CODE_2("Tab.3/Col 2: V*P1*V^t & V*L", csv);

BENCH_CODE_1(runs);
alignas (32) uint32_t bitsliced_SPS[K_MAX * K_MAX * M_MAX / 8] = {0};
bitsliced_m_calculate_PS_SPS(bitsliced_P1, bitsliced_P2, bitsliced_P3, s, M_MAX,
V_MAX, O_MAX, K_MAX, bitsliced_SPS);
BENCH_CODE_2("Tab.3/Col 3: S*P*S^t", csv);

if (csv) {
printf("\n");
}

return rc;
}

static inline int64_t cpucycles(void) {
#if (defined(TARGET_AMD64) || defined(TARGET_X86))
unsigned int hi, lo;

asm volatile ("rdtsc" : "=a" (lo), "=d"(hi));
return ((int64_t) lo) | (((int64_t) hi) << 32);
#elif (defined(TARGET_S390X))
uint64_t tod;
asm volatile("stckf %0\n" : "=Q" (tod) : : "cc");
return (tod * 1000 / 4096);
#else
struct timespec time;
clock_gettime(CLOCK_REALTIME, &time);
return (int64_t)(time.tv_sec * 1e9 + time.tv_nsec);
#endif
}

0 comments on commit 7354542

Please sign in to comment.