FK20-CUDAdocs/fk20benchmark_8cu_source.html

 // bls12_381: Arithmetic for BLS12-381

 // Copyright 2022-2023 Dag Arne Osvik

 // Copyright 2022-2023 Luan Cardoso dos Santos


 #include <bits/getopt_core.h>

 #include <cstring>

 #include <stdio.h>

 #include <stdlib.h>

 #include <time.h>

 #include <unistd.h>


 #include "fr.cuh"

 #include "fk20.cuh"

 #include "g1.cuh"

 #include "test.h"


 // Known-good values generated by the Python implementation


 extern __managed__ fr_t polynomial[4096];

 extern __managed__ g1p_t setup[4097];

 extern __managed__ g1p_t xext_fft[16][512];

 extern __managed__ fr_t toeplitz_coefficients[16][512];

 extern __managed__ fr_t toeplitz_coefficients_fft[16][512];

 extern __managed__ g1p_t hext_fft[512];

 extern __managed__ g1p_t h[512];

 extern __managed__ g1p_t h_fft[512];


 static int NSAMPLES = 5;


 // Debug printing on stderr with local information;

 #ifdef DEBUG

     #define DPRINTF(fmt, ...) fprintf(stderr, "[debug] %s:%d " fmt "\n", __FILE__, __LINE__,  ##__VA_ARGS__)

 #else

     #define DPRINTF(fmt, ...)

 #endif


 /******************************************************************************/

 /**************************** Workspace variables *****************************/

 /******************************************************************************/


 fr_t  *b_polynomial = NULL; //min[4096]; max[512*4096]

 g1p_t *b_xext_fft = NULL; //min[16][512]; max[16][512];

 fr_t  *b_toeplitz_coefficients = NULL; //min[16][512]; max [512*16][512];

 fr_t  *b_toeplitz_coefficients_fft = NULL; //min[16][512]; max [512*16][512];

 g1p_t *b_hext_fft = NULL; //min[512]; max [512*512];

 g1p_t *b_h = NULL; //min[512]; max [512*512];

 g1p_t *b_h_fft = NULL; //min[512]; max [512*512];


 // Result pointers

 fr_t  *b_fr_tmp;

 g1p_t *b_g1p_tmp;

 __managed__ uint8_t cmp[16*512]; // Comparison array written by GPU


 /******************************************************************************/

 /*********************************** Macros ***********************************/

 /******************************************************************************/


 // The necessary shared memory is larger than what we can allocate statically, hence it is

 // allocated dynamically in the kernel call. We set the maximum allowed size using this macro.

 #define SET_SHAREDMEM(SZ, FN)                                                                                          \

     err = cudaFuncSetAttribute(FN, cudaFuncAttributeMaxDynamicSharedMemorySize, SZ);                                   \

     cudaDeviceSynchronize();                                                                                           \

     if (err != cudaSuccess)                                                                                            \

         printf("Error cudaFuncSetAttribute: %s:%d, error %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));


 #define COPYMANY(DEST, SRC, SIZE, NCOPIES, TYPE)                                                     \

         for(int counter=0; counter<NCOPIES; counter++) memcpy(DEST+counter*SIZE, SRC, SIZE*sizeof(TYPE));


 // Synchronizes the device, making sure that the kernel has finished executing.

 // Checks for any errors, and reports if errors are found.

 #define CUDASYNC(fmt, ...)                                                                                             \

     err = cudaDeviceSynchronize();                                                                                     \

     if (err != cudaSuccess)                                                                                            \

     printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)


 #define BENCH_BEFORE \

 for(int i=0; i<NSAMPLES; i++){\

     cudaEventRecord(start)


 #define COL(N) "\x1B["#N"G"


 #define BENCH_AFTER(FNAME)\

     cudaEventRecord(stop); \

         err = cudaEventSynchronize(stop);\

         if (err != cudaSuccess) printf("%s:%d  Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));\

         cudaEventElapsedTime(&milliseconds[i], start, stop);\

     }\

     qsort(milliseconds, NSAMPLES, sizeof(milliseconds[0]), compare);\

     median = milliseconds[NSAMPLES/2];\

     printf(FNAME COL(25) " %8.3f ms [%8.3f - %8.3f]\n", median, milliseconds[0], milliseconds[NSAMPLES-1]);


 /******************************************************************************/

 /********************************* Prototypes *********************************/

 /******************************************************************************/


 void setupMemory(unsigned rows);

 void freeMemory();


 bool preBenchTest(int rows);

 void benchFull(int rows);

 void benchSteps(unsigned rows);

 void benchModules(unsigned rows);


 int compare(const void *  a, const void *  b);

 void printHeader(unsigned rows);


 int main(int argc, char **argv) {

     unsigned rows = 32;

     NSAMPLES = 7;

     int opt;


     while((opt = getopt(argc, argv, "r:s:h")) != -1){

         switch (opt) {

             case 'r':

                 rows = abs(atoi(optarg));

                 rows = rows>512?512:rows;

                 break;

             case 's':

                 NSAMPLES = abs(atoi(optarg));

                 break;

             case 'h':

                 printf("Usage: %s [-r rows] [-s NSAMPLES] [-h]\n", argv[0]);

                 printf("Options:\n");

                 printf("  -r #     Set the number of rows (default: %d)\n", rows);

                 printf("  -s #     Set the number of samples (default: %d)\n", NSAMPLES);

                 printf("  -h       Display this help information\n");

                 return 0;

             case '?':

                 if (optopt == 'r' || optopt == 's')

                     fprintf(stderr, "Option -%c requires an argument.\n", optopt);

                 else

                     fprintf(stderr, "Unknown option `-%c'.\n", optopt);

             default:

                 return 1;

         }

     }


     printHeader(rows);

     setupMemory(rows);


     bool pass = preBenchTest(rows);

     if (!pass) {

         // It might be interesting sometimes to have the benchmark run even if the

         // results are incorrect, hence why just a warning instead of halting execution.

         printf("WARNING: An error was detected during the pre-benchmark test! Continuing... \n");

     }


     benchFull(rows);

     benchSteps(rows);

     benchModules(rows);

     freeMemory();

     return 0;

 }


 bool preBenchTest(int rows){

     cudaError_t err;

     bool pass = true;


     // Setup


     SET_SHAREDMEM(fr_sharedmem,  fr_fft_wrapper);

     SET_SHAREDMEM(g1p_sharedmem, g1p_fft_wrapper);

     SET_SHAREDMEM(g1p_sharedmem, g1p_ift_wrapper);


     DPRINTF("Pre-bench test %d rows ", rows); fflush(stdout);


         fk20_poly2toeplitz_coefficients<<<rows, 256>>>(b_fr_tmp, b_polynomial);

         fr_fft_wrapper<<<rows*16, 256, fr_sharedmem>>>(b_fr_tmp, b_fr_tmp);

         fk20_msm<<<rows, 256>>>(b_g1p_tmp, b_fr_tmp,  (g1p_t *)xext_fft);

         g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_g1p_tmp);

         fk20_hext2h<<<rows, 256>>>(b_g1p_tmp);

         g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_g1p_tmp);


     clearRes;

     g1p_eq_wrapper<<<16, 32>>>(cmp, rows*512, b_g1p_tmp, b_h_fft);

     CUDASYNC("g1p_eq_wrapper");

     CMPCHECK(rows*512);

     #ifdef DEBUG

     PRINTPASS(pass);

     #endif

     return pass;

 }


 void benchFull(int rows){

     cudaError_t err;

     cudaEvent_t start, stop;

     cudaEventCreate(&start);

     cudaEventCreate(&stop);

     float milliseconds[NSAMPLES];

     float median;


     // Setup


     SET_SHAREDMEM(fr_sharedmem,  fr_fft_wrapper);

     SET_SHAREDMEM(g1p_sharedmem, g1p_fft_wrapper);

     SET_SHAREDMEM(g1p_sharedmem, g1p_ift_wrapper);


     printf("\n=== Test without stalling on Device\n");fflush(stdout);


     BENCH_BEFORE;

         fk20_poly2toeplitz_coefficients<<<rows, 256>>>(b_fr_tmp, b_polynomial);

         fr_fft_wrapper<<<rows*16, 256, fr_sharedmem>>>(b_fr_tmp, b_fr_tmp);

         fk20_msm<<<rows, 256>>>(b_g1p_tmp, b_fr_tmp,  (g1p_t *)xext_fft);

         g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_g1p_tmp);

         fk20_hext2h<<<rows, 256>>>(b_g1p_tmp);

         g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_g1p_tmp);

     BENCH_AFTER("FK20");


 }


 void benchSteps(unsigned rows){

     cudaError_t err;

     cudaEvent_t start, stop;

     cudaEventCreate(&start);

     cudaEventCreate(&stop);

     float milliseconds[NSAMPLES];

     float median;


     printf("\n=== Testing FK20 individual steps\n");


     SET_SHAREDMEM(g1p_sharedmem, g1p_fft_wrapper);

     SET_SHAREDMEM(g1p_sharedmem, g1p_ift_wrapper);


     BENCH_BEFORE;

         fk20_poly2toeplitz_coefficients<<<rows, 256>>>(b_fr_tmp, b_polynomial);

     BENCH_AFTER("polynomial -> tc");


     BENCH_BEFORE;

         fr_fft_wrapper<<<rows*16, 256, fr_sharedmem>>>(b_fr_tmp, b_fr_tmp);

     BENCH_AFTER("tc -> tc_fft");


     BENCH_BEFORE;

         fk20_msm<<<rows, 256>>>(b_g1p_tmp, b_fr_tmp,  (g1p_t *)xext_fft);

     BENCH_AFTER("tc_fft -> hext_fft (msm)");


     BENCH_BEFORE;

         g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_g1p_tmp);

     BENCH_AFTER("hext_fft -> hext");


     BENCH_BEFORE;

         fk20_hext2h<<<rows, 256>>>(b_g1p_tmp);

     BENCH_AFTER("hext -> h");


     BENCH_BEFORE;

         g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_g1p_tmp);

     BENCH_AFTER("h -> h_fft");

 }


 void benchModules(unsigned rows){

     cudaError_t err;

     cudaEvent_t start, stop;

     cudaEventCreate(&start);

     cudaEventCreate(&stop);

     float milliseconds[NSAMPLES];

     float median;


     printf("\n=== Testing FK20 components\n"); // The components you see in fk20test.cu


     SET_SHAREDMEM(g1p_sharedmem, fk20_hext_fft2h_fft)


     // Not used right now, may be useful for future optimizations

     //  BENCH_BEFORE;

     //  fk20_hext_fft2h_fft<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_hext_fft);

     //  BENCH_AFTER("fk20_hext_fft2h_fft");


     BENCH_BEFORE;

     fk20_poly2hext_fft<<<rows, 256, fr_sharedmem>>>(b_g1p_tmp, b_polynomial, (const g1p_t *)b_xext_fft);

     BENCH_AFTER("fk20_poly2hext_fft");


     BENCH_BEFORE;

     fk20_poly2h_fft(b_g1p_tmp, b_polynomial, (const g1p_t *)xext_fft, rows);

     BENCH_AFTER("fk20_poly2h_fft");


 }


 void setupMemory(unsigned rows){

     // Allocate memory and copy relevant data from the test vector

     // check, error on more than 193 rows

     cudaError_t err;

     #define MALLOCSYNC(fmt, ...) \

         if (err != cudaSuccess)                                                                                            \

         printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)


     err = cudaMallocManaged(&b_polynomial, rows*4096*sizeof(fr_t));

           MALLOCSYNC("b_polynomial");

     err = cudaMallocManaged(&b_xext_fft, 16*512*sizeof(g1p_t)); // size not dependant on number of rows

           MALLOCSYNC("id");

     // err = cudaMallocManaged(&b_toeplitz_coefficients, rows*16*512*sizeof(fr_t));

     //       MALLOCSYNC("id");

     // err = cudaMallocManaged(&b_toeplitz_coefficients_fft, rows*16*512*sizeof(fr_t));

     //       MALLOCSYNC("id");

     err = cudaMallocManaged(&b_hext_fft, rows*512*sizeof(g1p_t));

           MALLOCSYNC("b_hext_fft");

     // err = cudaMallocManaged(&b_h, rows*512*sizeof(g1p_t));

     //       MALLOCSYNC("id");

     err = cudaMallocManaged(&b_h_fft, rows*512*sizeof(g1p_t));

           MALLOCSYNC("b_h_fft");

     err = cudaMallocManaged(&b_g1p_tmp, rows*512*sizeof(g1p_t));

           MALLOCSYNC("b_g1p_tmp");

     err = cudaMallocManaged(&b_fr_tmp, rows*16*512*sizeof(fr_t));

           MALLOCSYNC("b_fr_tmp");


     // Copy data

     COPYMANY(b_polynomial, polynomial, 4096, rows, fr_t);

     COPYMANY(b_xext_fft, xext_fft, 16*512, 1, g1p_t);

     // COPYMANY(b_toeplitz_coefficients, toeplitz_coefficients, 16*512, rows, fr_t);

     // COPYMANY(b_toeplitz_coefficients_fft, toeplitz_coefficients_fft, 16*512, rows, fr_t);

     COPYMANY(b_hext_fft, hext_fft, 512, rows, g1p_t);

     // COPYMANY(b_h, h, 512, rows, g1p_t);

     COPYMANY(b_h_fft, h_fft, 512, rows, g1p_t);


     DPRINTF("Memory setup done");

 }


 void freeMemory(){

     // No worries about freeing a NULL pointer, that check is done by cudaFree

     cudaFree(b_polynomial);

     cudaFree(b_xext_fft);

     cudaFree(b_toeplitz_coefficients);

     cudaFree(b_toeplitz_coefficients_fft);

     cudaFree(b_hext_fft);

     cudaFree(b_h);

     cudaFree(b_h_fft);

     DPRINTF("Allocated memory freed");

 }


 void printHeader(unsigned rows){

     int kb=1<<10, mb=1<<20;


     printf("===  FK20 Benchmark: %d thread blocks\n", rows);

     printf("     Reporting median of %d executions as median [lowest | highest] \n", NSAMPLES);


     int devCount;

     cudaGetDeviceCount(&devCount);


     for(int i=0; i<devCount; i++){

         cudaDeviceProp props;

         cudaGetDeviceProperties(&props, i);


         printf("     GPU %d: %s: compute capability %d.%d\n", i, props.name, props.major, props.minor);

         printf("     Global memory:   %luMB\n", props.totalGlobalMem / mb);

         printf("     Shared memory:   %luKB\n", props.sharedMemPerBlock / kb);

         printf("     Constant memory: %luKB\n", props.totalConstMem / kb);

         printf("     Registers per block : %d\n", props.regsPerBlock);

         printf("     Multiprocessor count : %d\n\n", props.multiProcessorCount);


         printf("     Warp size:         %d\n", props.warpSize);

         printf("     Threads per block: %d\n", props.maxThreadsPerBlock);

         printf("     Max block dimensions: [ %d, %d, %d ]\n", props.maxThreadsDim[0], props.maxThreadsDim[1], props.maxThreadsDim[2]);

         printf("     Max grid dimensions:  [ %d, %d, %d ]\n", props.maxGridSize[0], props.maxGridSize[1], props.maxGridSize[2]);

         printf("\n");

     }

 }


 int compare(const void *  a, const void *  b){

   float fa = *(const float*) a;

   float fb = *(const float*) b;

   return (fa > fb) - (fa < fb);

 }


 // vim: ts=4 et sw=4 si

fk20.cuh

g1p_sharedmem
const size_t g1p_sharedmem
Definition: fk20.cuh:14

fk20_poly2h_fft
__host__ void fk20_poly2h_fft(g1p_t *h_fft, const fr_t *polynomial, const g1p_t xext_fft[8192], unsigned rows)
polynomial + xext_fft -> h_fft This function is a wrapper for the full FK20 computation,...
Definition: fk20_poly2h_fft.cu:47

fk20_hext_fft2h_fft
__global__ void fk20_hext_fft2h_fft(g1p_t *h_fft, const g1p_t *hext_fft)
hext_fft -> h_fft
Definition: fk20_hext_fft2h_fft.cu:21

fr_sharedmem
const size_t fr_sharedmem
Definition: fk20.cuh:15

xext_fft
__managed__ g1p_t xext_fft[16][512]
Definition: fk20_testvector.cu:24603

toeplitz_coefficients_fft
__managed__ fr_t toeplitz_coefficients_fft[16][512]
Definition: fk20_testvector.cu:73825

b_toeplitz_coefficients_fft
fr_t * b_toeplitz_coefficients_fft
Definition: fk20benchmark.cu:46

MALLOCSYNC
#define MALLOCSYNC(fmt,...)

toeplitz_coefficients
__managed__ fr_t toeplitz_coefficients[16][512]
Definition: fk20_testvector.cu:65598

main
int main(int argc, char **argv)
Definition: fk20benchmark.cu:131

SET_SHAREDMEM
#define SET_SHAREDMEM(SZ, FN)
Definition: fk20benchmark.cu:62

COPYMANY
#define COPYMANY(DEST, SRC, SIZE, NCOPIES, TYPE)
Write NCOPIES copies of SRC[SIZE] into DEST,.
Definition: fk20benchmark.cu:72

setup
__managed__ g1p_t setup[4097]
Definition: fk20_testvector.cu:4115

freeMemory
void freeMemory()
frees the pointers allocated by setupMemory
Definition: fk20benchmark.cu:378

b_hext_fft
g1p_t * b_hext_fft
Definition: fk20benchmark.cu:47

b_polynomial
fr_t * b_polynomial
Definition: fk20benchmark.cu:43

polynomial
__managed__ fr_t polynomial[4096]
Definition: fk20_testvector.cu:16

h
__managed__ g1p_t h[512]
Definition: fk20_testvector.cu:84615

b_h_fft
g1p_t * b_h_fft
Definition: fk20benchmark.cu:49

h_fft
__managed__ g1p_t h_fft[512]
Definition: fk20_testvector.cu:87178

b_fr_tmp
fr_t * b_fr_tmp
Definition: fk20benchmark.cu:52

preBenchTest
bool preBenchTest(int rows)
Executes a test of FK20 with one block for each row. At the end, compare if the calculated h_fft is t...
Definition: fk20benchmark.cu:188

b_xext_fft
g1p_t * b_xext_fft
Definition: fk20benchmark.cu:44

hext_fft
__managed__ g1p_t hext_fft[512]
Definition: fk20_testvector.cu:82052

benchSteps
void benchSteps(unsigned rows)
Benchmark the components functions separately and report.
Definition: fk20benchmark.cu:255

setupMemory
void setupMemory(unsigned rows)
Initialize the memory for the tests, by filling the memory with copies of the KAT Commented out varia...
Definition: fk20benchmark.cu:333

CUDASYNC
#define CUDASYNC(fmt,...)
Definition: fk20benchmark.cu:77

cmp
__managed__ uint8_t cmp[16 *512]
Definition: fk20benchmark.cu:54

b_toeplitz_coefficients
fr_t * b_toeplitz_coefficients
Definition: fk20benchmark.cu:45

printHeader
void printHeader(unsigned rows)
Prints to STDOUT an informative banner with the current hardware and benchmark parameters.
Definition: fk20benchmark.cu:396

BENCH_AFTER
#define BENCH_AFTER(FNAME)
Definition: fk20benchmark.cu:105

compare
int compare(const void *a, const void *b)
Comparator needed by qsort() from stdlib Simple and quick comparison of two floats.
Definition: fk20benchmark.cu:434

benchFull
void benchFull(int rows)
Benchmark full executions of FK20, without GPU stalling between the functions. This is the closest we...
Definition: fk20benchmark.cu:223

benchModules
void benchModules(unsigned rows)
Benchmark the for extra components not currently used on FK20.
Definition: fk20benchmark.cu:299

BENCH_BEFORE
#define BENCH_BEFORE
Definition: fk20benchmark.cu:99

b_g1p_tmp
g1p_t * b_g1p_tmp
Definition: fk20benchmark.cu:53

b_h
g1p_t * b_h
Definition: fk20benchmark.cu:48

DPRINTF
#define DPRINTF(fmt,...)
Definition: fk20benchmark.cu:35

fr.cuh

fr_t
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
Definition: fr.cuh:24

fr_fft_wrapper
__global__ void fr_fft_wrapper(fr_t *output, const fr_t *input)
wrapper for fr_fft: FFT for fr_t[512]
Definition: fr_fft.cu:316

g1.cuh

g1p_fft_wrapper
__global__ void g1p_fft_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_fft: FFT for arrays of g1p_t with length 512
Definition: g1p_fft.cu:336

g1p_ift_wrapper
__global__ void g1p_ift_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_ift: inverse FFT for arrays of g1p_t with length 512
Definition: g1p_fft.cu:349

g1p_t
G1 point in projective coordinates.
Definition: g1.cuh:27

test.h

clearRes
#define clearRes
Definition: test.h:87

PRINTPASS
#define PRINTPASS(pass)
Definition: test.h:25

CMPCHECK
#define CMPCHECK(LENGTH)
Definition: test.h:106