FK20 CUDA
fk20benchmark.cu
Go to the documentation of this file.
1 
2 // bls12_381: Arithmetic for BLS12-381
3 // Copyright 2022-2023 Dag Arne Osvik
4 // Copyright 2022-2023 Luan Cardoso dos Santos
5 
6 #include <bits/getopt_core.h>
7 #include <cstring>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <time.h>
11 #include <unistd.h>
12 
13 #include "fr.cuh"
14 #include "fk20.cuh"
15 #include "g1.cuh"
16 #include "test.h"
17 
18 // Known-good values generated by the Python implementation
19 
20 extern __managed__ fr_t polynomial[4096];
21 extern __managed__ g1p_t setup[4097];
22 extern __managed__ g1p_t xext_fft[16][512];
23 extern __managed__ fr_t toeplitz_coefficients[16][512];
24 extern __managed__ fr_t toeplitz_coefficients_fft[16][512];
25 extern __managed__ g1p_t hext_fft[512];
26 extern __managed__ g1p_t h[512];
27 extern __managed__ g1p_t h_fft[512];
28 
29 static int NSAMPLES = 5;
30 
31 // Debug printing on stderr with local information;
32 #ifdef DEBUG
33  #define DPRINTF(fmt, ...) fprintf(stderr, "[debug] %s:%d " fmt "\n", __FILE__, __LINE__, ##__VA_ARGS__)
34 #else
35  #define DPRINTF(fmt, ...)
36 #endif
37 
38 /******************************************************************************/
39 /**************************** Workspace variables *****************************/
40 /******************************************************************************/
41 
42 
43 fr_t *b_polynomial = NULL; //min[4096]; max[512*4096]
44 g1p_t *b_xext_fft = NULL; //min[16][512]; max[16][512];
45 fr_t *b_toeplitz_coefficients = NULL; //min[16][512]; max [512*16][512];
46 fr_t *b_toeplitz_coefficients_fft = NULL; //min[16][512]; max [512*16][512];
47 g1p_t *b_hext_fft = NULL; //min[512]; max [512*512];
48 g1p_t *b_h = NULL; //min[512]; max [512*512];
49 g1p_t *b_h_fft = NULL; //min[512]; max [512*512];
50 
51 // Result pointers
54 __managed__ uint8_t cmp[16*512]; // Comparison array written by GPU
55 
56 /******************************************************************************/
57 /*********************************** Macros ***********************************/
58 /******************************************************************************/
59 
60 // The necessary shared memory is larger than what we can allocate statically, hence it is
61 // allocated dynamically in the kernel call. We set the maximum allowed size using this macro.
62 #define SET_SHAREDMEM(SZ, FN) \
63  err = cudaFuncSetAttribute(FN, cudaFuncAttributeMaxDynamicSharedMemorySize, SZ); \
64  cudaDeviceSynchronize(); \
65  if (err != cudaSuccess) \
66  printf("Error cudaFuncSetAttribute: %s:%d, error %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));
67 
72 #define COPYMANY(DEST, SRC, SIZE, NCOPIES, TYPE) \
73  for(int counter=0; counter<NCOPIES; counter++) memcpy(DEST+counter*SIZE, SRC, SIZE*sizeof(TYPE));
74 
75 // Synchronizes the device, making sure that the kernel has finished executing.
76 // Checks for any errors, and reports if errors are found.
77 #define CUDASYNC(fmt, ...) \
78  err = cudaDeviceSynchronize(); \
79  if (err != cudaSuccess) \
80  printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)
81 
99 #define BENCH_BEFORE \
100 for(int i=0; i<NSAMPLES; i++){\
101  cudaEventRecord(start)
102 
103 #define COL(N) "\x1B["#N"G"
104 
105 #define BENCH_AFTER(FNAME)\
106  cudaEventRecord(stop); \
107  err = cudaEventSynchronize(stop);\
108  if (err != cudaSuccess) printf("%s:%d Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));\
109  cudaEventElapsedTime(&milliseconds[i], start, stop);\
110  }\
111  qsort(milliseconds, NSAMPLES, sizeof(milliseconds[0]), compare);\
112  median = milliseconds[NSAMPLES/2];\
113  printf(FNAME COL(25) " %8.3f ms [%8.3f - %8.3f]\n", median, milliseconds[0], milliseconds[NSAMPLES-1]);
114 
115 
116 /******************************************************************************/
117 /********************************* Prototypes *********************************/
118 /******************************************************************************/
119 
120 void setupMemory(unsigned rows);
121 void freeMemory();
122 
123 bool preBenchTest(int rows);
124 void benchFull(int rows);
125 void benchSteps(unsigned rows);
126 void benchModules(unsigned rows);
127 
128 int compare(const void * a, const void * b);
129 void printHeader(unsigned rows);
130 
131 int main(int argc, char **argv) {
132  unsigned rows = 32;
133  NSAMPLES = 7;
134  int opt;
135 
136  while((opt = getopt(argc, argv, "r:s:h")) != -1){
137  switch (opt) {
138  case 'r':
139  rows = abs(atoi(optarg));
140  rows = rows>512?512:rows;
141  break;
142  case 's':
143  NSAMPLES = abs(atoi(optarg));
144  break;
145  case 'h':
146  printf("Usage: %s [-r rows] [-s NSAMPLES] [-h]\n", argv[0]);
147  printf("Options:\n");
148  printf(" -r # Set the number of rows (default: %d)\n", rows);
149  printf(" -s # Set the number of samples (default: %d)\n", NSAMPLES);
150  printf(" -h Display this help information\n");
151  return 0;
152  case '?':
153  if (optopt == 'r' || optopt == 's')
154  fprintf(stderr, "Option -%c requires an argument.\n", optopt);
155  else
156  fprintf(stderr, "Unknown option `-%c'.\n", optopt);
157  default:
158  return 1;
159  }
160  }
161 
162  printHeader(rows);
163  setupMemory(rows);
164 
165  bool pass = preBenchTest(rows);
166  if (!pass) {
167  // It might be interesting sometimes to have the benchmark run even if the
168  // results are incorrect, hence why just a warning instead of halting execution.
169  printf("WARNING: An error was detected during the pre-benchmark test! Continuing... \n");
170  }
171 
172  benchFull(rows);
173  benchSteps(rows);
174  benchModules(rows);
175  freeMemory();
176  return 0;
177 }
178 
188 bool preBenchTest(int rows){
189  cudaError_t err;
190  bool pass = true;
191 
192  // Setup
193 
197 
198  DPRINTF("Pre-bench test %d rows ", rows); fflush(stdout);
199 
200  fk20_poly2toeplitz_coefficients<<<rows, 256>>>(b_fr_tmp, b_polynomial);
201  fr_fft_wrapper<<<rows*16, 256, fr_sharedmem>>>(b_fr_tmp, b_fr_tmp);
202  fk20_msm<<<rows, 256>>>(b_g1p_tmp, b_fr_tmp, (g1p_t *)xext_fft);
203  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_g1p_tmp);
204  fk20_hext2h<<<rows, 256>>>(b_g1p_tmp);
205  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_g1p_tmp);
206 
207  clearRes;
208  g1p_eq_wrapper<<<16, 32>>>(cmp, rows*512, b_g1p_tmp, b_h_fft);
209  CUDASYNC("g1p_eq_wrapper");
210  CMPCHECK(rows*512);
211  #ifdef DEBUG
212  PRINTPASS(pass);
213  #endif
214  return pass;
215 }
216 
223 void benchFull(int rows){
224  cudaError_t err;
225  cudaEvent_t start, stop;
226  cudaEventCreate(&start);
227  cudaEventCreate(&stop);
228  float milliseconds[NSAMPLES];
229  float median;
230 
231  // Setup
232 
236 
237  printf("\n=== Test without stalling on Device\n");fflush(stdout);
238 
239  BENCH_BEFORE;
240  fk20_poly2toeplitz_coefficients<<<rows, 256>>>(b_fr_tmp, b_polynomial);
241  fr_fft_wrapper<<<rows*16, 256, fr_sharedmem>>>(b_fr_tmp, b_fr_tmp);
242  fk20_msm<<<rows, 256>>>(b_g1p_tmp, b_fr_tmp, (g1p_t *)xext_fft);
243  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_g1p_tmp);
244  fk20_hext2h<<<rows, 256>>>(b_g1p_tmp);
245  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_g1p_tmp);
246  BENCH_AFTER("FK20");
247 
248 }
249 
255 void benchSteps(unsigned rows){
256  cudaError_t err;
257  cudaEvent_t start, stop;
258  cudaEventCreate(&start);
259  cudaEventCreate(&stop);
260  float milliseconds[NSAMPLES];
261  float median;
262 
263  printf("\n=== Testing FK20 individual steps\n");
264 
267 
268 
269  BENCH_BEFORE;
270  fk20_poly2toeplitz_coefficients<<<rows, 256>>>(b_fr_tmp, b_polynomial);
271  BENCH_AFTER("polynomial -> tc");
272 
273  BENCH_BEFORE;
274  fr_fft_wrapper<<<rows*16, 256, fr_sharedmem>>>(b_fr_tmp, b_fr_tmp);
275  BENCH_AFTER("tc -> tc_fft");
276 
277  BENCH_BEFORE;
278  fk20_msm<<<rows, 256>>>(b_g1p_tmp, b_fr_tmp, (g1p_t *)xext_fft);
279  BENCH_AFTER("tc_fft -> hext_fft (msm)");
280 
281  BENCH_BEFORE;
282  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_g1p_tmp);
283  BENCH_AFTER("hext_fft -> hext");
284 
285  BENCH_BEFORE;
286  fk20_hext2h<<<rows, 256>>>(b_g1p_tmp);
287  BENCH_AFTER("hext -> h");
288 
289  BENCH_BEFORE;
290  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_g1p_tmp);
291  BENCH_AFTER("h -> h_fft");
292 }
293 
299 void benchModules(unsigned rows){
300  cudaError_t err;
301  cudaEvent_t start, stop;
302  cudaEventCreate(&start);
303  cudaEventCreate(&stop);
304  float milliseconds[NSAMPLES];
305  float median;
306 
307  printf("\n=== Testing FK20 components\n"); // The components you see in fk20test.cu
308 
310 
311  // Not used right now, may be useful for future optimizations
312  // BENCH_BEFORE;
313  // fk20_hext_fft2h_fft<<<rows, 256, g1p_sharedmem>>>(b_g1p_tmp, b_hext_fft);
314  // BENCH_AFTER("fk20_hext_fft2h_fft");
315 
316  BENCH_BEFORE;
317  fk20_poly2hext_fft<<<rows, 256, fr_sharedmem>>>(b_g1p_tmp, b_polynomial, (const g1p_t *)b_xext_fft);
318  BENCH_AFTER("fk20_poly2hext_fft");
319 
320  BENCH_BEFORE;
322  BENCH_AFTER("fk20_poly2h_fft");
323 
324 
325 }
326 
333 void setupMemory(unsigned rows){
334  // Allocate memory and copy relevant data from the test vector
335  // check, error on more than 193 rows
336  cudaError_t err;
337  #define MALLOCSYNC(fmt, ...) \
338  if (err != cudaSuccess) \
339  printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)
340 
341  err = cudaMallocManaged(&b_polynomial, rows*4096*sizeof(fr_t));
342  MALLOCSYNC("b_polynomial");
343  err = cudaMallocManaged(&b_xext_fft, 16*512*sizeof(g1p_t)); // size not dependant on number of rows
344  MALLOCSYNC("id");
345  // err = cudaMallocManaged(&b_toeplitz_coefficients, rows*16*512*sizeof(fr_t));
346  // MALLOCSYNC("id");
347  // err = cudaMallocManaged(&b_toeplitz_coefficients_fft, rows*16*512*sizeof(fr_t));
348  // MALLOCSYNC("id");
349  err = cudaMallocManaged(&b_hext_fft, rows*512*sizeof(g1p_t));
350  MALLOCSYNC("b_hext_fft");
351  // err = cudaMallocManaged(&b_h, rows*512*sizeof(g1p_t));
352  // MALLOCSYNC("id");
353  err = cudaMallocManaged(&b_h_fft, rows*512*sizeof(g1p_t));
354  MALLOCSYNC("b_h_fft");
355  err = cudaMallocManaged(&b_g1p_tmp, rows*512*sizeof(g1p_t));
356  MALLOCSYNC("b_g1p_tmp");
357  err = cudaMallocManaged(&b_fr_tmp, rows*16*512*sizeof(fr_t));
358  MALLOCSYNC("b_fr_tmp");
359 
360 
361  // Copy data
362  COPYMANY(b_polynomial, polynomial, 4096, rows, fr_t);
363  COPYMANY(b_xext_fft, xext_fft, 16*512, 1, g1p_t);
364  // COPYMANY(b_toeplitz_coefficients, toeplitz_coefficients, 16*512, rows, fr_t);
365  // COPYMANY(b_toeplitz_coefficients_fft, toeplitz_coefficients_fft, 16*512, rows, fr_t);
366  COPYMANY(b_hext_fft, hext_fft, 512, rows, g1p_t);
367  // COPYMANY(b_h, h, 512, rows, g1p_t);
368  COPYMANY(b_h_fft, h_fft, 512, rows, g1p_t);
369 
370 
371  DPRINTF("Memory setup done");
372 }
373 
378 void freeMemory(){
379  // No worries about freeing a NULL pointer, that check is done by cudaFree
380  cudaFree(b_polynomial);
381  cudaFree(b_xext_fft);
382  cudaFree(b_toeplitz_coefficients);
383  cudaFree(b_toeplitz_coefficients_fft);
384  cudaFree(b_hext_fft);
385  cudaFree(b_h);
386  cudaFree(b_h_fft);
387  DPRINTF("Allocated memory freed");
388 }
389 
396 void printHeader(unsigned rows){
397  int kb=1<<10, mb=1<<20;
398 
399  printf("=== FK20 Benchmark: %d thread blocks\n", rows);
400  printf(" Reporting median of %d executions as median [lowest | highest] \n", NSAMPLES);
401 
402  int devCount;
403  cudaGetDeviceCount(&devCount);
404 
405  for(int i=0; i<devCount; i++){
406  cudaDeviceProp props;
407  cudaGetDeviceProperties(&props, i);
408 
409  printf(" GPU %d: %s: compute capability %d.%d\n", i, props.name, props.major, props.minor);
410  printf(" Global memory: %luMB\n", props.totalGlobalMem / mb);
411  printf(" Shared memory: %luKB\n", props.sharedMemPerBlock / kb);
412  printf(" Constant memory: %luKB\n", props.totalConstMem / kb);
413  printf(" Registers per block : %d\n", props.regsPerBlock);
414  printf(" Multiprocessor count : %d\n\n", props.multiProcessorCount);
415 
416  printf(" Warp size: %d\n", props.warpSize);
417  printf(" Threads per block: %d\n", props.maxThreadsPerBlock);
418  printf(" Max block dimensions: [ %d, %d, %d ]\n", props.maxThreadsDim[0], props.maxThreadsDim[1], props.maxThreadsDim[2]);
419  printf(" Max grid dimensions: [ %d, %d, %d ]\n", props.maxGridSize[0], props.maxGridSize[1], props.maxGridSize[2]);
420  printf("\n");
421  }
422 }
423 
434 int compare(const void * a, const void * b){
435  float fa = *(const float*) a;
436  float fb = *(const float*) b;
437  return (fa > fb) - (fa < fb);
438 }
439 
440 // vim: ts=4 et sw=4 si
const size_t g1p_sharedmem
Definition: fk20.cuh:14
__host__ void fk20_poly2h_fft(g1p_t *h_fft, const fr_t *polynomial, const g1p_t xext_fft[8192], unsigned rows)
polynomial + xext_fft -> h_fft This function is a wrapper for the full FK20 computation,...
__global__ void fk20_hext_fft2h_fft(g1p_t *h_fft, const g1p_t *hext_fft)
hext_fft -> h_fft
const size_t fr_sharedmem
Definition: fk20.cuh:15
__managed__ g1p_t xext_fft[16][512]
__managed__ fr_t toeplitz_coefficients_fft[16][512]
fr_t * b_toeplitz_coefficients_fft
#define MALLOCSYNC(fmt,...)
__managed__ fr_t toeplitz_coefficients[16][512]
int main(int argc, char **argv)
#define SET_SHAREDMEM(SZ, FN)
#define COPYMANY(DEST, SRC, SIZE, NCOPIES, TYPE)
Write NCOPIES copies of SRC[SIZE] into DEST,.
__managed__ g1p_t setup[4097]
void freeMemory()
frees the pointers allocated by setupMemory
g1p_t * b_hext_fft
fr_t * b_polynomial
__managed__ fr_t polynomial[4096]
__managed__ g1p_t h[512]
g1p_t * b_h_fft
__managed__ g1p_t h_fft[512]
fr_t * b_fr_tmp
bool preBenchTest(int rows)
Executes a test of FK20 with one block for each row. At the end, compare if the calculated h_fft is t...
g1p_t * b_xext_fft
__managed__ g1p_t hext_fft[512]
void benchSteps(unsigned rows)
Benchmark the components functions separately and report.
void setupMemory(unsigned rows)
Initialize the memory for the tests, by filling the memory with copies of the KAT Commented out varia...
#define CUDASYNC(fmt,...)
__managed__ uint8_t cmp[16 *512]
fr_t * b_toeplitz_coefficients
void printHeader(unsigned rows)
Prints to STDOUT an informative banner with the current hardware and benchmark parameters.
#define BENCH_AFTER(FNAME)
int compare(const void *a, const void *b)
Comparator needed by qsort() from stdlib Simple and quick comparison of two floats.
void benchFull(int rows)
Benchmark full executions of FK20, without GPU stalling between the functions. This is the closest we...
void benchModules(unsigned rows)
Benchmark the for extra components not currently used on FK20.
#define BENCH_BEFORE
g1p_t * b_g1p_tmp
g1p_t * b_h
#define DPRINTF(fmt,...)
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
Definition: fr.cuh:24
__global__ void fr_fft_wrapper(fr_t *output, const fr_t *input)
wrapper for fr_fft: FFT for fr_t[512]
Definition: fr_fft.cu:316
__global__ void g1p_fft_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_fft: FFT for arrays of g1p_t with length 512
Definition: g1p_fft.cu:336
__global__ void g1p_ift_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_ift: inverse FFT for arrays of g1p_t with length 512
Definition: g1p_fft.cu:349
G1 point in projective coordinates.
Definition: g1.cuh:27
#define clearRes
Definition: test.h:87
#define PRINTPASS(pass)
Definition: test.h:25
#define CMPCHECK(LENGTH)
Definition: test.h:106