6 #include <bits/getopt_core.h>
26 extern __managed__
g1p_t h[512];
29 static int NSAMPLES = 5;
33 #define DPRINTF(fmt, ...) fprintf(stderr, "[debug] %s:%d " fmt "\n", __FILE__, __LINE__, ##__VA_ARGS__)
35 #define DPRINTF(fmt, ...)
54 __managed__ uint8_t
cmp[16*512];
62 #define SET_SHAREDMEM(SZ, FN) \
63 err = cudaFuncSetAttribute(FN, cudaFuncAttributeMaxDynamicSharedMemorySize, SZ); \
64 cudaDeviceSynchronize(); \
65 if (err != cudaSuccess) \
66 printf("Error cudaFuncSetAttribute: %s:%d, error %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));
72 #define COPYMANY(DEST, SRC, SIZE, NCOPIES, TYPE) \
73 for(int counter=0; counter<NCOPIES; counter++) memcpy(DEST+counter*SIZE, SRC, SIZE*sizeof(TYPE));
77 #define CUDASYNC(fmt, ...) \
78 err = cudaDeviceSynchronize(); \
79 if (err != cudaSuccess) \
80 printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)
99 #define BENCH_BEFORE \
100 for(int i=0; i<NSAMPLES; i++){\
101 cudaEventRecord(start)
103 #define COL(N) "\x1B["#N"G"
105 #define BENCH_AFTER(FNAME)\
106 cudaEventRecord(stop); \
107 err = cudaEventSynchronize(stop);\
108 if (err != cudaSuccess) printf("%s:%d Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));\
109 cudaEventElapsedTime(&milliseconds[i], start, stop);\
111 qsort(milliseconds, NSAMPLES, sizeof(milliseconds[0]), compare);\
112 median = milliseconds[NSAMPLES/2];\
113 printf(FNAME COL(25) " %8.3f ms [%8.3f - %8.3f]\n", median, milliseconds[0], milliseconds[NSAMPLES-1]);
128 int compare(
const void * a,
const void * b);
131 int main(
int argc,
char **argv) {
136 while((opt = getopt(argc, argv,
"r:s:h")) != -1){
139 rows = abs(atoi(optarg));
140 rows = rows>512?512:rows;
143 NSAMPLES = abs(atoi(optarg));
146 printf(
"Usage: %s [-r rows] [-s NSAMPLES] [-h]\n", argv[0]);
147 printf(
"Options:\n");
148 printf(
" -r # Set the number of rows (default: %d)\n", rows);
149 printf(
" -s # Set the number of samples (default: %d)\n", NSAMPLES);
150 printf(
" -h Display this help information\n");
153 if (optopt ==
'r' || optopt ==
's')
154 fprintf(stderr,
"Option -%c requires an argument.\n", optopt);
156 fprintf(stderr,
"Unknown option `-%c'.\n", optopt);
169 printf(
"WARNING: An error was detected during the pre-benchmark test! Continuing... \n");
198 DPRINTF(
"Pre-bench test %d rows ", rows); fflush(stdout);
225 cudaEvent_t start, stop;
226 cudaEventCreate(&start);
227 cudaEventCreate(&stop);
228 float milliseconds[NSAMPLES];
237 printf(
"\n=== Test without stalling on Device\n");fflush(stdout);
257 cudaEvent_t start, stop;
258 cudaEventCreate(&start);
259 cudaEventCreate(&stop);
260 float milliseconds[NSAMPLES];
263 printf(
"\n=== Testing FK20 individual steps\n");
301 cudaEvent_t start, stop;
302 cudaEventCreate(&start);
303 cudaEventCreate(&stop);
304 float milliseconds[NSAMPLES];
307 printf(
"\n=== Testing FK20 components\n");
337 #define MALLOCSYNC(fmt, ...) \
338 if (err != cudaSuccess) \
339 printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)
353 err = cudaMallocManaged(&
b_h_fft, rows*512*
sizeof(
g1p_t));
357 err = cudaMallocManaged(&
b_fr_tmp, rows*16*512*
sizeof(
fr_t));
387 DPRINTF(
"Allocated memory freed");
397 int kb=1<<10, mb=1<<20;
399 printf(
"=== FK20 Benchmark: %d thread blocks\n", rows);
400 printf(
" Reporting median of %d executions as median [lowest | highest] \n", NSAMPLES);
403 cudaGetDeviceCount(&devCount);
405 for(
int i=0; i<devCount; i++){
406 cudaDeviceProp props;
407 cudaGetDeviceProperties(&props, i);
409 printf(
" GPU %d: %s: compute capability %d.%d\n", i, props.name, props.major, props.minor);
410 printf(
" Global memory: %luMB\n", props.totalGlobalMem / mb);
411 printf(
" Shared memory: %luKB\n", props.sharedMemPerBlock / kb);
412 printf(
" Constant memory: %luKB\n", props.totalConstMem / kb);
413 printf(
" Registers per block : %d\n", props.regsPerBlock);
414 printf(
" Multiprocessor count : %d\n\n", props.multiProcessorCount);
416 printf(
" Warp size: %d\n", props.warpSize);
417 printf(
" Threads per block: %d\n", props.maxThreadsPerBlock);
418 printf(
" Max block dimensions: [ %d, %d, %d ]\n", props.maxThreadsDim[0], props.maxThreadsDim[1], props.maxThreadsDim[2]);
419 printf(
" Max grid dimensions: [ %d, %d, %d ]\n", props.maxGridSize[0], props.maxGridSize[1], props.maxGridSize[2]);
435 float fa = *(
const float*) a;
436 float fb = *(
const float*) b;
437 return (fa > fb) - (fa < fb);
const size_t g1p_sharedmem
__host__ void fk20_poly2h_fft(g1p_t *h_fft, const fr_t *polynomial, const g1p_t xext_fft[8192], unsigned rows)
polynomial + xext_fft -> h_fft This function is a wrapper for the full FK20 computation,...
__global__ void fk20_hext_fft2h_fft(g1p_t *h_fft, const g1p_t *hext_fft)
hext_fft -> h_fft
const size_t fr_sharedmem
__managed__ g1p_t xext_fft[16][512]
__managed__ fr_t toeplitz_coefficients_fft[16][512]
fr_t * b_toeplitz_coefficients_fft
#define MALLOCSYNC(fmt,...)
__managed__ fr_t toeplitz_coefficients[16][512]
int main(int argc, char **argv)
#define SET_SHAREDMEM(SZ, FN)
#define COPYMANY(DEST, SRC, SIZE, NCOPIES, TYPE)
Write NCOPIES copies of SRC[SIZE] into DEST,.
__managed__ g1p_t setup[4097]
void freeMemory()
frees the pointers allocated by setupMemory
__managed__ fr_t polynomial[4096]
__managed__ g1p_t h_fft[512]
bool preBenchTest(int rows)
Executes a test of FK20 with one block for each row. At the end, compare if the calculated h_fft is t...
__managed__ g1p_t hext_fft[512]
void benchSteps(unsigned rows)
Benchmark the components functions separately and report.
void setupMemory(unsigned rows)
Initialize the memory for the tests, by filling the memory with copies of the KAT Commented out varia...
#define CUDASYNC(fmt,...)
__managed__ uint8_t cmp[16 *512]
fr_t * b_toeplitz_coefficients
void printHeader(unsigned rows)
Prints to STDOUT an informative banner with the current hardware and benchmark parameters.
#define BENCH_AFTER(FNAME)
int compare(const void *a, const void *b)
Comparator needed by qsort() from stdlib Simple and quick comparison of two floats.
void benchFull(int rows)
Benchmark full executions of FK20, without GPU stalling between the functions. This is the closest we...
void benchModules(unsigned rows)
Benchmark the for extra components not currently used on FK20.
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
__global__ void fr_fft_wrapper(fr_t *output, const fr_t *input)
wrapper for fr_fft: FFT for fr_t[512]
__global__ void g1p_fft_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_fft: FFT for arrays of g1p_t with length 512
__global__ void g1p_ift_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_ift: inverse FFT for arrays of g1p_t with length 512
G1 point in projective coordinates.