FK20 CUDA
fk20benchmark.cu File Reference
#include <bits/getopt_core.h>
#include <cstring>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include "fr.cuh"
#include "fk20.cuh"
#include "g1.cuh"
#include "test.h"
Include dependency graph for fk20benchmark.cu:

Go to the source code of this file.

Macros

#define DPRINTF(fmt, ...)
 
#define SET_SHAREDMEM(SZ, FN)
 
#define COPYMANY(DEST, SRC, SIZE, NCOPIES, TYPE)    for(int counter=0; counter<NCOPIES; counter++) memcpy(DEST+counter*SIZE, SRC, SIZE*sizeof(TYPE));
 Write NCOPIES copies of SRC[SIZE] into DEST,. More...
 
#define CUDASYNC(fmt, ...)
 
#define BENCH_BEFORE
 
#define COL(N)   "\x1B["#N"G"
 
#define BENCH_AFTER(FNAME)
 
#define MALLOCSYNC(fmt, ...)
 

Functions

void setupMemory (unsigned rows)
 Initialize the memory for the tests, by filling the memory with copies of the KAT Commented out variables are not currently used, uncomment allocation and data copy for future use. More...
 
void freeMemory ()
 frees the pointers allocated by setupMemory More...
 
bool preBenchTest (int rows)
 Executes a test of FK20 with one block for each row. At the end, compare if the calculated h_fft is the same as the known-good value. This function does not replace the more in-depth tests, but works as a canary to detect errors. More...
 
void benchFull (int rows)
 Benchmark full executions of FK20, without GPU stalling between the functions. This is the closest we have to real-world performance. More...
 
void benchSteps (unsigned rows)
 Benchmark the components functions separately and report. More...
 
void benchModules (unsigned rows)
 Benchmark the for extra components not currently used on FK20. More...
 
int compare (const void *a, const void *b)
 Comparator needed by qsort() from stdlib Simple and quick comparison of two floats. More...
 
void printHeader (unsigned rows)
 Prints to STDOUT an informative banner with the current hardware and benchmark parameters. More...
 
int main (int argc, char **argv)
 

Variables

__managed__ fr_t polynomial [4096]
 
__managed__ g1p_t setup [4097]
 
__managed__ g1p_t xext_fft [16][512]
 
__managed__ fr_t toeplitz_coefficients [16][512]
 
__managed__ fr_t toeplitz_coefficients_fft [16][512]
 
__managed__ g1p_t hext_fft [512]
 
__managed__ g1p_t h [512]
 
__managed__ g1p_t h_fft [512]
 
fr_tb_polynomial = NULL
 
g1p_tb_xext_fft = NULL
 
fr_tb_toeplitz_coefficients = NULL
 
fr_tb_toeplitz_coefficients_fft = NULL
 
g1p_tb_hext_fft = NULL
 
g1p_tb_h = NULL
 
g1p_tb_h_fft = NULL
 
fr_tb_fr_tmp
 
g1p_tb_g1p_tmp
 
__managed__ uint8_t cmp [16 *512]
 

Macro Definition Documentation

◆ BENCH_AFTER

#define BENCH_AFTER (   FNAME)
Value:
cudaEventRecord(stop); \
err = cudaEventSynchronize(stop);\
if (err != cudaSuccess) printf("%s:%d Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));\
cudaEventElapsedTime(&milliseconds[i], start, stop);\
}\
qsort(milliseconds, NSAMPLES, sizeof(milliseconds[0]), compare);\
median = milliseconds[NSAMPLES/2];\
printf(FNAME COL(25) " %8.3f ms [%8.3f - %8.3f]\n", median, milliseconds[0], milliseconds[NSAMPLES-1]);
#define COL(N)
int compare(const void *a, const void *b)
Comparator needed by qsort() from stdlib Simple and quick comparison of two floats.

Definition at line 105 of file fk20benchmark.cu.

◆ BENCH_BEFORE

#define BENCH_BEFORE
Value:
for(int i=0; i<NSAMPLES; i++){\
cudaEventRecord(start)

Use these macros to enclose a function and benchmark it.

example:

cudaError_t err; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); float milliseconds[NSAMPLES]; float median;

BENCH_BEFORE; functionToBench(params); BENCH_AFTER("Descriptive name");

Definition at line 99 of file fk20benchmark.cu.

◆ COL

#define COL (   N)    "\x1B["#N"G"

Definition at line 103 of file fk20benchmark.cu.

◆ COPYMANY

#define COPYMANY (   DEST,
  SRC,
  SIZE,
  NCOPIES,
  TYPE 
)     for(int counter=0; counter<NCOPIES; counter++) memcpy(DEST+counter*SIZE, SRC, SIZE*sizeof(TYPE));

Write NCOPIES copies of SRC[SIZE] into DEST,.

Definition at line 72 of file fk20benchmark.cu.

◆ CUDASYNC

#define CUDASYNC (   fmt,
  ... 
)
Value:
err = cudaDeviceSynchronize(); \
if (err != cudaSuccess) \
printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)

Definition at line 77 of file fk20benchmark.cu.

◆ DPRINTF

#define DPRINTF (   fmt,
  ... 
)

Definition at line 35 of file fk20benchmark.cu.

◆ MALLOCSYNC

#define MALLOCSYNC (   fmt,
  ... 
)
Value:
if (err != cudaSuccess) \
printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)

◆ SET_SHAREDMEM

#define SET_SHAREDMEM (   SZ,
  FN 
)
Value:
err = cudaFuncSetAttribute(FN, cudaFuncAttributeMaxDynamicSharedMemorySize, SZ); \
cudaDeviceSynchronize(); \
if (err != cudaSuccess) \
printf("Error cudaFuncSetAttribute: %s:%d, error %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));

Definition at line 62 of file fk20benchmark.cu.

Function Documentation

◆ benchFull()

void benchFull ( int  rows)

Benchmark full executions of FK20, without GPU stalling between the functions. This is the closest we have to real-world performance.

Parameters
rowsNumber of rows and cuda blocks

Definition at line 223 of file fk20benchmark.cu.

◆ benchModules()

void benchModules ( unsigned  rows)

Benchmark the for extra components not currently used on FK20.

Parameters
rowsNumber of rows and cuda blocks

Definition at line 299 of file fk20benchmark.cu.

◆ benchSteps()

void benchSteps ( unsigned  rows)

Benchmark the components functions separately and report.

Parameters
rowsnumber of rows and cuda blocks

Definition at line 255 of file fk20benchmark.cu.

◆ compare()

int compare ( const void *  a,
const void *  b 
)

Comparator needed by qsort() from stdlib Simple and quick comparison of two floats.

Parameters
a
b
Returns
int 1 if a>b
int 0 if a==b
int -1 if a<b

Definition at line 434 of file fk20benchmark.cu.

◆ freeMemory()

void freeMemory ( )

frees the pointers allocated by setupMemory

Definition at line 378 of file fk20benchmark.cu.

◆ main()

int main ( int  argc,
char **  argv 
)

Definition at line 131 of file fk20benchmark.cu.

◆ preBenchTest()

bool preBenchTest ( int  rows)

Executes a test of FK20 with one block for each row. At the end, compare if the calculated h_fft is the same as the known-good value. This function does not replace the more in-depth tests, but works as a canary to detect errors.

Parameters
rowsNumber of rows and cuda blocks.
Returns
true Test pass
false Test Fail

Definition at line 188 of file fk20benchmark.cu.

Here is the call graph for this function:

◆ printHeader()

void printHeader ( unsigned  rows)

Prints to STDOUT an informative banner with the current hardware and benchmark parameters.

Parameters
rowsnumber of rows and cuda blocks

Definition at line 396 of file fk20benchmark.cu.

◆ setupMemory()

void setupMemory ( unsigned  rows)

Initialize the memory for the tests, by filling the memory with copies of the KAT Commented out variables are not currently used, uncomment allocation and data copy for future use.

Parameters
rowsnumber of rows and cuda blocks

Definition at line 333 of file fk20benchmark.cu.

Variable Documentation

◆ b_fr_tmp

fr_t* b_fr_tmp

Definition at line 52 of file fk20benchmark.cu.

◆ b_g1p_tmp

g1p_t* b_g1p_tmp

Definition at line 53 of file fk20benchmark.cu.

◆ b_h

g1p_t* b_h = NULL

Definition at line 48 of file fk20benchmark.cu.

◆ b_h_fft

g1p_t* b_h_fft = NULL

Definition at line 49 of file fk20benchmark.cu.

◆ b_hext_fft

g1p_t* b_hext_fft = NULL

Definition at line 47 of file fk20benchmark.cu.

◆ b_polynomial

fr_t* b_polynomial = NULL

Definition at line 43 of file fk20benchmark.cu.

◆ b_toeplitz_coefficients

fr_t* b_toeplitz_coefficients = NULL

Definition at line 45 of file fk20benchmark.cu.

◆ b_toeplitz_coefficients_fft

fr_t* b_toeplitz_coefficients_fft = NULL

Definition at line 46 of file fk20benchmark.cu.

◆ b_xext_fft

g1p_t* b_xext_fft = NULL

Definition at line 44 of file fk20benchmark.cu.

◆ cmp

__managed__ uint8_t cmp[16 *512]

Definition at line 54 of file fk20benchmark.cu.

◆ h

__managed__ g1p_t h[512]
extern

Definition at line 84615 of file fk20_testvector.cu.

◆ h_fft

__managed__ g1p_t h_fft[512]
extern

Definition at line 87178 of file fk20_testvector.cu.

◆ hext_fft

__managed__ g1p_t hext_fft[512]
extern

Definition at line 82052 of file fk20_testvector.cu.

◆ polynomial

__managed__ fr_t polynomial[4096]
extern

Managed arrays to hold the valid intermediate values generated by the verified python implementation of FK20. Used by the test functions.

AVOID MANUAL EDIT Instead use the instrumented python code.

Definition at line 16 of file fk20_testvector.cu.

◆ setup

__managed__ g1p_t setup[4097]
extern

Definition at line 4115 of file fk20_testvector.cu.

◆ toeplitz_coefficients

__managed__ fr_t toeplitz_coefficients[16][512]
extern

Definition at line 65598 of file fk20_testvector.cu.

◆ toeplitz_coefficients_fft

__managed__ fr_t toeplitz_coefficients_fft[16][512]
extern

Definition at line 73825 of file fk20_testvector.cu.

◆ xext_fft

__managed__ g1p_t xext_fft[16][512]
extern

Definition at line 24603 of file fk20_testvector.cu.