#include <bits/getopt_core.h>
#include <cstring>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include "fr.cuh"
#include "fk20.cuh"
#include "g1.cuh"
#include "test.h"
Go to the source code of this file.
|
#define | DPRINTF(fmt, ...) |
|
#define | SET_SHAREDMEM(SZ, FN) |
|
#define | COPYMANY(DEST, SRC, SIZE, NCOPIES, TYPE) for(int counter=0; counter<NCOPIES; counter++) memcpy(DEST+counter*SIZE, SRC, SIZE*sizeof(TYPE)); |
| Write NCOPIES copies of SRC[SIZE] into DEST,. More...
|
|
#define | CUDASYNC(fmt, ...) |
|
#define | BENCH_BEFORE |
|
#define | COL(N) "\x1B["#N"G" |
|
#define | BENCH_AFTER(FNAME) |
|
#define | MALLOCSYNC(fmt, ...) |
|
|
void | setupMemory (unsigned rows) |
| Initialize the memory for the tests, by filling the memory with copies of the KAT Commented out variables are not currently used, uncomment allocation and data copy for future use. More...
|
|
void | freeMemory () |
| frees the pointers allocated by setupMemory More...
|
|
bool | preBenchTest (int rows) |
| Executes a test of FK20 with one block for each row. At the end, compare if the calculated h_fft is the same as the known-good value. This function does not replace the more in-depth tests, but works as a canary to detect errors. More...
|
|
void | benchFull (int rows) |
| Benchmark full executions of FK20, without GPU stalling between the functions. This is the closest we have to real-world performance. More...
|
|
void | benchSteps (unsigned rows) |
| Benchmark the components functions separately and report. More...
|
|
void | benchModules (unsigned rows) |
| Benchmark the for extra components not currently used on FK20. More...
|
|
int | compare (const void *a, const void *b) |
| Comparator needed by qsort() from stdlib Simple and quick comparison of two floats. More...
|
|
void | printHeader (unsigned rows) |
| Prints to STDOUT an informative banner with the current hardware and benchmark parameters. More...
|
|
int | main (int argc, char **argv) |
|
◆ BENCH_AFTER
#define BENCH_AFTER |
( |
|
FNAME | ) |
|
Value: cudaEventRecord(stop); \
err = cudaEventSynchronize(stop);\
if (err != cudaSuccess) printf("%s:%d Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));\
cudaEventElapsedTime(&milliseconds[i], start, stop);\
}\
qsort(milliseconds, NSAMPLES,
sizeof(milliseconds[0]),
compare);\
median = milliseconds[NSAMPLES/2];\
printf(FNAME
COL(25)
" %8.3f ms [%8.3f - %8.3f]\n", median, milliseconds[0], milliseconds[NSAMPLES-1]);
int compare(const void *a, const void *b)
Comparator needed by qsort() from stdlib Simple and quick comparison of two floats.
Definition at line 105 of file fk20benchmark.cu.
◆ BENCH_BEFORE
Value:for(int i=0; i<NSAMPLES; i++){\
cudaEventRecord(start)
Use these macros to enclose a function and benchmark it.
example:
cudaError_t err; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); float milliseconds[NSAMPLES]; float median;
BENCH_BEFORE; functionToBench(params); BENCH_AFTER("Descriptive name");
Definition at line 99 of file fk20benchmark.cu.
◆ COL
#define COL |
( |
|
N | ) |
"\x1B["#N"G" |
◆ COPYMANY
#define COPYMANY |
( |
|
DEST, |
|
|
|
SRC, |
|
|
|
SIZE, |
|
|
|
NCOPIES, |
|
|
|
TYPE |
|
) |
| for(int counter=0; counter<NCOPIES; counter++) memcpy(DEST+counter*SIZE, SRC, SIZE*sizeof(TYPE)); |
Write NCOPIES copies of SRC[SIZE] into DEST,.
Definition at line 72 of file fk20benchmark.cu.
◆ CUDASYNC
#define CUDASYNC |
( |
|
fmt, |
|
|
|
... |
|
) |
| |
Value: err = cudaDeviceSynchronize(); \
if (err != cudaSuccess) \
printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)
Definition at line 77 of file fk20benchmark.cu.
◆ DPRINTF
#define DPRINTF |
( |
|
fmt, |
|
|
|
... |
|
) |
| |
◆ MALLOCSYNC
#define MALLOCSYNC |
( |
|
fmt, |
|
|
|
... |
|
) |
| |
Value: if (err != cudaSuccess) \
printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)
◆ SET_SHAREDMEM
#define SET_SHAREDMEM |
( |
|
SZ, |
|
|
|
FN |
|
) |
| |
Value: err = cudaFuncSetAttribute(FN, cudaFuncAttributeMaxDynamicSharedMemorySize, SZ); \
cudaDeviceSynchronize(); \
if (err != cudaSuccess) \
printf("Error cudaFuncSetAttribute: %s:%d, error %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));
Definition at line 62 of file fk20benchmark.cu.
◆ benchFull()
void benchFull |
( |
int |
rows | ) |
|
Benchmark full executions of FK20, without GPU stalling between the functions. This is the closest we have to real-world performance.
- Parameters
-
rows | Number of rows and cuda blocks |
Definition at line 223 of file fk20benchmark.cu.
◆ benchModules()
void benchModules |
( |
unsigned |
rows | ) |
|
Benchmark the for extra components not currently used on FK20.
- Parameters
-
rows | Number of rows and cuda blocks |
Definition at line 299 of file fk20benchmark.cu.
◆ benchSteps()
void benchSteps |
( |
unsigned |
rows | ) |
|
Benchmark the components functions separately and report.
- Parameters
-
rows | number of rows and cuda blocks |
Definition at line 255 of file fk20benchmark.cu.
◆ compare()
int compare |
( |
const void * |
a, |
|
|
const void * |
b |
|
) |
| |
Comparator needed by qsort() from stdlib Simple and quick comparison of two floats.
- Parameters
-
- Returns
- int 1 if a>b
-
int 0 if a==b
-
int -1 if a<b
Definition at line 434 of file fk20benchmark.cu.
◆ freeMemory()
◆ main()
int main |
( |
int |
argc, |
|
|
char ** |
argv |
|
) |
| |
◆ preBenchTest()
bool preBenchTest |
( |
int |
rows | ) |
|
Executes a test of FK20 with one block for each row. At the end, compare if the calculated h_fft is the same as the known-good value. This function does not replace the more in-depth tests, but works as a canary to detect errors.
- Parameters
-
rows | Number of rows and cuda blocks. |
- Returns
- true Test pass
-
false Test Fail
Definition at line 188 of file fk20benchmark.cu.
◆ printHeader()
void printHeader |
( |
unsigned |
rows | ) |
|
Prints to STDOUT an informative banner with the current hardware and benchmark parameters.
- Parameters
-
rows | number of rows and cuda blocks |
Definition at line 396 of file fk20benchmark.cu.
◆ setupMemory()
void setupMemory |
( |
unsigned |
rows | ) |
|
Initialize the memory for the tests, by filling the memory with copies of the KAT Commented out variables are not currently used, uncomment allocation and data copy for future use.
- Parameters
-
rows | number of rows and cuda blocks |
Definition at line 333 of file fk20benchmark.cu.
◆ b_fr_tmp
◆ b_g1p_tmp
◆ b_h
◆ b_h_fft
◆ b_hext_fft
◆ b_polynomial
fr_t* b_polynomial = NULL |
◆ b_toeplitz_coefficients
fr_t* b_toeplitz_coefficients = NULL |
◆ b_toeplitz_coefficients_fft
fr_t* b_toeplitz_coefficients_fft = NULL |
◆ b_xext_fft
◆ cmp
__managed__ uint8_t cmp[16 *512] |
◆ h_fft
__managed__ g1p_t h_fft[512] |
|
extern |
◆ hext_fft
__managed__ g1p_t hext_fft[512] |
|
extern |
◆ polynomial
__managed__ fr_t polynomial[4096] |
|
extern |
Managed arrays to hold the valid intermediate values generated by the verified python implementation of FK20. Used by the test functions.
AVOID MANUAL EDIT Instead use the instrumented python code.
Definition at line 16 of file fk20_testvector.cu.
◆ setup
__managed__ g1p_t setup[4097] |
|
extern |
◆ toeplitz_coefficients
__managed__ fr_t toeplitz_coefficients[16][512] |
|
extern |
◆ toeplitz_coefficients_fft
__managed__ fr_t toeplitz_coefficients_fft[16][512] |
|
extern |
◆ xext_fft
__managed__ g1p_t xext_fft[16][512] |
|
extern |