FK20 CUDA
|
Go to the source code of this file.
Macros | |
#define | FK20_CUH |
#define | CUDASYNC(fmt, ...) |
#define | SET_SHAREDMEM(SZ, FN) |
Functions | |
__global__ void | fk20_setup2xext_fft (g1p_t xext_fft[8192], const g1p_t *setup) |
__host__ void | fk20_poly2h_fft (g1p_t *h_fft, const fr_t *polynomial, const g1p_t xext_fft[8192], unsigned rows) |
polynomial + xext_fft -> h_fft This function is a wrapper for the full FK20 computation, i.e. commits up to 512 polynomials of 4096 elements to the same setup. l = 16, intrinsic to the implementation. More... | |
__global__ void | fk20_poly2toeplitz_coefficients (fr_t *toeplitz_coefficients, const fr_t *polynomial) |
polynomial -> toeplitz_coefficients More... | |
__global__ void | fk20_poly2toeplitz_coefficients_fft (fr_t *toeplitz_coefficients_fft, const fr_t *polynomial) |
polynomial -> toeplitz_coefficients_fft More... | |
__global__ void | fk20_poly2hext_fft (g1p_t *hext_fft, const fr_t *polynomial, const g1p_t xext_fft[8192]) |
polynomial + xext_fft -> hext_fft More... | |
__global__ void | fk20_hext_fft2h (g1p_t *h, const g1p_t *hext_fft) |
__global__ void | fk20_h2h_fft (g1p_t *h_fft, const g1p_t *h) |
h -> h_fft More... | |
__global__ void | fk20_hext_fft2h_fft (g1p_t *h_fft, const g1p_t *hext_fft) |
hext_fft -> h_fft More... | |
__global__ void | fk20_hext2h (g1p_t *h) |
hext -> h Fill upper half of hext with inf, modifying in place. More... | |
__global__ void | fk20_msm (g1p_t *hext_fft, const fr_t *toeplitz_coefficients_fft, const g1p_t *xext_fft) |
toeplitz_coefficients_fft + xext_fft -> hext_fft More... | |
Variables | |
const size_t | g1p_sharedmem = 512 * 3 * 6 * 8 |
const size_t | fr_sharedmem = 512 * 4 * 8 |
#define CUDASYNC | ( | fmt, | |
... | |||
) |
#define SET_SHAREDMEM | ( | SZ, | |
FN | |||
) |
__global__ void fk20_hext2h | ( | g1p_t * | h | ) |
hext -> h Fill upper half of hext with inf, modifying in place.
[in,out] | h |
Definition at line 27 of file fk20_poly2h_fft.cu.
hext_fft -> h_fft
Grid must be 1-D, 256 threads per block. Dynamic shared memory: g1p_sharedmem(73728 Bytes)
[out] | h_fft | array with dimensions [gridDim.x * 512] |
[in] | hext_fft | array with dimensions [gridDim.x * 512] |
Definition at line 21 of file fk20_hext_fft2h_fft.cu.
toeplitz_coefficients_fft + xext_fft -> hext_fft
Grid must be 1-D, 256 threads per block. WARN: Calling this function with dynamic shared memory introduces unpredictable behavior.
[out] | he_fft | array with dimensions [gridDim.x * 512] |
[in] | tc_fft | array with dimensions [gridDim.x * 16][512] |
[in] | xe_fft | array with dimensions [16][512] |
Definition at line 20 of file fk20_msm.cu.
__host__ void fk20_poly2h_fft | ( | g1p_t * | h_fft, |
const fr_t * | polynomial, | ||
const g1p_t | xext_fft[8192], | ||
unsigned | rows | ||
) |
polynomial + xext_fft -> h_fft This function is a wrapper for the full FK20 computation, i.e. commits up to 512 polynomials of 4096 elements to the same setup. l = 16, intrinsic to the implementation.
[out] | h_fft | array with dimensions [rows * 512] |
[in] | polynomial | array with dimensions [rows * 16 * 512] |
[in] | xext_fft | array with dimensions [16 * 512] |
[in] | rows | number of rows (gridDim.x) |
Definition at line 47 of file fk20_poly2h_fft.cu.
__global__ void fk20_poly2hext_fft | ( | g1p_t * | hext_fft, |
const fr_t * | polynomial, | ||
const g1p_t | xext_fft[8192] | ||
) |
polynomial + xext_fft -> hext_fft
Grid must be 1-D, 256 threads per block. Dynamic shared memory: fr_sharedmem (16384 Bytes) shared memory is used both in MSM loop and FFTs, without conflict
[out] | hext_fft | array with dimensions [gridDim.x * 16 * 512] |
[in] | polynomial | array with dimensions [gridDim.x * 16 * 512] |
[in] | xext_fft | array with dimensions [16 * 512], computed with fk20_setup2xext_fft() |
Definition at line 24 of file fk20_poly2hext_fft.cu.
__global__ void fk20_poly2toeplitz_coefficients | ( | fr_t * | toeplitz_coefficients, |
const fr_t * | polynomial | ||
) |
polynomial -> toeplitz_coefficients
[out] | toeplitz_coefficients | array with dimension [4096 * gridDim.x] |
[in] | polynomial | array with dimensions [rows * 16 * 512] |
Grid must be 1-D, 256 threads per block.
IMPORTANT: This function does not need shared memory. Making the kernel call with a dynamic shared memory allocation is known to cause some subtle bugs, that not always show during normal execution. Similar comment is present in fk20test_poly.cu and fk20_512test_poly.cu. In case this function changes and starts needing shared memory, correct the tests on those two files.
Definition at line 23 of file fk20_poly2toeplitz_coefficients.cu.
__global__ void fk20_poly2toeplitz_coefficients_fft | ( | fr_t * | toeplitz_coefficients_fft, |
const fr_t * | polynomial | ||
) |
polynomial -> toeplitz_coefficients_fft
WARN: Usage of this function is deprecated: Instead use fk20_poly2toeplitz_coefficients() followed by fr_fft(). This function is not covered in the test suite. This function remains in the repository for future optimizations.
[out] | toeplitz_coefficients_fft | array with 16*512*gridDim.x elements |
[in] | polynomial | array with 16*512*gridDim.x elements |
Definition at line 23 of file fk20_poly2toeplitz_coefficients_fft.cu.