FK20 CUDA
fk20.cuh File Reference
#include <stdint.h>
#include "fr.cuh"
#include "g1.cuh"
Include dependency graph for fk20.cuh:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Macros

#define FK20_CUH
 
#define CUDASYNC(fmt, ...)
 
#define SET_SHAREDMEM(SZ, FN)
 

Functions

__global__ void fk20_setup2xext_fft (g1p_t xext_fft[8192], const g1p_t *setup)
 
__host__ void fk20_poly2h_fft (g1p_t *h_fft, const fr_t *polynomial, const g1p_t xext_fft[8192], unsigned rows)
 polynomial + xext_fft -> h_fft This function is a wrapper for the full FK20 computation, i.e. commits up to 512 polynomials of 4096 elements to the same setup. l = 16, intrinsic to the implementation. More...
 
__global__ void fk20_poly2toeplitz_coefficients (fr_t *toeplitz_coefficients, const fr_t *polynomial)
 polynomial -> toeplitz_coefficients More...
 
__global__ void fk20_poly2toeplitz_coefficients_fft (fr_t *toeplitz_coefficients_fft, const fr_t *polynomial)
 polynomial -> toeplitz_coefficients_fft More...
 
__global__ void fk20_poly2hext_fft (g1p_t *hext_fft, const fr_t *polynomial, const g1p_t xext_fft[8192])
 polynomial + xext_fft -> hext_fft More...
 
__global__ void fk20_hext_fft2h (g1p_t *h, const g1p_t *hext_fft)
 
__global__ void fk20_h2h_fft (g1p_t *h_fft, const g1p_t *h)
 h -> h_fft More...
 
__global__ void fk20_hext_fft2h_fft (g1p_t *h_fft, const g1p_t *hext_fft)
 hext_fft -> h_fft More...
 
__global__ void fk20_hext2h (g1p_t *h)
 hext -> h Fill upper half of hext with inf, modifying in place. More...
 
__global__ void fk20_msm (g1p_t *hext_fft, const fr_t *toeplitz_coefficients_fft, const g1p_t *xext_fft)
 toeplitz_coefficients_fft + xext_fft -> hext_fft More...
 

Variables

const size_t g1p_sharedmem = 512 * 3 * 6 * 8
 
const size_t fr_sharedmem = 512 * 4 * 8
 

Macro Definition Documentation

◆ CUDASYNC

#define CUDASYNC (   fmt,
  ... 
)
Value:
err = cudaDeviceSynchronize(); \
if (err != cudaSuccess) \
printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)

Definition at line 39 of file fk20.cuh.

◆ FK20_CUH

#define FK20_CUH

Definition at line 6 of file fk20.cuh.

◆ SET_SHAREDMEM

#define SET_SHAREDMEM (   SZ,
  FN 
)
Value:
err = cudaFuncSetAttribute(FN, cudaFuncAttributeMaxDynamicSharedMemorySize, SZ); \
cudaDeviceSynchronize(); \
if (err != cudaSuccess) \
printf("Error cudaFuncSetAttribute: %s:%d, error %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));

Definition at line 48 of file fk20.cuh.

Function Documentation

◆ fk20_h2h_fft()

__global__ void fk20_h2h_fft ( g1p_t h_fft,
const g1p_t h 
)

h -> h_fft

Parameters
[out]h_fftarray with 512*gridDim.x elements
[in]harray with 512*gridDim.x elements
Returns
void

Definition at line 92 of file fk20.cu.

Here is the call graph for this function:

◆ fk20_hext2h()

__global__ void fk20_hext2h ( g1p_t h)

hext -> h Fill upper half of hext with inf, modifying in place.

Parameters
[in,out]h
Returns
void

Definition at line 27 of file fk20_poly2h_fft.cu.

Here is the call graph for this function:

◆ fk20_hext_fft2h()

__global__ void fk20_hext_fft2h ( g1p_t h,
const g1p_t hext_fft 
)

◆ fk20_hext_fft2h_fft()

__global__ void fk20_hext_fft2h_fft ( g1p_t h_fft,
const g1p_t hext_fft 
)

hext_fft -> h_fft

Grid must be 1-D, 256 threads per block. Dynamic shared memory: g1p_sharedmem(73728 Bytes)

Parameters
[out]h_fftarray with dimensions [gridDim.x * 512]
[in]hext_fftarray with dimensions [gridDim.x * 512]
Returns
void

Definition at line 21 of file fk20_hext_fft2h_fft.cu.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ fk20_msm()

__global__ void fk20_msm ( g1p_t he_fft,
const fr_t tc_fft,
const g1p_t xe_fft 
)

toeplitz_coefficients_fft + xext_fft -> hext_fft

Grid must be 1-D, 256 threads per block. WARN: Calling this function with dynamic shared memory introduces unpredictable behavior.

Parameters
[out]he_fftarray with dimensions [gridDim.x * 512]
[in]tc_fftarray with dimensions [gridDim.x * 16][512]
[in]xe_fftarray with dimensions [16][512]
Returns
void

Definition at line 20 of file fk20_msm.cu.

Here is the call graph for this function:

◆ fk20_poly2h_fft()

__host__ void fk20_poly2h_fft ( g1p_t h_fft,
const fr_t polynomial,
const g1p_t  xext_fft[8192],
unsigned  rows 
)

polynomial + xext_fft -> h_fft This function is a wrapper for the full FK20 computation, i.e. commits up to 512 polynomials of 4096 elements to the same setup. l = 16, intrinsic to the implementation.

Parameters
[out]h_fftarray with dimensions [rows * 512]
[in]polynomialarray with dimensions [rows * 16 * 512]
[in]xext_fftarray with dimensions [16 * 512]
[in]rowsnumber of rows (gridDim.x)
Returns
void

Definition at line 47 of file fk20_poly2h_fft.cu.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ fk20_poly2hext_fft()

__global__ void fk20_poly2hext_fft ( g1p_t hext_fft,
const fr_t polynomial,
const g1p_t  xext_fft[8192] 
)

polynomial + xext_fft -> hext_fft

Grid must be 1-D, 256 threads per block. Dynamic shared memory: fr_sharedmem (16384 Bytes) shared memory is used both in MSM loop and FFTs, without conflict

Parameters
[out]hext_fftarray with dimensions [gridDim.x * 16 * 512]
[in]polynomialarray with dimensions [gridDim.x * 16 * 512]
[in]xext_fftarray with dimensions [16 * 512], computed with fk20_setup2xext_fft()
Returns
void

Definition at line 24 of file fk20_poly2hext_fft.cu.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ fk20_poly2toeplitz_coefficients()

__global__ void fk20_poly2toeplitz_coefficients ( fr_t toeplitz_coefficients,
const fr_t polynomial 
)

polynomial -> toeplitz_coefficients

Parameters
[out]toeplitz_coefficientsarray with dimension [4096 * gridDim.x]
[in]polynomialarray with dimensions [rows * 16 * 512]
Returns
void

Grid must be 1-D, 256 threads per block.

IMPORTANT: This function does not need shared memory. Making the kernel call with a dynamic shared memory allocation is known to cause some subtle bugs, that not always show during normal execution. Similar comment is present in fk20test_poly.cu and fk20_512test_poly.cu. In case this function changes and starts needing shared memory, correct the tests on those two files.

Definition at line 23 of file fk20_poly2toeplitz_coefficients.cu.

Here is the call graph for this function:

◆ fk20_poly2toeplitz_coefficients_fft()

__global__ void fk20_poly2toeplitz_coefficients_fft ( fr_t toeplitz_coefficients_fft,
const fr_t polynomial 
)

polynomial -> toeplitz_coefficients_fft

WARN: Usage of this function is deprecated: Instead use fk20_poly2toeplitz_coefficients() followed by fr_fft(). This function is not covered in the test suite. This function remains in the repository for future optimizations.

Parameters
[out]toeplitz_coefficients_fftarray with 16*512*gridDim.x elements
[in]polynomialarray with 16*512*gridDim.x elements
Returns
void

Definition at line 23 of file fk20_poly2toeplitz_coefficients_fft.cu.

◆ fk20_setup2xext_fft()

__global__ void fk20_setup2xext_fft ( g1p_t  xext_fft[8192],
const g1p_t setup 
)

Variable Documentation

◆ fr_sharedmem

const size_t fr_sharedmem = 512 * 4 * 8

Definition at line 15 of file fk20.cuh.

◆ g1p_sharedmem

const size_t g1p_sharedmem = 512 * 3 * 6 * 8

Definition at line 14 of file fk20.cuh.