#include <stdint.h>
#include "fr.cuh"
#include "g1.cuh"

Include dependency graph for fk20.cuh:

This graph shows which files directly or indirectly include this file:

Macros
#define	FK20_CUH

#define	CUDASYNC(fmt, ...)

#define	SET_SHAREDMEM(SZ, FN)

Functions
__global__ void	fk20_setup2xext_fft (g1p_t xext_fft[8192], const g1p_t *setup)

__host__ void	fk20_poly2h_fft (g1p_t h_fft, const fr_t polynomial, const g1p_t xext_fft[8192], unsigned rows)
	polynomial + xext_fft -> h_fft This function is a wrapper for the full FK20 computation, i.e. commits up to 512 polynomials of 4096 elements to the same setup. l = 16, intrinsic to the implementation. More...

__global__ void	fk20_poly2toeplitz_coefficients (fr_t toeplitz_coefficients, const fr_t polynomial)
	polynomial -> toeplitz_coefficients More...

__global__ void	fk20_poly2toeplitz_coefficients_fft (fr_t toeplitz_coefficients_fft, const fr_t polynomial)
	polynomial -> toeplitz_coefficients_fft More...

__global__ void	fk20_poly2hext_fft (g1p_t hext_fft, const fr_t polynomial, const g1p_t xext_fft[8192])
	polynomial + xext_fft -> hext_fft More...

__global__ void	fk20_hext_fft2h (g1p_t h, const g1p_t hext_fft)

__global__ void	fk20_h2h_fft (g1p_t h_fft, const g1p_t h)
	h -> h_fft More...

__global__ void	fk20_hext_fft2h_fft (g1p_t h_fft, const g1p_t hext_fft)
	hext_fft -> h_fft More...

__global__ void	fk20_hext2h (g1p_t *h)
	hext -> h Fill upper half of hext with inf, modifying in place. More...

__global__ void	fk20_msm (g1p_t hext_fft, const fr_t toeplitz_coefficients_fft, const g1p_t *xext_fft)
	toeplitz_coefficients_fft + xext_fft -> hext_fft More...

Variables
const size_t	g1p_sharedmem = 512 * 3 * 6 * 8

const size_t	fr_sharedmem = 512 * 4 * 8

Macro Definition Documentation

◆ CUDASYNC

#define CUDASYNC	(	fmt,
		...
	)

Value:

    err = cudaDeviceSynchronize();                                                                                     \
    if (err != cudaSuccess)                                                                                            \
    printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)

Definition at line 39 of file fk20.cuh.

◆ FK20_CUH

#define FK20_CUH

Definition at line 6 of file fk20.cuh.

◆ SET_SHAREDMEM

#define SET_SHAREDMEM	(	SZ,
		FN
	)

Value:

    err = cudaFuncSetAttribute(FN, cudaFuncAttributeMaxDynamicSharedMemorySize, SZ);                                   \
    cudaDeviceSynchronize();                                                                                           \
    if (err != cudaSuccess)                                                                                            \
        printf("Error cudaFuncSetAttribute: %s:%d, error %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));

Definition at line 48 of file fk20.cuh.

Function Documentation

◆ fk20_h2h_fft()

__global__ void fk20_h2h_fft	(	g1p_t *	h_fft,
		const g1p_t *	h
	)

h -> h_fft

Parameters

[out]	h_fft	array with 512*gridDim.x elements
[in]	h	array with 512*gridDim.x elements

Returns: void

Definition at line 92 of file fk20.cu.

Here is the call graph for this function:

◆ fk20_hext2h()

__global__ void fk20_hext2h ( g1p_t * h )

hext -> h Fill upper half of hext with inf, modifying in place.

Parameters

[in,out] h

Returns: void

Definition at line 27 of file fk20_poly2h_fft.cu.

Here is the call graph for this function:

◆ fk20_hext_fft2h()

__global__ void fk20_hext_fft2h	(	g1p_t *	h,
		const g1p_t *	hext_fft
	)

◆ fk20_hext_fft2h_fft()

__global__ void fk20_hext_fft2h_fft	(	g1p_t *	h_fft,
		const g1p_t *	hext_fft
	)

hext_fft -> h_fft

Grid must be 1-D, 256 threads per block. Dynamic shared memory: g1p_sharedmem(73728 Bytes)

Parameters

[out]	h_fft	array with dimensions [gridDim.x * 512]
[in]	hext_fft	array with dimensions [gridDim.x * 512]

Returns: void

Definition at line 21 of file fk20_hext_fft2h_fft.cu.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fk20_msm()

__global__ void fk20_msm	(	g1p_t *	he_fft,
		const fr_t *	tc_fft,
		const g1p_t *	xe_fft
	)

toeplitz_coefficients_fft + xext_fft -> hext_fft

Grid must be 1-D, 256 threads per block. WARN: Calling this function with dynamic shared memory introduces unpredictable behavior.

Parameters

[out]	he_fft	array with dimensions [gridDim.x * 512]
[in]	tc_fft	array with dimensions [gridDim.x * 16][512]
[in]	xe_fft	array with dimensions [16][512]

Returns: void

Definition at line 20 of file fk20_msm.cu.

Here is the call graph for this function:

◆ fk20_poly2h_fft()

__host__ void fk20_poly2h_fft	(	g1p_t *	h_fft,
		const fr_t *	polynomial,
		const g1p_t	xext_fft[8192],
		unsigned	rows
	)

polynomial + xext_fft -> h_fft This function is a wrapper for the full FK20 computation, i.e. commits up to 512 polynomials of 4096 elements to the same setup. l = 16, intrinsic to the implementation.

Parameters

[out]	h_fft	array with dimensions [rows * 512]
[in]	polynomial	array with dimensions [rows * 16 * 512]
[in]	xext_fft	array with dimensions [16 * 512]
[in]	rows	number of rows (gridDim.x)

Returns: void

Definition at line 47 of file fk20_poly2h_fft.cu.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fk20_poly2hext_fft()

__global__ void fk20_poly2hext_fft	(	g1p_t *	hext_fft,
		const fr_t *	polynomial,
		const g1p_t	xext_fft[8192]
	)

polynomial + xext_fft -> hext_fft

Grid must be 1-D, 256 threads per block. Dynamic shared memory: fr_sharedmem (16384 Bytes) shared memory is used both in MSM loop and FFTs, without conflict

Parameters

[out]	hext_fft	array with dimensions [gridDim.x * 16 * 512]
[in]	polynomial	array with dimensions [gridDim.x * 16 * 512]
[in]	xext_fft	array with dimensions [16 * 512], computed with fk20_setup2xext_fft()

Returns: void

Definition at line 24 of file fk20_poly2hext_fft.cu.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fk20_poly2toeplitz_coefficients()

__global__ void fk20_poly2toeplitz_coefficients	(	fr_t *	toeplitz_coefficients,
		const fr_t *	polynomial
	)

polynomial -> toeplitz_coefficients

Parameters

[out]	toeplitz_coefficients	array with dimension [4096 * gridDim.x]
[in]	polynomial	array with dimensions [rows * 16 * 512]

Returns: void

Grid must be 1-D, 256 threads per block.

IMPORTANT: This function does not need shared memory. Making the kernel call with a dynamic shared memory allocation is known to cause some subtle bugs, that not always show during normal execution. Similar comment is present in fk20test_poly.cu and fk20_512test_poly.cu. In case this function changes and starts needing shared memory, correct the tests on those two files.

Definition at line 23 of file fk20_poly2toeplitz_coefficients.cu.

Here is the call graph for this function:

◆ fk20_poly2toeplitz_coefficients_fft()

__global__ void fk20_poly2toeplitz_coefficients_fft	(	fr_t *	toeplitz_coefficients_fft,
		const fr_t *	polynomial
	)

polynomial -> toeplitz_coefficients_fft

WARN: Usage of this function is deprecated: Instead use fk20_poly2toeplitz_coefficients() followed by fr_fft(). This function is not covered in the test suite. This function remains in the repository for future optimizations.

Parameters

[out]	toeplitz_coefficients_fft	array with 16512gridDim.x elements
[in]	polynomial	array with 16512gridDim.x elements

Returns: void

Definition at line 23 of file fk20_poly2toeplitz_coefficients_fft.cu.

◆ fk20_setup2xext_fft()

__global__ void fk20_setup2xext_fft	(	g1p_t	xext_fft[8192],
		const g1p_t *	setup
	)

Variable Documentation

◆ fr_sharedmem

const size_t fr_sharedmem = 512 * 4 * 8

Definition at line 15 of file fk20.cuh.

◆ g1p_sharedmem

const size_t g1p_sharedmem = 512 * 3 * 6 * 8

Definition at line 14 of file fk20.cuh.

Macros

Functions

Variables

Macro Definition Documentation

◆ CUDASYNC

◆ FK20_CUH

◆ SET_SHAREDMEM

Function Documentation

◆ fk20_h2h_fft()

◆ fk20_hext2h()

◆ fk20_hext_fft2h()

◆ fk20_hext_fft2h_fft()

◆ fk20_msm()

◆ fk20_poly2h_fft()

◆ fk20_poly2hext_fft()

◆ fk20_poly2toeplitz_coefficients()

◆ fk20_poly2toeplitz_coefficients_fft()

◆ fk20_setup2xext_fft()

Variable Documentation

◆ fr_sharedmem

◆ g1p_sharedmem