FK20 CUDA
fk20.cuh
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 #ifndef FK20_CUH
6 #define FK20_CUH
7 
8 #include <stdint.h>
9 
10 #include "fr.cuh"
11 #include "g1.cuh"
12 
13 // Shared memory sizes
14 const size_t g1p_sharedmem = 512 * 3 * 6 * 8; // 512 points * 3 residues/point * 6 words/residue * 8 bytes/word = 72 KiB
15 const size_t fr_sharedmem = 512 * 4 * 8; // 512 residues * 4 words/residue * 8 bytes/word = 16 KiB
16 
17 // External interface
18 // Consult README file for definitions of nomenclature.
19 
20 __global__ void fk20_setup2xext_fft(g1p_t xext_fft[8192], const g1p_t *setup);
21 
22 __host__ void fk20_poly2h_fft(g1p_t *h_fft, const fr_t *polynomial, const g1p_t xext_fft[8192], unsigned rows);
23 
26 __global__ void fk20_poly2hext_fft(g1p_t *hext_fft, const fr_t *polynomial, const g1p_t xext_fft[8192]);
27 
28 __global__ void fk20_hext_fft2h(g1p_t *h, const g1p_t *hext_fft);
29 __global__ void fk20_h2h_fft(g1p_t *h_fft, const g1p_t *h);
30 __global__ void fk20_hext_fft2h_fft(g1p_t *h_fft, const g1p_t *hext_fft);
31 __global__ void fk20_hext2h(g1p_t *h);
32 __global__ void fk20_msm(g1p_t *hext_fft, const fr_t *toeplitz_coefficients_fft, const g1p_t *xext_fft);
33 
34 // useful macros. Need to have a cudaError_t err variable declared in the caller
35 
36 // Syncronizes the Device, making sure that the kernel has finished the execution. Checks for any errors, and report if
37 // errors are found.
38 #ifndef CUDASYNC
39 #define CUDASYNC(fmt, ...) \
40  err = cudaDeviceSynchronize(); \
41  if (err != cudaSuccess) \
42  printf("%s:%d " fmt " Error: %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err), ##__VA_ARGS__)
43 #endif
44 
45 // The necessary shared memory is larger than what we can statically allocate, hence it is allocated dynamically in the
46 // kernel call. Because cuda, we need to set the maximum allowed size using this macro.
47 #ifndef SET_SHAREDMEM
48 #define SET_SHAREDMEM(SZ, FN) \
49  err = cudaFuncSetAttribute(FN, cudaFuncAttributeMaxDynamicSharedMemorySize, SZ); \
50  cudaDeviceSynchronize(); \
51  if (err != cudaSuccess) \
52  printf("Error cudaFuncSetAttribute: %s:%d, error %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));
53 #endif
54 
55 
56 #endif
57 
58 // vim: ts=4 et sw=4 si
__global__ void fk20_hext_fft2h(g1p_t *h, const g1p_t *hext_fft)
const size_t g1p_sharedmem
Definition: fk20.cuh:14
__global__ void fk20_msm(g1p_t *hext_fft, const fr_t *toeplitz_coefficients_fft, const g1p_t *xext_fft)
toeplitz_coefficients_fft + xext_fft -> hext_fft
Definition: fk20_msm.cu:20
__global__ void fk20_poly2toeplitz_coefficients(fr_t *toeplitz_coefficients, const fr_t *polynomial)
polynomial -> toeplitz_coefficients
__host__ void fk20_poly2h_fft(g1p_t *h_fft, const fr_t *polynomial, const g1p_t xext_fft[8192], unsigned rows)
polynomial + xext_fft -> h_fft This function is a wrapper for the full FK20 computation,...
__global__ void fk20_poly2hext_fft(g1p_t *hext_fft, const fr_t *polynomial, const g1p_t xext_fft[8192])
polynomial + xext_fft -> hext_fft
__global__ void fk20_poly2toeplitz_coefficients_fft(fr_t *toeplitz_coefficients_fft, const fr_t *polynomial)
polynomial -> toeplitz_coefficients_fft
__global__ void fk20_hext_fft2h_fft(g1p_t *h_fft, const g1p_t *hext_fft)
hext_fft -> h_fft
__global__ void fk20_setup2xext_fft(g1p_t xext_fft[8192], const g1p_t *setup)
const size_t fr_sharedmem
Definition: fk20.cuh:15
__global__ void fk20_h2h_fft(g1p_t *h_fft, const g1p_t *h)
h -> h_fft
Definition: fk20.cu:92
__global__ void fk20_hext2h(g1p_t *h)
hext -> h Fill upper half of hext with inf, modifying in place.
__managed__ g1p_t xext_fft[16][512]
__managed__ g1p_t hext_fft[512 *512]
__managed__ g1p_t h_fft[512 *512]
__managed__ fr_t toeplitz_coefficients[512 *16][512]
__managed__ fr_t polynomial[512 *4096]
__managed__ fr_t toeplitz_coefficients_fft[512 *16][512]
__managed__ g1p_t h[512 *512]
__managed__ g1p_t setup[4097]
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
Definition: fr.cuh:24
G1 point in projective coordinates.
Definition: g1.cuh:27