FK20 CUDA
fk20_poly2h_fft.cu
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 #include <time.h>
6 #include <stdio.h> // For reporting sharedmem errors.
7 
8 #include "fr.cuh"
9 #include "g1.cuh"
10 #include "fk20.cuh"
11 
12 // Maximum number of rows covered in the test.
13 #define ROWS 512
14 
15 static __managed__ fr_t fr[ROWS * 16 * 512]; // 256 KiB per threadblock
16 static __managed__ g1p_t g1p[ROWS * 512]; // 72 KiB per threadblock
17 
19 
27 __global__ void fk20_hext2h(g1p_t *h) {
28  unsigned tid = threadIdx.x; // Thread number
29  unsigned bid = blockIdx.x; // Block number
30 
31  h += 512 * bid;
32  g1p_inf(h[256 + tid]);
33 }
34 
47 __host__ void fk20_poly2h_fft(g1p_t *h_fft, const fr_t *polynomial, const g1p_t xext_fft[8192], unsigned rows) {
48  cudaError_t err;
49 
50  // Setup
51 
55 
56  // polynomial -> tc
57  fk20_poly2toeplitz_coefficients<<<rows, 256, fr_sharedmem>>>(fr, polynomial);
58  // CUDASYNC("fk20_poly2toeplitz_coefficients");
59 
60  // tc -> tc_fft
61  fr_fft_wrapper<<<rows * 16, 256, fr_sharedmem>>>(fr, fr);
62  // CUDASYNC("fr_fft_wrapper");
63 
64  // tc_fft -> hext_fft
65  fk20_msm<<<rows, 256>>>(g1p, fr, xext_fft);
66  // CUDASYNC("fk20_msm");
67 
68  // hext_fft -> hext
69  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(g1p, g1p);
70  // CUDASYNC("g1p_ift_wrapper");
71 
72  // hext -> h
73  fk20_hext2h<<<rows, 256>>>(g1p);
74  // CUDASYNC("fk20_hext2h");
75 
76  // h -> h_fft
77  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(h_fft, g1p);
78  // CUDASYNC("g1p_fft_wrapper");
79 }
80 // vim: ts=4 et sw=4 si
const size_t g1p_sharedmem
Definition: fk20.cuh:14
#define SET_SHAREDMEM(SZ, FN)
Definition: fk20.cuh:48
const size_t fr_sharedmem
Definition: fk20.cuh:15
__managed__ g1p_t xext_fft[16][512]
__managed__ g1p_t h_fft[512 *512]
__managed__ fr_t polynomial[512 *4096]
__managed__ g1p_t h[512 *512]
#define ROWS
__host__ void fk20_poly2h_fft(g1p_t *h_fft, const fr_t *polynomial, const g1p_t xext_fft[8192], unsigned rows)
polynomial + xext_fft -> h_fft This function is a wrapper for the full FK20 computation,...
__global__ void fk20_hext2h(g1p_t *h)
hext -> h Fill upper half of hext with inf, modifying in place.
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
Definition: fr.cuh:24
__global__ void fr_fft_wrapper(fr_t *output, const fr_t *input)
wrapper for fr_fft: FFT for fr_t[512]
Definition: fr_fft.cu:316
__device__ __host__ void g1p_inf(g1p_t &p)
Set p to the point-at-infinity (0,1,0)
Definition: g1p.cu:93
__global__ void g1p_fft_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_fft: FFT for arrays of g1p_t with length 512
Definition: g1p_fft.cu:336
__global__ void g1p_ift_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_ift: inverse FFT for arrays of g1p_t with length 512
Definition: g1p_fft.cu:349
G1 point in projective coordinates.
Definition: g1.cuh:27