FK20-CUDAdocs/fk20__poly2hext__fft_8cu_source.html

 // bls12_381: Arithmetic for BLS12-381

 // Copyright 2022-2023 Dag Arne Osvik

 // Copyright 2022-2023 Luan Cardoso dos Santos


 #include "fr.cuh"

 #include "g1.cuh"

 #include "fk20.cuh"


 static __device__ fr_t fr_tmp[512*512];     // 16 KiB memory per threadblock


 __global__ void fk20_poly2hext_fft(g1p_t *hext_fft, const fr_t *polynomial, const g1p_t xext_fft[8192]) {


     // gridDim.x is the number of rows

     if (gridDim.y  !=   1) return;

     if (gridDim.z  !=   1) return;

     if (blockDim.x != 256) return;  // k

     if (blockDim.y !=   1) return;

     if (blockDim.z !=   1) return;


     unsigned tid = threadIdx.x; // Thread number

     unsigned bid = blockIdx.x;  // Block number


     // Accumulators and temporaries in registers or local

     // (thread-interleaved global) memory


     g1p_t a0, a1, t;


     g1p_inf(a0);

     g1p_inf(a1);


     polynomial += 4096 * bid;

     hext_fft += 512 * bid;


     fr_t *fr = fr_tmp + 512 * bid;


     // MSM Loop


     for (int i=0; i<16; i++) {


         // Copy from the polynomial into half of the coefficient array


         unsigned src = 16*tid + 15 - i;

         unsigned dst = (tid+257) % 512;


         if (tid > 0)

             fr_cpy(fr[dst], polynomial[src]);

         else

             fr_zero(fr[dst]);


         __syncthreads();


         // Zero the other half of coefficients before FFT


         fr_zero(fr[tid+1]);


         // Compute FFT


         __syncthreads();

         fr_fft(fr, fr);

         __syncthreads();


         // Multiply and accumulate


         g1p_cpy(t, xext_fft[512*i + tid + 0]);

         g1p_mul(t, fr[tid]);

         __syncthreads();

         g1p_add(a0, t);


         g1p_cpy(t, xext_fft[512*i + tid + 256]);

         g1p_mul(t, fr[tid+256]);

         __syncthreads();

         g1p_add(a1, t);

     }


     // Store accumulators


     g1p_cpy(hext_fft[tid+  0], a0);

     g1p_cpy(hext_fft[tid+256], a1);

 }


 // vim: ts=4 et sw=4 si

fk20.cuh

xext_fft
__managed__ g1p_t xext_fft[16][512]
Definition: fk20_testvector.cu:24603

hext_fft
__managed__ g1p_t hext_fft[512 *512]
Definition: fk20_testvector.cu:82052

polynomial
__managed__ fr_t polynomial[512 *4096]
Definition: fk20_testvector.cu:16

fk20_poly2hext_fft
__global__ void fk20_poly2hext_fft(g1p_t *hext_fft, const fr_t *polynomial, const g1p_t xext_fft[8192])
polynomial + xext_fft -> hext_fft
Definition: fk20_poly2hext_fft.cu:24

fr_zero
__device__ __host__ void fr_zero(fr_t &z)
Sets the value of z to zero.
Definition: fr.cu:15

fr.cuh

fr_t
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
Definition: fr.cuh:24

fr_cpy
__device__ __host__ void fr_cpy(fr_t &z, const fr_t &x)
Copy from x into z.
Definition: fr_cpy.cu:14

fr_fft
__device__ void fr_fft(fr_t *output, const fr_t *input)
FFT over Fr.
Definition: fr_fft.cu:26

g1.cuh

g1p_inf
__device__ __host__ void g1p_inf(g1p_t &p)
Set p to the point-at-infinity (0,1,0)
Definition: g1p.cu:93

g1p_add
__device__ void g1p_add(g1p_t &p, const g1p_t &q)
Computes the sum of two points q into p, using projective coordinates. and stores in p.
Definition: g1p_add.cu:29

g1p_mul
__device__ void g1p_mul(g1p_t &p, const fr_t &x)
p ← k·p Point multiplication by scalar, in projective coordinates. That result is stored back into p.
Definition: g1p_mul.cu:19

g1p_cpy
__device__ __host__ void g1p_cpy(g1p_t &p, const g1p_t &q)
Copy from q into p.
Definition: g1p.cu:67

g1p_t
G1 point in projective coordinates.
Definition: g1.cuh:27