FK20 CUDA
fk20_poly2toeplitz_coefficients_fft.cu
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 #include <stdio.h>
6 
7 #include "fr.cuh"
8 #include "fk20.cuh"
9 
10 static __device__ fr_t fr_tmp[512*16*512]; // 256 KiB memory per threadblock
11 
24 
25  // gridDim.x is the number of rows
26  if (gridDim.y != 1) return;
27  if (gridDim.z != 1) return;
28  if (blockDim.x != 256) return; // k
29  if (blockDim.y != 1) return;
30  if (blockDim.z != 1) return;
31 
32  unsigned tid = threadIdx.x; // Thread number
33  unsigned bid = blockIdx.x; // Block number
34 
35  // Accumulators and temporaries in registers or local
36  // (thread-interleaved global) memory
37 
38  polynomial += 4096 * bid;
39  toeplitz_coefficients_fft += 8192 * bid;
40 
41  fr_t *fr = fr_tmp + 8192 * bid;
42 
43  __syncthreads();
44 
45  // Loop
46 
47  for (int i=0; i<16; i++) {
48 
49  // Copy from the polynomial into half of the coefficient array
50 
51  unsigned src = tid*16+15-i;
52  unsigned dst = (tid+257)%512;
53 
54  if (tid > 0)
55  fr_cpy(fr[dst], polynomial[src]);
56  else
57  fr_zero(fr[dst]);
58 
59  __syncthreads();
60 
61  // Zero the other half of coefficients before FFT
62 
63  fr_zero(fr[tid+1]);
64 
65  __syncthreads();
66 
67  // Compute FFT
68 
69  fr_fft(fr, fr);
70 
71  __syncthreads();
72 
73  fr_cpy(toeplitz_coefficients_fft[tid], fr[tid]);
74  fr_cpy(toeplitz_coefficients_fft[tid+256], fr[tid+256]);
75 
77  }
78 }
79 
80 // vim: ts=4 et sw=4 si
__managed__ fr_t polynomial[512 *4096]
__managed__ fr_t toeplitz_coefficients_fft[512 *16][512]
__global__ void fk20_poly2toeplitz_coefficients_fft(fr_t *toeplitz_coefficients_fft, const fr_t *polynomial)
polynomial -> toeplitz_coefficients_fft
__device__ __host__ void fr_zero(fr_t &z)
Sets the value of z to zero.
Definition: fr.cu:15
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
Definition: fr.cuh:24
__device__ __host__ void fr_cpy(fr_t &z, const fr_t &x)
Copy from x into z.
Definition: fr_cpy.cu:14
__device__ void fr_fft(fr_t *output, const fr_t *input)
FFT over Fr.
Definition: fr_fft.cu:26