26 if (gridDim.y != 1)
return;
27 if (gridDim.z != 1)
return;
28 if (blockDim.x != 256)
return;
29 if (blockDim.y != 1)
return;
30 if (blockDim.z != 1)
return;
32 unsigned tid = threadIdx.x;
33 unsigned bid = blockIdx.x;
34 unsigned l, r, w, src, dst;
45 asm volatile (
"\n\tbrev.b32 %0, %1;" :
"=r"(dst) :
"r"(src << (32-9)));
70 l = tid + (tid & -2U);
81 l = tid + (tid & -4U);
92 l = tid + (tid & -8U);
103 l = tid + (tid & -16U);
114 l = tid + (tid & -32U);
125 l = tid + (tid & -64U);
135 w = (tid & 127) << 1;
136 l = tid + (tid & -128U);
146 w = (tid & 255) << 0;
147 l = tid + (tid & -256U);
180 if (gridDim.y != 1)
return;
181 if (gridDim.z != 1)
return;
182 if (blockDim.x != 256)
return;
183 if (blockDim.y != 1)
return;
184 if (blockDim.z != 1)
return;
186 unsigned tid = threadIdx.x;
187 unsigned bid = blockIdx.x;
188 unsigned l, r, w, src, dst;
211 w = (tid & 255) << 0;
212 l = tid + (tid & -256U);
222 w = (tid & 127) << 1;
223 l = tid + (tid & -128U);
234 l = tid + (tid & -64U);
245 l = tid + (tid & -32U);
256 l = tid + (tid & -16U);
267 l = tid + (tid & -8U);
278 l = tid + (tid & -4U);
289 l = tid + (tid & -2U);
313 asm volatile (
"\n\tbrev.b32 %0, %1;" :
"=r"(src) :
"r"(dst << (32-9)));
__constant__ fr_t fr_roots[515]
Table for the precomputed root-of-unity values.
__device__ void g1p_addsub(g1p_t &p, g1p_t &q)
Stores the sum and difference of p and q into p and q. Projective p and q, p,q ← p+q,...
__device__ void g1p_mul(g1p_t &p, const fr_t &x)
p ← k·p Point multiplication by scalar, in projective coordinates. That result is stored back into p.
__device__ __host__ void g1p_cpy(g1p_t &p, const g1p_t &q)
Copy from q into p.
__device__ void g1p_ift(g1p_t *output, const g1p_t *input)
Inverse FFT of size 512 over G1 with projective coordinates. Input and output arrays may overlap....
__shared__ g1p_t g1p_tmp[]
__global__ void g1p_fft_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_fft: FFT for arrays of g1p_t with length 512
__device__ void g1p_fft(g1p_t *output, const g1p_t *input)
FFT of size 512 over G1 with projective coordinates. Input and output arrays may overlap....
__global__ void g1p_ift_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_ift: inverse FFT for arrays of g1p_t with length 512
G1 point in projective coordinates.