28 unsigned tid = threadIdx.x;
29 unsigned l, r, w, src, dst;
35 asm volatile (
"\n\tbrev.b32 %0, %1;" :
"=r"(dst) :
"r"(src << (32-9)));
60 l = tid + (tid & -2U);
71 l = tid + (tid & -4U);
82 l = tid + (tid & -8U);
93 l = tid + (tid & -16U);
104 l = tid + (tid & -32U);
115 l = tid + (tid & -64U);
125 w = (tid & 127) << 1;
126 l = tid + (tid & -128U);
136 w = (tid & 255) << 0;
137 l = tid + (tid & -256U);
172 unsigned tid = threadIdx.x;
173 unsigned l, r, w, src, dst;
191 w = (tid & 255) << 0;
192 l = tid + (tid & -256U);
202 w = (tid & 127) << 1;
203 l = tid + (tid & -128U);
214 l = tid + (tid & -64U);
225 l = tid + (tid & -32U);
236 l = tid + (tid & -16U);
247 l = tid + (tid & -8U);
258 l = tid + (tid & -4U);
269 l = tid + (tid & -2U);
293 asm volatile (
"\n\tbrev.b32 %0, %1;" :
"=r"(src) :
"r"(dst << (32-9)));
318 if (gridDim.y != 1)
return;
319 if (gridDim.z != 1)
return;
320 if (blockDim.x != 256)
return;
321 if (blockDim.y != 1)
return;
322 if (blockDim.z != 1)
return;
326 unsigned bid = blockIdx.x;
347 if (gridDim.y != 1)
return;
348 if (gridDim.z != 1)
return;
349 if (blockDim.x != 256)
return;
350 if (blockDim.y != 1)
return;
351 if (blockDim.z != 1)
return;
355 unsigned bid = blockIdx.x;
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
__device__ void fr_addsub(fr_t &x, fr_t &y)
Computes the sum and the difference of the arguments, storing back into the arguments: (x,...
__constant__ fr_t fr_roots[515]
Table for the precomputed root-of-unity values.
__device__ __host__ void fr_cpy(fr_t &z, const fr_t &x)
Copy from x into z.
__device__ void fr_mul(fr_t &z, const fr_t &x)
Multiply two residues module r z and x, stores back into z.
__global__ void fr_fft_wrapper(fr_t *output, const fr_t *input)
wrapper for fr_fft: FFT for fr_t[512]
__global__ void fr_ift_wrapper(fr_t *output, const fr_t *input)
wrapper for fr_ift: inverse FFT for fr_t[512]
__shared__ fr_t fr_smem[]
Workspace in shared memory. Must be 512*sizeof(fr_t) bytes.
__device__ void fr_ift(fr_t *output, const fr_t *input)
Inverse FFT for fr_t[512].
__device__ void fr_fft(fr_t *output, const fr_t *input)
FFT over Fr.