15 __managed__
fr_t X[512*512],
Y[512*512],
Z[512*512];
16 __managed__
g1p_t P[512*512],
Q[512*512],
R[512*512],
S[512*512],
T[512*512];
18 __managed__ uint8_t
cmp[512*512];
22 #define SET_SHAREDMEM(SZ, FN) \
23 err = cudaFuncSetAttribute(FN, cudaFuncAttributeMaxDynamicSharedMemorySize, SZ); \
24 cudaDeviceSynchronize(); \
25 if (err != cudaSuccess) printf("Error cudaFuncSetAttribute: %s:%d, error %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));
40 unsigned tid = 0; tid += blockIdx.z;
41 tid *= gridDim.y; tid += blockIdx.y;
42 tid *= gridDim.x; tid += blockIdx.x;
43 tid *= blockDim.z; tid += threadIdx.z;
44 tid *= blockDim.y; tid += threadIdx.y;
45 tid *= blockDim.x; tid += threadIdx.x;
47 unsigned step = gridDim.z * gridDim.y * gridDim.x
48 * blockDim.z * blockDim.y * blockDim.x;
50 for (
unsigned i=tid; i<count; i+=step) {
71 unsigned tid = 0; tid += blockIdx.z;
72 tid *= gridDim.y; tid += blockIdx.y;
73 tid *= gridDim.x; tid += blockIdx.x;
74 tid *= blockDim.z; tid += threadIdx.z;
75 tid *= blockDim.y; tid += threadIdx.y;
76 tid *= blockDim.x; tid += threadIdx.x;
78 unsigned step = gridDim.z * gridDim.y * gridDim.x
79 * blockDim.z * blockDim.y * blockDim.x;
81 for (
unsigned i=tid; i<count; i+=step) {
101 unsigned tid = 0; tid += blockIdx.z;
102 tid *= gridDim.y; tid += blockIdx.y;
103 tid *= gridDim.x; tid += blockIdx.x;
104 tid *= blockDim.z; tid += threadIdx.z;
105 tid *= blockDim.y; tid += threadIdx.y;
106 tid *= blockDim.x; tid += threadIdx.x;
108 unsigned step = gridDim.z * gridDim.y * gridDim.x
109 * blockDim.z * blockDim.y * blockDim.x;
111 for (
unsigned i=tid; i<count; i+=step) {
137 const char filename[] =
"/dev/urandom";
140 cudaError_t err = cudaSuccess;
157 pf = fopen(filename,
"r");
160 fprintf(stderr,
"Error opening %s\n", filename);
166 result = fread(
X,
sizeof(
fr_t), rows*512, pf);
168 if (result < rows*512) {
169 fprintf(stderr,
"Only read %zd values\n", result);
177 g1p_fr2g1p_wrapper<<<32, 256>>>(
P, rows*512,
X);
181 result = fread(
Y,
sizeof(
fr_t), rows*512, pf);
183 if (result < rows*512) {
184 fprintf(stderr,
"Only read %zd values\n", result);
192 g1p_fr2g1p_wrapper<<<32, 256>>>(
Q, rows*512,
X);
CUDASYNC(
"g1p_fr2g1p_wrapper");
194 for (
int c=0; c<2; c++) {
198 printf(
"=== RUN IFT(FFT(P)) == P\n");
199 for (i=0; i<512*512; i++)
cmp[i] = 0;
201 g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(
S,
P);
CUDASYNC(
"g1p_fft_wrapper");
202 g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(
T,
S);
CUDASYNC(
"g1p_ift_wrapper");
206 g1p_eq_wrapper <<<rows, 256>>>(
cmp, rows*512,
P,
T);
CUDASYNC(
"g1p_eq_wrapper");
208 for (i=0, pass=
true; pass && (i<rows*512); i++)
209 if (
cmp[i] != 1) { fprintf(stderr,
"ERROR at %d\n", i); pass =
false; }
215 printf(
"=== RUN FFT(IFT(P)) == P\n");
216 for (i=0; i<512*512; i++)
cmp[i] = 0;
218 g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(
T,
P);
CUDASYNC(
"g1p_ift_wrapper");
219 g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(
S,
T);
CUDASYNC(
"g1p_fft_wrapper");
223 g1p_eq_wrapper <<<rows, 256>>>(
cmp, rows*512,
P,
S);
CUDASYNC(
"g1p_eq_wrapper");
225 for (i=0, pass=
true; pass && (i<rows*512); i++)
226 if (
cmp[i] != 1) { fprintf(stderr,
"ERROR at %d\n", i); pass =
false; }
232 printf(
"=== RUN FFT(P+Q) == FFT(P) + FFT(Q)\n");
233 for (i=0; i<512*512; i++)
cmp[i] = 0;
235 g1p_add_wrapper<<<rows, 256>>>(
R, rows*512,
P,
Q);
CUDASYNC(
"g1p_add_wrapper");
236 g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(
R,
R);
CUDASYNC(
"g1p_fft_wrapper");
237 g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(
S,
P);
CUDASYNC(
"g1p_fft_wrapper");
238 g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(
T,
Q);
CUDASYNC(
"g1p_fft_wrapper");
239 g1p_add_wrapper<<<rows, 256>>>(
S, rows*512,
S,
T);
CUDASYNC(
"g1p_add_wrapper");
243 g1p_eq_wrapper <<<rows, 256>>>(
cmp, rows*512,
R,
S);
CUDASYNC(
"g1p_eq_wrapper");
245 for (i=0, pass=
true; pass && (i<rows*512); i++)
246 if (
cmp[i] != 1) { fprintf(stderr,
"ERROR at %d\n", i); pass =
false; }
252 printf(
"=== RUN IFT(P+Q) == IFT(P) + IFT(Q)\n");
253 for (i=0; i<512*512; i++)
cmp[i] = 0;
255 g1p_add_wrapper<<<rows, 256>>>(
R, rows*512,
P,
Q);
CUDASYNC(
"g1p_add_wrapper");
256 g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(
R,
R);
CUDASYNC(
"g1p_ift_wrapper");
257 g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(
S,
P);
CUDASYNC(
"g1p_ift_wrapper");
258 g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(
T,
Q);
CUDASYNC(
"g1p_ift_wrapper");
259 g1p_add_wrapper<<<rows, 256>>>(
S, rows*512,
S,
T);
CUDASYNC(
"g1p_add_wrapper");
263 g1p_eq_wrapper <<<rows, 256>>>(
cmp, rows*512,
R,
S);
CUDASYNC(
"g1p_eq_wrapper");
265 for (i=0, pass=
true; pass && (i<rows*512); i++)
266 if (
cmp[i] != 1) { fprintf(stderr,
"ERROR at %d\n", i); pass =
false; }
272 printf(
"=== RUN FFT(x*P) == x*FFT(P)\n");
273 for (i=0; i<512*512; i++)
cmp[i] = 0;
274 for (i=0; i<512*512; i++)
fr_cpy(
Z[i],
Y[0]);
276 g1p_mul_wrapper<<<rows, 256>>>(
R, rows*512,
Z,
P);
CUDASYNC(
"g1p_mul_wrapper");
277 g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(
R,
R);
CUDASYNC(
"g1p_fft_wrapper");
278 g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(
S,
P);
CUDASYNC(
"g1p_fft_wrapper");
279 g1p_mul_wrapper<<<rows, 256>>>(
S, rows*512,
Z,
S);
CUDASYNC(
"g1p_mul_wrapper");
283 g1p_eq_wrapper <<<rows, 256>>>(
cmp, rows*512,
R,
S);
CUDASYNC(
"g1p_eq_wrapper");
285 for (i=0, pass=
true; pass && (i<rows*512); i++)
286 if (
cmp[i] != 1) { fprintf(stderr,
"ERROR at %d\n", i); pass =
false; }
292 printf(
"=== RUN IFT(x*P) == x*IFT(P)\n");
293 for (i=0; i<512*512; i++)
cmp[i] = 0;
294 for (i=0; i<512*512; i++)
fr_cpy(
Z[i],
Y[0]);
296 g1p_mul_wrapper<<<rows, 256>>>(
R, rows*512,
Z,
P);
CUDASYNC(
"g1p_mul_wrapper");
297 g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(
R,
R);
CUDASYNC(
"g1p_ift_wrapper");
298 g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(
S,
P);
CUDASYNC(
"g1p_ift_wrapper");
299 g1p_mul_wrapper<<<rows, 256>>>(
S, rows*512,
Z,
S);
CUDASYNC(
"g1p_mul_wrapper");
303 g1p_eq_wrapper <<<rows, 256>>>(
cmp, rows*512,
R,
S);
CUDASYNC(
"g1p_eq_wrapper");
305 for (i=0, pass=
true; pass && (i<rows*512); i++)
306 if (
cmp[i] != 1) { fprintf(stderr,
"ERROR at %d\n", i); pass =
false; }
312 printf(
"=== RUN FFT(G*X) == G*FFT(X)\n");
313 for (i=0; i<512*512; i++)
cmp[i] = 0;
314 for (i=0; i<512*512; i++)
g1p_gen(
R[i]);
316 g1p_mul_wrapper<<<rows, 256>>>(
S, rows*512,
X,
R);
CUDASYNC(
"g1p_mul_wrapper");
317 g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(
S,
S);
CUDASYNC(
"g1p_fft_wrapper");
318 fr_fft_wrapper <<<rows, 256, fr_sharedmem>>> (
Z,
X);
CUDASYNC(
"fr_fft_wrapper");
319 g1p_mul_wrapper<<<rows, 256>>>(
T, rows*512,
Z,
R);
CUDASYNC(
"g1p_mul_wrapper");
323 g1p_eq_wrapper <<<rows, 256>>>(
cmp, rows*512,
S,
T);
CUDASYNC(
"g1p_eq_wrapper");
325 for (i=0, pass=
true; pass && (i<rows*512); i++)
326 if (
cmp[i] != 1) { fprintf(stderr,
"ERROR at %d\n", i); pass =
false; }
332 printf(
"=== RUN IFT(G*X) == G*IFT(X)\n");
333 for (i=0; i<512*512; i++)
cmp[i] = 0;
334 for (i=0; i<512*512; i++)
g1p_gen(
R[i]);
336 g1p_mul_wrapper<<<rows, 256>>>(
S, rows*512,
X,
R);
CUDASYNC(
"g1p_mul_wrapper");
337 g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(
S,
S);
CUDASYNC(
"g1p_ift_wrapper");
338 fr_ift_wrapper <<<rows, 256, fr_sharedmem>>> (
Z,
X);
CUDASYNC(
"fr_ift_wrapper");
339 g1p_mul_wrapper<<<rows, 256>>>(
T, rows*512,
Z,
R);
CUDASYNC(
"g1p_mul_wrapper");
343 g1p_eq_wrapper <<<rows, 256>>>(
cmp, rows*512,
S,
T);
CUDASYNC(
"g1p_eq_wrapper");
345 for (i=0, pass=
true; pass && (i<rows*512); i++)
346 if (
cmp[i] != 1) { fprintf(stderr,
"ERROR at %d\n", i); pass =
false; }
351 printf(
"Tests below must detect an error at 511\n");
const size_t g1p_sharedmem
#define CUDASYNC(fmt,...)
const size_t fr_sharedmem
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
__device__ __host__ void fr_cpy(fr_t &z, const fr_t &x)
Copy from x into z.
__global__ void fr_fft_wrapper(fr_t *output, const fr_t *input)
wrapper for fr_fft: FFT for fr_t[512]
__global__ void g1p_fft_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_fft: FFT for arrays of g1p_t with length 512
__device__ void g1p_add(g1p_t &p, const g1p_t &q)
Computes the sum of two points q into p, using projective coordinates. and stores in p.
__device__ __host__ void g1p_gen(g1p_t &p)
Sets p to the generator point G1 of bls12_381.
__device__ void g1p_mul(g1p_t &p, const fr_t &x)
p ← k·p Point multiplication by scalar, in projective coordinates. That result is stored back into p.
__device__ __host__ void g1p_cpy(g1p_t &p, const g1p_t &q)
Copy from q into p.
__global__ void g1p_ift_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_ift: inverse FFT for arrays of g1p_t with length 512
__managed__ g1p_t S[512 *512]
__managed__ uint8_t cmp[512 *512]
__global__ void g1p_fr2g1p_wrapper(g1p_t *g1, int count, const fr_t *fr)
Kernel wrapper for device multiplication computing k*G from k.
__managed__ fr_t Y[512 *512]
__global__ void g1p_mul_wrapper(g1p_t *q, int count, const fr_t *x, const g1p_t *p)
Kernel wrapper for device multiplication.
#define SET_SHAREDMEM(SZ, FN)
__managed__ g1p_t P[512 *512]
__global__ void g1p_add_wrapper(g1p_t *sum, int count, const g1p_t *x, const g1p_t *y)
Kernel wrapper for device addition.
__managed__ fr_t Z[512 *512]
__managed__ g1p_t Q[512 *512]
__managed__ g1p_t R[512 *512]
__managed__ g1p_t T[512 *512]
void G1TestFFT(unsigned rows)
Test for FFT and IFFT of points on the G1 curve. Checks self consistency with the following propertie...
__managed__ fr_t X[512 *512]
G1 point in projective coordinates.