FK20 CUDA
g1test_fft.cu
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <unistd.h>
10 
11 #include "fr.cuh"
12 #include "g1.cuh"
13 #include "g1test.cuh"
14 
15 __managed__ fr_t X[512*512], Y[512*512], Z[512*512];
16 __managed__ g1p_t P[512*512], Q[512*512], R[512*512], S[512*512], T[512*512];
17 
18 __managed__ uint8_t cmp[512*512];
19 
21 
22 #define SET_SHAREDMEM(SZ, FN) \
23  err = cudaFuncSetAttribute(FN, cudaFuncAttributeMaxDynamicSharedMemorySize, SZ); \
24  cudaDeviceSynchronize(); \
25  if (err != cudaSuccess) printf("Error cudaFuncSetAttribute: %s:%d, error %d (%s)\n", __FILE__, __LINE__, err, cudaGetErrorName(err));
26 
28 
38 __global__ void g1p_add_wrapper(g1p_t *sum, int count, const g1p_t *x, const g1p_t *y) {
39 
40  unsigned tid = 0; tid += blockIdx.z;
41  tid *= gridDim.y; tid += blockIdx.y;
42  tid *= gridDim.x; tid += blockIdx.x;
43  tid *= blockDim.z; tid += threadIdx.z;
44  tid *= blockDim.y; tid += threadIdx.y;
45  tid *= blockDim.x; tid += threadIdx.x;
46 
47  unsigned step = gridDim.z * gridDim.y * gridDim.x
48  * blockDim.z * blockDim.y * blockDim.x;
49 
50  for (unsigned i=tid; i<count; i+=step) {
51  g1p_t p;
52  g1p_cpy(p, x[i]);
53  g1p_add(p, y[i]);
54  g1p_cpy(sum[i], p);
55  }
56 }
57 
59 
69 __global__ void g1p_mul_wrapper(g1p_t *q, int count, const fr_t *x, const g1p_t *p) {
70 
71  unsigned tid = 0; tid += blockIdx.z;
72  tid *= gridDim.y; tid += blockIdx.y;
73  tid *= gridDim.x; tid += blockIdx.x;
74  tid *= blockDim.z; tid += threadIdx.z;
75  tid *= blockDim.y; tid += threadIdx.y;
76  tid *= blockDim.x; tid += threadIdx.x;
77 
78  unsigned step = gridDim.z * gridDim.y * gridDim.x
79  * blockDim.z * blockDim.y * blockDim.x;
80 
81  for (unsigned i=tid; i<count; i+=step) {
82  g1p_t t;
83  g1p_cpy(t, p[i]);
84  g1p_mul(t, x[i]);
85  g1p_cpy(q[i], t);
86  }
87 }
88 
90 
99 __global__ void g1p_fr2g1p_wrapper(g1p_t *g1, int count, const fr_t *fr) {
100 
101  unsigned tid = 0; tid += blockIdx.z;
102  tid *= gridDim.y; tid += blockIdx.y;
103  tid *= gridDim.x; tid += blockIdx.x;
104  tid *= blockDim.z; tid += threadIdx.z;
105  tid *= blockDim.y; tid += threadIdx.y;
106  tid *= blockDim.x; tid += threadIdx.x;
107 
108  unsigned step = gridDim.z * gridDim.y * gridDim.x
109  * blockDim.z * blockDim.y * blockDim.x;
110 
111  for (unsigned i=tid; i<count; i+=step) {
112  g1p_t p;
113  g1p_gen(p);
114  g1p_mul(p, fr[i]);
115  g1p_cpy(g1[i], p);
116  }
117 }
118 
120 
136 void G1TestFFT(unsigned rows) {
137  const char filename[] = "/dev/urandom";
138  FILE *pf = NULL;
139  size_t result = 0;
140  cudaError_t err = cudaSuccess;
141  bool pass = true;
142  int i;
143 
144  // Shared memory sizes
145 
146  const size_t g1p_sharedmem = 512*3*6*8; // 512 points * 3 residues/point * 6 words/residue * 8 bytes/word = 72 KiB
147  const size_t fr_sharedmem = 512*4*8; // 512 residues * 4 words/residue * 8 bytes/word = 16 KiB
148 
149  // Setup
150 
154 
155  // Initialise X with random values
156 
157  pf = fopen(filename, "r");
158 
159  if (!pf) {
160  fprintf(stderr, "Error opening %s\n", filename);
161  return;
162  }
163 
164  // printf("Initialising X[]\n");
165 
166  result = fread(X, sizeof(fr_t), rows*512, pf);
167 
168  if (result < rows*512) {
169  fprintf(stderr, "Only read %zd values\n", result);
170  goto L_pf;
171  }
172 
173  // printf("Initialising P[]\n");
174 
175  // Initialise P: P[i] = X[i] * G
176 
177  g1p_fr2g1p_wrapper<<<32, 256>>>(P, rows*512, X);
178 
179  // printf("Initialising Y[]\n");
180 
181  result = fread(Y, sizeof(fr_t), rows*512, pf);
182 
183  if (result < rows*512) {
184  fprintf(stderr, "Only read %zd values\n", result);
185  goto L_pf;
186  }
187 
188  // printf("Initialising Q[]\n");
189 
190  // Initialise Q: Q[i] = Y[i] * G
191 
192  g1p_fr2g1p_wrapper<<<32, 256>>>(Q, rows*512, X); CUDASYNC("g1p_fr2g1p_wrapper");
193 
194  for (int c=0; c<2; c++) { // Tests must pass when c==0 and fail when c==1
195 
196  // IFT(FFT(P)) == P
197 
198  printf("=== RUN IFT(FFT(P)) == P\n");
199  for (i=0; i<512*512; i++) cmp[i] = 0;
200 
201  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(S, P); CUDASYNC("g1p_fft_wrapper");
202  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(T, S); CUDASYNC("g1p_ift_wrapper");
203  if (c==1)
204  g1p_gen(T[511]);
205 
206  g1p_eq_wrapper <<<rows, 256>>>(cmp, rows*512, P, T); CUDASYNC("g1p_eq_wrapper");
207 
208  for (i=0, pass=true; pass && (i<rows*512); i++)
209  if (cmp[i] != 1) { fprintf(stderr, "ERROR at %d\n", i); pass = false; }
210 
211  PRINTPASS(pass^(c==1));
212 
213  // FFT(IFT(P)) == P
214 
215  printf("=== RUN FFT(IFT(P)) == P\n");
216  for (i=0; i<512*512; i++) cmp[i] = 0;
217 
218  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(T, P); CUDASYNC("g1p_ift_wrapper");
219  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(S, T); CUDASYNC("g1p_fft_wrapper");
220  if (c==1)
221  g1p_gen(S[511]);
222 
223  g1p_eq_wrapper <<<rows, 256>>>(cmp, rows*512, P, S); CUDASYNC("g1p_eq_wrapper");
224 
225  for (i=0, pass=true; pass && (i<rows*512); i++)
226  if (cmp[i] != 1) { fprintf(stderr, "ERROR at %d\n", i); pass = false; }
227 
228  PRINTPASS(pass^(c==1));
229 
230  // FFT(P+Q) == FFT(P) + FFT(Q)
231 
232  printf("=== RUN FFT(P+Q) == FFT(P) + FFT(Q)\n");
233  for (i=0; i<512*512; i++) cmp[i] = 0;
234 
235  g1p_add_wrapper<<<rows, 256>>>(R, rows*512, P, Q); CUDASYNC("g1p_add_wrapper"); // P+Q
236  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(R, R); CUDASYNC("g1p_fft_wrapper"); // FFT(P+Q)
237  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(S, P); CUDASYNC("g1p_fft_wrapper"); // FFT(P)
238  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(T, Q); CUDASYNC("g1p_fft_wrapper"); // FFT(Q)
239  g1p_add_wrapper<<<rows, 256>>>(S, rows*512, S, T); CUDASYNC("g1p_add_wrapper"); // FFT(P)+FFT(Q)
240  if (c==1)
241  g1p_gen(S[511]);
242 
243  g1p_eq_wrapper <<<rows, 256>>>(cmp, rows*512, R, S); CUDASYNC("g1p_eq_wrapper");
244 
245  for (i=0, pass=true; pass && (i<rows*512); i++)
246  if (cmp[i] != 1) { fprintf(stderr, "ERROR at %d\n", i); pass = false; }
247 
248  PRINTPASS(pass^(c==1));
249 
250  // IFT(P+Q) == IFT(P) + IFT(Q)
251 
252  printf("=== RUN IFT(P+Q) == IFT(P) + IFT(Q)\n");
253  for (i=0; i<512*512; i++) cmp[i] = 0;
254 
255  g1p_add_wrapper<<<rows, 256>>>(R, rows*512, P, Q); CUDASYNC("g1p_add_wrapper"); // P+Q
256  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(R, R); CUDASYNC("g1p_ift_wrapper"); // IFT(P+Q)
257  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(S, P); CUDASYNC("g1p_ift_wrapper"); // IFT(P)
258  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(T, Q); CUDASYNC("g1p_ift_wrapper"); // IFT(Q)
259  g1p_add_wrapper<<<rows, 256>>>(S, rows*512, S, T); CUDASYNC("g1p_add_wrapper"); // IFT(P)+IFT(Q)
260  if (c==1)
261  g1p_gen(S[511]);
262 
263  g1p_eq_wrapper <<<rows, 256>>>(cmp, rows*512, R, S); CUDASYNC("g1p_eq_wrapper");
264 
265  for (i=0, pass=true; pass && (i<rows*512); i++)
266  if (cmp[i] != 1) { fprintf(stderr, "ERROR at %d\n", i); pass = false; }
267 
268  PRINTPASS(pass^(c==1));
269 
270  // FFT(x*P) == x*FFT(P)
271 
272  printf("=== RUN FFT(x*P) == x*FFT(P)\n");
273  for (i=0; i<512*512; i++) cmp[i] = 0;
274  for (i=0; i<512*512; i++) fr_cpy(Z[i], Y[0]);
275 
276  g1p_mul_wrapper<<<rows, 256>>>(R, rows*512, Z, P); CUDASYNC("g1p_mul_wrapper"); // x*P
277  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(R, R); CUDASYNC("g1p_fft_wrapper"); // FFT(x*P)
278  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(S, P); CUDASYNC("g1p_fft_wrapper"); // FFT(P)
279  g1p_mul_wrapper<<<rows, 256>>>(S, rows*512, Z, S); CUDASYNC("g1p_mul_wrapper"); // x*FFT(P)
280  if (c==1)
281  g1p_gen(S[511]);
282 
283  g1p_eq_wrapper <<<rows, 256>>>(cmp, rows*512, R, S); CUDASYNC("g1p_eq_wrapper");
284 
285  for (i=0, pass=true; pass && (i<rows*512); i++)
286  if (cmp[i] != 1) { fprintf(stderr, "ERROR at %d\n", i); pass = false; }
287 
288  PRINTPASS(pass^(c==1));
289 
290  // IFT(x*P) == x*IFT(P)
291 
292  printf("=== RUN IFT(x*P) == x*IFT(P)\n");
293  for (i=0; i<512*512; i++) cmp[i] = 0;
294  for (i=0; i<512*512; i++) fr_cpy(Z[i], Y[0]);
295 
296  g1p_mul_wrapper<<<rows, 256>>>(R, rows*512, Z, P); CUDASYNC("g1p_mul_wrapper"); // x*P
297  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(R, R); CUDASYNC("g1p_ift_wrapper"); // IFT(x*P)
298  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(S, P); CUDASYNC("g1p_ift_wrapper"); // IFT(P)
299  g1p_mul_wrapper<<<rows, 256>>>(S, rows*512, Z, S); CUDASYNC("g1p_mul_wrapper"); // x*IFT(P)
300  if (c==1)
301  g1p_gen(S[511]);
302 
303  g1p_eq_wrapper <<<rows, 256>>>(cmp, rows*512, R, S); CUDASYNC("g1p_eq_wrapper");
304 
305  for (i=0, pass=true; pass && (i<rows*512); i++)
306  if (cmp[i] != 1) { fprintf(stderr, "ERROR at %d\n", i); pass = false; }
307 
308  PRINTPASS(pass^(c==1));
309 
310  // FFT(G*X) == G*FFT(X) (FFT commutes with mapping from Fr to G1)
311 
312  printf("=== RUN FFT(G*X) == G*FFT(X)\n");
313  for (i=0; i<512*512; i++) cmp[i] = 0;
314  for (i=0; i<512*512; i++) g1p_gen(R[i]);
315 
316  g1p_mul_wrapper<<<rows, 256>>>(S, rows*512, X, R); CUDASYNC("g1p_mul_wrapper"); // G*X
317  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(S, S); CUDASYNC("g1p_fft_wrapper"); // FFT(G*X)
318  fr_fft_wrapper <<<rows, 256, fr_sharedmem>>> (Z, X); CUDASYNC("fr_fft_wrapper"); // FFT(X)
319  g1p_mul_wrapper<<<rows, 256>>>(T, rows*512, Z, R); CUDASYNC("g1p_mul_wrapper"); // G*FFT(X)
320  if (c==1)
321  g1p_gen(T[511]);
322 
323  g1p_eq_wrapper <<<rows, 256>>>(cmp, rows*512, S, T); CUDASYNC("g1p_eq_wrapper");
324 
325  for (i=0, pass=true; pass && (i<rows*512); i++)
326  if (cmp[i] != 1) { fprintf(stderr, "ERROR at %d\n", i); pass = false; }
327 
328  PRINTPASS(pass^(c==1));
329 
330  // IFT(G*X) == G*IFT(X) (IFT commutes with mapping from Fr to G1)
331 
332  printf("=== RUN IFT(G*X) == G*IFT(X)\n");
333  for (i=0; i<512*512; i++) cmp[i] = 0;
334  for (i=0; i<512*512; i++) g1p_gen(R[i]);
335 
336  g1p_mul_wrapper<<<rows, 256>>>(S, rows*512, X, R); CUDASYNC("g1p_mul_wrapper"); // G*X
337  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(S, S); CUDASYNC("g1p_ift_wrapper"); // IFT(G*X)
338  fr_ift_wrapper <<<rows, 256, fr_sharedmem>>> (Z, X); CUDASYNC("fr_ift_wrapper"); // IFT(X)
339  g1p_mul_wrapper<<<rows, 256>>>(T, rows*512, Z, R); CUDASYNC("g1p_mul_wrapper"); // G*IFT(X)
340  if (c==1)
341  g1p_gen(T[511]);
342 
343  g1p_eq_wrapper <<<rows, 256>>>(cmp, rows*512, S, T); CUDASYNC("g1p_eq_wrapper");
344 
345  for (i=0, pass=true; pass && (i<rows*512); i++)
346  if (cmp[i] != 1) { fprintf(stderr, "ERROR at %d\n", i); pass = false; }
347 
348  PRINTPASS(pass^(c==1));
349 
350  if (c==0) {
351  printf("Tests below must detect an error at 511\n");
352  }
353  }
354 
355 L_pf:
356  fclose(pf);
357 }
358 
359 // vim: ts=4 et sw=4 si
const size_t g1p_sharedmem
Definition: fk20.cuh:14
#define CUDASYNC(fmt,...)
Definition: fk20.cuh:39
const size_t fr_sharedmem
Definition: fk20.cuh:15
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
Definition: fr.cuh:24
__device__ __host__ void fr_cpy(fr_t &z, const fr_t &x)
Copy from x into z.
Definition: fr_cpy.cu:14
__global__ void fr_fft_wrapper(fr_t *output, const fr_t *input)
wrapper for fr_fft: FFT for fr_t[512]
Definition: fr_fft.cu:316
__global__ void g1p_fft_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_fft: FFT for arrays of g1p_t with length 512
Definition: g1p_fft.cu:336
__device__ void g1p_add(g1p_t &p, const g1p_t &q)
Computes the sum of two points q into p, using projective coordinates. and stores in p.
Definition: g1p_add.cu:29
__device__ __host__ void g1p_gen(g1p_t &p)
Sets p to the generator point G1 of bls12_381.
Definition: g1p.cu:106
__device__ void g1p_mul(g1p_t &p, const fr_t &x)
p ← k·p Point multiplication by scalar, in projective coordinates. That result is stored back into p.
Definition: g1p_mul.cu:19
__device__ __host__ void g1p_cpy(g1p_t &p, const g1p_t &q)
Copy from q into p.
Definition: g1p.cu:67
__global__ void g1p_ift_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_ift: inverse FFT for arrays of g1p_t with length 512
Definition: g1p_fft.cu:349
__managed__ g1p_t S[512 *512]
Definition: g1test_fft.cu:16
__managed__ uint8_t cmp[512 *512]
Definition: g1test_fft.cu:18
__global__ void g1p_fr2g1p_wrapper(g1p_t *g1, int count, const fr_t *fr)
Kernel wrapper for device multiplication computing k*G from k.
Definition: g1test_fft.cu:99
__managed__ fr_t Y[512 *512]
Definition: g1test_fft.cu:15
__global__ void g1p_mul_wrapper(g1p_t *q, int count, const fr_t *x, const g1p_t *p)
Kernel wrapper for device multiplication.
Definition: g1test_fft.cu:69
#define SET_SHAREDMEM(SZ, FN)
Definition: g1test_fft.cu:22
__managed__ g1p_t P[512 *512]
Definition: g1test_fft.cu:16
__global__ void g1p_add_wrapper(g1p_t *sum, int count, const g1p_t *x, const g1p_t *y)
Kernel wrapper for device addition.
Definition: g1test_fft.cu:38
__managed__ fr_t Z[512 *512]
Definition: g1test_fft.cu:15
__managed__ g1p_t Q[512 *512]
Definition: g1test_fft.cu:16
__managed__ g1p_t R[512 *512]
Definition: g1test_fft.cu:16
__managed__ g1p_t T[512 *512]
Definition: g1test_fft.cu:16
void G1TestFFT(unsigned rows)
Test for FFT and IFFT of points on the G1 curve. Checks self consistency with the following propertie...
Definition: g1test_fft.cu:136
__managed__ fr_t X[512 *512]
Definition: g1test_fft.cu:15
G1 point in projective coordinates.
Definition: g1.cuh:27
#define PRINTPASS(pass)
Definition: test.h:25