FK20 CUDA
fk20test_poly.cu
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 
6 #include <stdio.h>
7 #include <time.h>
8 
9 #include "g1.cuh"
10 #include "fk20.cuh"
11 #include "fk20test.cuh"
12 #include "fk20_testvector.cuh"
13 
14 
15 static __managed__ uint8_t cmp[16*512];
16 static __managed__ fr_t fr_tmp[16*512];
17 static __managed__ g1p_t g1p_tmp[512];
18 
24 void FK20TestPoly() {
25  printf(">>>> Poly Tests\n");
26  //fk20_setup2xext_fft_test(setup, xext_fft); //deprecated
28  //fk20_poly2toeplitz_coefficients_fft_test(polynomial, toeplitz_coefficients_fft); //deprecated
32 
33  fullTest();
35 
36 }
37 
44 void fullTest() {
45  const int rows = 1;
46  cudaError_t err;
47  bool pass = true;
48  CLOCKINIT; //Initializes the time variables
49 
50  // Setup
51 
55 
56  // polynomial -> tc
57  // All steps follow the same format
58  printf("\n>>>>Full integration test\n"); fflush(stdout);
59  printf("polynomial -> tc\n"); fflush(stdout);
60 
61  CLOCKSTART; // Starts a basic timer
62  fk20_poly2toeplitz_coefficients<<<rows, 256>>>(fr_tmp, polynomial);
63  CUDASYNC("fk20_poly2toeplitz_coefficients"); // Ensures the GPU has finished the computation, and check for errors
64  CLOCKEND; // Reports time
65 
66  clearRes;
67  fr_eq_wrapper<<<256, 32>>>(cmp, 16*512, fr_tmp, (fr_t *)toeplitz_coefficients);
68  CUDASYNC("fr_eq_wrapper");
69  CMPCHECK(16*512);
70  PRINTPASS(pass);
71 
72  // tc -> tc_fft
73 
74  printf("tc -> tc_fft\n"); fflush(stdout);
75 
76  CLOCKSTART;
77  for(int i=0; i<16; i++){
78  fr_fft_wrapper<<<rows, 256, fr_sharedmem>>>(fr_tmp+512*i, fr_tmp+512*i); // Needs to do 16 of those
79  }
80  CUDASYNC("fr_fft_wrapper");
81  CLOCKEND;
82 
83  clearRes;
84  fr_eq_wrapper<<<256, 32>>>(cmp, 16*512, fr_tmp, (fr_t *)toeplitz_coefficients_fft);
85  CUDASYNC("fr_eq_wrapper");
86  CMPCHECK(16*512);
87  PRINTPASS(pass);
88 
89  // tc_fft -> hext_fft
90  printf("tc_fft -> hext_fft\n"); fflush(stdout);
91  CLOCKSTART;
92  fk20_msm<<<rows, 256>>>(g1p_tmp, fr_tmp, (g1p_t *)xext_fft);
93  CUDASYNC("fk20_msm");
94  CLOCKEND;
95  clearRes;
96  g1p_eq_wrapper<<<16, 32>>>(cmp, 512, g1p_tmp, (g1p_t *)hext_fft);
97  CUDASYNC("g1p_eq_wrapper");
98  CMPCHECK(512);
99  PRINTPASS(pass);
100 
101  // hext_fft -> hext -> h
102 
103  printf("hext_fft -> hext -> h\n"); fflush(stdout);
104 
105  CLOCKSTART;
106  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(g1p_tmp, g1p_tmp);
107  CUDASYNC("g1p_ift_wrapper");
108  fk20_hext2h<<<rows, 256>>>(g1p_tmp);
109  CLOCKEND;
110  CUDASYNC("fk20_hext2h");
111  clearRes;
112  g1p_eq_wrapper<<<16, 32>>>(cmp, 256, g1p_tmp, (g1p_t *)h);
113  CUDASYNC("g1p_eq_wrapper");
114  CMPCHECK(256);
115  PRINTPASS(pass);
116 
117  // h -> h_fft
118 
119  printf("h -> h_fft\n"); fflush(stdout);
120 
121  CLOCKSTART;
122  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(g1p_tmp, g1p_tmp);
123  CUDASYNC("g1p_fft_wrapper");
124  CLOCKEND;
125 
126  clearRes;
127  g1p_eq_wrapper<<<16, 32>>>(cmp, 512, g1p_tmp, h_fft);
128  CUDASYNC("g1p_eq_wrapper");
129  CMPCHECK(512);
130  PRINTPASS(pass);
131 }
132 
141  const int rows = 1;
142  cudaError_t err;
143  bool pass = true;
144  CLOCKINIT;
145 
146  // Setup
147 
151 
152  varMangle((fr_t*)polynomial, 4096, 512); // Non destructively changes polynomial
153 
154  printf("\n>>>>Full integration test\n"); fflush(stdout);
155 
156  // polynomial -> tc
157 
158  printf("polynomial -> tc\n"); fflush(stdout);
159 
160  CLOCKSTART;
161  fk20_poly2toeplitz_coefficients<<<rows, 256, fr_sharedmem>>>(fr_tmp, polynomial);
162  CUDASYNC("fk20_poly2toeplitz_coefficients");
163  CLOCKEND;
164 
165  clearRes;
166  fr_eq_wrapper<<<256, 32>>>(cmp, 16*512, fr_tmp, (fr_t *)toeplitz_coefficients);
167  CUDASYNC("fr_eq_wrapper");
168  NEGCMPCHECK(16*512);
169  NEGPRINTPASS(pass);
170 
171  // tc -> tc_fft
172 
173  printf("tc -> tc_fft\n"); fflush(stdout);
174 
175  CLOCKSTART;
176  for(int i=0; i<16; i++){
177  fr_fft_wrapper<<<rows, 256, fr_sharedmem>>>(fr_tmp+512*i, fr_tmp+512*i); // Needs to do 16 of those
178  }
179  CUDASYNC("fr_fft_wrapper");
180  CLOCKEND;
181 
182  clearRes;
183  fr_eq_wrapper<<<256, 32>>>(cmp, 16*512, fr_tmp, (fr_t *)toeplitz_coefficients_fft);
184  CUDASYNC("fr_eq_wrapper");
185  NEGCMPCHECK(16*512);
186  NEGPRINTPASS(pass);
187 
188  // tc_fft -> hext_fft
189 
190  printf("tc_fft -> hext_fft\n"); fflush(stdout);
191 
192  CLOCKSTART;
193  fk20_msm<<<rows, 256>>>(g1p_tmp, fr_tmp, (g1p_t *)xext_fft);
194  CUDASYNC("fk20_msm");
195  CLOCKEND;
196 
197  clearRes;
198  g1p_eq_wrapper<<<16, 32>>>(cmp, 512, g1p_tmp, (g1p_t *)hext_fft);
199  CUDASYNC("g1p_eq_wrapper");
200  NEGCMPCHECK(512);
201  NEGPRINTPASS(pass);
202 
203  // hext_fft -> hext -> h
204 
205  printf("hext_fft -> hext -> h\n"); fflush(stdout);
206 
207  CLOCKSTART;
208  g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(g1p_tmp, g1p_tmp);
209  CUDASYNC("g1p_ift_wrapper");
210  fk20_hext2h<<<rows, 256>>>(g1p_tmp);
211  CLOCKEND;
212 
213  CUDASYNC("fk20_hext2h");
214  clearRes;
215  g1p_eq_wrapper<<<16, 32>>>(cmp, 256, g1p_tmp, (g1p_t *)h);
216  CUDASYNC("g1p_eq_wrapper");
217  NEGCMPCHECK(256);
218  NEGPRINTPASS(pass);
219 
220  // h -> h_fft
221 
222  printf("h -> h_fft\n"); fflush(stdout);
223 
224  CLOCKSTART;
225  g1p_fft_wrapper<<<rows, 256, g1p_sharedmem>>>(g1p_tmp, g1p_tmp);
226  CUDASYNC("g1p_fft_wrapper");
227  CLOCKEND;
228 
229  clearRes;
230  g1p_eq_wrapper<<<16, 32>>>(cmp, 512, g1p_tmp, h_fft);
231  CUDASYNC("g1p_eq_wrapper");
232  NEGCMPCHECK(512);
233  NEGPRINTPASS(pass);
234 
235  varMangle((fr_t*)polynomial, 4096, 512); // Restore polynomial
236 }
237 
238 /*******************************************************************************
239 
240 The testing functions follow an common template, described in ./doc/fk20test.md
241 
242 *******************************************************************************/
243 
244 
251 void fk20_poly2toeplitz_coefficients_test(fr_t polynomial_l[4096], fr_t toeplitz_coefficients_l[16][512]){
252  cudaError_t err;
253  bool pass = true;
254  CLOCKINIT;
255 
256  printf("=== RUN %s\n", "fk20_poly2toeplitz_coefficients: polynomial -> toeplitz_coefficients");
257  memset(fr_tmp, 0xAA,16*512*sizeof(fr_t)); // Pattern on tmp dest.
258  for(int testIDX=0; testIDX<=1; testIDX++){
259 
260  CLOCKSTART;
261  fk20_poly2toeplitz_coefficients<<<1, 256 >>>(fr_tmp, polynomial_l);
262  // IMPORTANT: This function does not need shared memory. Making the kernel call with a dynamic shared
263  // memory allocation is known to cause some subtle bugs, which do not always show during normal execution.
264  CUDASYNC("fk20_poly2toeplitz_coefficients");
265  CLOCKEND;
266 
267  clearRes;
268  fr_eq_wrapper<<<256, 32>>>(cmp, 16*512, fr_tmp, (fr_t *)toeplitz_coefficients_l);
269  CUDASYNC("fr_eq_wrapper");
270 
271  // Check result
272  if (testIDX == 0){
273  CMPCHECK(16 * 512)
274  PRINTPASS(pass);
275  }
276  else{
277  NEGCMPCHECK(16*512);
278  NEGPRINTPASS(pass);
279  }
280 
281  varMangle((fr_t*)polynomial_l, 4096, 512);
282  }
283 }
284 
292 void fk20_poly2hext_fft_test(fr_t polynomial_l[4096], g1p_t xext_fft_l[16][512], g1p_t hext_fft_l[512]){
293  cudaError_t err;
294  CLOCKINIT;
295  bool pass = true;
296 
298 
299  printf("=== RUN %s\n", "fk20_poly2hext_fft: polynomial -> hext_fft");
300  memset(g1p_tmp,0xAA,512*sizeof(g1p_t)); // Pattern on tmp dest
301  for(int testIDX=0; testIDX<=1; testIDX++){
302 
303  CLOCKSTART;
304  fk20_poly2hext_fft<<<1, 256, fr_sharedmem>>>(g1p_tmp, polynomial_l, (const g1p_t *)xext_fft_l);
305  CUDASYNC("fk20_poly2hext_fft");
306  CLOCKEND;
307 
308  clearRes;
309  g1p_eq_wrapper<<<16, 32>>>(cmp, 512, g1p_tmp, (g1p_t *)hext_fft_l);
310  CUDASYNC("g1p_eq_wrapper");
311 
312  // Check result
313  if (testIDX == 0){
314  CMPCHECK(512)
315  PRINTPASS(pass);
316  }
317  else{
318  NEGCMPCHECK(512);
319  NEGPRINTPASS(pass);
320  }
321  varMangle(hext_fft_l, 512, 64);
322  }
323 }
324 
332 void fk20_poly2h_fft_test(fr_t polynomial_l[4096], g1p_t xext_fft_l[16][512], g1p_t h_fft_l[512]){
333  cudaError_t err;
334  CLOCKINIT;
335  bool pass = true;
336 
337  printf("=== RUN %s\n", "fk20_poly2h_fft: polynomial -> h_fft (full computation)");
338  // memset(g1p_tmp,0x88,512*sizeof(g1p_t)); // Pattern on tmp dest
339  memset(g1p_tmp,0,512*sizeof(g1p_t)); // Pattern on tmp dest
340  memset(fr_tmp,0xAA,8192*sizeof(fr_t)); // Pattern on tmp dest
341  for(int testIDX=0; testIDX<=1; testIDX++){
342 
343  CLOCKSTART;
344  fk20_poly2h_fft(g1p_tmp, polynomial_l, (const g1p_t *)xext_fft_l, 1); // This causes memory issues
345  CUDASYNC("fk20_poly2h_fft");
346  CLOCKEND;
347 
348  clearRes;
349  g1p_eq_wrapper<<<16, 32>>>(cmp, 512, g1p_tmp, (g1p_t *)h_fft_l);
350  CUDASYNC("g1p_eq_wrapper");
351 
352  // Check result
353  if (testIDX == 0){
354  CMPCHECK(512)
355  PRINTPASS(pass);
356  }
357  else{
358  NEGCMPCHECK(512);
359  NEGPRINTPASS(pass);
360  }
361  varMangle(h_fft_l, 512, 64);
362  }
363 }
364 
372 void fk20_msmloop(g1p_t hext_fft_l[512], fr_t toeplitz_coefficients_fft_l[16][512],
373  g1p_t xext_fft_l[16][512]){
374  cudaError_t err;
375  CLOCKINIT;
376  bool pass = true;
377 
378  printf("=== RUN %s\n", "fk20_msm: Toeplitz_coefficients+xext_fft -> hext_fft");
379  memset(g1p_tmp,0x88,512*sizeof(g1p_t)); // Pattern on tmp dest
380  for(int testIDX=0; testIDX<=1; testIDX++){
381 
382  CLOCKSTART;
383  fk20_msm<<<1, 256>>>(g1p_tmp, (const fr_t*)toeplitz_coefficients_fft_l, (const g1p_t*)xext_fft_l);
384  CUDASYNC("fk20_msm");
385  CLOCKEND;
386 
387  clearRes;
388  g1p_eq_wrapper<<<16, 32>>>(cmp, 512, g1p_tmp, (g1p_t *)hext_fft_l);
389  CUDASYNC("g1p_eq_wrapper");
390 
391  // Check result
392  if (testIDX == 0){
393  CMPCHECK(512)
394  PRINTPASS(pass);
395  }
396  else{
397  NEGCMPCHECK(512);
398  NEGPRINTPASS(pass);
399  }
400  varMangle(hext_fft_l, 512, 64);
401  }
402 }
403 
404 // Deprecated function
405 /*
406 void fk20_setup2xext_fft_test(g1p_t setup_l[4097], g1p_t xext_fft_l[16][512]){
407  cudaError_t err;
408  bool pass = true;
409  g1p_t g1ptmp[16*512];
410 
411  CLOCKINIT;
412 
413  printf("=== RUN %s\n", "fk20_setup2xext_fft: setup -> xext_fft");
414  memset(g1ptmp, 0xAA, 16*512*sizeof(g1p_t)); //pattern on tmp dest.
415  SET_SHAREDMEM(g1p_sharedmem, fk20_setup2xext_fft)
416  for(int testIDX=0; testIDX<=1; testIDX++){
417 
418  CLOCKSTART;
419  fk20_setup2xext_fft<<<16, 256, g1p_sharedmem>>>(g1ptmp, setup);
420 
421  CUDASYNC("fk20_setup2xext_fft");
422  CLOCKEND;
423 
424  clearRes;
425  g1p_eq_wrapper<<<256, 32>>>(cmp, 16*512, g1ptmp, (g1p_t*)xext_fft);
426  CUDASYNC("g1p_eq_wrapper");
427 
428  // Check result
429  if (testIDX == 0){
430  CMPCHECK(16 * 512)
431  PRINTPASS(pass);
432  }
433  else{
434  NEGCMPCHECK(16*512);
435  NEGPRINTPASS(pass);
436  }
437 
438  varMangle((g1p_t*)xext_fft_l, 4096, 512);
439  }
440 }
441 */
442 
443 //Deprecated function
444 /*
445 void fk20_poly2toeplitz_coefficients_fft_test(fr_t polynomial_l[4096], fr_t toeplitz_coefficients_fft_l[16][512]){
446  cudaError_t err;
447  CLOCKINIT;
448  bool pass = true;
449 
450  SET_SHAREDMEM(g1p_sharedmem, fk20_poly2toeplitz_coefficients_fft);
451 
452  printf("=== RUN %s\n", "fk20_poly2toeplitz_coefficients_fft: polynomial -> toeplitz_coefficients_fft");
453  memset(fr_tmp, 0xAA,16*512*sizeof(fr_t)); //pattern on tmp dest.
454  CLOCKSTART;
455  fk20_poly2toeplitz_coefficients_fft<<<1, 256, fr_sharedmem>>>(fr_tmp, polynomial_l);
456  err = cudaDeviceSynchronize();
457  end = clock();
458 
459  if (err != cudaSuccess)
460  printf("Error fk20_poly2toeplitz_coefficients_fft: %d (%s)\n", err, cudaGetErrorName(err));
461  else
462  printf(" (%.3f s)\n", (end - start) * (1.0 / CLOCKS_PER_SEC));
463 
464  // Clear comparison results
465 
466  for (int i=0; i<16*512; i++)
467  cmp[i] = 0;
468 
469  fr_eq_wrapper<<<16, 256>>>(cmp, 16*512, fr_tmp, (fr_t *)toeplitz_coefficients_fft_l);
470 
471  err = cudaDeviceSynchronize();
472  if (err != cudaSuccess) printf("Error fr_eq_wrapper: %d (%s)\n", err, cudaGetErrorName(err));
473 
474  // Check result
475 
476  for (int i=0; i<16*512; i++)
477  if (cmp[i] != 1) {
478  printf("poly2tc error %04x\n", i);
479  pass = false;
480  break;
481  }
482 
483  PRINTPASS(pass);
484 }
485 */
486 
487 // vim: ts=4 et sw=4 si
const size_t g1p_sharedmem
Definition: fk20.cuh:14
__host__ void fk20_poly2h_fft(g1p_t *h_fft, const fr_t *polynomial, const g1p_t xext_fft[8192], unsigned rows)
polynomial + xext_fft -> h_fft This function is a wrapper for the full FK20 computation,...
__global__ void fk20_poly2hext_fft(g1p_t *hext_fft, const fr_t *polynomial, const g1p_t xext_fft[8192])
polynomial + xext_fft -> hext_fft
#define SET_SHAREDMEM(SZ, FN)
Definition: fk20.cuh:48
#define CUDASYNC(fmt,...)
Definition: fk20.cuh:39
const size_t fr_sharedmem
Definition: fk20.cuh:15
__managed__ g1p_t xext_fft[16][512]
__managed__ g1p_t hext_fft[512 *512]
__managed__ g1p_t h_fft[512 *512]
__managed__ fr_t toeplitz_coefficients[512 *16][512]
__managed__ fr_t polynomial[512 *4096]
__managed__ fr_t toeplitz_coefficients_fft[512 *16][512]
void varMangle(fr_t *target, size_t size, unsigned step)
swap elements at positions multiple of step. Nondestructive, call a second time to undo the changes
__managed__ g1p_t h[512 *512]
__managed__ uint8_t cmp[16 *512]
void fk20_msmloop(g1p_t hext_fft_l[512], fr_t toeplitz_coefficients_fft_l[16][512], g1p_t xext_fft_l[16][512])
Test for fk20_msm: Toeplitz_coefficients+xext_fft -> hext_fft.
void fullTest()
void FK20TestPoly()
void fk20_poly2hext_fft_test(fr_t polynomial_l[4096], g1p_t xext_fft_l[16][512], g1p_t hext_fft_l[512])
Test for fk20_poly2hext_fft: polynomial -> hext_fft.
void fk20_poly2h_fft_test(fr_t polynomial_l[4096], g1p_t xext_fft_l[16][512], g1p_t h_fft_l[512])
Test for fk20_poly2h_fft: polynomial -> h_fft.
void fullTestFalsifiability()
void fk20_poly2toeplitz_coefficients_test(fr_t polynomial_l[4096], fr_t toeplitz_coefficients_l[16][512])
Test for fk20_poly2toeplitz_coefficients: polynomial -> toeplitz_coefficients.
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
Definition: fr.cuh:24
__global__ void fr_fft_wrapper(fr_t *output, const fr_t *input)
wrapper for fr_fft: FFT for fr_t[512]
Definition: fr_fft.cu:316
__global__ void g1p_fft_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_fft: FFT for arrays of g1p_t with length 512
Definition: g1p_fft.cu:336
__global__ void g1p_ift_wrapper(g1p_t *output, const g1p_t *input)
wrapper for g1p_ift: inverse FFT for arrays of g1p_t with length 512
Definition: g1p_fft.cu:349
__shared__ g1p_t g1p_tmp[]
G1 point in projective coordinates.
Definition: g1.cuh:27
#define CLOCKINIT
Definition: test.h:98
#define NEGCMPCHECK(LENGTH)
Definition: test.h:116
#define clearRes
Definition: test.h:87
#define PRINTPASS(pass)
Definition: test.h:25
#define CLOCKEND
Definition: test.h:100
#define CLOCKSTART
Definition: test.h:99
#define NEGPRINTPASS(pass)
Definition: test.h:32
#define CMPCHECK(LENGTH)
Definition: test.h:106