FK20 CUDA
fp_mul.cu
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 #include "fp.cuh"
6 #include "fp_mul.cuh"
7 #include "fp_reduce12.cuh"
8 
17 __device__ void fp_mul(fp_t &z, const fp_t &x, const fp_t &y) {
18  uint64_t
19  x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5],
20  y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4], y5 = y[5],
21  z0, z1, z2, z3, z4, z5;
22 
23  asm volatile (
24  "\n\t{"
25  "\n\t.reg .u64 x<6>, y<6>;"
26  "\n\t.reg .u64 u<10>, ua, ub;"
27  "\n\t.reg .u64 q<8>;"
28  "\n\t.reg .u64 r<7>;"
29 
30  "\n\tmov.u64 x0, %6;"
31  "\n\tmov.u64 x1, %7;"
32  "\n\tmov.u64 x2, %8;"
33  "\n\tmov.u64 x3, %9;"
34  "\n\tmov.u64 x4, %10;"
35  "\n\tmov.u64 x5, %11;"
36 
37  "\n\tmov.u64 y0, %12;"
38  "\n\tmov.u64 y1, %13;"
39  "\n\tmov.u64 y2, %14;"
40  "\n\tmov.u64 y3, %15;"
41  "\n\tmov.u64 y4, %16;"
42  "\n\tmov.u64 y5, %17;"
43 
44 FP_MUL(u, x, y)
45 FP_REDUCE12(u)
46 
47  "\n\tmov.u64 %0, u0;"
48  "\n\tmov.u64 %1, u1;"
49  "\n\tmov.u64 %2, u2;"
50  "\n\tmov.u64 %3, u3;"
51  "\n\tmov.u64 %4, u4;"
52  "\n\tmov.u64 %5, u5;"
53 
54  "\n\t}"
55  :
56  "=l"(z0), "=l"(z1), "=l"(z2), "=l"(z3), "=l"(z4), "=l"(z5)
57  :
58  "l"(x0), "l"(x1), "l"(x2), "l"(x3), "l"(x4), "l"(x5),
59  "l"(y0), "l"(y1), "l"(y2), "l"(y3), "l"(y4), "l"(y5)
60  );
61 
62  z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; z[5] = z5;
63 }
64 
65 // vim: ts=4 et sw=4 si
uint64_t fp_t[6]
Residue modulo p. Any 384-bit representative of each residue is allowed, and stored as a 6-element li...
Definition: fp.cuh:14
__device__ void fp_mul(fp_t &z, const fp_t &x, const fp_t &y)
Multiplies two Fp residues x and y, stores in z.
Definition: fp_mul.cu:17
#define FP_MUL(Z, X, Y)
PTX macro for multiplication of two residues mod p Reads X0..X5 and Y0..Y5. Writes Z0....
Definition: fp_mul.cuh:8
#define FP_REDUCE12(Z)
Wide reduction over 12 words.
Definition: fp_reduce12.cuh:12