FK20 CUDA
fp_mma.cu
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 #include "fp.cuh"
6 #include "fp_mul.cuh"
7 #include "fp_reduce12.cuh"
8 
20 __device__ void fp_mma(fp_t &z, const fp_t &v, const fp_t &w, const fp_t &x, const fp_t &y) {
21  uint64_t
22  v0 = v[0], v1 = v[1], v2 = v[2], v3 = v[3], v4 = v[4], v5 = v[5],
23  w0 = w[0], w1 = w[1], w2 = w[2], w3 = w[3], w4 = w[4], w5 = w[5],
24  x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5],
25  y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4], y5 = y[5],
26  z0, z1, z2, z3, z4, z5;
27 
28  asm volatile (
29  "\n\t{"
30  "\n\t.reg .u64 v<6>, w<6>, x<6>, y<6>;"
31  "\n\t.reg .u64 u<10>, ua, ub;"
32  "\n\t.reg .u32 uc;"
33  "\n\t.reg .u64 q<8>;"
34  "\n\t.reg .u64 r<7>;"
35  "\n\t.reg .pred nz;"
36 
37  "\n\tmov.u64 v0, %6;"
38  "\n\tmov.u64 v1, %7;"
39  "\n\tmov.u64 v2, %8;"
40  "\n\tmov.u64 v3, %9;"
41  "\n\tmov.u64 v4, %10;"
42  "\n\tmov.u64 v5, %11;"
43 
44  "\n\tmov.u64 w0, %12;"
45  "\n\tmov.u64 w1, %13;"
46  "\n\tmov.u64 w2, %14;"
47  "\n\tmov.u64 w3, %15;"
48  "\n\tmov.u64 w4, %16;"
49  "\n\tmov.u64 w5, %17;"
50 
51  "\n\tmov.u64 x0, %18;"
52  "\n\tmov.u64 x1, %19;"
53  "\n\tmov.u64 x2, %20;"
54  "\n\tmov.u64 x3, %21;"
55  "\n\tmov.u64 x4, %22;"
56  "\n\tmov.u64 x5, %23;"
57 
58  "\n\tmov.u64 y0, %24;"
59  "\n\tmov.u64 y1, %25;"
60  "\n\tmov.u64 y2, %26;"
61  "\n\tmov.u64 y3, %27;"
62  "\n\tmov.u64 y4, %28;"
63  "\n\tmov.u64 y5, %29;"
64 
65 FP_MUL(u, v, w)
66 
67  "\n\tmov.u64 v0, u0;"
68  "\n\tmov.u64 v1, u1;"
69  "\n\tmov.u64 v2, u2;"
70  "\n\tmov.u64 v3, u3;"
71  "\n\tmov.u64 v4, u4;"
72  "\n\tmov.u64 v5, u5;"
73 
74  "\n\tmov.u64 w0, u6;"
75  "\n\tmov.u64 w1, u7;"
76  "\n\tmov.u64 w2, u8;"
77  "\n\tmov.u64 w3, u9;"
78  "\n\tmov.u64 w4, ua;"
79  "\n\tmov.u64 w5, ub;"
80 
81 FP_MUL(u, x, y)
82 
83  // Double-width addition
84 
85  "\n\tadd.u64.cc u0, u0, v0;"
86  "\n\taddc.u64.cc u1, u1, v1;"
87  "\n\taddc.u64.cc u2, u2, v2;"
88  "\n\taddc.u64.cc u3, u3, v3;"
89  "\n\taddc.u64.cc u4, u4, v4;"
90  "\n\taddc.u64.cc u5, u5, v5;"
91  "\n\taddc.u64.cc u6, u6, w0;"
92  "\n\taddc.u64.cc u7, u7, w1;"
93  "\n\taddc.u64.cc u8, u8, w2;"
94  "\n\taddc.u64.cc u9, u9, w3;"
95  "\n\taddc.u64.cc ua, ua, w4;"
96  "\n\taddc.u64.cc ub, ub, w5;"
97  "\n\taddc.u32 uc, 0, 0;"
98 
99  // Double-width reduction
100 
101  /* if u >= 2^768 then u -= mmu0 * 2^384 */
102 
103  "\n\tsetp.ne.u32 nz, uc, 0;"
104  "\n@nz\tsub.u64.cc u6, u6, 0x89f6fffffffd0003U;"
105  "\n@nz\tsubc.u64.cc u7, u7, 0x140bfff43bf3fffdU;"
106  "\n@nz\tsubc.u64.cc u8, u8, 0xa0b767a8ac38a745U;"
107  "\n@nz\tsubc.u64.cc u9, u9, 0x8831a7ac8fada8baU;"
108  "\n@nz\tsubc.u64.cc ua, ua, 0xa3f8e5685da91392U;"
109  "\n@nz\tsubc.u64.cc ub, ub, 0xea09a13c057f1b6cU;"
110  "\n@nz\tsubc.u32 uc, uc, 0;"
111 
112  /* if u >= 2^768 then u -= mmu0 * 2^384 */
113 
114  "\n\tsetp.ne.u32 nz, uc, 0;"
115  "\n@nz\tsub.u64.cc u6, u6, 0x89f6fffffffd0003U;"
116  "\n@nz\tsubc.u64.cc u7, u7, 0x140bfff43bf3fffdU;"
117  "\n@nz\tsubc.u64.cc u8, u8, 0xa0b767a8ac38a745U;"
118  "\n@nz\tsubc.u64.cc u9, u9, 0x8831a7ac8fada8baU;"
119  "\n@nz\tsubc.u64.cc ua, ua, 0xa3f8e5685da91392U;"
120  "\n@nz\tsubc.u64.cc ub, ub, 0xea09a13c057f1b6cU;"
121 
122 FP_REDUCE12(u)
123 
124  "\n\tmov.u64 %0, u0;"
125  "\n\tmov.u64 %1, u1;"
126  "\n\tmov.u64 %2, u2;"
127  "\n\tmov.u64 %3, u3;"
128  "\n\tmov.u64 %4, u4;"
129  "\n\tmov.u64 %5, u5;"
130 
131  "\n\t}"
132  :
133  "=l"(z0), "=l"(z1), "=l"(z2), "=l"(z3), "=l"(z4), "=l"(z5)
134  :
135  "l"(v0), "l"(v1), "l"(v2), "l"(v3), "l"(v4), "l"(v5),
136  "l"(w0), "l"(w1), "l"(w2), "l"(w3), "l"(w4), "l"(w5),
137  "l"(x0), "l"(x1), "l"(x2), "l"(x3), "l"(x4), "l"(x5),
138  "l"(y0), "l"(y1), "l"(y2), "l"(y3), "l"(y4), "l"(y5)
139  );
140 
141  z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; z[5] = z5;
142 }
143 
144 // vim: ts=4 et sw=4 si
uint64_t fp_t[6]
Residue modulo p. Any 384-bit representative of each residue is allowed, and stored as a 6-element li...
Definition: fp.cuh:14
__device__ void fp_mma(fp_t &z, const fp_t &v, const fp_t &w, const fp_t &x, const fp_t &y)
Fp multiply-multiply-add. Fast execution of z = (v*w + x*y) mod p The double-wide products are added ...
Definition: fp_mma.cu:20
#define FP_MUL(Z, X, Y)
PTX macro for multiplication of two residues mod p Reads X0..X5 and Y0..Y5. Writes Z0....
Definition: fp_mul.cuh:8
#define FP_REDUCE12(Z)
Wide reduction over 12 words.
Definition: fp_reduce12.cuh:12