FK20 CUDA
fr_mul.cu
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 #include "fr.cuh"
13 __device__ void fr_mul(fr_t &z, const fr_t &x) {
14  uint64_t
15  z0 = z[0], z1 = z[1], z2 = z[2], z3 = z[3],
16  x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3];
17 
18  asm volatile (
19  "\n\t{"
20  "\n\t.reg .u64 u<8>;"
21  "\n\t.reg .u64 q<6>;"
22  "\n\t.reg .u64 r<5>;"
23  "\n\t.reg .pred nz;"
24 
25  // mul
26 
27  "\n\tmul.lo.u64 u1, %0, %5 ; mul.hi.u64 u2, %0, %5 ;"
28  "\n\tmul.lo.u64 u3, %0, %7 ; mul.hi.u64 u4, %0, %7 ;"
29 
30  "\n\tmul.lo.u64 u0, %0, %4 ; mad.hi.u64.cc u1, %0, %4, u1;"
31  "\n\tmadc.lo.u64.cc u2, %0, %6, u2; madc.hi.u64.cc u3, %0, %6, u3;"
32  "\n\taddc.u64 u4, 0, u4;"
33 
34 
35  "\n\tmad.lo.u64.cc u2, %1, %5, u2; madc.hi.u64.cc u3, %1, %5, u3;"
36  "\n\tmadc.lo.u64.cc u4, %1, %7, u4; madc.hi.u64 u5, %1, %7, 0;"
37 
38  "\n\tmad.lo.u64.cc u1, %1, %4, u1; madc.hi.u64.cc u2, %1, %4, u2;"
39  "\n\tmadc.lo.u64.cc u3, %1, %6, u3; madc.hi.u64.cc u4, %1, %6, u4;"
40  "\n\taddc.u64 u5, 0, u5;"
41 
42 
43  "\n\tmad.lo.u64.cc u3, %2, %5, u3; madc.hi.u64.cc u4, %2, %5, u4;"
44  "\n\tmadc.lo.u64.cc u5, %2, %7, u5; madc.hi.u64 u6, %2, %7, 0;"
45 
46  "\n\tmad.lo.u64.cc u2, %2, %4, u2; madc.hi.u64.cc u3, %2, %4, u3;"
47  "\n\tmadc.lo.u64.cc u4, %2, %6, u4; madc.hi.u64.cc u5, %2, %6, u5;"
48  "\n\taddc.u64 u6, 0, u6;"
49 
50 
51  "\n\tmad.lo.u64.cc u4, %3, %5, u4; madc.hi.u64.cc u5, %3, %5, u5;"
52  "\n\tmadc.lo.u64.cc u6, %3, %7, u6; madc.hi.u64 u7, %3, %7, 0;"
53 
54  "\n\tmad.lo.u64.cc u3, %3, %4, u3; madc.hi.u64.cc u4, %3, %4, u4;"
55  "\n\tmadc.lo.u64.cc u5, %3, %6, u5; madc.hi.u64.cc u6, %3, %6, u6;"
56  "\n\taddc.u64 u7, 0, u7;"
57 
58  // reduce8
59 
60  // q2 = q1 * mu; q3 = q2 / 2^320
61 
62  // mu0
63 
64  "\n\tmul.hi.u64 q0, 0x42737A020C0D6393U, u6;"
65 
66  "\n\tmad.lo.u64.cc q0, 0x42737A020C0D6393U, u7, q0;"
67  "\n\tmadc.hi.u64 q1, 0x42737A020C0D6393U, u7, 0;"
68 
69  // mu1
70 
71  "\n\tmad.hi.u64.cc q0, 0x65043EB4BE4BAD71U, u5, q0;"
72  "\n\tmadc.lo.u64.cc q1, 0x65043EB4BE4BAD71U, u7, q1;"
73  "\n\tmadc.hi.u64 q2, 0x65043EB4BE4BAD71U, u7, 0;"
74 
75  "\n\tmad.lo.u64.cc q0, 0x65043EB4BE4BAD71U, u6, q0;"
76  "\n\tmadc.hi.u64.cc q1, 0x65043EB4BE4BAD71U, u6, q1;"
77  "\n\taddc.u64 q2, q2, 0;"
78 
79  // mu2
80 
81  "\n\tmad.lo.u64.cc q0, 0x38B5DCB707E08ED3U, u5, q0;"
82  "\n\tmadc.hi.u64.cc q1, 0x38B5DCB707E08ED3U, u5, q1;"
83  "\n\tmadc.lo.u64.cc q2, 0x38B5DCB707E08ED3U, u7, q2;"
84  "\n\tmadc.hi.u64 q3, 0x38B5DCB707E08ED3U, u7, 0;"
85 
86  "\n\tmad.hi.u64.cc q0, 0x38B5DCB707E08ED3U, u4, q0;"
87  "\n\tmadc.lo.u64.cc q1, 0x38B5DCB707E08ED3U, u6, q1;"
88  "\n\tmadc.hi.u64.cc q2, 0x38B5DCB707E08ED3U, u6, q2;"
89  "\n\taddc.u64 q3, q3, 0;"
90 
91  // mu3
92 
93  "\n\tmad.hi.u64.cc q0, 0x355094EDFEDE377CU, u3, q0;"
94  "\n\tmadc.lo.u64.cc q1, 0x355094EDFEDE377CU, u5, q1;"
95  "\n\tmadc.hi.u64.cc q2, 0x355094EDFEDE377CU, u5, q2;"
96  "\n\tmadc.lo.u64.cc q3, 0x355094EDFEDE377CU, u7, q3;"
97  "\n\tmadc.hi.u64 q4, 0x355094EDFEDE377CU, u7, 0;"
98 
99  "\n\tmad.lo.u64.cc q0, 0x355094EDFEDE377CU, u4, q0;"
100  "\n\tmadc.hi.u64.cc q1, 0x355094EDFEDE377CU, u4, q1;"
101  "\n\tmadc.lo.u64.cc q2, 0x355094EDFEDE377CU, u6, q2;"
102  "\n\tmadc.hi.u64.cc q3, 0x355094EDFEDE377CU, u6, q3;"
103  "\n\taddc.u64 q4, q4, 0;"
104 
105  // mu4
106 
107  "\n\tmad.lo.u64.cc q0, 0x0000000000000002U, u3, q0;"
108  "\n\tmadc.hi.u64.cc q1, 0x0000000000000002U, u3, q1;"
109  "\n\tmadc.lo.u64.cc q2, 0x0000000000000002U, u5, q2;"
110  "\n\tmadc.hi.u64.cc q3, 0x0000000000000002U, u5, q3;"
111  "\n\tmadc.lo.u64.cc q4, 0x0000000000000002U, u7, q4;"
112  "\n\tmadc.hi.u64 q5, 0x0000000000000002U, u7, 0;"
113 
114  "\n\tmad.hi.u64.cc q0, 0x0000000000000002U, u2, q0;"
115  "\n\tmadc.lo.u64.cc q1, 0x0000000000000002U, u4, q1;"
116  "\n\tmadc.hi.u64.cc q2, 0x0000000000000002U, u4, q2;"
117  "\n\tmadc.lo.u64.cc q3, 0x0000000000000002U, u6, q3;"
118  "\n\tmadc.hi.u64.cc q4, 0x0000000000000002U, u6, q4;"
119  "\n\taddc.u64 q5, q5, 0;"
120 
121  // r2 = q3 * m mod 2^320
122  // u contains z^2
123  // q contains q3
124  // produces r2 in r
125 
126  // m3
127 
128  "\n\tmul.lo.u64 r3, 0x73EDA753299D7D48U, q1 ;"
129  "\n\tmul.hi.u64 r4, 0x73EDA753299D7D48U, q1 ;"
130  "\n\tmad.lo.u64 r4, 0x73EDA753299D7D48U, q2, r4;"
131 
132  // m2
133 
134  "\n\tmul.lo.u64 r2, 0x3339D80809A1D805U, q1 ;"
135  "\n\tmad.hi.u64.cc r3, 0x3339D80809A1D805U, q1, r3;"
136  "\n\tmadc.lo.u64 r4, 0x3339D80809A1D805U, q3, r4;"
137 
138  "\n\tmad.lo.u64.cc r3, 0x3339D80809A1D805U, q2, r3;"
139  "\n\tmadc.hi.u64 r4, 0x3339D80809A1D805U, q2, r4;"
140 
141  // m1
142 
143  "\n\tmul.lo.u64 r1, 0x53BDA402FFFE5BFEU, q1 ;"
144  "\n\tmad.hi.u64.cc r2, 0x53BDA402FFFE5BFEU, q1, r2;"
145  "\n\tmadc.lo.u64.cc r3, 0x53BDA402FFFE5BFEU, q3, r3;"
146  "\n\tmadc.hi.u64 r4, 0x53BDA402FFFE5BFEU, q3, r4;"
147 
148  "\n\tmad.lo.u64.cc r2, 0x53BDA402FFFE5BFEU, q2, r2;"
149  "\n\tmadc.hi.u64.cc r3, 0x53BDA402FFFE5BFEU, q2, r3;"
150  "\n\tmadc.lo.u64 r4, 0x53BDA402FFFE5BFEU, q4, r4;"
151 
152  // m0
153 
154  "\n\tmul.lo.u64 r0, 0xFFFFFFFF00000001U, q1 ;"
155  "\n\tmad.hi.u64.cc r1, 0xFFFFFFFF00000001U, q1, r1;"
156  "\n\tmadc.lo.u64.cc r2, 0xFFFFFFFF00000001U, q3, r2;"
157  "\n\tmadc.hi.u64.cc r3, 0xFFFFFFFF00000001U, q3, r3;"
158  "\n\tmadc.lo.u64 r4, 0xFFFFFFFF00000001U, q5, r4;"
159 
160  "\n\tmad.lo.u64.cc r1, 0xFFFFFFFF00000001U, q2, r1;"
161  "\n\tmadc.hi.u64.cc r2, 0xFFFFFFFF00000001U, q2, r2;"
162  "\n\tmadc.lo.u64.cc r3, 0xFFFFFFFF00000001U, q4, r3;"
163  "\n\tmadc.hi.u64 r4, 0xFFFFFFFF00000001U, q4, r4;"
164 
165  // r = r1 - r2
166  // r1 is in u
167  // r2 is in r
168 
169  // z = r1 - r2
170 
171  // Note: 0 <= z < 3m and 2m < 2^256, so z >= 2^256 => 0 < z-m < 2^256
172 
173  "\n\tsub.u64.cc %0, u0, r0;"
174  "\n\tsubc.u64.cc %1, u1, r1;"
175  "\n\tsubc.u64.cc %2, u2, r2;"
176  "\n\tsubc.u64.cc %3, u3, r3;"
177  "\n\tsubc.u64.cc u4, u4, r4;"
178  "\n\tsetp.ne.u64 nz, u4, 0;" // set predicate if z >= 2^256
179 
180  // if predicate is set then z = z - m
181 
182  "\n @nz\tsub.u64.cc %0, %0, 0xFFFFFFFF00000001U;"
183  "\n @nz\tsubc.u64.cc %1, %1, 0x53BDA402FFFE5BFEU;"
184  "\n @nz\tsubc.u64.cc %2, %2, 0x3339D80809A1D805U;"
185  "\n @nz\tsubc.u64 %3, %3, 0x73EDA753299D7D48U;"
186 
187  "\n\t}"
188  : "+l"(z0), "+l"(z1), "+l"(z2), "+l"(z3)
189  : "l"(x0), "l"(x1), "l"(x2), "l"(x3)
190  );
191 
192  z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3;
193 }
194 
195 // vim: ts=4 et sw=4 si
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
Definition: fr.cuh:24
__device__ void fr_mul(fr_t &z, const fr_t &x)
Multiply two residues module r z and x, stores back into z.
Definition: fr_mul.cu:13