FK20 CUDA
fr_sqr.cu
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 
6 #include "fr.cuh"
7 
14 __device__ void fr_sqr(fr_t &z) {
15  uint64_t
16  z0 = z[0], z1 = z[1], z2 = z[2], z3 = z[3];
17 
18  asm volatile (
19  "\n\t{"
20  "\n\t.reg .u64 u<8>;"
21  "\n\t.reg .u64 q<6>;"
22  "\n\t.reg .u64 r<5>;"
23  "\n\t.reg .pred nz;"
24 
25  "\n\tmul.lo.u64 u3, %0, %3 ; mul.hi.u64 u4, %0, %3 ;"
26 
27  "\n\tmul.lo.u64 u2, %0, %2 ; mad.hi.u64.cc u3, %0, %2, u3;"
28  "\n\tmadc.lo.u64.cc u4, %1, %3, u4; madc.hi.u64 u5, %1, %3, 0;"
29 
30  "\n\tmul.lo.u64 u1, %0, %1 ; mad.hi.u64.cc u2, %0, %1, u2;"
31  "\n\tmadc.lo.u64.cc u3, %1, %2, u3; madc.hi.u64.cc u4, %1, %2, u4;"
32  "\n\tmadc.lo.u64.cc u5, %2, %3, u5; madc.hi.u64 u6, %2, %3, 0;"
33 
34  "\n\tadd.u64.cc u1, u1, u1;"
35  "\n\taddc.u64.cc u2, u2, u2;"
36  "\n\taddc.u64.cc u3, u3, u3;"
37  "\n\taddc.u64.cc u4, u4, u4;"
38  "\n\taddc.u64.cc u5, u5, u5;"
39  "\n\taddc.u64.cc u6, u6, u6;"
40  "\n\taddc.u64 u7, 0, 0;"
41 
42  "\n\tmul.lo.u64 u0, %0, %0 ; mad.hi.u64.cc u1, %0, %0, u1;"
43  "\n\tmadc.lo.u64.cc u2, %1, %1, u2; madc.hi.u64.cc u3, %1, %1, u3;"
44  "\n\tmadc.lo.u64.cc u4, %2, %2, u4; madc.hi.u64.cc u5, %2, %2, u5;"
45  "\n\tmadc.lo.u64.cc u6, %3, %3, u6; madc.hi.u64 u7, %3, %3, u7;"
46 
47 
48  // reduce8
49 
50  // q2 = q1 * mu; q3 = q2 / 2^320
51 
52  // mu0
53 
54  "\n\tmul.hi.u64 q0, 0x42737A020C0D6393U, u6;"
55 
56  "\n\tmad.lo.u64.cc q0, 0x42737A020C0D6393U, u7, q0;"
57  "\n\tmadc.hi.u64 q1, 0x42737A020C0D6393U, u7, 0;"
58 
59  // mu1
60 
61  "\n\tmad.hi.u64.cc q0, 0x65043EB4BE4BAD71U, u5, q0;"
62  "\n\tmadc.lo.u64.cc q1, 0x65043EB4BE4BAD71U, u7, q1;"
63  "\n\tmadc.hi.u64 q2, 0x65043EB4BE4BAD71U, u7, 0;"
64 
65  "\n\tmad.lo.u64.cc q0, 0x65043EB4BE4BAD71U, u6, q0;"
66  "\n\tmadc.hi.u64.cc q1, 0x65043EB4BE4BAD71U, u6, q1;"
67  "\n\taddc.u64 q2, q2, 0;"
68 
69  // mu2
70 
71  "\n\tmad.lo.u64.cc q0, 0x38B5DCB707E08ED3U, u5, q0;"
72  "\n\tmadc.hi.u64.cc q1, 0x38B5DCB707E08ED3U, u5, q1;"
73  "\n\tmadc.lo.u64.cc q2, 0x38B5DCB707E08ED3U, u7, q2;"
74  "\n\tmadc.hi.u64 q3, 0x38B5DCB707E08ED3U, u7, 0;"
75 
76  "\n\tmad.hi.u64.cc q0, 0x38B5DCB707E08ED3U, u4, q0;"
77  "\n\tmadc.lo.u64.cc q1, 0x38B5DCB707E08ED3U, u6, q1;"
78  "\n\tmadc.hi.u64.cc q2, 0x38B5DCB707E08ED3U, u6, q2;"
79  "\n\taddc.u64 q3, q3, 0;"
80 
81  // mu3
82 
83  "\n\tmad.hi.u64.cc q0, 0x355094EDFEDE377CU, u3, q0;"
84  "\n\tmadc.lo.u64.cc q1, 0x355094EDFEDE377CU, u5, q1;"
85  "\n\tmadc.hi.u64.cc q2, 0x355094EDFEDE377CU, u5, q2;"
86  "\n\tmadc.lo.u64.cc q3, 0x355094EDFEDE377CU, u7, q3;"
87  "\n\tmadc.hi.u64 q4, 0x355094EDFEDE377CU, u7, 0;"
88 
89  "\n\tmad.lo.u64.cc q0, 0x355094EDFEDE377CU, u4, q0;"
90  "\n\tmadc.hi.u64.cc q1, 0x355094EDFEDE377CU, u4, q1;"
91  "\n\tmadc.lo.u64.cc q2, 0x355094EDFEDE377CU, u6, q2;"
92  "\n\tmadc.hi.u64.cc q3, 0x355094EDFEDE377CU, u6, q3;"
93  "\n\taddc.u64 q4, q4, 0;"
94 
95  // mu4
96 
97  "\n\tmad.lo.u64.cc q0, 0x0000000000000002U, u3, q0;"
98  "\n\tmadc.hi.u64.cc q1, 0x0000000000000002U, u3, q1;"
99  "\n\tmadc.lo.u64.cc q2, 0x0000000000000002U, u5, q2;"
100  "\n\tmadc.hi.u64.cc q3, 0x0000000000000002U, u5, q3;"
101  "\n\tmadc.lo.u64.cc q4, 0x0000000000000002U, u7, q4;"
102  "\n\tmadc.hi.u64 q5, 0x0000000000000002U, u7, 0;"
103 
104  "\n\tmad.hi.u64.cc q0, 0x0000000000000002U, u2, q0;"
105  "\n\tmadc.lo.u64.cc q1, 0x0000000000000002U, u4, q1;"
106  "\n\tmadc.hi.u64.cc q2, 0x0000000000000002U, u4, q2;"
107  "\n\tmadc.lo.u64.cc q3, 0x0000000000000002U, u6, q3;"
108  "\n\tmadc.hi.u64.cc q4, 0x0000000000000002U, u6, q4;"
109  "\n\taddc.u64 q5, q5, 0;"
110 
111  // r2 = q3 * m mod 2^320
112  // u contains z^2
113  // q contains q3
114  // produces r2 in r
115 
116  // m3
117 
118  "\n\tmul.lo.u64 r3, 0x73EDA753299D7D48U, q1 ;"
119  "\n\tmul.hi.u64 r4, 0x73EDA753299D7D48U, q1 ;"
120  "\n\tmad.lo.u64 r4, 0x73EDA753299D7D48U, q2, r4;"
121 
122  // m2
123 
124  "\n\tmul.lo.u64 r2, 0x3339D80809A1D805U, q1 ;"
125  "\n\tmad.hi.u64.cc r3, 0x3339D80809A1D805U, q1, r3;"
126  "\n\tmadc.lo.u64 r4, 0x3339D80809A1D805U, q3, r4;"
127 
128  "\n\tmad.lo.u64.cc r3, 0x3339D80809A1D805U, q2, r3;"
129  "\n\tmadc.hi.u64 r4, 0x3339D80809A1D805U, q2, r4;"
130 
131  // m1
132 
133  "\n\tmul.lo.u64 r1, 0x53BDA402FFFE5BFEU, q1 ;"
134  "\n\tmad.hi.u64.cc r2, 0x53BDA402FFFE5BFEU, q1, r2;"
135  "\n\tmadc.lo.u64.cc r3, 0x53BDA402FFFE5BFEU, q3, r3;"
136  "\n\tmadc.hi.u64 r4, 0x53BDA402FFFE5BFEU, q3, r4;"
137 
138  "\n\tmad.lo.u64.cc r2, 0x53BDA402FFFE5BFEU, q2, r2;"
139  "\n\tmadc.hi.u64.cc r3, 0x53BDA402FFFE5BFEU, q2, r3;"
140  "\n\tmadc.lo.u64 r4, 0x53BDA402FFFE5BFEU, q4, r4;"
141 
142  // m0
143 
144  "\n\tmul.lo.u64 r0, 0xFFFFFFFF00000001U, q1 ;"
145  "\n\tmad.hi.u64.cc r1, 0xFFFFFFFF00000001U, q1, r1;"
146  "\n\tmadc.lo.u64.cc r2, 0xFFFFFFFF00000001U, q3, r2;"
147  "\n\tmadc.hi.u64.cc r3, 0xFFFFFFFF00000001U, q3, r3;"
148  "\n\tmadc.lo.u64 r4, 0xFFFFFFFF00000001U, q5, r4;"
149 
150  "\n\tmad.lo.u64.cc r1, 0xFFFFFFFF00000001U, q2, r1;"
151  "\n\tmadc.hi.u64.cc r2, 0xFFFFFFFF00000001U, q2, r2;"
152  "\n\tmadc.lo.u64.cc r3, 0xFFFFFFFF00000001U, q4, r3;"
153  "\n\tmadc.hi.u64 r4, 0xFFFFFFFF00000001U, q4, r4;"
154 
155  // r = r1 - r2
156  // r1 is in u
157  // r2 is in r
158 
159  // z = r1 - r2
160 
161  // Note: 0 <= z < 3m and 2m < 2^256, so z >= 2^256 => 0 < z-m < 2^256
162 
163  "\n\tsub.u64.cc %0, u0, r0;"
164  "\n\tsubc.u64.cc %1, u1, r1;"
165  "\n\tsubc.u64.cc %2, u2, r2;"
166  "\n\tsubc.u64.cc %3, u3, r3;"
167  "\n\tsubc.u64.cc u4, u4, r4;"
168  "\n\tsetp.ne.u64 nz, u4, 0;" // set predicate if z >= 2^256
169 
170  // if predicate is set then z = z - m
171 
172  "\n @nz\tsub.u64.cc %0, %0, 0xFFFFFFFF00000001U;"
173  "\n @nz\tsubc.u64.cc %1, %1, 0x53BDA402FFFE5BFEU;"
174  "\n @nz\tsubc.u64.cc %2, %2, 0x3339D80809A1D805U;"
175  "\n @nz\tsubc.u64 %3, %3, 0x73EDA753299D7D48U;"
176 
177  "\n\t}"
178  : "+l"(z0), "+l"(z1), "+l"(z2), "+l"(z3)
179  );
180 
181  z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3;
182 }
183 
184 // vim: ts=4 et sw=4 si
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
Definition: fr.cuh:24
__device__ void fr_sqr(fr_t &z)
Squares the value in z as a residue modulo r, and stores back into z.
Definition: fr_sqr.cu:14