15 z0 = z[0], z1 = z[1], z2 = z[2], z3 = z[3],
16 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3];
27 "\n\tmul.lo.u64 u1, %0, %5 ; mul.hi.u64 u2, %0, %5 ;"
28 "\n\tmul.lo.u64 u3, %0, %7 ; mul.hi.u64 u4, %0, %7 ;"
30 "\n\tmul.lo.u64 u0, %0, %4 ; mad.hi.u64.cc u1, %0, %4, u1;"
31 "\n\tmadc.lo.u64.cc u2, %0, %6, u2; madc.hi.u64.cc u3, %0, %6, u3;"
32 "\n\taddc.u64 u4, 0, u4;"
35 "\n\tmad.lo.u64.cc u2, %1, %5, u2; madc.hi.u64.cc u3, %1, %5, u3;"
36 "\n\tmadc.lo.u64.cc u4, %1, %7, u4; madc.hi.u64 u5, %1, %7, 0;"
38 "\n\tmad.lo.u64.cc u1, %1, %4, u1; madc.hi.u64.cc u2, %1, %4, u2;"
39 "\n\tmadc.lo.u64.cc u3, %1, %6, u3; madc.hi.u64.cc u4, %1, %6, u4;"
40 "\n\taddc.u64 u5, 0, u5;"
43 "\n\tmad.lo.u64.cc u3, %2, %5, u3; madc.hi.u64.cc u4, %2, %5, u4;"
44 "\n\tmadc.lo.u64.cc u5, %2, %7, u5; madc.hi.u64 u6, %2, %7, 0;"
46 "\n\tmad.lo.u64.cc u2, %2, %4, u2; madc.hi.u64.cc u3, %2, %4, u3;"
47 "\n\tmadc.lo.u64.cc u4, %2, %6, u4; madc.hi.u64.cc u5, %2, %6, u5;"
48 "\n\taddc.u64 u6, 0, u6;"
51 "\n\tmad.lo.u64.cc u4, %3, %5, u4; madc.hi.u64.cc u5, %3, %5, u5;"
52 "\n\tmadc.lo.u64.cc u6, %3, %7, u6; madc.hi.u64 u7, %3, %7, 0;"
54 "\n\tmad.lo.u64.cc u3, %3, %4, u3; madc.hi.u64.cc u4, %3, %4, u4;"
55 "\n\tmadc.lo.u64.cc u5, %3, %6, u5; madc.hi.u64.cc u6, %3, %6, u6;"
56 "\n\taddc.u64 u7, 0, u7;"
64 "\n\tmul.hi.u64 q0, 0x42737A020C0D6393U, u6;"
66 "\n\tmad.lo.u64.cc q0, 0x42737A020C0D6393U, u7, q0;"
67 "\n\tmadc.hi.u64 q1, 0x42737A020C0D6393U, u7, 0;"
71 "\n\tmad.hi.u64.cc q0, 0x65043EB4BE4BAD71U, u5, q0;"
72 "\n\tmadc.lo.u64.cc q1, 0x65043EB4BE4BAD71U, u7, q1;"
73 "\n\tmadc.hi.u64 q2, 0x65043EB4BE4BAD71U, u7, 0;"
75 "\n\tmad.lo.u64.cc q0, 0x65043EB4BE4BAD71U, u6, q0;"
76 "\n\tmadc.hi.u64.cc q1, 0x65043EB4BE4BAD71U, u6, q1;"
77 "\n\taddc.u64 q2, q2, 0;"
81 "\n\tmad.lo.u64.cc q0, 0x38B5DCB707E08ED3U, u5, q0;"
82 "\n\tmadc.hi.u64.cc q1, 0x38B5DCB707E08ED3U, u5, q1;"
83 "\n\tmadc.lo.u64.cc q2, 0x38B5DCB707E08ED3U, u7, q2;"
84 "\n\tmadc.hi.u64 q3, 0x38B5DCB707E08ED3U, u7, 0;"
86 "\n\tmad.hi.u64.cc q0, 0x38B5DCB707E08ED3U, u4, q0;"
87 "\n\tmadc.lo.u64.cc q1, 0x38B5DCB707E08ED3U, u6, q1;"
88 "\n\tmadc.hi.u64.cc q2, 0x38B5DCB707E08ED3U, u6, q2;"
89 "\n\taddc.u64 q3, q3, 0;"
93 "\n\tmad.hi.u64.cc q0, 0x355094EDFEDE377CU, u3, q0;"
94 "\n\tmadc.lo.u64.cc q1, 0x355094EDFEDE377CU, u5, q1;"
95 "\n\tmadc.hi.u64.cc q2, 0x355094EDFEDE377CU, u5, q2;"
96 "\n\tmadc.lo.u64.cc q3, 0x355094EDFEDE377CU, u7, q3;"
97 "\n\tmadc.hi.u64 q4, 0x355094EDFEDE377CU, u7, 0;"
99 "\n\tmad.lo.u64.cc q0, 0x355094EDFEDE377CU, u4, q0;"
100 "\n\tmadc.hi.u64.cc q1, 0x355094EDFEDE377CU, u4, q1;"
101 "\n\tmadc.lo.u64.cc q2, 0x355094EDFEDE377CU, u6, q2;"
102 "\n\tmadc.hi.u64.cc q3, 0x355094EDFEDE377CU, u6, q3;"
103 "\n\taddc.u64 q4, q4, 0;"
107 "\n\tmad.lo.u64.cc q0, 0x0000000000000002U, u3, q0;"
108 "\n\tmadc.hi.u64.cc q1, 0x0000000000000002U, u3, q1;"
109 "\n\tmadc.lo.u64.cc q2, 0x0000000000000002U, u5, q2;"
110 "\n\tmadc.hi.u64.cc q3, 0x0000000000000002U, u5, q3;"
111 "\n\tmadc.lo.u64.cc q4, 0x0000000000000002U, u7, q4;"
112 "\n\tmadc.hi.u64 q5, 0x0000000000000002U, u7, 0;"
114 "\n\tmad.hi.u64.cc q0, 0x0000000000000002U, u2, q0;"
115 "\n\tmadc.lo.u64.cc q1, 0x0000000000000002U, u4, q1;"
116 "\n\tmadc.hi.u64.cc q2, 0x0000000000000002U, u4, q2;"
117 "\n\tmadc.lo.u64.cc q3, 0x0000000000000002U, u6, q3;"
118 "\n\tmadc.hi.u64.cc q4, 0x0000000000000002U, u6, q4;"
119 "\n\taddc.u64 q5, q5, 0;"
128 "\n\tmul.lo.u64 r3, 0x73EDA753299D7D48U, q1 ;"
129 "\n\tmul.hi.u64 r4, 0x73EDA753299D7D48U, q1 ;"
130 "\n\tmad.lo.u64 r4, 0x73EDA753299D7D48U, q2, r4;"
134 "\n\tmul.lo.u64 r2, 0x3339D80809A1D805U, q1 ;"
135 "\n\tmad.hi.u64.cc r3, 0x3339D80809A1D805U, q1, r3;"
136 "\n\tmadc.lo.u64 r4, 0x3339D80809A1D805U, q3, r4;"
138 "\n\tmad.lo.u64.cc r3, 0x3339D80809A1D805U, q2, r3;"
139 "\n\tmadc.hi.u64 r4, 0x3339D80809A1D805U, q2, r4;"
143 "\n\tmul.lo.u64 r1, 0x53BDA402FFFE5BFEU, q1 ;"
144 "\n\tmad.hi.u64.cc r2, 0x53BDA402FFFE5BFEU, q1, r2;"
145 "\n\tmadc.lo.u64.cc r3, 0x53BDA402FFFE5BFEU, q3, r3;"
146 "\n\tmadc.hi.u64 r4, 0x53BDA402FFFE5BFEU, q3, r4;"
148 "\n\tmad.lo.u64.cc r2, 0x53BDA402FFFE5BFEU, q2, r2;"
149 "\n\tmadc.hi.u64.cc r3, 0x53BDA402FFFE5BFEU, q2, r3;"
150 "\n\tmadc.lo.u64 r4, 0x53BDA402FFFE5BFEU, q4, r4;"
154 "\n\tmul.lo.u64 r0, 0xFFFFFFFF00000001U, q1 ;"
155 "\n\tmad.hi.u64.cc r1, 0xFFFFFFFF00000001U, q1, r1;"
156 "\n\tmadc.lo.u64.cc r2, 0xFFFFFFFF00000001U, q3, r2;"
157 "\n\tmadc.hi.u64.cc r3, 0xFFFFFFFF00000001U, q3, r3;"
158 "\n\tmadc.lo.u64 r4, 0xFFFFFFFF00000001U, q5, r4;"
160 "\n\tmad.lo.u64.cc r1, 0xFFFFFFFF00000001U, q2, r1;"
161 "\n\tmadc.hi.u64.cc r2, 0xFFFFFFFF00000001U, q2, r2;"
162 "\n\tmadc.lo.u64.cc r3, 0xFFFFFFFF00000001U, q4, r3;"
163 "\n\tmadc.hi.u64 r4, 0xFFFFFFFF00000001U, q4, r4;"
173 "\n\tsub.u64.cc %0, u0, r0;"
174 "\n\tsubc.u64.cc %1, u1, r1;"
175 "\n\tsubc.u64.cc %2, u2, r2;"
176 "\n\tsubc.u64.cc %3, u3, r3;"
177 "\n\tsubc.u64.cc u4, u4, r4;"
178 "\n\tsetp.ne.u64 nz, u4, 0;"
182 "\n @nz\tsub.u64.cc %0, %0, 0xFFFFFFFF00000001U;"
183 "\n @nz\tsubc.u64.cc %1, %1, 0x53BDA402FFFE5BFEU;"
184 "\n @nz\tsubc.u64.cc %2, %2, 0x3339D80809A1D805U;"
185 "\n @nz\tsubc.u64 %3, %3, 0x73EDA753299D7D48U;"
188 :
"+l"(z0),
"+l"(z1),
"+l"(z2),
"+l"(z3)
189 :
"l"(x0),
"l"(x1),
"l"(x2),
"l"(x3)
192 z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3;
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
__device__ void fr_mul(fr_t &z, const fr_t &x)
Multiply two residues module r z and x, stores back into z.