16 z0 = z[0], z1 = z[1], z2 = z[2], z3 = z[3];
25 "\n\tmul.lo.u64 u3, %0, %3 ; mul.hi.u64 u4, %0, %3 ;"
27 "\n\tmul.lo.u64 u2, %0, %2 ; mad.hi.u64.cc u3, %0, %2, u3;"
28 "\n\tmadc.lo.u64.cc u4, %1, %3, u4; madc.hi.u64 u5, %1, %3, 0;"
30 "\n\tmul.lo.u64 u1, %0, %1 ; mad.hi.u64.cc u2, %0, %1, u2;"
31 "\n\tmadc.lo.u64.cc u3, %1, %2, u3; madc.hi.u64.cc u4, %1, %2, u4;"
32 "\n\tmadc.lo.u64.cc u5, %2, %3, u5; madc.hi.u64 u6, %2, %3, 0;"
34 "\n\tadd.u64.cc u1, u1, u1;"
35 "\n\taddc.u64.cc u2, u2, u2;"
36 "\n\taddc.u64.cc u3, u3, u3;"
37 "\n\taddc.u64.cc u4, u4, u4;"
38 "\n\taddc.u64.cc u5, u5, u5;"
39 "\n\taddc.u64.cc u6, u6, u6;"
40 "\n\taddc.u64 u7, 0, 0;"
42 "\n\tmul.lo.u64 u0, %0, %0 ; mad.hi.u64.cc u1, %0, %0, u1;"
43 "\n\tmadc.lo.u64.cc u2, %1, %1, u2; madc.hi.u64.cc u3, %1, %1, u3;"
44 "\n\tmadc.lo.u64.cc u4, %2, %2, u4; madc.hi.u64.cc u5, %2, %2, u5;"
45 "\n\tmadc.lo.u64.cc u6, %3, %3, u6; madc.hi.u64 u7, %3, %3, u7;"
54 "\n\tmul.hi.u64 q0, 0x42737A020C0D6393U, u6;"
56 "\n\tmad.lo.u64.cc q0, 0x42737A020C0D6393U, u7, q0;"
57 "\n\tmadc.hi.u64 q1, 0x42737A020C0D6393U, u7, 0;"
61 "\n\tmad.hi.u64.cc q0, 0x65043EB4BE4BAD71U, u5, q0;"
62 "\n\tmadc.lo.u64.cc q1, 0x65043EB4BE4BAD71U, u7, q1;"
63 "\n\tmadc.hi.u64 q2, 0x65043EB4BE4BAD71U, u7, 0;"
65 "\n\tmad.lo.u64.cc q0, 0x65043EB4BE4BAD71U, u6, q0;"
66 "\n\tmadc.hi.u64.cc q1, 0x65043EB4BE4BAD71U, u6, q1;"
67 "\n\taddc.u64 q2, q2, 0;"
71 "\n\tmad.lo.u64.cc q0, 0x38B5DCB707E08ED3U, u5, q0;"
72 "\n\tmadc.hi.u64.cc q1, 0x38B5DCB707E08ED3U, u5, q1;"
73 "\n\tmadc.lo.u64.cc q2, 0x38B5DCB707E08ED3U, u7, q2;"
74 "\n\tmadc.hi.u64 q3, 0x38B5DCB707E08ED3U, u7, 0;"
76 "\n\tmad.hi.u64.cc q0, 0x38B5DCB707E08ED3U, u4, q0;"
77 "\n\tmadc.lo.u64.cc q1, 0x38B5DCB707E08ED3U, u6, q1;"
78 "\n\tmadc.hi.u64.cc q2, 0x38B5DCB707E08ED3U, u6, q2;"
79 "\n\taddc.u64 q3, q3, 0;"
83 "\n\tmad.hi.u64.cc q0, 0x355094EDFEDE377CU, u3, q0;"
84 "\n\tmadc.lo.u64.cc q1, 0x355094EDFEDE377CU, u5, q1;"
85 "\n\tmadc.hi.u64.cc q2, 0x355094EDFEDE377CU, u5, q2;"
86 "\n\tmadc.lo.u64.cc q3, 0x355094EDFEDE377CU, u7, q3;"
87 "\n\tmadc.hi.u64 q4, 0x355094EDFEDE377CU, u7, 0;"
89 "\n\tmad.lo.u64.cc q0, 0x355094EDFEDE377CU, u4, q0;"
90 "\n\tmadc.hi.u64.cc q1, 0x355094EDFEDE377CU, u4, q1;"
91 "\n\tmadc.lo.u64.cc q2, 0x355094EDFEDE377CU, u6, q2;"
92 "\n\tmadc.hi.u64.cc q3, 0x355094EDFEDE377CU, u6, q3;"
93 "\n\taddc.u64 q4, q4, 0;"
97 "\n\tmad.lo.u64.cc q0, 0x0000000000000002U, u3, q0;"
98 "\n\tmadc.hi.u64.cc q1, 0x0000000000000002U, u3, q1;"
99 "\n\tmadc.lo.u64.cc q2, 0x0000000000000002U, u5, q2;"
100 "\n\tmadc.hi.u64.cc q3, 0x0000000000000002U, u5, q3;"
101 "\n\tmadc.lo.u64.cc q4, 0x0000000000000002U, u7, q4;"
102 "\n\tmadc.hi.u64 q5, 0x0000000000000002U, u7, 0;"
104 "\n\tmad.hi.u64.cc q0, 0x0000000000000002U, u2, q0;"
105 "\n\tmadc.lo.u64.cc q1, 0x0000000000000002U, u4, q1;"
106 "\n\tmadc.hi.u64.cc q2, 0x0000000000000002U, u4, q2;"
107 "\n\tmadc.lo.u64.cc q3, 0x0000000000000002U, u6, q3;"
108 "\n\tmadc.hi.u64.cc q4, 0x0000000000000002U, u6, q4;"
109 "\n\taddc.u64 q5, q5, 0;"
118 "\n\tmul.lo.u64 r3, 0x73EDA753299D7D48U, q1 ;"
119 "\n\tmul.hi.u64 r4, 0x73EDA753299D7D48U, q1 ;"
120 "\n\tmad.lo.u64 r4, 0x73EDA753299D7D48U, q2, r4;"
124 "\n\tmul.lo.u64 r2, 0x3339D80809A1D805U, q1 ;"
125 "\n\tmad.hi.u64.cc r3, 0x3339D80809A1D805U, q1, r3;"
126 "\n\tmadc.lo.u64 r4, 0x3339D80809A1D805U, q3, r4;"
128 "\n\tmad.lo.u64.cc r3, 0x3339D80809A1D805U, q2, r3;"
129 "\n\tmadc.hi.u64 r4, 0x3339D80809A1D805U, q2, r4;"
133 "\n\tmul.lo.u64 r1, 0x53BDA402FFFE5BFEU, q1 ;"
134 "\n\tmad.hi.u64.cc r2, 0x53BDA402FFFE5BFEU, q1, r2;"
135 "\n\tmadc.lo.u64.cc r3, 0x53BDA402FFFE5BFEU, q3, r3;"
136 "\n\tmadc.hi.u64 r4, 0x53BDA402FFFE5BFEU, q3, r4;"
138 "\n\tmad.lo.u64.cc r2, 0x53BDA402FFFE5BFEU, q2, r2;"
139 "\n\tmadc.hi.u64.cc r3, 0x53BDA402FFFE5BFEU, q2, r3;"
140 "\n\tmadc.lo.u64 r4, 0x53BDA402FFFE5BFEU, q4, r4;"
144 "\n\tmul.lo.u64 r0, 0xFFFFFFFF00000001U, q1 ;"
145 "\n\tmad.hi.u64.cc r1, 0xFFFFFFFF00000001U, q1, r1;"
146 "\n\tmadc.lo.u64.cc r2, 0xFFFFFFFF00000001U, q3, r2;"
147 "\n\tmadc.hi.u64.cc r3, 0xFFFFFFFF00000001U, q3, r3;"
148 "\n\tmadc.lo.u64 r4, 0xFFFFFFFF00000001U, q5, r4;"
150 "\n\tmad.lo.u64.cc r1, 0xFFFFFFFF00000001U, q2, r1;"
151 "\n\tmadc.hi.u64.cc r2, 0xFFFFFFFF00000001U, q2, r2;"
152 "\n\tmadc.lo.u64.cc r3, 0xFFFFFFFF00000001U, q4, r3;"
153 "\n\tmadc.hi.u64 r4, 0xFFFFFFFF00000001U, q4, r4;"
163 "\n\tsub.u64.cc %0, u0, r0;"
164 "\n\tsubc.u64.cc %1, u1, r1;"
165 "\n\tsubc.u64.cc %2, u2, r2;"
166 "\n\tsubc.u64.cc %3, u3, r3;"
167 "\n\tsubc.u64.cc u4, u4, r4;"
168 "\n\tsetp.ne.u64 nz, u4, 0;"
172 "\n @nz\tsub.u64.cc %0, %0, 0xFFFFFFFF00000001U;"
173 "\n @nz\tsubc.u64.cc %1, %1, 0x53BDA402FFFE5BFEU;"
174 "\n @nz\tsubc.u64.cc %2, %2, 0x3339D80809A1D805U;"
175 "\n @nz\tsubc.u64 %3, %3, 0x73EDA753299D7D48U;"
178 :
"+l"(z0),
"+l"(z1),
"+l"(z2),
"+l"(z3)
181 z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3;
uint64_t fr_t[4]
Subgroup element stored as a 256-bit array (a 4-element little-endian array of uint64_t)....
__device__ void fr_sqr(fr_t &z)
Squares the value in z as a residue modulo r, and stores back into z.