22 v0 = v[0], v1 = v[1], v2 = v[2], v3 = v[3], v4 = v[4], v5 = v[5],
23 w0 = w[0], w1 = w[1], w2 = w[2], w3 = w[3], w4 = w[4], w5 = w[5],
24 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5],
25 y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4], y5 = y[5],
26 z0, z1, z2, z3, z4, z5;
30 "\n\t.reg .u64 v<6>, w<6>, x<6>, y<6>;"
31 "\n\t.reg .u64 u<10>, ua, ub;"
41 "\n\tmov.u64 v4, %10;"
42 "\n\tmov.u64 v5, %11;"
44 "\n\tmov.u64 w0, %12;"
45 "\n\tmov.u64 w1, %13;"
46 "\n\tmov.u64 w2, %14;"
47 "\n\tmov.u64 w3, %15;"
48 "\n\tmov.u64 w4, %16;"
49 "\n\tmov.u64 w5, %17;"
51 "\n\tmov.u64 x0, %18;"
52 "\n\tmov.u64 x1, %19;"
53 "\n\tmov.u64 x2, %20;"
54 "\n\tmov.u64 x3, %21;"
55 "\n\tmov.u64 x4, %22;"
56 "\n\tmov.u64 x5, %23;"
58 "\n\tmov.u64 y0, %24;"
59 "\n\tmov.u64 y1, %25;"
60 "\n\tmov.u64 y2, %26;"
61 "\n\tmov.u64 y3, %27;"
62 "\n\tmov.u64 y4, %28;"
63 "\n\tmov.u64 y5, %29;"
85 "\n\tadd.u64.cc u0, u0, v0;"
86 "\n\taddc.u64.cc u1, u1, v1;"
87 "\n\taddc.u64.cc u2, u2, v2;"
88 "\n\taddc.u64.cc u3, u3, v3;"
89 "\n\taddc.u64.cc u4, u4, v4;"
90 "\n\taddc.u64.cc u5, u5, v5;"
91 "\n\taddc.u64.cc u6, u6, w0;"
92 "\n\taddc.u64.cc u7, u7, w1;"
93 "\n\taddc.u64.cc u8, u8, w2;"
94 "\n\taddc.u64.cc u9, u9, w3;"
95 "\n\taddc.u64.cc ua, ua, w4;"
96 "\n\taddc.u64.cc ub, ub, w5;"
97 "\n\taddc.u32 uc, 0, 0;"
103 "\n\tsetp.ne.u32 nz, uc, 0;"
104 "\n@nz\tsub.u64.cc u6, u6, 0x89f6fffffffd0003U;"
105 "\n@nz\tsubc.u64.cc u7, u7, 0x140bfff43bf3fffdU;"
106 "\n@nz\tsubc.u64.cc u8, u8, 0xa0b767a8ac38a745U;"
107 "\n@nz\tsubc.u64.cc u9, u9, 0x8831a7ac8fada8baU;"
108 "\n@nz\tsubc.u64.cc ua, ua, 0xa3f8e5685da91392U;"
109 "\n@nz\tsubc.u64.cc ub, ub, 0xea09a13c057f1b6cU;"
110 "\n@nz\tsubc.u32 uc, uc, 0;"
114 "\n\tsetp.ne.u32 nz, uc, 0;"
115 "\n@nz\tsub.u64.cc u6, u6, 0x89f6fffffffd0003U;"
116 "\n@nz\tsubc.u64.cc u7, u7, 0x140bfff43bf3fffdU;"
117 "\n@nz\tsubc.u64.cc u8, u8, 0xa0b767a8ac38a745U;"
118 "\n@nz\tsubc.u64.cc u9, u9, 0x8831a7ac8fada8baU;"
119 "\n@nz\tsubc.u64.cc ua, ua, 0xa3f8e5685da91392U;"
120 "\n@nz\tsubc.u64.cc ub, ub, 0xea09a13c057f1b6cU;"
124 "\n\tmov.u64 %0, u0;"
125 "\n\tmov.u64 %1, u1;"
126 "\n\tmov.u64 %2, u2;"
127 "\n\tmov.u64 %3, u3;"
128 "\n\tmov.u64 %4, u4;"
129 "\n\tmov.u64 %5, u5;"
133 "=l"(z0),
"=l"(z1),
"=l"(z2),
"=l"(z3),
"=l"(z4),
"=l"(z5)
135 "l"(v0),
"l"(v1),
"l"(v2),
"l"(v3),
"l"(v4),
"l"(v5),
136 "l"(w0),
"l"(w1),
"l"(w2),
"l"(w3),
"l"(w4),
"l"(w5),
137 "l"(x0),
"l"(x1),
"l"(x2),
"l"(x3),
"l"(x4),
"l"(x5),
138 "l"(y0),
"l"(y1),
"l"(y2),
"l"(y3),
"l"(y4),
"l"(y5)
141 z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; z[5] = z5;
uint64_t fp_t[6]
Residue modulo p. Any 384-bit representative of each residue is allowed, and stored as a 6-element li...
__device__ void fp_mma(fp_t &z, const fp_t &v, const fp_t &w, const fp_t &x, const fp_t &y)
Fp multiply-multiply-add. Fast execution of z = (v*w + x*y) mod p The double-wide products are added ...
#define FP_MUL(Z, X, Y)
PTX macro for multiplication of two residues mod p Reads X0..X5 and Y0..Y5. Writes Z0....
#define FP_REDUCE12(Z)
Wide reduction over 12 words.