FK20 CUDA
g1p_add.cu
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 #include <stdio.h>
6 
7 #include "fp.cuh"
8 #include "g1.cuh"
9 
10 #include "fp_add.cuh"
11 #include "fp_mul.cuh"
12 #include "fp_sub.cuh"
13 #include "fp_x3.cuh"
14 #include "fp_x12.cuh"
15 #include "fp_reduce12.cuh"
16 
17 
29 __device__ void g1p_add(g1p_t &p, const g1p_t &q) {
30 
31 #ifndef NDEBUG
32  if (!g1p_isPoint(p) || !(g1p_isPoint(q))) {
33  //printf("ERROR in g1p_add(): Invalid point(s)\n");
34  //g1p_print("p: ", p);
35  //g1p_print("q: ", q);
36 
37  // return invalid point as result
38  fp_zero(p.x);
39  fp_zero(p.y);
40  fp_zero(p.z);
41 
42  return;
43  }
44 #endif
45 
46 #if 1
47  fp_t
48  X1, Y1, Z1,
49  X2, Y2, Z2,
50  t0, t1, t2, t3;
51 
52  fp_cpy(X1, p.x);
53  fp_cpy(Y1, p.y);
54  fp_cpy(Z1, p.z);
55 
56  fp_cpy(X2, q.x);
57  fp_cpy(Y2, q.y);
58  fp_cpy(Z2, q.z);
59 
60  // Adapted from eprint 2015-1060, algorithm 7.
61  // Modified to remove one temp value and avoid overwriting inputs.
62  // 12 mul, 0 square, 11 add, 5 sub, 2 x12, 1 x3.
63 
64  fp_add(t0, X1, Y1); // t3
65  fp_add(t1, Y1, Z1); // t8
66  fp_add(t2, Z1, X1); // td
67 
68  fp_mul(X1, X1, X2); // t0
69  fp_mul(Y1, Y1, Y2); // t1
70  fp_mul(Z1, Z1, Z2); // t2
71 
72  fp_add(t3, X2, Y2); // t4
73  fp_add(Y2, Y2, Z2); // t9
74  fp_add(Z2, Z2, X2); // te
75 
76  fp_mul(X2, t3, t0); // t5
77  fp_mul(Y2, Y2, t1); // ta
78  fp_mul(Z2, Z2, t2); // tf
79 
80  fp_x3(t0, X1); // ti
81  fp_add(t1, Y1, Z1); // tb
82  fp_add(t2, Z1, X1); // tg
83  fp_x12(t3, Z1); // tk
84 
85  fp_add(X1, X1, Y1); // t6
86  fp_add(Z1, Y1, t3); // tl
87  fp_sub(Y1, Y1, t3); // tm
88 
89  fp_sub(X1, X2, X1); // t7
90  fp_mul(X2, X1, t0); // ts
91 
92  fp_mul(X1, X1, Y1); // tp
93  fp_mul(Y1, Y1, Z1); // tr
94 
95  fp_sub(Y2, Y2, t1); // tc
96  fp_mul(Z1, Z1, Y2); // tt
97  fp_sub(Z2, Z2, t2); // th
98 
99  fp_x12(Z2, Z2); // tn
100  fp_mul(Y2, Y2, Z2); // to
101  fp_mul(Z2, Z2, t0); // tq
102 
103  fp_sub(X1, X1, Y2); // X3
104  fp_add(Y1, Y1, Z2); // Y3
105  fp_add(Z1, Z1, X2); // Z3
106 
107  fp_cpy(p.x, X1);
108  fp_cpy(p.y, Y1);
109  fp_cpy(p.z, Z1);
110 #else
111  uint64_t
112  x0 = p.x[0], y0 = p.y[0], z0 = p.z[0],
113  x1 = p.x[1], y1 = p.y[1], z1 = p.z[1],
114  x2 = p.x[2], y2 = p.y[2], z2 = p.z[2],
115  x3 = p.x[3], y3 = p.y[3], z3 = p.z[3],
116  x4 = p.x[4], y4 = p.y[4], z4 = p.z[4],
117  x5 = p.x[5], y5 = p.y[5], z5 = p.z[5],
118 
119  u0 = q.x[0], v0 = q.y[0], w0 = q.z[0],
120  u1 = q.x[1], v1 = q.y[1], w1 = q.z[1],
121  u2 = q.x[2], v2 = q.y[2], w2 = q.z[2],
122  u3 = q.x[3], v3 = q.y[3], w3 = q.z[3],
123  u4 = q.x[4], v4 = q.y[4], w4 = q.z[4],
124  u5 = q.x[5], v5 = q.y[5], w5 = q.z[5];
125 
126  asm volatile (
127  "\n\t{"
128  "\n\t.reg .u64 X1<10>, X1a, X1b;"
129  "\n\t.reg .u64 X2<10>, X2a, X2b;"
130  "\n\t.reg .u64 Y1<10>, Y1a, Y1b;"
131  "\n\t.reg .u64 Y2<10>, Y2a, Y2b;"
132  "\n\t.reg .u64 Z1<10>, Z1a, Z1b;"
133  "\n\t.reg .u64 Z2<10>, Z2a, Z2b;"
134  "\n\t.reg .u64 t0<6>, t1<6>, t2<6>, t3<6>;"
135 
136  "\n\t.reg .u64 t<6>;"
137  "\n\t.reg .u32 z6;"
138  "\n\t.reg .pred ne, gt;"
139 
140  "\n\tmov.u64 X10, %0;"
141  "\n\tmov.u64 X11, %1;"
142  "\n\tmov.u64 X12, %2;"
143  "\n\tmov.u64 X13, %3;"
144  "\n\tmov.u64 X14, %4;"
145  "\n\tmov.u64 X15, %5;"
146 
147  "\n\tmov.u64 Y10, %6;"
148  "\n\tmov.u64 Y11, %7;"
149  "\n\tmov.u64 Y12, %8;"
150  "\n\tmov.u64 Y13, %9;"
151  "\n\tmov.u64 Y14, %10;"
152  "\n\tmov.u64 Y15, %11;"
153 
154  "\n\tmov.u64 Z10, %12;"
155  "\n\tmov.u64 Z11, %13;"
156  "\n\tmov.u64 Z12, %14;"
157  "\n\tmov.u64 Z13, %15;"
158  "\n\tmov.u64 Z14, %16;"
159  "\n\tmov.u64 Z15, %17;"
160 
161  "\n\tmov.u64 X20, %18;"
162  "\n\tmov.u64 X21, %19;"
163  "\n\tmov.u64 X22, %20;"
164  "\n\tmov.u64 X23, %21;"
165  "\n\tmov.u64 X24, %22;"
166  "\n\tmov.u64 X25, %23;"
167 
168  "\n\tmov.u64 Y20, %24;"
169  "\n\tmov.u64 Y21, %25;"
170  "\n\tmov.u64 Y22, %26;"
171  "\n\tmov.u64 Y23, %27;"
172  "\n\tmov.u64 Y24, %28;"
173  "\n\tmov.u64 Y25, %29;"
174 
175  "\n\tmov.u64 Z20, %30;"
176  "\n\tmov.u64 Z21, %31;"
177  "\n\tmov.u64 Z22, %32;"
178  "\n\tmov.u64 Z23, %33;"
179  "\n\tmov.u64 Z24, %34;"
180  "\n\tmov.u64 Z25, %35;"
181 
182 FP_ADD(t0, X1, Y1) // t3
183 FP_ADD(t1, Y1, Z1) // t8
184 FP_ADD(t2, Z1, X1) // td
185 
186 FP_MUL(X1, X1, X2) // t0
187 FP_REDUCE12(X1)
188 FP_MUL(Y1, Y1, Y2) // t1
189 FP_REDUCE12(Y1)
190 FP_MUL(Z1, Z1, Z2) // t2
191 FP_REDUCE12(Z1)
192 
193 FP_ADD(t3, X2, Y2) // t4
194 FP_ADD(Y2, Y2, Z2) // t9
195 FP_ADD(Z2, Z2, X2) // te
196 
197 FP_MUL(X2, t3, t0) // t5
198 FP_REDUCE12(X2)
199 FP_MUL(Y2, Y2, t1) // ta
200 FP_REDUCE12(Y2)
201 FP_MUL(Z2, Z2, t2) // tf
202 FP_REDUCE12(Z2)
203 
204 FP_X3(t0, X1) // ti
205 FP_ADD(t1, Y1, Z1) // tb
206 FP_ADD(t2, Z1, X1) // tg
207 FP_X12(t3, Z1) // tk
208 
209 FP_ADD(X1, X1, Y1) // t6
210 FP_ADD(Z1, Y1, t3) // tl
211 FP_SUB(Y1, Y1, t3) // tm
212 
213 FP_SUB(X1, X2, X1) // t7
214 FP_MUL(X2, X1, t0) // ts
215 FP_REDUCE12(X2)
216 
217 FP_MUL(X1, X1, Y1) // tp
218 FP_REDUCE12(X1)
219 FP_MUL(Y1, Y1, Z1) // tr
220 FP_REDUCE12(Y1)
221 
222 FP_SUB(Y2, Y2, t1) // tc
223 FP_MUL(Z1, Z1, Y2) // tt
224 FP_REDUCE12(Z1)
225 FP_SUB(Z2, Z2, t2) // th
226 
227 FP_X12(Z2, Z2) // tn
228 FP_MUL(Y2, Y2, Z2) // to
229 FP_REDUCE12(Y2)
230 FP_MUL(Z2, Z2, t0) // tq
231 FP_REDUCE12(Z2)
232 
233 FP_SUB(X1, X1, Y2) // X3
234 FP_ADD(Y1, Y1, Z2) // Y3
235 FP_ADD(Z1, Z1, X2) // Z3
236 
237  "\n\tmov.u64 %0, X10;"
238  "\n\tmov.u64 %1, X11;"
239  "\n\tmov.u64 %2, X12;"
240  "\n\tmov.u64 %3, X13;"
241  "\n\tmov.u64 %4, X14;"
242  "\n\tmov.u64 %5, X15;"
243 
244  "\n\tmov.u64 %6, Y10;"
245  "\n\tmov.u64 %7, Y11;"
246  "\n\tmov.u64 %8, Y12;"
247  "\n\tmov.u64 %9, Y13;"
248  "\n\tmov.u64 %10, Y14;"
249  "\n\tmov.u64 %11, Y15;"
250 
251  "\n\tmov.u64 %12, Z10;"
252  "\n\tmov.u64 %13, Z11;"
253  "\n\tmov.u64 %14, Z12;"
254  "\n\tmov.u64 %15, Z13;"
255  "\n\tmov.u64 %16, Z14;"
256  "\n\tmov.u64 %17, Z15;"
257 
258  "\n\t}"
259  :
260  "+l"(x0), "+l"(x1), "+l"(x2), "+l"(x3), "+l"(x4), "+l"(x5),
261  "+l"(y0), "+l"(y1), "+l"(y2), "+l"(y3), "+l"(y4), "+l"(y5),
262  "+l"(z0), "+l"(z1), "+l"(z2), "+l"(z3), "+l"(z4), "+l"(z5)
263  :
264  "l"(u0), "l"(u1), "l"(u2), "l"(u3), "l"(u4), "l"(u5),
265  "l"(v0), "l"(v1), "l"(v2), "l"(v3), "l"(v4), "l"(v5),
266  "l"(w0), "l"(w1), "l"(w2), "l"(w3), "l"(w4), "l"(w5)
267  );
268 
269  p.x[0] = x0, p.x[1] = x1, p.x[2] = x2, p.x[3] = x3, p.x[4] = x4, p.x[5] = x5;
270  p.y[0] = y0, p.y[1] = y1, p.y[2] = y2, p.y[3] = y3, p.y[4] = y4, p.y[5] = y5;
271  p.z[0] = z0, p.z[1] = z1, p.z[2] = z2, p.z[3] = z3, p.z[4] = z4, p.z[5] = z5;
272 #endif
273 }
274 
275 // vim: ts=4 et sw=4 si
__device__ __host__ void fp_zero(fp_t &z)
Sets z to zero.
Definition: fp.cu:15
__device__ void fp_add(fp_t &z, const fp_t &x, const fp_t &y)
Computes the sum of two residues x and y modulo p and stores it in z. Device only function.
Definition: fp_add.cu:17
__device__ void fp_x12(fp_t &z, const fp_t &x)
Multiplies the residue mod p x by 12 and stores the result into z.
Definition: fp_x12.cu:15
uint64_t fp_t[6]
Residue modulo p. Any 384-bit representative of each residue is allowed, and stored as a 6-element li...
Definition: fp.cuh:14
__device__ void fp_mul(fp_t &z, const fp_t &x, const fp_t &y)
Multiplies two Fp residues x and y, stores in z.
Definition: fp_mul.cu:17
__device__ __host__ void fp_cpy(fp_t &z, const fp_t &x)
Copy from x into z.
Definition: fp_cpy.cu:14
__device__ void fp_x3(fp_t &z, const fp_t &x)
Multiplies x by 3 and stores the result into z.
Definition: fp_x3.cu:15
__device__ void fp_sub(fp_t &z, const fp_t &x, const fp_t &y)
Calculates the difference of two residues modulo p and stores it into z.
Definition: fp_sub.cu:16
#define FP_ADD(Z, X, Y)
PTX macro for addition of two residues modulo p. Z←X+Y.
Definition: fp_add.cuh:11
#define FP_MUL(Z, X, Y)
PTX macro for multiplication of two residues mod p Reads X0..X5 and Y0..Y5. Writes Z0....
Definition: fp_mul.cuh:8
#define FP_REDUCE12(Z)
Wide reduction over 12 words.
Definition: fp_reduce12.cuh:12
#define FP_SUB(Z, X, Y)
PTX macro for calculating de difference of two residues modulo p, Z = X-Y.
Definition: fp_sub.cuh:12
#define FP_X12(Z, X)
PTX macro for multiplication by 12. Stores in Z.
Definition: fp_x12.cuh:10
#define FP_X3(Z, X)
PTX macro for multiplication by 3. Stores in Z.
Definition: fp_x3.cuh:11
__device__ bool g1p_isPoint(const g1p_t &p)
Check if the value stored in p is a valid point on the G1 curve.
Definition: g1p_ispoint.cu:34
__device__ void g1p_add(g1p_t &p, const g1p_t &q)
Computes the sum of two points q into p, using projective coordinates. and stores in p.
Definition: g1p_add.cu:29
G1 point in projective coordinates.
Definition: g1.cuh:27
fp_t z
Definition: g1.cuh:28
fp_t x
Definition: g1.cuh:28
fp_t y
Definition: g1.cuh:28