FK20 CUDA
g1p_dbl.cu
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 #include "g1.cuh"
6 #include "fp.cuh"
7 #include "fp_x2.cuh"
8 #include "fp_x3.cuh"
9 #include "fp_x8.cuh"
10 #include "fp_x12.cuh"
11 #include "fp_add.cuh"
12 #include "fp_sub.cuh"
13 #include "fp_sqr.cuh"
14 #include "fp_mul.cuh"
15 #include "fp_reduce12.cuh"
16 
23 __device__ void g1p_dbl(g1p_t &p) {
24 
25  if (!g1p_isPoint(p)) {
26  g1p_print("ERROR in g1p_dbl(): Invalid point ", p);
27 
28  // return invalid point as result
29  fp_zero(p.x);
30  fp_zero(p.y);
31  fp_zero(p.z);
32 
33  return;
34  }
35 
36 #if 0
37  uint64_t
38  x0 = p.x[0], x1 = p.x[1], x2 = p.x[2], x3 = p.x[3], x4 = p.x[4], x5 = p.x[5],
39  y0 = p.y[0], y1 = p.y[1], y2 = p.y[2], y3 = p.y[3], y4 = p.y[4], y5 = p.y[5],
40  z0 = p.z[0], z1 = p.z[1], z2 = p.z[2], z3 = p.z[3], z4 = p.z[4], z5 = p.z[5];
41 
42  asm volatile (
43  "\n\t{"
44  "\n\t.reg .u64 v<6>, w<6>, x<6>, y<6>, z<6>;"
45  "\n\t.reg .u32 z6;"
46  "\n\t.reg .u64 u<10>, ua, ub;"
47  "\n\t.reg .u64 q<8>;"
48  "\n\t.reg .u64 r<7>;"
49  "\n\t.reg .u64 t<6>;"
50  "\n\t.reg .pred nz, gt;"
51 
52  "\n\tmov.u64 x0, %0;"
53  "\n\tmov.u64 x1, %1;"
54  "\n\tmov.u64 x2, %2;"
55  "\n\tmov.u64 x3, %3;"
56  "\n\tmov.u64 x4, %4;"
57  "\n\tmov.u64 x5, %5;"
58 
59  "\n\tmov.u64 y0, %6;"
60  "\n\tmov.u64 y1, %7;"
61  "\n\tmov.u64 y2, %8;"
62  "\n\tmov.u64 y3, %9;"
63  "\n\tmov.u64 y4, %10;"
64  "\n\tmov.u64 y5, %11;"
65 
66  "\n\tmov.u64 z0, %12;"
67  "\n\tmov.u64 z1, %13;"
68  "\n\tmov.u64 z2, %14;"
69  "\n\tmov.u64 z3, %15;"
70  "\n\tmov.u64 z4, %16;"
71  "\n\tmov.u64 z5, %17;"
72 
73  FP_MUL(u, x, y)
74  FP_REDUCE12(u)
75 
76  "\n\tmov.u64 x0, u0;"
77  "\n\tmov.u64 x1, u1;"
78  "\n\tmov.u64 x2, u2;"
79  "\n\tmov.u64 x3, u3;"
80  "\n\tmov.u64 x4, u4;"
81  "\n\tmov.u64 x5, u5;"
82 
83  FP_SQR(u, z)
84  FP_REDUCE12(u)
85 
86  "\n\tmov.u64 v0, u0;"
87  "\n\tmov.u64 v1, u1;"
88  "\n\tmov.u64 v2, u2;"
89  "\n\tmov.u64 v3, u3;"
90  "\n\tmov.u64 v4, u4;"
91  "\n\tmov.u64 v5, u5;"
92 
93  FP_X12(v, v)
94 
95  FP_MUL(u, z, y)
96  FP_REDUCE12(u)
97 
98  "\n\tmov.u64 z0, u0;"
99  "\n\tmov.u64 z1, u1;"
100  "\n\tmov.u64 z2, u2;"
101  "\n\tmov.u64 z3, u3;"
102  "\n\tmov.u64 z4, u4;"
103  "\n\tmov.u64 z5, u5;"
104 
105  FP_SQR(u, y)
106  FP_REDUCE12(u)
107 
108  "\n\tmov.u64 y0, u0;"
109  "\n\tmov.u64 y1, u1;"
110  "\n\tmov.u64 y2, u2;"
111  "\n\tmov.u64 y3, u3;"
112  "\n\tmov.u64 y4, u4;"
113  "\n\tmov.u64 y5, u5;"
114 
115  FP_X3 (w, v)
116 
117  FP_SUB(w, y, w)
118 
119  FP_MUL(u, x, w)
120  FP_REDUCE12(u)
121 
122  "\n\tmov.u64 x0, u0;"
123  "\n\tmov.u64 x1, u1;"
124  "\n\tmov.u64 x2, u2;"
125  "\n\tmov.u64 x3, u3;"
126  "\n\tmov.u64 x4, u4;"
127  "\n\tmov.u64 x5, u5;"
128 
129  FP_ADD(y, y, v)
130 
131  FP_MUL(u, w, y)
132  FP_REDUCE12(u)
133 
134  "\n\tmov.u64 w0, u0;"
135  "\n\tmov.u64 w1, u1;"
136  "\n\tmov.u64 w2, u2;"
137  "\n\tmov.u64 w3, u3;"
138  "\n\tmov.u64 w4, u4;"
139  "\n\tmov.u64 w5, u5;"
140 
141  FP_SUB(y, y, v)
142 
143  FP_X8 (y, y)
144 
145  FP_X2 (x, x)
146 
147  FP_MUL(u, z, y)
148  FP_REDUCE12(u)
149 
150  "\n\tmov.u64 z0, u0;"
151  "\n\tmov.u64 z1, u1;"
152  "\n\tmov.u64 z2, u2;"
153  "\n\tmov.u64 z3, u3;"
154  "\n\tmov.u64 z4, u4;"
155  "\n\tmov.u64 z5, u5;"
156 
157  FP_MUL(u, y, v)
158  FP_REDUCE12(u)
159 
160  "\n\tmov.u64 y0, u0;"
161  "\n\tmov.u64 y1, u1;"
162  "\n\tmov.u64 y2, u2;"
163  "\n\tmov.u64 y3, u3;"
164  "\n\tmov.u64 y4, u4;"
165  "\n\tmov.u64 y5, u5;"
166 
167  FP_ADD(y, y, w)
168 
169  "\n\tmov.u64 %0, x0;"
170  "\n\tmov.u64 %1, x1;"
171  "\n\tmov.u64 %2, x2;"
172  "\n\tmov.u64 %3, x3;"
173  "\n\tmov.u64 %4, x4;"
174  "\n\tmov.u64 %5, x5;"
175 
176  "\n\tmov.u64 %6, y0;"
177  "\n\tmov.u64 %7, y1;"
178  "\n\tmov.u64 %8, y2;"
179  "\n\tmov.u64 %9, y3;"
180  "\n\tmov.u64 %10, y4;"
181  "\n\tmov.u64 %11, y5;"
182 
183  "\n\tmov.u64 %12, z0;"
184  "\n\tmov.u64 %13, z1;"
185  "\n\tmov.u64 %14, z2;"
186  "\n\tmov.u64 %15, z3;"
187  "\n\tmov.u64 %16, z4;"
188  "\n\tmov.u64 %17, z5;"
189 
190  "\n\t}"
191  :
192  "+l"(x0), "+l"(x1), "+l"(x2), "+l"(x3), "+l"(x4), "+l"(x5),
193  "+l"(y0), "+l"(y1), "+l"(y2), "+l"(y3), "+l"(y4), "+l"(y5),
194  "+l"(z0), "+l"(z1), "+l"(z2), "+l"(z3), "+l"(z4), "+l"(z5)
195  );
196 
197  p.x[0] = x0; p.x[1] = x1; p.x[2] = x2; p.x[3] = x3; p.x[4] = x4; p.x[5] = x5;
198  p.y[0] = y0; p.y[1] = y1; p.y[2] = y2; p.y[3] = y3; p.y[4] = y4; p.y[5] = y5;
199  p.z[0] = z0; p.z[1] = z1; p.z[2] = z2; p.z[3] = z3; p.z[4] = z4; p.z[5] = z5;
200 #else
201  fp_t x, y, z, v, w;
202 
203  fp_cpy(x, p.x);
204  fp_cpy(y, p.y);
205  fp_cpy(z, p.z);
206 
207  fp_mul(x, x, y);
208  fp_sqr(v, z);
209  fp_x12(v, v);
210  fp_mul(z, z, y);
211  fp_sqr(y, y);
212  fp_x3(w, v);
213  fp_sub(w, y, w);
214  fp_mul(x, x, w);
215  fp_add(y, y, v);
216  fp_mul(w, w, y);
217  fp_sub(y, y, v);
218  fp_x8(y, y);
219  fp_x2(x, x);
220  fp_mul(z, z, y);
221  fp_mul(y, y, v);
222  fp_add(y, y, w);
223 
224  fp_cpy(p.x, x);
225  fp_cpy(p.y, y);
226  fp_cpy(p.z, z);
227 #endif
228 }
229 
230 // vim: ts=4 et sw=4 si
__device__ __host__ void fp_zero(fp_t &z)
Sets z to zero.
Definition: fp.cu:15
__device__ void fp_add(fp_t &z, const fp_t &x, const fp_t &y)
Computes the sum of two residues x and y modulo p and stores it in z. Device only function.
Definition: fp_add.cu:17
__device__ void fp_x8(fp_t &z, const fp_t &x)
Multiplies x by 8 and stores the result into z.
Definition: fp_x8.cu:15
__device__ void fp_x2(fp_t &z, const fp_t &x)
Multiplies x by 2 and stores the result into z.
Definition: fp_x2.cu:15
__device__ void fp_x12(fp_t &z, const fp_t &x)
Multiplies the residue mod p x by 12 and stores the result into z.
Definition: fp_x12.cu:15
__device__ void fp_sqr(fp_t &z, const fp_t &x)
Computes the square of the residue x modulo p and stores it in z.
Definition: fp_sqr.cu:16
uint64_t fp_t[6]
Residue modulo p. Any 384-bit representative of each residue is allowed, and stored as a 6-element li...
Definition: fp.cuh:14
__device__ void fp_mul(fp_t &z, const fp_t &x, const fp_t &y)
Multiplies two Fp residues x and y, stores in z.
Definition: fp_mul.cu:17
__device__ __host__ void fp_cpy(fp_t &z, const fp_t &x)
Copy from x into z.
Definition: fp_cpy.cu:14
__device__ void fp_x3(fp_t &z, const fp_t &x)
Multiplies x by 3 and stores the result into z.
Definition: fp_x3.cu:15
__device__ void fp_sub(fp_t &z, const fp_t &x, const fp_t &y)
Calculates the difference of two residues modulo p and stores it into z.
Definition: fp_sub.cu:16
#define FP_ADD(Z, X, Y)
PTX macro for addition of two residues modulo p. Z←X+Y.
Definition: fp_add.cuh:11
#define FP_MUL(Z, X, Y)
PTX macro for multiplication of two residues mod p Reads X0..X5 and Y0..Y5. Writes Z0....
Definition: fp_mul.cuh:8
#define FP_REDUCE12(Z)
Wide reduction over 12 words.
Definition: fp_reduce12.cuh:12
#define FP_SQR(Z, X)
PTX macro for computing the square of the residue x modulo p. Stores in z.
Definition: fp_sqr.cuh:12
#define FP_SUB(Z, X, Y)
PTX macro for calculating de difference of two residues modulo p, Z = X-Y.
Definition: fp_sub.cuh:12
#define FP_X12(Z, X)
PTX macro for multiplication by 12. Stores in Z.
Definition: fp_x12.cuh:10
#define FP_X2(Z, X)
PTX macro for multiplication by 2. Stores in Z.
Definition: fp_x2.cuh:11
#define FP_X3(Z, X)
PTX macro for multiplication by 3. Stores in Z.
Definition: fp_x3.cuh:11
#define FP_X8(Z, X)
PTX macro for multiplication by 8. Stores in Z.
Definition: fp_x8.cuh:11
__device__ bool g1p_isPoint(const g1p_t &p)
Check if the value stored in p is a valid point on the G1 curve.
Definition: g1p_ispoint.cu:34
__device__ __host__ void g1p_print(const char *s, const g1p_t &p)
Print a standard representation of p, preceded by the user-set string s.
Definition: g1p.cu:80
__device__ void g1p_dbl(g1p_t &p)
G1 point doubling, with write back: p=2*p.
Definition: g1p_dbl.cu:23
G1 point in projective coordinates.
Definition: g1.cuh:27
fp_t z
Definition: g1.cuh:28
fp_t x
Definition: g1.cuh:28
fp_t y
Definition: g1.cuh:28