FK20 CUDA
fp_reduce6.cu
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 #include "fp.cuh"
6 
14 __device__ void fp_reduce6(fp_t &z) {
15  uint64_t
16  z0 = z[0],
17  z1 = z[1],
18  z2 = z[2],
19  z3 = z[3],
20  z4 = z[4],
21  z5 = z[5];
22 
23  asm volatile (
24  "\n\t{"
25  "\n\t.reg .u64 t<7>, q5;"
26  "\n\t.reg .pred nz;"
27 
28  // q1 = x/2^320; q2 = q1 * mu; q3 = q2 / 2^448
29 
30  "\n\tmul.hi.u64 q5, %5, 9;" // mu6 == 9
31 
32  // r2 = q3 * m mod 2^448
33 
34  "\n\tmul.lo.u64 t0, q5, 0xB9FEFFFFFFFFAAABU;" // p0
35  "\n\tmul.hi.u64 t1, q5, 0xB9FEFFFFFFFFAAABU;"
36 
37  "\n\tmul.lo.u64 t2, q5, 0x6730D2A0F6B0F624U;" // p2
38  "\n\tmul.hi.u64 t3, q5, 0x6730D2A0F6B0F624U;"
39 
40  "\n\tmul.lo.u64 t4, q5, 0x4B1BA7B6434BACD7U;" // p4
41  "\n\tmul.hi.u64 t5, q5, 0x4B1BA7B6434BACD7U;"
42 
43 
44  "\n\tmad.lo.u64.cc t1, q5, 0x1EABFFFEB153FFFFU, t1;" // p1
45  "\n\tmadc.hi.u64.cc t2, q5, 0x1EABFFFEB153FFFFU, t2;"
46 
47  "\n\tmadc.lo.u64.cc t3, q5, 0x64774B84F38512BFU, t3;" // p3
48  "\n\tmadc.hi.u64.cc t4, q5, 0x64774B84F38512BFU, t4;"
49 
50  "\n\tmadc.lo.u64.cc t5, q5, 0x1A0111EA397FE69AU, t5;" // p5
51  "\n\tmadc.hi.u64.cc t6, q5, 0x1A0111EA397FE69AU, 0;"
52 
53  // r = r1 - r2 = z - r2
54 
55  // Note: x < 2^384
56  // => q3 <= x/m
57  // => q3*m <= x
58  // => r2 <= x
59  // => r >= 0
60 
61  "\n\tsub.u64.cc %0, %0, t0;"
62  "\n\tsubc.u64.cc %1, %1, t1;"
63  "\n\tsubc.u64.cc %2, %2, t2;"
64  "\n\tsubc.u64.cc %3, %3, t3;"
65  "\n\tsubc.u64.cc %4, %4, t4;"
66  "\n\tsubc.u64 %5, %5, t5;"
67 
68  "\n\tsub.u64.cc t0, %0, 0xB9FEFFFFFFFFAAABU;"
69  "\n\tsubc.u64.cc t1, %1, 0x1EABFFFEB153FFFFU;"
70  "\n\tsubc.u64.cc t2, %2, 0x6730D2A0F6B0F624U;"
71  "\n\tsubc.u64.cc t3, %3, 0x64774B84F38512BFU;"
72  "\n\tsubc.u64.cc t4, %4, 0x4B1BA7B6434BACD7U;"
73  "\n\tsubc.u64.cc t5, %5, 0x1A0111EA397FE69AU;"
74  "\n\tsubc.u64 t6, 0, 0;"
75  "\n\tsetp.ne.u64 nz, t6, 0;"
76 
77  "\n@!nz\tmov.u64 %0, t0;"
78  "\n@!nz\tmov.u64 %1, t1;"
79  "\n@!nz\tmov.u64 %2, t2;"
80  "\n@!nz\tmov.u64 %3, t3;"
81  "\n@!nz\tmov.u64 %4, t4;"
82  "\n@!nz\tmov.u64 %5, t5;"
83 
84  "\n\tsub.u64.cc t0, %0, 0xB9FEFFFFFFFFAAABU;"
85  "\n\tsubc.u64.cc t1, %1, 0x1EABFFFEB153FFFFU;"
86  "\n\tsubc.u64.cc t2, %2, 0x6730D2A0F6B0F624U;"
87  "\n\tsubc.u64.cc t3, %3, 0x64774B84F38512BFU;"
88  "\n\tsubc.u64.cc t4, %4, 0x4B1BA7B6434BACD7U;"
89  "\n\tsubc.u64.cc t5, %5, 0x1A0111EA397FE69AU;"
90  "\n\tsubc.u64 t6, 0, 0;"
91  "\n\tsetp.ne.u64 nz, t6, 0;"
92 
93  "\n@!nz\tmov.u64 %0, t0;"
94  "\n@!nz\tmov.u64 %1, t1;"
95  "\n@!nz\tmov.u64 %2, t2;"
96  "\n@!nz\tmov.u64 %3, t3;"
97  "\n@!nz\tmov.u64 %4, t4;"
98  "\n@!nz\tmov.u64 %5, t5;"
99 
100  "\n\t}"
101  :
102  "+l"(z0), "+l"(z1), "+l"(z2), "+l"(z3), "+l"(z4), "+l"(z5)
103  );
104 
105  z[0] = z0, z[1] = z1, z[2] = z2, z[3] = z3, z[4] = z4, z[5] = z5;
106 }
107 
108 // vim: ts=4 et sw=4 si
uint64_t fp_t[6]
Residue modulo p. Any 384-bit representative of each residue is allowed, and stored as a 6-element li...
Definition: fp.cuh:14
__device__ void fp_reduce6(fp_t &z)
Narrow reduction of a residue modulo p, reducing to the canonical representation.
Definition: fp_reduce6.cu:14