FK20 CUDA
fp_reduce12.cuh
Go to the documentation of this file.
1 // bls12_381: Arithmetic for BLS12-381
2 // Copyright 2022-2023 Dag Arne Osvik
3 // Copyright 2022-2023 Luan Cardoso dos Santos
4 
5 #ifndef FP_REDUCE12
6 
12 #define FP_REDUCE12(Z) \
13 \
14  /* q2 = q1 * mu; q3 = q2 / 2^448 */ \
15 \
16  /* mu0 */ \
17 \
18  "\n\tmul.hi.u64 q0, 0x13E207F56591BA2EU, "#Z"a;" \
19 \
20  "\n\tmad.lo.u64.cc q0, 0x13E207F56591BA2EU, "#Z"b, q0;" \
21  "\n\tmadc.hi.u64 q1, 0x13E207F56591BA2EU, "#Z"b, 0;" \
22 \
23  /* mu1 */ \
24 \
25  "\n\tmad.hi.u64.cc q0, 0x997167A058F1C07BU, "#Z"9, q0;" \
26  "\n\tmadc.lo.u64.cc q1, 0x997167A058F1C07BU, "#Z"b, q1;" \
27  "\n\tmadc.hi.u64 q2, 0x997167A058F1C07BU, "#Z"b, 0;" \
28 \
29  "\n\tmad.lo.u64.cc q0, 0x997167A058F1C07BU, "#Z"a, q0;" \
30  "\n\tmadc.hi.u64.cc q1, 0x997167A058F1C07BU, "#Z"a, q1;" \
31  "\n\taddc.u64 q2, q2, 0;" \
32 \
33  /* mu2 */ \
34 \
35  "\n\tmad.lo.u64.cc q0, 0xDF4771E0286779D3U, "#Z"9, q0;" \
36  "\n\tmadc.hi.u64.cc q1, 0xDF4771E0286779D3U, "#Z"9, q1;" \
37  "\n\tmadc.lo.u64.cc q2, 0xDF4771E0286779D3U, "#Z"b, q2;" \
38  "\n\tmadc.hi.u64 q3, 0xDF4771E0286779D3U, "#Z"b, 0;" \
39 \
40  "\n\tmad.hi.u64.cc q0, 0xDF4771E0286779D3U, "#Z"8, q0;" \
41  "\n\tmadc.lo.u64.cc q1, 0xDF4771E0286779D3U, "#Z"a, q1;" \
42  "\n\tmadc.hi.u64.cc q2, 0xDF4771E0286779D3U, "#Z"a, q2;" \
43  "\n\taddc.u64 q3, q3, 0;" \
44 \
45  /* mu3 */ \
46 \
47  "\n\tmad.hi.u64.cc q0, 0x1B82741FF6A0A94BU, "#Z"7, q0;" \
48  "\n\tmadc.lo.u64.cc q1, 0x1B82741FF6A0A94BU, "#Z"9, q1;" \
49  "\n\tmadc.hi.u64.cc q2, 0x1B82741FF6A0A94BU, "#Z"9, q2;" \
50  "\n\tmadc.lo.u64.cc q3, 0x1B82741FF6A0A94BU, "#Z"b, q3;" \
51  "\n\tmadc.hi.u64 q4, 0x1B82741FF6A0A94BU, "#Z"b, 0;" \
52 \
53  "\n\tmad.lo.u64.cc q0, 0x1B82741FF6A0A94BU, "#Z"8, q0;" \
54  "\n\tmadc.hi.u64.cc q1, 0x1B82741FF6A0A94BU, "#Z"8, q1;" \
55  "\n\tmadc.lo.u64.cc q2, 0x1B82741FF6A0A94BU, "#Z"a, q2;" \
56  "\n\tmadc.hi.u64.cc q3, 0x1B82741FF6A0A94BU, "#Z"a, q3;" \
57  "\n\taddc.u64 q4, q4, 0;" \
58 \
59  /* mu4 */ \
60 \
61  "\n\tmad.lo.u64.cc q0, 0x28101B0CC7A6BA29U, "#Z"7, q0;" \
62  "\n\tmadc.hi.u64.cc q1, 0x28101B0CC7A6BA29U, "#Z"7, q1;" \
63  "\n\tmadc.lo.u64.cc q2, 0x28101B0CC7A6BA29U, "#Z"9, q2;" \
64  "\n\tmadc.hi.u64.cc q3, 0x28101B0CC7A6BA29U, "#Z"9, q3;" \
65  "\n\tmadc.lo.u64.cc q4, 0x28101B0CC7A6BA29U, "#Z"b, q4;" \
66  "\n\tmadc.hi.u64 q5, 0x28101B0CC7A6BA29U, "#Z"b, 0;" \
67 \
68  "\n\tmad.hi.u64.cc q0, 0x28101B0CC7A6BA29U, "#Z"6, q0;" \
69  "\n\tmadc.lo.u64.cc q1, 0x28101B0CC7A6BA29U, "#Z"8, q1;" \
70  "\n\tmadc.hi.u64.cc q2, 0x28101B0CC7A6BA29U, "#Z"8, q2;" \
71  "\n\tmadc.lo.u64.cc q3, 0x28101B0CC7A6BA29U, "#Z"a, q3;" \
72  "\n\tmadc.hi.u64.cc q4, 0x28101B0CC7A6BA29U, "#Z"a, q4;" \
73  "\n\taddc.u64 q5, q5, 0;" \
74 \
75  /* mu5 */ \
76 \
77  "\n\tmad.hi.u64.cc q0, 0xD835D2F3CC9E45CEU, "#Z"5, q0;" \
78  "\n\tmadc.lo.u64.cc q1, 0xD835D2F3CC9E45CEU, "#Z"7, q1;" \
79  "\n\tmadc.hi.u64.cc q2, 0xD835D2F3CC9E45CEU, "#Z"7, q2;" \
80  "\n\tmadc.lo.u64.cc q3, 0xD835D2F3CC9E45CEU, "#Z"9, q3;" \
81  "\n\tmadc.hi.u64.cc q4, 0xD835D2F3CC9E45CEU, "#Z"9, q4;" \
82  "\n\tmadc.lo.u64.cc q5, 0xD835D2F3CC9E45CEU, "#Z"b, q5;" \
83  "\n\tmadc.hi.u64 q6, 0xD835D2F3CC9E45CEU, "#Z"b, 0;" \
84 \
85  "\n\tmad.lo.u64.cc q0, 0xD835D2F3CC9E45CEU, "#Z"6, q0;" \
86  "\n\tmadc.hi.u64.cc q1, 0xD835D2F3CC9E45CEU, "#Z"6, q1;" \
87  "\n\tmadc.lo.u64.cc q2, 0xD835D2F3CC9E45CEU, "#Z"8, q2;" \
88  "\n\tmadc.hi.u64.cc q3, 0xD835D2F3CC9E45CEU, "#Z"8, q3;" \
89  "\n\tmadc.lo.u64.cc q4, 0xD835D2F3CC9E45CEU, "#Z"a, q4;" \
90  "\n\tmadc.hi.u64.cc q5, 0xD835D2F3CC9E45CEU, "#Z"a, q5;" \
91  "\n\taddc.u64 q6, q6, 0;" \
92 \
93  /* mu6 */ \
94 \
95  "\n\tmad.lo.u64.cc q0, 0x0000000000000009U, "#Z"5, q0;" \
96  "\n\tmadc.hi.u64.cc q1, 0x0000000000000009U, "#Z"5, q1;" \
97  "\n\tmadc.lo.u64.cc q2, 0x0000000000000009U, "#Z"7, q2;" \
98  "\n\tmadc.hi.u64.cc q3, 0x0000000000000009U, "#Z"7, q3;" \
99  "\n\tmadc.lo.u64.cc q4, 0x0000000000000009U, "#Z"9, q4;" \
100  "\n\tmadc.hi.u64.cc q5, 0x0000000000000009U, "#Z"9, q5;" \
101  "\n\tmadc.lo.u64.cc q6, 0x0000000000000009U, "#Z"b, q6;" \
102  "\n\tmadc.hi.u64 q7, 0x0000000000000009U, "#Z"b, 0;" \
103 \
104  "\n\tmad.hi.u64.cc q0, 0x0000000000000009U, "#Z"4, q0;" \
105  "\n\tmadc.lo.u64.cc q1, 0x0000000000000009U, "#Z"6, q1;" \
106  "\n\tmadc.hi.u64.cc q2, 0x0000000000000009U, "#Z"6, q2;" \
107  "\n\tmadc.lo.u64.cc q3, 0x0000000000000009U, "#Z"8, q3;" \
108  "\n\tmadc.hi.u64.cc q4, 0x0000000000000009U, "#Z"8, q4;" \
109  "\n\tmadc.lo.u64.cc q5, 0x0000000000000009U, "#Z"a, q5;" \
110  "\n\tmadc.hi.u64.cc q6, 0x0000000000000009U, "#Z"a, q6;" \
111  "\n\taddc.u64 q7, q7, 0;" \
112 \
113  /* r2 = q3 * m mod 2^448 */ \
114  /* u contains z^2 */ \
115  /* q contains q3 */ \
116  /* produces r2 in r */ \
117 \
118  /* m5 */ \
119 \
120  "\n\tmul.lo.u64 r5, 0x1A0111EA397FE69AU, q1 ;" \
121  "\n\tmul.hi.u64 r6, 0x1A0111EA397FE69AU, q1 ;" \
122  "\n\tmad.lo.u64 r6, 0x1A0111EA397FE69AU, q2, r6;" \
123 \
124  /* m4 */ \
125 \
126  "\n\tmul.lo.u64 r4, 0x4B1BA7B6434BACD7U, q1 ;" \
127  "\n\tmad.hi.u64.cc r5, 0x4B1BA7B6434BACD7U, q1, r5;" \
128  "\n\tmadc.lo.u64 r6, 0x4B1BA7B6434BACD7U, q3, r6;" \
129 \
130  "\n\tmad.lo.u64.cc r5, 0x4B1BA7B6434BACD7U, q2, r5;" \
131  "\n\tmadc.hi.u64 r6, 0x4B1BA7B6434BACD7U, q2, r6;" \
132 \
133  /* m3 */ \
134 \
135  "\n\tmul.lo.u64 r3, 0x64774B84F38512BFU, q1 ;" \
136  "\n\tmad.hi.u64.cc r4, 0x64774B84F38512BFU, q1, r4;" \
137  "\n\tmadc.lo.u64.cc r5, 0x64774B84F38512BFU, q3, r5;" \
138  "\n\tmadc.hi.u64 r6, 0x64774B84F38512BFU, q3, r6;" \
139 \
140  "\n\tmad.lo.u64.cc r4, 0x64774B84F38512BFU, q2, r4;" \
141  "\n\tmadc.hi.u64.cc r5, 0x64774B84F38512BFU, q2, r5;" \
142  "\n\tmadc.lo.u64 r6, 0x64774B84F38512BFU, q4, r6;" \
143 \
144  /* m2 */ \
145 \
146  "\n\tmul.lo.u64 r2, 0x6730D2A0F6B0F624U, q1 ;" \
147  "\n\tmad.hi.u64.cc r3, 0x6730D2A0F6B0F624U, q1, r3;" \
148  "\n\tmadc.lo.u64.cc r4, 0x6730D2A0F6B0F624U, q3, r4;" \
149  "\n\tmadc.hi.u64.cc r5, 0x6730D2A0F6B0F624U, q3, r5;" \
150  "\n\tmadc.lo.u64 r6, 0x6730D2A0F6B0F624U, q5, r6;" \
151 \
152  "\n\tmad.lo.u64.cc r3, 0x6730D2A0F6B0F624U, q2, r3;" \
153  "\n\tmadc.hi.u64.cc r4, 0x6730D2A0F6B0F624U, q2, r4;" \
154  "\n\tmadc.lo.u64.cc r5, 0x6730D2A0F6B0F624U, q4, r5;" \
155  "\n\tmadc.hi.u64 r6, 0x6730D2A0F6B0F624U, q4, r6;" \
156 \
157  /* m1 */ \
158 \
159  "\n\tmul.lo.u64 r1, 0x1EABFFFEB153FFFFU, q1 ;" \
160  "\n\tmad.hi.u64.cc r2, 0x1EABFFFEB153FFFFU, q1, r2;" \
161  "\n\tmadc.lo.u64.cc r3, 0x1EABFFFEB153FFFFU, q3, r3;" \
162  "\n\tmadc.hi.u64.cc r4, 0x1EABFFFEB153FFFFU, q3, r4;" \
163  "\n\tmadc.lo.u64.cc r5, 0x1EABFFFEB153FFFFU, q5, r5;" \
164  "\n\tmadc.hi.u64 r6, 0x1EABFFFEB153FFFFU, q5, r6;" \
165 \
166  "\n\tmad.lo.u64.cc r2, 0x1EABFFFEB153FFFFU, q2, r2;" \
167  "\n\tmadc.hi.u64.cc r3, 0x1EABFFFEB153FFFFU, q2, r3;" \
168  "\n\tmadc.lo.u64.cc r4, 0x1EABFFFEB153FFFFU, q4, r4;" \
169  "\n\tmadc.hi.u64.cc r5, 0x1EABFFFEB153FFFFU, q4, r5;" \
170  "\n\tmadc.lo.u64 r6, 0x1EABFFFEB153FFFFU, q6, r6;" \
171 \
172  /* m0 */ \
173 \
174  "\n\tmul.lo.u64 r0, 0xB9FEFFFFFFFFAAABU, q1 ;" \
175  "\n\tmad.hi.u64.cc r1, 0xB9FEFFFFFFFFAAABU, q1, r1;" \
176  "\n\tmadc.lo.u64.cc r2, 0xB9FEFFFFFFFFAAABU, q3, r2;" \
177  "\n\tmadc.hi.u64.cc r3, 0xB9FEFFFFFFFFAAABU, q3, r3;" \
178  "\n\tmadc.lo.u64.cc r4, 0xB9FEFFFFFFFFAAABU, q5, r4;" \
179  "\n\tmadc.hi.u64.cc r5, 0xB9FEFFFFFFFFAAABU, q5, r5;" \
180  "\n\tmadc.lo.u64 r6, 0xB9FEFFFFFFFFAAABU, q7, r6;" \
181 \
182  "\n\tmad.lo.u64.cc r1, 0xB9FEFFFFFFFFAAABU, q2, r1;" \
183  "\n\tmadc.hi.u64.cc r2, 0xB9FEFFFFFFFFAAABU, q2, r2;" \
184  "\n\tmadc.lo.u64.cc r3, 0xB9FEFFFFFFFFAAABU, q4, r3;" \
185  "\n\tmadc.hi.u64.cc r4, 0xB9FEFFFFFFFFAAABU, q4, r4;" \
186  "\n\tmadc.lo.u64.cc r5, 0xB9FEFFFFFFFFAAABU, q6, r5;" \
187  "\n\tmadc.hi.u64 r6, 0xB9FEFFFFFFFFAAABU, q6, r6;" \
188 \
189  /* r = r1 - r2 */ \
190  /* r1 is in u */ \
191  /* r2 is in r */ \
192 \
193  /* z = r1 - r2 */ \
194 \
195  "\n\tsub.u64.cc "#Z"0, "#Z"0, r0;" \
196  "\n\tsubc.u64.cc "#Z"1, "#Z"1, r1;" \
197  "\n\tsubc.u64.cc "#Z"2, "#Z"2, r2;" \
198  "\n\tsubc.u64.cc "#Z"3, "#Z"3, r3;" \
199  "\n\tsubc.u64.cc "#Z"4, "#Z"4, r4;" \
200  "\n\tsubc.u64 "#Z"5, "#Z"5, r5;"
201 
202 #endif
203 // vim: ts=4 et sw=4 si