-
Notifications
You must be signed in to change notification settings - Fork 815
/
Copy pathx25519.s
367 lines (320 loc) · 9.99 KB
/
x25519.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
/* Copyright lowRISC contributors. */
/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
/* SPDX-License-Identifier: Apache-2.0 */
/**
* This library contains code for X25519, for use in elliptic-curve Diffie
* Hellman key exchange.
*
* Helpful references:
* RFC: https://datatracker.ietf.org/doc/html/rfc7748
* Academic paper: https://www.iacr.org/cryptodb/archive/2006/PKC/3351/3351.pdf
*/
/**
* Top-level X25519 function, as defined in RFC 7748.
*
* This routine runs in constant time.
*
* Flags: Flags have no meaning beyond the scope of this subroutine.
*
* @param[in] w8: enc(k), encoded scalar (256-bit random number)
* @param[in] w9: enc(u), encoded Montgomery u-coordinate (256 bits)
* @param[out] w22: result, X25519(k, u) as an encoded u-coordinate
*
* clobbered registers: w2 to w24
* clobbered flag groups: FG0
*/
.globl X25519
X25519:
/* Prepare all-zero register. */
bn.xor w31, w31, w31
/* Prepare constant for field operations.
w19 <= 19 */
bn.addi w19, w31, 19
/* Load modulus from DMEM.
MOD <= dmem[modulus25519] = 2^255 - 19 = p */
li x2, 2
la x3, modulus25519
bn.lid x2, 0(x3)
bn.wsrw 0x0, w2
/* Decode scalar. From RFC 7748, section 5:
"For X25519, in order to decode 32 random bytes as an integer scalar, set
the three least significant bits of the first byte and the most significant bit
of the last to zero, set the second most significant bit of the last byte to 1
and, finally, decode as little-endian." */
/* w8 <= (w8 >> 3) = enc(k) >> 3 */
bn.rshi w8, w31, w8 >> 3
/* w8 <= w8 << 5 = ((enc(k) >> 3) << 5) mod 2^256 */
bn.rshi w8, w8, w31 >> 251
/* w8 <= 2^254 + (w8 >> 2) = ((enc(k) >> 3) << 3) mod 2^254 + 2^254 = k*/
bn.addi w7, w31, 1
bn.rshi w8, w7, w8 >> 2
/* Decode point u-coordinate. As specified in RFC 7748, section 5, we must
zero the most significant bit and accept non-canonical values (i.e. cases
where p <= u) as if they had been reduced modulo p. */
/* w7 <= ~w31 = 2^256 - 1 */
bn.not w7, w31
/* w7 <= w7 >> 1 = 2^255 - 1 */
bn.rshi w7, w31, w7 >> 1
/* w9 <= w9 & w7 = enc(u) mod 2^255 */
bn.and w9, w9, w7
/* w9 <= w9 mod p <= (enc(u) mod 2^255) mod p = u */
bn.addm w9, w9, w31
/* Note that, in contrast to Ed25519, the given point does not have to be
validated; see the original Curve25519 paper for discussion (search
for "small-subgroup attacks" and "free key validation").
https://www.iacr.org/cryptodb/archive/2006/PKC/3351/3351.pdf */
/* Perform scalar multiplication.
w22 <= scalar_mult(k, u) = X25519(k, u) */
jal x1, scalar_mult
/* Since the scalar multiplication routine has already reduced its result
modulo p and OTBN is little-endian, encoding the result is a no-op. */
ret
/**
* Elliptic-curve scalar multiplication for Curve25519.
*
* Returns B = kA.
*
* Multiplies a point on the curve (A) with a scalar (k). Uses the Montgomery
* ladder algorithm as in RFC 7748. For the Montgomery ladder, we only need to
* consider the u-coordinate of curve points.
*
* See RFC 7748, section 5:
* https://datatracker.ietf.org/doc/html/rfc7748#section-5
*
* This routine runs in constant time.
*
* Flags: Flags have no meaning beyond the scope of this subroutine.
*
* @param[in] w8: k, 255-bit scalar value
* @param[in] w9: u, Montgomery u-coordinate of point A (k < p)
* @param[in] w19: constant, w19 = 19
* @param[in] w31, all-zero
* @param[in] MOD: p, modulus = 2^255 - 19
* @param[out] w22: result, Montgomery u-coordinate of point kA
*
* clobbered registers: w2 to w7, w10 to w18, w20 to w24
* clobbered flag groups: FG0
*/
scalar_mult:
/* Load the constant a24:
w24 <= dmem[a24] = 121665 */
li x2, 24
la x3, a24
bn.lid x2, 0(x3)
/* Set initial values. From RFC 7748:
x_2 = 1
z_2 = 0
x_3 = u
z_3 = 1
swap = 0
*/
/* x_2 = w10 <= 1 */
bn.addi w10, w31, 1
/* z_2 = w12 <= 0 */
bn.mov w12, w31
/* x_3 = w11 <= u */
bn.mov w11, w9
/* z_3 = w13 <= 1 */
bn.addi w13, w31, 1
/* swap = w15 <= 0 */
bn.mov w15, w31
/* Initialize w4 for loop.
w14 <= (k << 1) mod 2^256 */
bn.rshi w14, w8, w31 >> 255
/* Main loop. From RFC 7748 (with `ladderstep` factored into a separate step):
For t = bits-1 down to 0:
k_t = (k >> t) & 1
swap ^= k_t
(x_2, x_3) = cswap(swap, x_2, x_3)
(z_2, z_3) = cswap(swap, z_2, z_3)
swap = k_t
ladderstep(u, x_2, x_3, z_2, z_3)
Loop invariants:
w14 = (k << (255-t)) mod 2^256
w9 = u
Loop temporary registers: w6, w7
*/
loopi 255, 12
/* w7 <= w14 >> 255 = (k << (255-t)) mod 2^256 >> 255 = k[t] = k_t */
bn.rshi w7, w31, w14 >> 255
/* w14 <= w14 << 1 = (k << (255-(t-1)) mod 2^256 */
bn.rshi w14, w14, w31 >> 255
/* swap = w15 = w15 ^ w7 = swap ^ k_t */
bn.xor w15, w15, w7
/* Note: The xor above set FG0.Z if and only if swap ^ k_t is zero. */
/* Conditionally swap x_2 and x_3. */
/* w6 <= if swap =? 0 then x_2 else x_3 */
bn.sel w6, w10, w11, FG0.Z
/* x_3 = w11 <= if swap =? 0 then x_3 else x_2 */
bn.sel w11, w11, w10, FG0.Z
/* x_2 = w10 <= w6 = if swap =? 0 then x_2 else x_3 */
bn.mov w10, w6
/* Conditionally swap z_2 and z_3. */
/* w6 <= if swap =? 0 then z_2 else z_3 */
bn.sel w6, w12, w13, FG0.Z
/* z_3 = w13 <= if swap =? 0 then z_3 else z_2 */
bn.sel w13, w13, w12, FG0.Z
/* z_2 = w12 <= w6 = if swap =? 0 then z_2 else z_3 */
bn.mov w12, w6
/* swap = w15 <= w7 = k_t */
bn.mov w15, w7
jal x1, ladderstep
nop
/* Loop done; final steps. From RFC 7748:
(x_2, x_3) = cswap(swap, x_2, x_3)
(z_2, z_3) = cswap(swap, z_2, z_3)
Return x_2 * (z_2^(p - 2))
*/
/* Set the FG0.Z flag if swap (w15) is 0. */
bn.add w15, w15, w31
/* Conditionally swap x_2 and x_3. */
/* w6 <= if swap =? 0 then x_2 else x_3 */
bn.sel w6, w10, w11, FG0.Z
/* x_3 = w11 <= if swap =? 0 then x_3 else x_2 */
bn.sel w11, w11, w10, FG0.Z
/* x_2 = w10 <= w6 = if swap =? 0 then x_2 else x_3 */
bn.mov w10, w6
/* Conditionally swap z_2 and z_3. */
/* w6 <= if swap =? 0 then z_2 else z_3 */
bn.sel w6, w12, w13, FG0.Z
/* z_3 = w13 <= if swap =? 0 then z_3 else z_2 */
bn.sel w13, w13, w12, FG0.Z
/* z_2 = w12 <= w6 = if swap =? 0 then z_2 else z_3 */
bn.mov w12, w6
/* w22 <= fe_inv(w12) = z_2^(p-2) */
bn.mov w16, w12
jal x1, fe_inv
/* w22 <= w22 * w10 = (z_2^(p-2)) * x_2 */
bn.mov w23, w10
jal x1, fe_mul
ret
/**
* Performs the core arithmetic step of the Montgomery ladder.
*
* This subroutine is intended to be used as part of a Montgomery ladder
* implementation. It performs the core arithmetic in the main loop, i.e. the
* following steps (from RFC 7748, section 5):
* A = x_2 + z_2
* AA = A^2
* B = x_2 - z_2
* BB = B^2
* E = AA - BB
* C = x_3 + z_3
* D = x_3 - z_3
* DA = D * A
* CB = C * B
* x_3 = (DA + CB)^2
* z_3 = x_1 * (DA - CB)^2
* x_2 = AA * BB
* z_2 = E * (AA + a24 * E)
*
* All arithmetic (+, -, *, ^) above is in the finite field GF(2^255-19).
*
* TODO: investigate alternative laddersteps, such as the one in
* curve25519-donna, which may be faster.
*
* This routine runs in constant time.
*
* Flags: Flags have no meaning beyond the scope of this subroutine.
*
* @param[in] w9: x_1, Montgomery u-coordinate of point A (k < p)
* @param[in] w19: constant, w19 = 19
* @param[in] w24: constant a24 = 121665
* @param[in] w31: all-zero
* @param[in] MOD: p, modulus = 2^255 - 19
* @param[in,out] w10: x_2
* @param[in,out] w11: x_3
* @param[in,out] w12: z_2
* @param[in,out] w13: z_3
*
* clobbered registers: w2 to w5, w10 to w13, w17, w18, w20 to w23
* clobbered flag groups: FG0
*/
ladderstep:
/* First, compute the new x_2 and z_2:
A = x_2 + z_2
AA = A^2
B = x_2 - z_2
BB = B^2
E = AA - BB
x_2 = AA * BB
z_2 = E * (AA + a24 * E) */
/* w2 <= w10 + w12 = x_2 + z_2 = A */
bn.addm w2, w10, w12
/* w3 <= w2^2 = A^2 = AA */
bn.mov w22, w2
jal x1, fe_square
bn.mov w3, w22
/* w4 <= w10 - w12 = x_2 - z_2 = B */
bn.subm w4, w10, w12
/* w22 <= w4^2 = B^2 = BB */
bn.mov w22, w4
jal x1, fe_square
/* w5 <= w3 - w22 = AA - BB = E */
bn.subm w5, w3, w22
/* x_2 = w10 <= w22 * w3 = BB * AA */
bn.mov w23, w3
jal x1, fe_mul
bn.mov w10, w22
/* w23 <= w5 = E */
bn.mov w23, w5
/* TODO: create faster, specialized fe_mul for a24 */
/* w22 <= w24 * w23 = a24 * E */
bn.mov w22, w24
jal x1, fe_mul
/* w22 <= w3 + w22 = AA + (a24 * E) */
bn.add w22, w3, w22
/* z_2 = w12 <= w22 * w23 = (AA + a24 * E) * E */
jal x1, fe_mul
bn.mov w12, w22
/* Now, compute the new x_3 and z_3, using A and B from the computation
above:
C = x_3 + z_3
D = x_3 - z_3
DA = D * A
CB = C * B
x_3 = (DA + CB)^2
z_3 = x_1 * (DA - CB)^2 */
/* w22 <= w11 + w13 = x_3 + z_3 = C */
bn.addm w22, w11, w13
/* w4 <= w22 * w4 = C * B = CB */
bn.mov w23, w4
jal x1, fe_mul
bn.mov w4, w22
/* w22 <= w11 - w13 = x_3 - z_3 = D */
bn.subm w22, w11, w13
/* w2 <= w22 * w2 = D * A = DA */
bn.mov w23, w2
jal x1, fe_mul
bn.mov w2, w22
/* w22 <= w2 + w4 <= DA + CB */
bn.addm w22, w2, w4
/* x_3 = w11 <= w22^2 = (DA + CB)^2 */
jal x1, fe_square
bn.mov w11, w22
/* w22 <= w2 - w4 <= DA - CB */
bn.subm w22, w2, w4
/* w22 <= w22^2 = (DA - CB)^2 */
jal x1, fe_square
/* z_3 = w13 <= w22 * w9 = (DA - CB)^2 * x_1 */
bn.mov w23, w9
jal x1, fe_mul
bn.mov w13, w22
ret
.data
/* Modulus p = 2^255 - 19. */
.balign 32
modulus25519:
.word 0xffffffed
.word 0xffffffff
.word 0xffffffff
.word 0xffffffff
.word 0xffffffff
.word 0xffffffff
.word 0xffffffff
.word 0x7fffffff
/* Constant (a - 2) / 4 for Curve25519 = 121665 (see RFC 7748, section 5). */
.balign 32
a24:
.word 0x0001db41
.zero 28