summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDirk Engling <erdgeist@erdgeist.org>2013-07-15 14:41:01 +0200
committerDirk Engling <erdgeist@erdgeist.org>2013-07-15 14:41:01 +0200
commitce15c0e9a6a3801774a7f76a3036cdf6abd18466 (patch)
tree46893c2804cbf85d7c7a6a1bf43107f83c694bf3
Stable version
-rw-r--r--kara.c301
-rw-r--r--powm.c516
2 files changed, 817 insertions, 0 deletions
diff --git a/kara.c b/kara.c
new file mode 100644
index 0000000..2898e63
--- /dev/null
+++ b/kara.c
@@ -0,0 +1,301 @@
1#include <stdio.h>
2#include <stdint.h>
3#include <string.h>
4
5typedef uint8_t leg_t;
6typedef uint16_t dleg_t;
7
8enum { g_legs = 256 };
9
10#define dump_int(A,B,C) dump_int2(A,B,C)
11static void dump_int2( uint8_t const * p, int l, int nl )
12{
13 while( l-- )
14 printf( "%02X", *(p++) );
15 if( nl ) putchar( 10 );
16}
17
18//#define printf(...)
19//#define putchar(A)
20
21static int mp_cmp( leg_t const *a, leg_t const *b, int legs )
22{
23 int leg;
24 for( leg=0; leg<legs; ++leg )
25 {
26 if( a[leg] < b[leg] ) return -1;
27 if( a[leg] > b[leg] ) return 1;
28 }
29 return 0;
30}
31
32static void add( leg_t *oper, leg_t const *oper2, int legs )
33{
34 dleg_t acc = 0;
35 while( legs-- > 0 )
36 {
37 acc += oper[legs] + oper2[legs];
38 oper[legs] = (leg_t)acc;
39 acc >>= 8*sizeof(leg_t);
40 }
41 while( acc )
42 {
43 acc += oper[legs];
44 oper[legs--] = (leg_t)acc;
45 acc >>= 8*sizeof(leg_t);
46 }
47}
48
49static void adm( leg_t *oper, leg_t const *oper2, int legs, int flegs )
50{
51 dleg_t acc = 0;
52 while( legs-- > 0 )
53 {
54 acc += oper[legs] + oper2[legs];
55 oper[legs] = (leg_t)acc;
56 acc >>= 8*sizeof(leg_t);
57 }
58 while( acc && flegs-- )
59 {
60 acc += oper[legs];
61 oper[legs--] = (leg_t)acc;
62 acc >>= 8*sizeof(leg_t);
63 }
64}
65
66static void sub( leg_t *oper, leg_t const *oper2, int legs )
67{
68 int borrow = 0, borrow_temp;
69 while( legs-- )
70 {
71 leg_t temp = oper[legs] - oper2[legs];
72 borrow_temp = temp > oper[legs];
73 oper[legs] = temp - borrow;
74 borrow = borrow_temp | ( oper[legs] > temp );
75 }
76 while( borrow )
77 {
78 leg_t temp = oper[legs] - borrow;
79 borrow = temp > oper[legs];
80 oper[legs--] = temp;
81 }
82printf( "sub: %d\n", legs );
83}
84
85/* result is guaranteed to be initialized with enough legs prepended to take the carry */
86static void mp_mul_uint_add( leg_t *result, leg_t const *oper, leg_t fac, int legs )
87{
88 dleg_t acc = 0;
89 while( legs-- )
90 {
91 acc += (dleg_t)result[legs] + (dleg_t)oper[legs] * (dleg_t)fac;
92 result[legs] = (leg_t)acc;
93 acc >>= 8*sizeof(leg_t);
94 }
95 while( acc )
96 {
97 acc += result[legs];
98 result[legs--] = (leg_t)acc;
99 acc >>= 8*sizeof(leg_t);
100 }
101}
102
103static void mul( leg_t * result, leg_t const *oper, leg_t const *oper2, int legs )
104{
105 int leg = legs;
106 while( leg-- )
107 mp_mul_uint_add( result + 1 + leg, oper, oper2[leg], legs );
108}
109
110static int sub_mod( leg_t * result, leg_t const *a, leg_t const * b, int legs )
111{
112 int borrow = 0, borrow_temp;
113 while( legs-- )
114 {
115 leg_t temp = a[legs] - b[legs];
116 borrow_temp = temp > a[legs];
117 result[legs] = temp - borrow;
118 borrow = borrow_temp | (result[legs] > temp );
119 }
120 return borrow;
121}
122
123static void negate( leg_t * p, int legs )
124{
125 while( legs && !p[legs-1] ) p[legs--]^=(leg_t)-1;
126 if( !legs ) return;
127 p[legs-1]--;
128 while( legs--) p[legs]^=(leg_t)-1;
129}
130
131void kara2( leg_t* p, leg_t *a, leg_t *b, int len )
132{
133printf( "L%d karaloop: a*b ", len ); dump_int( a, len, 0 ); putchar('*'); dump_int( b, len, 1 );
134 memset( p, 0, 2 * len * sizeof( leg_t ));
135 if( len == 2 )
136 {
137 mul( p, a, b, len );
138 printf( "L%d p:", len ); dump_int( p, len*2, 1 );
139 }
140 else
141 {
142 leg_t scratch[len];
143 int sign = 0, n = len / 2;
144
145 if( mp_cmp( a, a + n, n ) > 0 )
146 sub_mod( scratch, a, a + n, n );
147 else
148 {
149 sub_mod( scratch, a + n, a, n );
150 sign = 1;
151 }
152
153 if( mp_cmp( b, b + n, n ) > 0 )
154 sub_mod( scratch + n, b, b + n, n );
155 else
156 {
157 sub_mod( scratch + n, b + n, b, n );
158 sign ^= 1;
159 }
160
161printf( "L%d (x0-x1)*(y1-y0)=", len ); dump_int( scratch, n, 0 ); putchar('*'); dump_int( scratch + n, n, 0 ); printf( "(sign = %d)\n", sign );
162
163 kara2( p + n, scratch, scratch + n, n );
164 if( !sign )
165 negate( p, len + n );
166
167printf( "L%d psub:", len ); dump_int( p, len*2, 1 );
168
169 kara2( scratch, a + n, b + n, n );
170 adm( p + len, scratch, len, len );
171 adm( p + n, scratch, len, n );
172
173 kara2( scratch, a, b, n );
174 adm( p + n, scratch, len, n );
175 adm( p, scratch, len, 0 );
176printf( "L%d p:", len ); dump_int( p, len*2, 1 );
177 }
178}
179
180void kara( uint8_t* p, uint8_t *a, uint8_t *b, int len )
181{
182 int i;
183memset( p, 0, 2 * len * sizeof(leg_t ) );
184 if( len <= 3 )
185 {
186printf( "L%d karaend: a*b ", len ); dump_int( a, len, 0 ); putchar('*'); dump_int( b, len, 1 );
187 mul( p, a, b, len );
188printf( "L%d result:\n", len ); dump_int( p, len*2, 1 );
189 }
190 else {
191 int n = len / 2;
192 int l = ( len + 1 ) / 2;
193 int o = l - n;
194 int sh = l + 1;
195 uint8_t scratch[2*sh];
196 uint8_t *midpoint = p + len * 2 - n;
197
198printf( "L%d karaloop: a*b ", len ); dump_int( a, len, 0 ); putchar('*'); dump_int( b, len, 1 );
199printf( "Values n: %d l: %d o: %d sh: %d\n", n, l, o, sh );
200
201 memset( scratch, 0, sizeof(scratch));
202 memcpy( scratch + 1 , a, l );
203 memcpy( scratch + 1 + sh , b, l );
204
205 /* P3 = (a1+a2)*(b1+b2) */
206 add( scratch + 1 + o, a + l, n );
207 add( scratch + sh + 1 + o, b + l, n );
208
209 printf( "L%d a1+a2: \n", len ); dump_int( scratch, sh, 1 );
210 printf( "L%d b1+b2: \n", len ); dump_int( scratch+sh, sh, 1 );
211
212 kara( midpoint - 2 * sh, scratch, scratch + sh, sh );
213
214 printf( "L%d (a1+a2)*(b1+b2)<<n: \n", len ); dump_int( p, len*2, 1 );
215
216 /* P2 = a2*b2 */
217 kara( scratch, a + l, b + l, n );
218
219 printf( "L%d a2*b2: \n", len ); dump_int( scratch, n*2, 1 );
220
221 /* P = ( P3 - P2 ) * 2^n + P2 ... and later: + P1 * 2^(2*n) - P1 * 2^n*/
222 add( midpoint - n, scratch, n * 2 );
223
224 printf( "L%d p + p2: \n", len ); dump_int( p, len*2, 1 );
225
226 sub( midpoint - n * 2, scratch, n * 2 );
227
228 printf( "L%d p - p2<<n: \n", len ); dump_int( p, len*2, 1 );
229
230 /* now + P1 * 2^(2*n) - P1 * 2^n*/
231 kara( scratch, a, b, l );
232
233 printf( "L%d a1*b1: \n", len ); dump_int( scratch, l*2, 1 );
234
235 sub( midpoint - l * 2, scratch, l * 2 );
236
237 printf( "L%d p - p1<<n: \n", len ); dump_int( p, len*2, 1 );
238
239 add( midpoint - n - l * 2, scratch, l * 2 );
240
241 printf( "L%d p + p1<<n^2: \n", len ); dump_int( p, len*2, 1 );
242 }
243}
244
245#undef putchar
246int main() {
247 uint8_t a[] = { 0x99,0xf3,0x0a,0x3d,0x8a,0xdd,0x2d,0x1f };
248 uint8_t b[] = { 0xf1,0x48,0x71,0x4b,0xf7,0xb1,0x9a,0x30 };
249
250// uint8_t a[g_legs], b[g_legs];
251// uint8_t p[2*g_legs];
252
253/*
254static const leg_t b[] = {
255 0x3D, 0xA9, 0x76, 0x59, 0xE2, 0x80, 0xDB, 0x0B, 0xE6, 0x5B, 0xCC, 0x3A, 0xB7, 0x8F, 0xDA, 0xA9, 0xB7, 0xB7, 0x68, 0xC8, 0x99, 0x31, 0xD7, 0x8D, 0xF8, 0xB1, 0x17, 0x25, 0x33, 0x9E, 0xBC, 0x93,
256 0xAA, 0x7F, 0xBD, 0x95, 0x62, 0x05, 0x9F, 0x1F, 0x4C, 0x2D, 0xE6, 0x7D, 0xAD, 0x47, 0x52, 0x7E, 0x52, 0x6A, 0x65, 0x3A, 0x7A, 0x67, 0x4B, 0xD5, 0x54, 0x01, 0xEA, 0x4F, 0x3E, 0xD7, 0x3A, 0x2F,
257 0x70, 0xB5, 0x6F, 0x52, 0x7F, 0x6F, 0x60, 0x4F, 0xF3, 0xE5, 0x6C, 0xC2, 0xBD, 0x9F, 0x04, 0x8C, 0xFE, 0xA8, 0x0D, 0x9A, 0x6E, 0xC9, 0xFC, 0xD3, 0x3A, 0xD3, 0x6F, 0xD8, 0x22, 0xC3, 0x9F, 0x34,
258 0x91, 0xF3, 0x0C, 0x52, 0xF7, 0x98, 0xDA, 0x6A, 0x18, 0xC3, 0xDC, 0xE2, 0x01, 0x88, 0xD8, 0x4C, 0xCB, 0x22, 0x51, 0x76, 0x25, 0x9E, 0x08, 0x0F, 0xA8, 0x9D, 0x1D, 0xCD, 0x9D, 0x38, 0x1C, 0xC5,
259 0xBE, 0xAC, 0xD4, 0x6F, 0x3C, 0xDD, 0x11, 0x96, 0x91, 0xA4, 0xF5, 0x57, 0x29, 0x29, 0xB9, 0x0C, 0x67, 0xDE, 0x8F, 0xA0, 0x23, 0x86, 0x47, 0x14, 0xC2, 0x8A, 0x61, 0xD4, 0x74, 0x11, 0x40, 0x2D,
260 0x6C, 0x09, 0x06, 0x0D, 0x41, 0x05, 0x86, 0x39, 0xE4, 0xFC, 0xCF, 0x1D, 0x63, 0x8F, 0x45, 0x66, 0x9E, 0x10, 0xFD, 0xE2, 0x8E, 0x54, 0x80, 0x6B, 0xF1, 0xD2, 0x7D, 0x0B, 0x5C, 0x7D, 0xC9, 0xC2,
261 0xB6, 0x16, 0xD6, 0xFA, 0x8B, 0xE2, 0xC9, 0x1D, 0xE8, 0x10, 0x54, 0x64, 0xE9, 0xF8, 0x0A, 0x5F, 0x34, 0x72, 0x08, 0x69, 0x90, 0xDA, 0xCF, 0x1A, 0x7E, 0x2C, 0x75, 0xA5, 0x8E, 0x25, 0xF1, 0x42,
262 0x8F, 0xB4, 0x83, 0x2E, 0xF8, 0x27, 0xDE, 0x84, 0xCA, 0x06, 0xDA, 0x91, 0xC2, 0xB3, 0xE7, 0xE2, 0x4F, 0x02, 0x41, 0x93, 0x78, 0x7A, 0x82, 0x78, 0x8B, 0xD7, 0x05, 0x62, 0xDA, 0x60, 0xE3, 0x92,
263 0x46, 0xBD, 0xB7, 0x33, 0x6E, 0x84, 0x52, 0xD9, 0x1D, 0x7D, 0x37, 0xA2, 0x3F, 0xB8, 0xCF, 0x61, 0xB1, 0x8A, 0x9E, 0xF1, 0x50, 0xC8, 0x95, 0x3A, 0xC9, 0x39, 0x19, 0xD1, 0x2A, 0x4B, 0x1A, 0x67,
264 0xFD, 0xC6, 0x5A, 0x26, 0x9B, 0x51, 0xC1, 0xEF, 0x09, 0x95, 0x48, 0x43, 0x20, 0xE7, 0x39, 0xF4, 0x6C, 0x79, 0x51, 0xA5, 0x23, 0xCE, 0xF7, 0x85, 0xE6, 0x0C, 0x6E, 0xFD, 0xFF, 0xB7, 0xA9, 0xA9,
265 0x5F, 0x66, 0x61, 0x46, 0xCB, 0x44, 0x1F, 0x59, 0xAE, 0x01, 0xE0, 0xF3, 0x63, 0xA9, 0x31, 0x5D, 0x04, 0xBA, 0x04, 0x4A, 0xEB, 0x4E, 0xEF, 0xD4, 0x85, 0x63, 0x21, 0x5F, 0x72, 0xC8, 0xD9, 0x89,
266 0x62, 0xD2, 0x18, 0x77, 0x12, 0x96, 0xEF, 0x6A, 0x20, 0xBD, 0x72, 0xB9, 0xB2, 0x1E, 0x6B, 0x3D, 0x53, 0xC4, 0x4F, 0xAB, 0x73, 0x48, 0x10, 0xF7, 0xD2, 0x03, 0xA9, 0xE0, 0xD7, 0xCE, 0x25, 0xD0,
267 0x2E, 0x52, 0x98, 0x9E, 0xCC, 0xF8, 0x5F, 0x34, 0x91, 0x2A, 0x04, 0x91, 0x3E, 0x9E, 0xBD, 0x87, 0x82, 0x67, 0x53, 0x7D, 0x4A, 0x61, 0x2A, 0x18, 0x51, 0xE7, 0x5D, 0x99, 0x98, 0xF0, 0x01, 0xDB,
268 0xC9, 0xC7, 0x7F, 0x0C, 0x35, 0x2D, 0x40, 0x8C, 0xA7, 0x96, 0xD1, 0x82, 0x04, 0xA6, 0x36, 0xF7, 0xE4, 0x40, 0x40, 0x92, 0x1C, 0x1E, 0x46, 0x7C, 0x52, 0x4E, 0x7C, 0x7A, 0x7E, 0xD3, 0x6C, 0x41,
269 0x2A, 0x43, 0x4C, 0xEB, 0x23, 0x0B, 0x2D, 0xFE, 0x35, 0x49, 0xC5, 0x77, 0x7A, 0x17, 0xFB, 0x04, 0xB8, 0x50, 0xDE, 0x95, 0xD9, 0x7A, 0xC4, 0x0A, 0x55, 0xEA, 0x6F, 0x75, 0x41, 0xC4, 0xF8, 0x2B,
270 0x37, 0xBF, 0x90, 0xFE, 0x52, 0x07, 0x4F, 0x19, 0xFA, 0x8F, 0x75, 0xF0, 0x06, 0x7E, 0x82, 0xB1, 0x8A, 0x1A, 0xC0, 0x24, 0xB3, 0x0E, 0x9B, 0x12, 0xC1, 0x4A, 0xB0, 0xDD, 0xCC, 0x03, 0xAA, 0x20
271};
272 uint8_t a[sizeof(b)];
273*/
274
275 uint8_t p[2*sizeof(b)];
276 int i;
277
278 srandomdev();
279
280// while( 1 )
281 {
282// memset( a, 0, sizeof(a) );
283// a[sizeof(b)-1]=2;
284/*
285 memset( b, 0, sizeof(b) );
286 a[g_legs-1] = random() & 0xff;
287 for( i=g_legs/2; i<sizeof(a); ++i )
288 {
289 b[i] = random() & 0xff;
290 }
291*/
292 kara2( p, a, b, sizeof(b));
293
294 dump_int2( a, sizeof(a), 0 );
295 putchar( '*' );
296 dump_int2( b, sizeof(b), 0 );
297 putchar( '-' );
298 dump_int2( p, sizeof(p), 1 );
299 }
300 return 0;
301}
diff --git a/powm.c b/powm.c
new file mode 100644
index 0000000..e847e54
--- /dev/null
+++ b/powm.c
@@ -0,0 +1,516 @@
1#include <stdint.h>
2#include <string.h>
3#include <stdio.h>
4
5/*
6This code implements base ^ exp mod p with base being a small integer,
7g given in g_prime and the r number of bits (4096) being used to pre-calculate
8g_r_square = r^2 mod p in order to speed up montgomery reduction
9
10Because our primes conveniently end in 0xf..f for word sizes up to 64, the
11multiplicative inverse is ("pre-calculated") g_r_inverse = 1.
12
13Todo: check if little endian internal representation is more efficient, also
14write an import/export function for the exponent/result from big endian uint8_t
15representation which is the default transport representation.
16
17Currently, with gcc -O3, we measure at around 2,75x the time libgmp needs for
18it's powm which is not so bad, considering we don't use any assembler optimization.
19
20Hot spot function is mp_mul_uint_add.
21
22When we're done measuring, we should set PREVENT_TIMING_ATTACKS so that powm()
23does not leak information about the number of bits set, but on average doubling the
24time needed. Discuss ;)
25*/
26
27#if 0
28typedef uint32_t leg_t;
29typedef uint64_t dleg_t;
30#else
31typedef uint64_t leg_t;
32typedef unsigned int uint128_t __attribute__((mode(TI)));
33typedef uint128_t dleg_t;
34#endif
35
36//#define WITH_MEASURE_GMP
37//#define WITH_PREVENT_TIMING_ATTACKS
38#define WITH_ROUNDS 128
39#define KARATSUBA_THRESHOLD 16
40
41#ifdef WITH_MEASURE_GMP
42#include <gmp.h>
43#endif
44
45#ifdef DEBUG_FUNCTIONS
46static void dump_int( int level, char * prefix, leg_t const * p, int l, int nl )
47{
48 printf( "L%d %s: ", level, prefix );
49 while( l-- )
50 printf( "%08llX", *(p++) );
51 if( nl )
52 putchar( 10 );
53}
54#endif
55
56/* Test values for smaller legs sizes */
57#if 0
58static leg_t g_prime[] = { 4293918689 };
59static leg_t g_r_square[] = { 333456065 };
60static leg_t g_r_inverse = 4192173023;
61#endif
62
63#if 0
64static leg_t g_prime[] = { 0xFFFFFFFF, 0xFFFFFFA1 };
65/* g_prime == 18446744073709551521 */
66/* g_r == 18446744073709551616 */
67/* g_r^2 == 340282366920938463463374607431768211456 */
68static leg_t g_r_square[] = { 0, 9025 };
69/* g_prime^-1 mod g_r = 12815632724892951649 */
70static leg_t g_r_inverse = 3571604383;
71#endif
72
73#if 0
74static leg_t g_prime[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffff61 };
75/* g_prime == 340282366920938463463374607431768211297 */
76/* g_r == 340282366920938463463374607431768211456 */
77/* g_r^2 == 115792089237316195423570985008687907853269984665640564039457584007913129639936 */
78static leg_t g_r_square[] = { 0, 0, 0, 25281 };
79/* g_prime^-1 mod g_r = 104866892950477891256008526818595234977 */
80static leg_t g_r_inverse = 783358815;
81#endif
82
83#if 0
84static leg_t g_prime[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x61 };
85/* g_prime == 340282366920938463463374607431768211297 */
86/* g_r == 340282366920938463463374607431768211456 */
87/* g_r^2 == 115792089237316195423570985008687907853269984665640564039457584007913129639936 */
88static leg_t g_r_square[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x62, 0xc1 };
89/* g_prime^-1 mod g_r = 104866892950477891256008526818595234977 */
90static leg_t g_r_inverse = 0x5f;
91#endif
92
93#if 0
94/* Standard prime for leg_t defined as uint32_t */
95static const leg_t g_prime[] = {
96 0xFFFFFFFF, 0xFFFFFFFF, 0xC90FDAA2, 0x2168C234, 0xC4C6628B, 0x80DC1CD1, 0x29024E08, 0x8A67CC74,
97 0x020BBEA6, 0x3B139B22, 0x514A0879, 0x8E3404DD, 0xEF9519B3, 0xCD3A431B, 0x302B0A6D, 0xF25F1437,
98 0x4FE1356D, 0x6D51C245, 0xE485B576, 0x625E7EC6, 0xF44C42E9, 0xA637ED6B, 0x0BFF5CB6, 0xF406B7ED,
99 0xEE386BFB, 0x5A899FA5, 0xAE9F2411, 0x7C4B1FE6, 0x49286651, 0xECE45B3D, 0xC2007CB8, 0xA163BF05,
100 0x98DA4836, 0x1C55D39A, 0x69163FA8, 0xFD24CF5F, 0x83655D23, 0xDCA3AD96, 0x1C62F356, 0x208552BB,
101 0x9ED52907, 0x7096966D, 0x670C354E, 0x4ABC9804, 0xF1746C08, 0xCA18217C, 0x32905E46, 0x2E36CE3B,
102 0xE39E772C, 0x180E8603, 0x9B2783A2, 0xEC07A28F, 0xB5C55DF0, 0x6F4C52C9, 0xDE2BCBF6, 0x95581718,
103 0x3995497C, 0xEA956AE5, 0x15D22618, 0x98FA0510, 0x15728E5A, 0x8AAAC42D, 0xAD33170D, 0x04507A33,
104 0xA85521AB, 0xDF1CBA64, 0xECFB8504, 0x58DBEF0A, 0x8AEA7157, 0x5D060C7D, 0xB3970F85, 0xA6E1E4C7,
105 0xABF5AE8C, 0xDB0933D7, 0x1E8C94E0, 0x4A25619D, 0xCEE3D226, 0x1AD2EE6B, 0xF12FFA06, 0xD98A0864,
106 0xD8760273, 0x3EC86A64, 0x521F2B18, 0x177B200C, 0xBBE11757, 0x7A615D6C, 0x770988C0, 0xBAD946E2,
107 0x08E24FA0, 0x74E5AB31, 0x43DB5BFC, 0xE0FD108E, 0x4B82D120, 0xA9210801, 0x1A723C12, 0xA787E6D7,
108 0x88719A10, 0xBDBA5B26, 0x99C32718, 0x6AF4E23C, 0x1A946834, 0xB6150BDA, 0x2583E9CA, 0x2AD44CE8,
109 0xDBBBC2DB, 0x04DE8EF9, 0x2E8EFC14, 0x1FBECAA6, 0x287C5947, 0x4E6BC05D, 0x99B2964F, 0xA090C3A2,
110 0x233BA186, 0x515BE7ED, 0x1F612970, 0xCEE2D7AF, 0xB81BDD76, 0x2170481C, 0xD0069127, 0xD5B05AA9,
111 0x93B4EA98, 0x8D8FDDC1, 0x86FFB7DC, 0x90A6C08F, 0x4DF435C9, 0x34063199, 0xFFFFFFFF, 0xFFFFFFFF
112};
113
114static const leg_t g_r_square[] = {
115 0x3DA97659, 0xE280DB0B, 0xE65BCC3A, 0xB78FDAA9, 0xB7B768C8, 0x9931D78D, 0xF8B11725, 0x339EBC93,
116 0xAA7FBD95, 0x62059F1F, 0x4C2DE67D, 0xAD47527E, 0x526A653A, 0x7A674BD5, 0x5401EA4F, 0x3ED73A2F,
117 0x70B56F52, 0x7F6F604F, 0xF3E56CC2, 0xBD9F048C, 0xFEA80D9A, 0x6EC9FCD3, 0x3AD36FD8, 0x22C39F34,
118 0x91F30C52, 0xF798DA6A, 0x18C3DCE2, 0x0188D84C, 0xCB225176, 0x259E080F, 0xA89D1DCD, 0x9D381CC5,
119 0xBEACD46F, 0x3CDD1196, 0x91A4F557, 0x2929B90C, 0x67DE8FA0, 0x23864714, 0xC28A61D4, 0x7411402D,
120 0x6C09060D, 0x41058639, 0xE4FCCF1D, 0x638F4566, 0x9E10FDE2, 0x8E54806B, 0xF1D27D0B, 0x5C7DC9C2,
121 0xB616D6FA, 0x8BE2C91D, 0xE8105464, 0xE9F80A5F, 0x34720869, 0x90DACF1A, 0x7E2C75A5, 0x8E25F142,
122 0x8FB4832E, 0xF827DE84, 0xCA06DA91, 0xC2B3E7E2, 0x4F024193, 0x787A8278, 0x8BD70562, 0xDA60E392,
123 0x46BDB733, 0x6E8452D9, 0x1D7D37A2, 0x3FB8CF61, 0xB18A9EF1, 0x50C8953A, 0xC93919D1, 0x2A4B1A67,
124 0xFDC65A26, 0x9B51C1EF, 0x09954843, 0x20E739F4, 0x6C7951A5, 0x23CEF785, 0xE60C6EFD, 0xFFB7A9A9,
125 0x5F666146, 0xCB441F59, 0xAE01E0F3, 0x63A9315D, 0x04BA044A, 0xEB4EEFD4, 0x8563215F, 0x72C8D989,
126 0x62D21877, 0x1296EF6A, 0x20BD72B9, 0xB21E6B3D, 0x53C44FAB, 0x734810F7, 0xD203A9E0, 0xD7CE25D0,
127 0x2E52989E, 0xCCF85F34, 0x912A0491, 0x3E9EBD87, 0x8267537D, 0x4A612A18, 0x51E75D99, 0x98F001DB,
128 0xC9C77F0C, 0x352D408C, 0xA796D182, 0x04A636F7, 0xE4404092, 0x1C1E467C, 0x524E7C7A, 0x7ED36C41,
129 0x2A434CEB, 0x230B2DFE, 0x3549C577, 0x7A17FB04, 0xB850DE95, 0xD97AC40A, 0x55EA6F75, 0x41C4F82B,
130 0x37BF90FE, 0x52074F19, 0xFA8F75F0, 0x067E82B1, 0x8A1AC024, 0xB30E9B12, 0xC14AB0DD, 0xCC03AA20
131};
132
133static const leg_t g_r_inverse = 1;
134*/
135#endif
136
137#if 1
138/* Standard prime for leg_t defined as uint64_t */
139static const leg_t g_prime[] = {
140 0xFFFFFFFFFFFFFFFF, 0xC90FDAA22168C234, 0xC4C6628B80DC1CD1, 0x29024E088A67CC74,
141 0x020BBEA63B139B22, 0x514A08798E3404DD, 0xEF9519B3CD3A431B, 0x302B0A6DF25F1437,
142 0x4FE1356D6D51C245, 0xE485B576625E7EC6, 0xF44C42E9A637ED6B, 0x0BFF5CB6F406B7ED,
143 0xEE386BFB5A899FA5, 0xAE9F24117C4B1FE6, 0x49286651ECE45B3D, 0xC2007CB8A163BF05,
144 0x98DA48361C55D39A, 0x69163FA8FD24CF5F, 0x83655D23DCA3AD96, 0x1C62F356208552BB,
145 0x9ED529077096966D, 0x670C354E4ABC9804, 0xF1746C08CA18217C, 0x32905E462E36CE3B,
146 0xE39E772C180E8603, 0x9B2783A2EC07A28F, 0xB5C55DF06F4C52C9, 0xDE2BCBF695581718,
147 0x3995497CEA956AE5, 0x15D2261898FA0510, 0x15728E5A8AAAC42D, 0xAD33170D04507A33,
148 0xA85521ABDF1CBA64, 0xECFB850458DBEF0A, 0x8AEA71575D060C7D, 0xB3970F85A6E1E4C7,
149 0xABF5AE8CDB0933D7, 0x1E8C94E04A25619D, 0xCEE3D2261AD2EE6B, 0xF12FFA06D98A0864,
150 0xD87602733EC86A64, 0x521F2B18177B200C, 0xBBE117577A615D6C, 0x770988C0BAD946E2,
151 0x08E24FA074E5AB31, 0x43DB5BFCE0FD108E, 0x4B82D120A9210801, 0x1A723C12A787E6D7,
152 0x88719A10BDBA5B26, 0x99C327186AF4E23C, 0x1A946834B6150BDA, 0x2583E9CA2AD44CE8,
153 0xDBBBC2DB04DE8EF9, 0x2E8EFC141FBECAA6, 0x287C59474E6BC05D, 0x99B2964FA090C3A2,
154 0x233BA186515BE7ED, 0x1F612970CEE2D7AF, 0xB81BDD762170481C, 0xD0069127D5B05AA9,
155 0x93B4EA988D8FDDC1, 0x86FFB7DC90A6C08F, 0x4DF435C934063199, 0xFFFFFFFFFFFFFFFF
156};
157
158static const leg_t g_r_square[] = {
159 0x3DA97659E280DB0B, 0xE65BCC3AB78FDAA9, 0xB7B768C89931D78D, 0xF8B11725339EBC93,
160 0xAA7FBD9562059F1F, 0x4C2DE67DAD47527E, 0x526A653A7A674BD5, 0x5401EA4F3ED73A2F,
161 0x70B56F527F6F604F, 0xF3E56CC2BD9F048C, 0xFEA80D9A6EC9FCD3, 0x3AD36FD822C39F34,
162 0x91F30C52F798DA6A, 0x18C3DCE20188D84C, 0xCB225176259E080F, 0xA89D1DCD9D381CC5,
163 0xBEACD46F3CDD1196, 0x91A4F5572929B90C, 0x67DE8FA023864714, 0xC28A61D47411402D,
164 0x6C09060D41058639, 0xE4FCCF1D638F4566, 0x9E10FDE28E54806B, 0xF1D27D0B5C7DC9C2,
165 0xB616D6FA8BE2C91D, 0xE8105464E9F80A5F, 0x3472086990DACF1A, 0x7E2C75A58E25F142,
166 0x8FB4832EF827DE84, 0xCA06DA91C2B3E7E2, 0x4F024193787A8278, 0x8BD70562DA60E392,
167 0x46BDB7336E8452D9, 0x1D7D37A23FB8CF61, 0xB18A9EF150C8953A, 0xC93919D12A4B1A67,
168 0xFDC65A269B51C1EF, 0x0995484320E739F4, 0x6C7951A523CEF785, 0xE60C6EFDFFB7A9A9,
169 0x5F666146CB441F59, 0xAE01E0F363A9315D, 0x04BA044AEB4EEFD4, 0x8563215F72C8D989,
170 0x62D218771296EF6A, 0x20BD72B9B21E6B3D, 0x53C44FAB734810F7, 0xD203A9E0D7CE25D0,
171 0x2E52989ECCF85F34, 0x912A04913E9EBD87, 0x8267537D4A612A18, 0x51E75D9998F001DB,
172 0xC9C77F0C352D408C, 0xA796D18204A636F7, 0xE44040921C1E467C, 0x524E7C7A7ED36C41,
173 0x2A434CEB230B2DFE, 0x3549C5777A17FB04, 0xB850DE95D97AC40A, 0x55EA6F7541C4F82B,
174 0x37BF90FE52074F19, 0xFA8F75F0067E82B1, 0x8A1AC024B30E9B12, 0xC14AB0DDCC03AA20
175};
176
177static const leg_t g_r_inverse = 1;
178#endif
179
180/* Returns 0 if a and b are equal, -1 if a < b, 1 if a > b */
181static int mp_cmp( leg_t const *a, leg_t const *b, int const legs )
182{
183 int leg;
184 for( leg=0; leg<legs; ++leg )
185 {
186 if( a[leg] < b[leg] ) return -1;
187 if( a[leg] > b[leg] ) return 1;
188 }
189 return 0;
190}
191
192/* Subtract b from a, store in a. Expects enough words prepended
193 to borrow from */
194static void mp_sub( leg_t *a, leg_t const *b, int legs )
195{
196 int borrow = 0, borrow_temp;
197 while( legs-- )
198 {
199 leg_t temp = a[legs] - b[legs];
200 borrow_temp = temp > a[legs];
201 a[legs] = temp - borrow;
202 borrow = borrow_temp | ( a[legs] > temp );
203 }
204 while( borrow )
205 {
206 leg_t temp = a[legs] - borrow;
207 borrow = temp > a[legs];
208 a[legs--] = temp;
209 }
210}
211
212/* Add b to a, store in a. Operates on legs + flegs words, with
213 flegs the amount of legs to propagate the carry to */
214static void mp_adm( leg_t *a, leg_t const *b, int legs, int flegs )
215{
216 dleg_t acc = 0;
217 while( legs-- > 0 )
218 {
219 acc += (dleg_t)a[legs] + (dleg_t)b[legs];
220 a[legs] = (leg_t)acc;
221 acc >>= 8*sizeof(leg_t);
222 }
223 while( acc && flegs-- )
224 {
225 acc += (dleg_t)a[legs];
226 a[legs--] = (leg_t)acc;
227 acc >>= 8*sizeof(leg_t);
228 }
229}
230
231/* Subtract b from a and store in result. Expects nothing to borrow.*/
232static void mp_sub_mod( leg_t * result, leg_t const *a, leg_t const * b, int legs )
233{
234 int borrow = 0, borrow_temp;
235 while( legs-- )
236 {
237 leg_t temp = a[legs] - b[legs];
238 borrow_temp = temp > a[legs];
239 result[legs] = temp - borrow;
240 borrow = borrow_temp | (result[legs] > temp );
241 }
242}
243
244/* Fast negate */
245static void mp_negate( leg_t * p, int legs )
246{
247 int legss = legs;
248 while( legs--) p[legs]^=(leg_t)-1;
249 while( legss-- && !++p[legss] );
250}
251
252/* Multiplies a with fac, adds to result.
253 result is guaranteed to be initialized with enough legs prepended to take the carry */
254static void mp_mul_uint_add( leg_t *result, leg_t const *a, leg_t fac, int legs )
255{
256 dleg_t acc = 0;
257 leg_t *r = result+legs-1;
258 while( legs-- )
259 {
260 acc += (dleg_t)*r + (dleg_t)a[legs] * (dleg_t)fac;
261 *(r--) = (leg_t)acc;
262 acc >>= 8*sizeof(leg_t);
263 }
264 while( acc )
265 {
266 acc += (dleg_t)*r;
267 *(r--) = (leg_t)acc;
268 acc >>= 8*sizeof(leg_t);
269 }
270}
271
272/* Multiplies a and b, adds to result, base case version. */
273static void mp_mul_oper_add( leg_t * result, leg_t const *a, leg_t const *b, int legs )
274{
275 int leg = legs;
276 while( leg-- )
277 mp_mul_uint_add( result + 1 + leg, a, b[leg], legs );
278}
279
280/* Optimized mp_mul_oper_add for a == b, i.e. squaring */
281static void mp_sqr( leg_t *result, leg_t const * a, int legs )
282{
283 while( legs-- ) {
284 leg_t *offs = result+2*legs+1;
285 leg_t fac = a[legs];
286 int leg = legs;
287 dleg_t acc = (dleg_t)*offs + (dleg_t)fac * (dleg_t)fac;
288
289 *(offs--) = (leg_t)acc;
290 acc >>= 8*sizeof(leg_t);
291
292 while( leg-- )
293 {
294 dleg_t subresult = (dleg_t)fac * (dleg_t)a[leg];
295 int carry = !!( subresult >> (16*sizeof(leg_t)-1));
296
297 acc += 2 * subresult + (dleg_t)*offs;
298 *(offs--) = (leg_t)acc;
299
300 acc >>= 8*sizeof(leg_t);
301 acc += (dleg_t)carry << 8*sizeof(leg_t);
302 }
303
304 while( acc )
305 {
306 acc += (dleg_t)*offs;
307 *(offs--) = (leg_t)acc;
308 acc >>= 8*sizeof(leg_t);
309 }
310 }
311}
312
313/* Optimized karatsuba (toom2.2) for a == b, i.e. squaring */
314static void mp_mul_kara_square( leg_t* p, leg_t const *a, int len, leg_t *scratch )
315{
316 memset( p, 0, 2 * len * sizeof( leg_t ));
317 if( len <= KARATSUBA_THRESHOLD )
318 mp_sqr( p, a, len );
319 else
320 {
321 int n = len / 2;
322
323 if( mp_cmp( a, a + n, n ) > 0 )
324 mp_sub_mod( scratch, a, a + n, n );
325 else
326 mp_sub_mod( scratch, a + n, a, n );
327
328 mp_mul_kara_square( p + n, scratch, n, scratch + len );
329 mp_negate( p, len + n );
330
331 mp_mul_kara_square( scratch, a + n, n, scratch + len );
332 mp_adm( p + len, scratch, len, len );
333 mp_adm( p + n, scratch, len, n );
334
335 mp_mul_kara_square( scratch, a, n, scratch + len );
336 mp_adm( p + n, scratch, len, n );
337 mp_adm( p, scratch, len, 0 );
338 }
339}
340
341/* karatsuba (toom2.2), generic */
342static void mp_mul_kara( leg_t* p, leg_t const *a, leg_t const *b, int len, leg_t *scratch )
343{
344 memset( p, 0, 2 * len * sizeof( leg_t ));
345 if( len <= KARATSUBA_THRESHOLD )
346 mp_mul_oper_add( p, a, b, len );
347 else
348 {
349 int sign = 0, n = len / 2;
350
351 if( mp_cmp( a, a + n, n ) > 0 )
352 mp_sub_mod( scratch, a, a + n, n );
353 else
354 {
355 mp_sub_mod( scratch, a + n, a, n );
356 sign = 1;
357 }
358
359 if( mp_cmp( b, b + n, n ) > 0 )
360 mp_sub_mod( scratch + n, b, b + n, n );
361 else
362 {
363 mp_sub_mod( scratch + n, b + n, b, n );
364 sign ^= 1;
365 }
366
367 mp_mul_kara( p + n, scratch, scratch + n, n, scratch + len );
368 if( !sign )
369 mp_negate( p, len + n );
370
371 mp_mul_kara( scratch, a + n, b + n, n, scratch + len );
372 mp_adm( p + len, scratch, len, len );
373 mp_adm( p + n, scratch, len, n );
374
375 mp_mul_kara( scratch, a, b, n, scratch + len );
376 mp_adm( p + n, scratch, len, n );
377 mp_adm( p, scratch, len, 0 );
378 }
379}
380
381/* Multiply a and b, store in a, work in montgomery domain
382 to achieve multiply, a needs to be reduced by k * g_prime
383 until all lower legs are 0, allowing exact division by 2^r
384
385 if !do_mul, we convert from montgomery domain back to
386 g_prime domain and thus only multiply by 1 before reducing
387 */
388static void redc( leg_t *a, leg_t const *b, int legs, int do_mul )
389{
390 leg_t scratch[ 2 * ( legs * legs / KARATSUBA_THRESHOLD ) ];
391 leg_t temp[1 + 2 * legs];
392 leg_t leg = legs;
393
394 /* Not necessary for transforming back */
395 if( do_mul )
396 {
397 temp[0] = 0;
398 if( a == b )
399 mp_mul_kara_square( temp + 1, a, legs, scratch );
400 else
401 mp_mul_kara( temp + 1, a, b, legs, scratch );
402 }
403 else
404 {
405 memset( temp, 0, ( 1 + legs ) * sizeof(leg_t));
406 memcpy( temp + 1 + legs, a, legs * sizeof(leg_t));
407 }
408
409 /* m = p * ( m * R_1 ) % R */
410 while( leg-- )
411 {
412 leg_t k = temp[1+legs+leg] * 1; // g_r_inverse;
413 mp_mul_uint_add( temp + 1 + leg + 1, g_prime, k, legs );
414 }
415
416 /* the lower legs of temp are now zero */
417 /* if necessary, reduce temp to fit in legs */
418 if( temp[0] || mp_cmp( temp + 1, g_prime, legs ) > 0 )
419 mp_sub( temp + 1, g_prime, legs );
420
421 memcpy( a, temp + 1, legs * sizeof(leg_t) );
422}
423
424/* calculate base ^ exponent modulo g_prime */
425static void powm( leg_t * result, leg_t const *exponent, leg_t base, int legs )
426{
427 leg_t acc[legs];
428#ifdef WITH_PREVENT_TIMING_ATTACKS
429 leg_t dummy[legs];
430#endif
431 int first = 0, bit = legs * 8 * sizeof(leg_t);
432
433 memset( acc, 0, sizeof(acc) );
434 acc[legs-1] = base;
435
436 /* Transform base into montgomery domain */
437 redc( acc, g_r_square, legs, 1 );
438
439 /* mul in temp and if bit set in exponent, multiply into accumulator */
440 while( bit-- )
441 {
442 int this_bit = sizeof(leg_t) * 8 - 1 - ( bit % ( sizeof(leg_t) * 8 ) );
443 if( ( exponent[ bit / ( sizeof(leg_t) * 8 ) ] >> this_bit ) & 1 ) {
444 if( first++ )
445 redc( result, acc, legs, 1 );
446 else
447 memcpy( result, acc, sizeof(leg_t) * legs );
448 }
449#ifdef WITH_PREVENT_TIMING_ATTACKS
450 else
451 redc( dummy, acc, legs, 1 );
452#endif
453 if( bit )
454 redc( acc, acc, legs, 1 );
455 }
456
457 /* Transform result in acc back into mod p domain */
458 if( first )
459 redc( result, 0, legs, 0 );
460 else /* base ^ 0 mod p = 1 */
461 {
462 memset( result, 0, legs * sizeof(leg_t) );
463 result[legs-1] = 1;
464 }
465}
466
467int main()
468{
469 int i, legs = 64;
470 leg_t input[ /*legs*/ ] = {
471 0x1de7eae6c0d6a0b0, 0x9ac2961fe160f837, 0xe5d85f8fb8fced5c, 0xb563a9cf79c6eaec,
472 0xa6d78571af8d3688, 0x118727b5eb17dfbd, 0x6060e2cd96000219, 0x52292d56ea2960a9,
473 0x72d720399b4ad2d4, 0xd07dc909e7070b0e, 0xcb650843a43a4a00, 0x950e895d7777d182,
474 0x3ca892560247ca32, 0xac5ecd1fe4997513, 0xb420bd7e538ef88b, 0x2bca50f311573101,
475 0x31052a6fc332563d, 0xd151e2770baec00b, 0x094d72c301b8f25d, 0x70f0c055dde43121,
476 0x1f61b1ea0e5aafb4, 0x0e84ab94e744b8ba, 0x87e0db1397d9f2fd, 0xa13b801d1ac631d5,
477 0xb5444312b3b43541, 0xa3612061c92319ce, 0x0e2aab629710919b, 0xd7ac4896d4e08b35,
478 0x1785bbfb88755c99, 0x70cc7ff6ea8f1b51, 0x5479eaeb9b64ba87, 0xa77b229138f2d5d1,
479 0x942fad7b2cc3dcc5, 0x196bcb0cc0e353f8, 0xd8aec44d430026aa, 0xe9228ae741da24cd,
480 0x518781609b447bf8, 0x353eb64e067f86cc, 0x4c3b1333a07fee24, 0x82b4e75bf0e60d06,
481 0x2d44f438b889673c, 0xf8df973a9842e7b2, 0x4ab8b5e0adaad6a0, 0xd2111f6ce69c2bc6,
482 0xfcc2a7f6d4b0d4f7, 0xa86946df6abbea0f, 0xc0dc8c759af7492b, 0xe0c022a74c073360,
483 0x74fc0b5dcdfaa679, 0xd6b85c29e8e8a528, 0x547fd48d3b471c7c, 0x051a88159adc842b,
484 0xe407bde8519bc534, 0x21da1ef76099ed94, 0x9ae478fd69254dc9, 0x6b984d2e7b9cc244,
485 0x87e90f5b6397727e, 0xfb4e8dadb3111df4, 0xfa8718685a5b8359, 0x5ec1d62932d6937a,
486 0x56eee1ba0dc1936c, 0xb08ad95f8040183f, 0xe0cc506399225574, 0x281b04b30bb62984
487 };
488
489 leg_t res[legs]; /*
490= { 569D056BC5485C6F3C026935CFC1FC36CFC5A6B770B442323750EE08A32C63F64743A4902E32A41F7C5A9097509EB9AAA5C5B425D9BDAEDC3B98EE3A9F99AE6FB8723EBE01154937ECC1E4AE38CBC12B767FD17264A4AAEB310EA5B0E72D12FE3969532C5A0718B45F251BFB2A126EFB87F819BDCD434EAEE3B9108C387F4DFFEFA0E9532A0532192034DB0EF5C5BB2A77BCA5B4A08DFFD22F6B1D4F17F5123935CAB4F40EC7E77F4FEEEC6A8CD20CDB2D37DD1BB44544C50B01E98F8B3F5E1ABF91817D40B7E526D6ED80CDE416C6CD32218D3ECCA753D15BBB007661754E44A937C5A68957C3C2D3F39A220930567E9F720B3AAE9D34FEBA6E5A047C131F32271A3BC1E85BC1BC210F49B09F3938226399991BC90299B4E0E684B2DE6BDA684693F40CF728FD24A4B1D1A2B8F4CD3316A02DBC44069A40C1AC64F862091A888DD284AB180918B379A3BD173EDDEA5E08DF4BE51B441AEA7EDA4E439CF57914F68E0FB3E756AB7286616C3736CA72DD40A88FC37B598BF561300AFD1F80F7D44E53C4C1D85FF215250A0C56222A177299C61C2E654B040176F4A8D44BE2612B36E066E7AA6E58D84D4C4CB9A5952E1668CDF04EE55C38ABA59614CB7F43D12921551DD82062AFC3F340B33F43C643BC9D5DB6D6BAB2C94F100D139443B20BCB80384C3231261C5ABFFF3D971CB878116F4EBB7D4D24A2B6F5869 }
491*/
492
493#ifdef WITH_MEASURE_GMP
494 mpz_t m, base, exp, result;
495
496 mpz_init( m ); mpz_init( base ); mpz_init( exp ); mpz_init( result );
497 mpz_import( m, sizeof(g_prime) / sizeof(uint64_t), 1, sizeof(uint64_t), 0, 0, g_prime );
498 mpz_set_ui( base, 2 );
499 mpz_import( exp, sizeof(input) / sizeof(uint64_t), 1, sizeof(uint64_t), 0, 0, input );
500
501 for( i=0; i< WITH_ROUNDS; ++i )
502 mpz_powm( result, base, exp, m );
503
504 gmp_printf("%ZX\n", result);
505#else
506
507 for( i=0; i<WITH_ROUNDS; ++i )
508 powm( res, input, 2, legs );
509
510 for( i=0; i<legs; ++i )
511 printf( "%08llX", res[i] );
512 putchar(10); putchar(10);
513#endif
514
515 return 0;
516}