From aae5320226e9f0fcc764471337326ab690683df7 Mon Sep 17 00:00:00 2001 From: Dirk Engling Date: Thu, 18 Jul 2013 21:31:26 +0200 Subject: Enable unrolling to speed up base case mul --- powm.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/powm.c b/powm.c index b8599df..5c281a6 100644 --- a/powm.c +++ b/powm.c @@ -2,6 +2,7 @@ #include #include #include +#include /* This code implements base ^ exp mod p with base being a small integer, @@ -25,7 +26,6 @@ does not leak information about the number of bits set, but on average doubling time needed. Discuss ;) */ -static int run_tests( ); #if 0 typedef uint32_t leg_t; typedef uint64_t dleg_t; @@ -37,9 +37,16 @@ typedef uint128_t dleg_t; //#define WITH_MEASURE_GMP //#define WITH_PREVENT_TIMING_ATTACKS +//#define WITH_TESTS +//#define WITHOUT_UNROLLING #define WITH_ROUNDS 128 + +#ifdef WITH_TESTS +static int run_tests( ); +#define KARATSUBA_THRESHOLD 4 +#else #define KARATSUBA_THRESHOLD 16 -//#define WITH_TESTS +#endif #ifdef WITH_MEASURE_GMP #include @@ -261,18 +268,52 @@ static void mp_negate( leg_t * p, int legs ) result is guaranteed to be initialized with enough legs prepended to take the carry */ static void mp_mul_uint_add( leg_t *result, leg_t const *a, leg_t fac, int legs ) { - dleg_t acc = 0; + dleg_t acc8 = 0; +#ifndef WITHOUT_UNROLLING + dleg_t acc1 = 0, acc2 = 0, acc3 = 0, acc4 = 0; + dleg_t acc5 = 0, acc6 = 0, acc7 = 0; + + while( legs >= 8 ) + { + acc1 = ( acc8 >> 8*sizeof(leg_t) ) + (dleg_t)*result + (dleg_t)*(a++) * (dleg_t)fac; + *(result++) = (leg_t)acc1; + + acc2 = ( acc1 >> 8*sizeof(leg_t) ) + (dleg_t)*result + (dleg_t)*(a++) * (dleg_t)fac; + *(result++) = (leg_t)acc2; + + acc3 = ( acc2 >> 8*sizeof(leg_t) ) + (dleg_t)*result + (dleg_t)*(a++) * (dleg_t)fac; + *(result++) = (leg_t)acc3; + + acc4 = ( acc3 >> 8*sizeof(leg_t) ) + (dleg_t)*result + (dleg_t)*(a++) * (dleg_t)fac; + *(result++) = (leg_t)acc4; + + acc5 = ( acc4 >> 8*sizeof(leg_t) ) + (dleg_t)*result + (dleg_t)*(a++) * (dleg_t)fac; + *(result++) = (leg_t)acc5; + + acc6 = ( acc5 >> 8*sizeof(leg_t) ) + (dleg_t)*result + (dleg_t)*(a++) * (dleg_t)fac; + *(result++) = (leg_t)acc6; + + acc7 = ( acc6 >> 8*sizeof(leg_t) ) + (dleg_t)*result + (dleg_t)*(a++) * (dleg_t)fac; + *(result++) = (leg_t)acc7; + + acc8 = ( acc7 >> 8*sizeof(leg_t) ) + (dleg_t)*result + (dleg_t)*(a++) * (dleg_t)fac; + *(result++) = (leg_t)acc8; + + legs -= 8; + } + acc8 >>= 8*sizeof(leg_t); +#endif while( legs-- ) { - acc += (dleg_t)*result + (dleg_t)*(a++) * (dleg_t)fac; - *(result++) = (leg_t)acc; - acc >>= 8*sizeof(leg_t); + acc8 += (dleg_t)*result + (dleg_t)*(a++) * (dleg_t)fac; + *(result++) = (leg_t)acc8; + acc8 >>= 8*sizeof(leg_t); } - while( acc ) + while( acc8 ) { - acc += (dleg_t)*result; - *(result++) = (leg_t)acc; - acc >>= 8*sizeof(leg_t); + acc8 += (dleg_t)*result; + *(result++) = (leg_t)acc8; + acc8 >>= 8*sizeof(leg_t); } } -- cgit v1.2.3