Run util/openssl-format-source on the Curve448 code

Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
(Merged from https://github.com/openssl/openssl/pull/5105)
This commit is contained in:
Matt Caswell 2017-12-04 11:38:58 +00:00
parent 1308e022e1
commit 205fd63881
29 changed files with 4140 additions and 3030 deletions

View File

@ -11,20 +11,21 @@
*/ */
#ifndef __ARCH_ARCH_32_ARCH_INTRINSICS_H__ #ifndef __ARCH_ARCH_32_ARCH_INTRINSICS_H__
#define __ARCH_ARCH_32_ARCH_INTRINSICS_H__ # define __ARCH_ARCH_32_ARCH_INTRINSICS_H__
#define ARCH_WORD_BITS 32 # define ARCH_WORD_BITS 32
static __inline__ __attribute((always_inline,unused)) static __inline__ __attribute((always_inline, unused))
uint32_t word_is_zero(uint32_t a) { uint32_t word_is_zero(uint32_t a)
{
/* let's hope the compiler isn't clever enough to optimize this. */ /* let's hope the compiler isn't clever enough to optimize this. */
return (((uint64_t)a)-1)>>32; return (((uint64_t)a) - 1) >> 32;
} }
static __inline__ __attribute((always_inline,unused)) static __inline__ __attribute((always_inline, unused))
uint64_t widemul(uint32_t a, uint32_t b) { uint64_t widemul(uint32_t a, uint32_t b)
{
return ((uint64_t)a) * b; return ((uint64_t)a) * b;
} }
#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */ #endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */

View File

@ -14,54 +14,51 @@
#if (defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) && !I_HATE_UNROLLED_LOOPS) \ #if (defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) && !I_HATE_UNROLLED_LOOPS) \
|| defined(DECAF_FORCE_UNROLL) || defined(DECAF_FORCE_UNROLL)
#define REPEAT8(_x) _x _x _x _x _x _x _x _x # define REPEAT8(_x) _x _x _x _x _x _x _x _x
#define FOR_LIMB(_i,_start,_end,_x) do { _i=_start; REPEAT8( if (_i<_end) { _x; } _i++;) } while (0) # define FOR_LIMB(_i,_start,_end,_x) do { _i=_start; REPEAT8( if (_i<_end) { _x; } _i++;) } while (0)
#else #else
#define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0) # define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0)
#endif #endif
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
{
const uint32_t *a = as->limb, *b = bs->limb; const uint32_t *a = as->limb, *b = bs->limb;
uint32_t *c = cs->limb; uint32_t *c = cs->limb;
uint64_t accum0 = 0, accum1 = 0, accum2 = 0; uint64_t accum0 = 0, accum1 = 0, accum2 = 0;
uint32_t mask = (1<<28) - 1; uint32_t mask = (1 << 28) - 1;
uint32_t aa[8], bb[8]; uint32_t aa[8], bb[8];
int i,j; int i, j;
for (i=0; i<8; i++) { for (i = 0; i < 8; i++) {
aa[i] = a[i] + a[i+8]; aa[i] = a[i] + a[i + 8];
bb[i] = b[i] + b[i+8]; bb[i] = b[i] + b[i + 8];
} }
FOR_LIMB(j,0,8,{ FOR_LIMB(j, 0, 8, {
accum2 = 0; accum2 = 0;
FOR_LIMB(i, 0, j + 1, {
FOR_LIMB (i,0,j+1,{ accum2 += widemul(a[j - i], b[i]);
accum2 += widemul(a[j-i],b[i]); accum1 += widemul(aa[j - i], bb[i]);
accum1 += widemul(aa[j-i],bb[i]); accum0 += widemul(a[8 + j - i], b[8 + i]);
accum0 += widemul(a[8+j-i], b[8+i]); }
}); ); accum1 -= accum2; accum0 += accum2;
accum1 -= accum2;
accum0 += accum2;
accum2 = 0; accum2 = 0;
FOR_LIMB(i, j + 1, 8, {
FOR_LIMB (i,j+1,8,{ accum0 -=
accum0 -= widemul(a[8+j-i], b[i]); widemul(a[8 + j - i], b[i]);
accum2 += widemul(aa[8+j-i], bb[i]); accum2 +=
accum1 += widemul(a[16+j-i], b[8+i]); widemul(aa[8 + j - i],
}); bb[i]);
accum1 += widemul(a[16 + j - i], b[8 + i]);
}
);
accum1 += accum2; accum1 += accum2;
accum0 += accum2; accum0 += accum2;
c[j] = ((uint32_t)(accum0)) & mask; c[j] = ((uint32_t)(accum0)) & mask;
c[j+8] = ((uint32_t)(accum1)) & mask; c[j + 8] = ((uint32_t)(accum1)) & mask;
accum0 >>= 28; accum1 >>= 28;
accum0 >>= 28;
accum1 >>= 28;
}); });
accum0 += accum1; accum0 += accum1;
@ -76,21 +73,20 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
c[1] += ((uint32_t)(accum1)); c[1] += ((uint32_t)(accum1));
} }
void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
{
const uint32_t *a = as->limb; const uint32_t *a = as->limb;
uint32_t *c = cs->limb; uint32_t *c = cs->limb;
uint64_t accum0 = 0, accum8 = 0; uint64_t accum0 = 0, accum8 = 0;
uint32_t mask = (1<<28)-1; uint32_t mask = (1 << 28) - 1;
int i; int i;
assert(b<1<<28); assert(b < 1 << 28);
FOR_LIMB(i,0,8,{
accum0 += widemul(b, a[i]);
accum8 += widemul(b, a[i+8]);
FOR_LIMB(i, 0, 8, {
accum0 += widemul(b, a[i]); accum8 += widemul(b, a[i + 8]);
c[i] = accum0 & mask; accum0 >>= 28; c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28;
}); });
accum0 += accum8 + c[8]; accum0 += accum8 + c[8];
@ -102,7 +98,7 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
c[1] += accum8 >> 28; c[1] += accum8 >> 28;
} }
void gf_sqr (gf_s *__restrict__ cs, const gf as) { void gf_sqr(gf_s * __restrict__ cs, const gf as)
gf_mul(cs,as,as); /* Performs better with a dedicated square */ {
gf_mul(cs, as, as); /* Performs better with a dedicated square */
} }

View File

@ -16,40 +16,43 @@
#define LIMB_PLACE_VALUE(i) 28 #define LIMB_PLACE_VALUE(i) 28
void gf_add_RAW (gf out, const gf a, const gf b) { void gf_add_RAW(gf out, const gf a, const gf b)
{
unsigned int i; unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] + b->limb[i]; out->limb[i] = a->limb[i] + b->limb[i];
} }
} }
void gf_sub_RAW (gf out, const gf a, const gf b) { void gf_sub_RAW(gf out, const gf a, const gf b)
{
unsigned int i; unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] - b->limb[i]; out->limb[i] = a->limb[i] - b->limb[i];
} }
} }
void gf_bias (gf a, int amt) { void gf_bias(gf a, int amt)
{
unsigned int i; unsigned int i;
uint32_t co1 = ((1<<28)-1)*amt, co2 = co1-amt; uint32_t co1 = ((1 << 28) - 1) * amt, co2 = co1 - amt;
for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) { for (i = 0; i < sizeof(*a) / sizeof(a->limb[0]); i++) {
a->limb[i] += (i==sizeof(*a)/sizeof(a->limb[0])/2) ? co2 : co1; a->limb[i] += (i == sizeof(*a) / sizeof(a->limb[0]) / 2) ? co2 : co1;
} }
} }
void gf_weak_reduce (gf a) { void gf_weak_reduce(gf a)
uint32_t mask = (1<<28) - 1; {
uint32_t mask = (1 << 28) - 1;
uint32_t tmp = a->limb[15] >> 28; uint32_t tmp = a->limb[15] >> 28;
unsigned int i; unsigned int i;
a->limb[8] += tmp; a->limb[8] += tmp;
for (i=15; i>0; i--) { for (i = 15; i > 0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
} }
a->limb[0] = (a->limb[0] & mask) + tmp; a->limb[0] = (a->limb[0] & mask) + tmp;
} }

View File

@ -11,22 +11,26 @@
*/ */
#ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__ #ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__
#define __ARCH_ARM_32_ARCH_INTRINSICS_H__ # define __ARCH_ARM_32_ARCH_INTRINSICS_H__
#define ARCH_WORD_BITS 32 # define ARCH_WORD_BITS 32
static __inline__ __attribute((always_inline,unused)) static __inline__ __attribute((always_inline, unused))
uint32_t word_is_zero(uint32_t a) { uint32_t word_is_zero(uint32_t a)
{
uint32_t ret; uint32_t ret;
asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc"); asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
return ret; return ret;
} }
static __inline__ __attribute((always_inline,unused)) static __inline__ __attribute((always_inline, unused))
uint64_t widemul(uint32_t a, uint32_t b) { uint64_t widemul(uint32_t a, uint32_t b)
/* Could be UMULL, but it's hard to express to CC that the registers must be different */ {
/*
* Could be UMULL, but it's hard to express to CC that the registers must
* be different
*/
return ((uint64_t)a) * b; return ((uint64_t)a) * b;
} }
#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */ #endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */

View File

@ -12,100 +12,89 @@
#include "f_field.h" #include "f_field.h"
static inline void __attribute__((gnu_inline,always_inline)) static inline void __attribute__ ((gnu_inline, always_inline))
smlal ( smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
uint64_t *acc, {
const uint32_t a,
const uint32_t b
) {
#ifdef __ARMEL__ #ifdef __ARMEL__
uint32_t lo = *acc, hi = (*acc)>>32; uint32_t lo = *acc, hi = (*acc) >> 32;
__asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
: [lo]"+&r"(lo), [hi]"+&r"(hi) [hi] "+&r"(hi)
: [a]"r"(a), [b]"r"(b)); :[a] "r"(a),[b] "r"(b));
*acc = lo + (((uint64_t)hi)<<32); *acc = lo + (((uint64_t)hi) << 32);
#else #else
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b;
#endif #endif
} }
static inline void __attribute__((gnu_inline,always_inline)) static inline void __attribute__ ((gnu_inline, always_inline))
smlal2 ( smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
uint64_t *acc, {
const uint32_t a,
const uint32_t b
) {
#ifdef __ARMEL__ #ifdef __ARMEL__
uint32_t lo = *acc, hi = (*acc)>>32; uint32_t lo = *acc, hi = (*acc) >> 32;
__asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
: [lo]"+&r"(lo), [hi]"+&r"(hi) [hi] "+&r"(hi)
: [a]"r"(a), [b]"r"(2*b)); :[a] "r"(a),[b] "r"(2 * b));
*acc = lo + (((uint64_t)hi)<<32); *acc = lo + (((uint64_t)hi) << 32);
#else #else
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2); *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
#endif #endif
} }
static inline void __attribute__((gnu_inline,always_inline)) static inline void __attribute__ ((gnu_inline, always_inline))
smull ( smull(uint64_t *acc, const uint32_t a, const uint32_t b)
uint64_t *acc, {
const uint32_t a,
const uint32_t b
) {
#ifdef __ARMEL__ #ifdef __ARMEL__
uint32_t lo, hi; uint32_t lo, hi;
__asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]" __asm__ __volatile__("smull %[lo], %[hi], %[a], %[b]":[lo] "=&r"(lo),
: [lo]"=&r"(lo), [hi]"=&r"(hi) [hi] "=&r"(hi)
: [a]"r"(a), [b]"r"(b)); :[a] "r"(a),[b] "r"(b));
*acc = lo + (((uint64_t)hi)<<32); *acc = lo + (((uint64_t)hi) << 32);
#else #else
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b;
#endif #endif
} }
static inline void __attribute__((gnu_inline,always_inline)) static inline void __attribute__ ((gnu_inline, always_inline))
smull2 ( smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
uint64_t *acc, {
const uint32_t a,
const uint32_t b
) {
#ifdef __ARMEL__ #ifdef __ARMEL__
uint32_t lo, hi; uint32_t lo, hi;
__asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]" __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
: [lo]"=&r"(lo), [hi]"=&r"(hi) : [lo] "=&r"(lo),[hi] "=&r"(hi)
: [a]"r"(a), [b]"r"(2*b)); : [a] "r"(a),[b] "r"(2 * b));
*acc = lo + (((uint64_t)hi)<<32); *acc = lo + (((uint64_t)hi) << 32);
#else #else
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2); *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
#endif #endif
} }
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
{
const uint32_t *a = as->limb, *b = bs->limb; const uint32_t *a = as->limb, *b = bs->limb;
uint32_t *c = cs->limb; uint32_t *c = cs->limb;
uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1; uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1;
uint32_t mask = (1<<28) - 1; uint32_t mask = (1 << 28) - 1;
uint32_t aa[8], bm[8]; uint32_t aa[8], bm[8];
int i; int i;
for (i=0; i<8; i++) { for (i = 0; i < 8; i++) {
aa[i] = a[i] + a[i+8]; aa[i] = a[i] + a[i + 8];
bm[i] = b[i] - b[i+8]; bm[i] = b[i] - b[i + 8];
} }
uint32_t ax,bx; uint32_t ax, bx;
{ {
/* t^3 terms */ /* t^3 terms */
smull(&accum1, ax = aa[1], bx = b[15]); smull(&accum1, ax = aa[1], bx = b[15]);
@ -284,7 +273,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum0, ax, bx = b[8]); smlal(&accum0, ax, bx = b[8]);
smlal(&accum2, ax = aa[5], bx); smlal(&accum2, ax = aa[5], bx);
smlal(&accum0, ax = a[13], bx = b[7]); smlal(&accum0, ax = a[13], bx = b[7]);
smlal(&accum2, ax = a[14], bx); smlal(&accum2, ax = a[14], bx);
smlal(&accum0, ax, bx = b[6]); smlal(&accum0, ax, bx = b[6]);
@ -307,7 +295,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum1, ax, bx = b[0]); smlal(&accum1, ax, bx = b[0]);
smlal(&accum3, ax = a[13], bx); smlal(&accum3, ax = a[13], bx);
smlal(&accum1, ax = a[5], bx = bm[7]); smlal(&accum1, ax = a[5], bx = bm[7]);
smlal(&accum3, ax = a[6], bx); smlal(&accum3, ax = a[6], bx);
smlal(&accum1, ax, bx = bm[6]); smlal(&accum1, ax, bx = bm[6]);
@ -365,7 +352,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum0, ax, bx = b[8]); smlal(&accum0, ax, bx = b[8]);
smlal(&accum2, ax = aa[7], bx); smlal(&accum2, ax = aa[7], bx);
smlal(&accum0, ax = a[15], bx = b[7]); smlal(&accum0, ax = a[15], bx = b[7]);
/* t terms */ /* t terms */
@ -388,7 +374,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum1, ax, bx = b[0]); smlal(&accum1, ax, bx = b[0]);
smlal(&accum3, ax = a[15], bx); smlal(&accum3, ax = a[15], bx);
smlal(&accum1, ax = a[7], bx = bm[7]); smlal(&accum1, ax = a[7], bx = bm[7]);
/* 1 terms */ /* 1 terms */
@ -435,21 +420,22 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
c[1] += ((uint32_t)(accum1)); c[1] += ((uint32_t)(accum1));
} }
void gf_sqr (gf_s *__restrict__ cs, const gf as) { void gf_sqr(gf_s * __restrict__ cs, const gf as)
{
const uint32_t *a = as->limb; const uint32_t *a = as->limb;
uint32_t *c = cs->limb; uint32_t *c = cs->limb;
uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp; uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp;
uint32_t mask = (1<<28) - 1; uint32_t mask = (1 << 28) - 1;
uint32_t bm[8]; uint32_t bm[8];
int i; int i;
for (i=0; i<8; i++) { for (i = 0; i < 8; i++) {
bm[i] = a[i] - a[i+8]; bm[i] = a[i] - a[i + 8];
} }
uint32_t ax,bx; uint32_t ax, bx;
{ {
/* t^3 terms */ /* t^3 terms */
smull2(&accum1, ax = a[9], bx = a[15]); smull2(&accum1, ax = a[9], bx = a[15]);
@ -498,8 +484,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
smlal2(&accum2, ax = bm[0], bx = bm[1]); smlal2(&accum2, ax = bm[0], bx = bm[1]);
smlal(&accum0, ax, ax); smlal(&accum0, ax, ax);
tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; tmp = -accum3;
tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; accum3 = tmp - accum2;
accum2 = tmp;
tmp = -accum1;
accum1 = tmp - accum0;
accum0 = tmp;
accum2 += accum0 >> 28; accum2 += accum0 >> 28;
accum3 += accum1 >> 28; accum3 += accum1 >> 28;
@ -560,9 +550,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
smlal2(&accum2, ax = bm[1], bx); smlal2(&accum2, ax = bm[1], bx);
smlal(&accum0, ax, ax); smlal(&accum0, ax, ax);
tmp = -accum3;
tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; accum3 = tmp - accum2;
tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; accum2 = tmp;
tmp = -accum1;
accum1 = tmp - accum0;
accum0 = tmp;
accum0 += accumC0; accum0 += accumC0;
accum1 += accumC1; accum1 += accumC1;
@ -596,7 +589,6 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
smlal2(&accum2, ax = a[10], bx); smlal2(&accum2, ax = a[10], bx);
smlal(&accum0, ax, ax); smlal(&accum0, ax, ax);
smlal2(&accum0, ax = a[5], bx = a[7]); smlal2(&accum0, ax = a[5], bx = a[7]);
smlal2(&accum2, ax = a[6], bx); smlal2(&accum2, ax = a[6], bx);
smlal(&accum0, ax, ax); smlal(&accum0, ax, ax);
@ -630,9 +622,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
smlal2(&accum2, ax = bm[2], bx); smlal2(&accum2, ax = bm[2], bx);
smlal(&accum0, ax, ax); smlal(&accum0, ax, ax);
tmp = -accum3;
tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; accum3 = tmp - accum2;
tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; accum2 = tmp;
tmp = -accum1;
accum1 = tmp - accum0;
accum0 = tmp;
accum0 += accumC0; accum0 += accumC0;
accum1 += accumC1; accum1 += accumC1;
@ -664,7 +659,6 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
smlal2(&accum2, ax = a[11], bx); smlal2(&accum2, ax = a[11], bx);
smlal(&accum0, ax, ax); smlal(&accum0, ax, ax);
smlal(&accum0, ax = a[7], bx = a[7]); smlal(&accum0, ax = a[7], bx = a[7]);
/* t terms */ /* t terms */
@ -699,9 +693,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
smlal2(&accum2, ax = bm[3], bx); smlal2(&accum2, ax = bm[3], bx);
smlal(&accum0, ax, ax); smlal(&accum0, ax, ax);
tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; tmp = -accum3;
tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; accum3 = tmp - accum2;
accum2 = tmp;
tmp = -accum1;
accum1 = tmp - accum0;
accum0 = tmp;
accum0 += accumC0; accum0 += accumC0;
accum1 += accumC1; accum1 += accumC1;
@ -729,12 +726,9 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
c[1] += ((uint32_t)(accum1)); c[1] += ((uint32_t)(accum1));
} }
void gf_mulw_unsigned ( void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
gf_s *__restrict__ cs, {
const gf as, uint32_t mask = (1ull << 28) - 1;
uint32_t b
) {
uint32_t mask = (1ull<<28)-1;
assert(b <= mask); assert(b <= mask);
const uint32_t *a = as->limb; const uint32_t *a = as->limb;
@ -745,75 +739,99 @@ void gf_mulw_unsigned (
int i; int i;
uint32_t c0, c8, n0, n8; uint32_t c0, c8, n0, n8;
c0 = a[0]; c8 = a[8]; c0 = a[0];
c8 = a[8];
accum0 = widemul(b, c0); accum0 = widemul(b, c0);
accum8 = widemul(b, c8); accum8 = widemul(b, c8);
c[0] = accum0 & mask; accum0 >>= 28; c[0] = accum0 & mask;
c[8] = accum8 & mask; accum8 >>= 28; accum0 >>= 28;
c[8] = accum8 & mask;
accum8 >>= 28;
i=1; i = 1;
{ {
n0 = a[i]; n8 = a[i+8]; n0 = a[i];
n8 = a[i + 8];
smlal(&accum0, b, n0); smlal(&accum0, b, n0);
smlal(&accum8, b, n8); smlal(&accum8, b, n8);
c[i] = accum0 & mask; accum0 >>= 28; c[i] = accum0 & mask;
c[i+8] = accum8 & mask; accum8 >>= 28; accum0 >>= 28;
c[i + 8] = accum8 & mask;
accum8 >>= 28;
i++; i++;
} }
{ {
c0 = a[i]; c8 = a[i+8]; c0 = a[i];
c8 = a[i + 8];
smlal(&accum0, b, c0); smlal(&accum0, b, c0);
smlal(&accum8, b, c8); smlal(&accum8, b, c8);
c[i] = accum0 & mask; accum0 >>= 28; c[i] = accum0 & mask;
c[i+8] = accum8 & mask; accum8 >>= 28; accum0 >>= 28;
c[i + 8] = accum8 & mask;
accum8 >>= 28;
i++; i++;
} }
{ {
n0 = a[i]; n8 = a[i+8]; n0 = a[i];
n8 = a[i + 8];
smlal(&accum0, b, n0); smlal(&accum0, b, n0);
smlal(&accum8, b, n8); smlal(&accum8, b, n8);
c[i] = accum0 & mask; accum0 >>= 28; c[i] = accum0 & mask;
c[i+8] = accum8 & mask; accum8 >>= 28; accum0 >>= 28;
c[i + 8] = accum8 & mask;
accum8 >>= 28;
i++; i++;
} }
{ {
c0 = a[i]; c8 = a[i+8]; c0 = a[i];
c8 = a[i + 8];
smlal(&accum0, b, c0); smlal(&accum0, b, c0);
smlal(&accum8, b, c8); smlal(&accum8, b, c8);
c[i] = accum0 & mask; accum0 >>= 28; c[i] = accum0 & mask;
c[i+8] = accum8 & mask; accum8 >>= 28; accum0 >>= 28;
c[i + 8] = accum8 & mask;
accum8 >>= 28;
i++; i++;
} }
{ {
n0 = a[i]; n8 = a[i+8]; n0 = a[i];
n8 = a[i + 8];
smlal(&accum0, b, n0); smlal(&accum0, b, n0);
smlal(&accum8, b, n8); smlal(&accum8, b, n8);
c[i] = accum0 & mask; accum0 >>= 28; c[i] = accum0 & mask;
c[i+8] = accum8 & mask; accum8 >>= 28; accum0 >>= 28;
c[i + 8] = accum8 & mask;
accum8 >>= 28;
i++; i++;
} }
{ {
c0 = a[i]; c8 = a[i+8]; c0 = a[i];
c8 = a[i + 8];
smlal(&accum0, b, c0); smlal(&accum0, b, c0);
smlal(&accum8, b, c8); smlal(&accum8, b, c8);
c[i] = accum0 & mask; accum0 >>= 28; c[i] = accum0 & mask;
c[i+8] = accum8 & mask; accum8 >>= 28; accum0 >>= 28;
c[i + 8] = accum8 & mask;
accum8 >>= 28;
i++; i++;
} }
{ {
n0 = a[i]; n8 = a[i+8]; n0 = a[i];
n8 = a[i + 8];
smlal(&accum0, b, n0); smlal(&accum0, b, n0);
smlal(&accum8, b, n8); smlal(&accum8, b, n8);
c[i] = accum0 & mask; accum0 >>= 28; c[i] = accum0 & mask;
c[i+8] = accum8 & mask; accum8 >>= 28; accum0 >>= 28;
c[i + 8] = accum8 & mask;
accum8 >>= 28;
i++; i++;
} }

View File

@ -17,45 +17,49 @@
#define LIMB_PLACE_VALUE(i) 28 #define LIMB_PLACE_VALUE(i) 28
void gf_add_RAW (gf out, const gf a, const gf b) { void gf_add_RAW(gf out, const gf a, const gf b)
for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) { {
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i]; for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
((uint32xn_t *) out)[i] =
((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
} }
/* /*
for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { * for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] + b->limb[i]; * out->limb[i] = a->limb[i] + b->limb[i]; }
}
*/ */
} }
void gf_sub_RAW (gf out, const gf a, const gf b) { void gf_sub_RAW(gf out, const gf a, const gf b)
for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) { {
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i]; for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
((uint32xn_t *) out)[i] =
((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
} }
/* /*
for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { * for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] - b->limb[i]; * out->limb[i] = a->limb[i] - b->limb[i]; }
}
*/ */
} }
void gf_bias (gf a, int amt) { void gf_bias(gf a, int amt)
uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt; {
uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1}; uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
uint32x4_t *aa = (uint32x4_t*) a; uint32x4_t lo = { co1, co1, co1, co1 }, hi = {
co2, co1, co1, co1};
uint32x4_t *aa = (uint32x4_t *) a;
aa[0] += lo; aa[0] += lo;
aa[1] += lo; aa[1] += lo;
aa[2] += hi; aa[2] += hi;
aa[3] += lo; aa[3] += lo;
} }
void gf_weak_reduce (gf a) { void gf_weak_reduce(gf a)
uint64_t mask = (1ull<<28) - 1; {
uint64_t mask = (1ull << 28) - 1;
uint64_t tmp = a->limb[15] >> 28; uint64_t tmp = a->limb[15] >> 28;
a->limb[8] += tmp; a->limb[8] += tmp;
for (unsigned int i=15; i>0; i--) { for (unsigned int i = 15; i > 0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
} }
a->limb[0] = (a->limb[0] & mask) + tmp; a->limb[0] = (a->limb[0] & mask) + tmp;
} }

View File

@ -11,22 +11,26 @@
*/ */
#ifndef __ARCH_NEON_ARCH_INTRINSICS_H__ #ifndef __ARCH_NEON_ARCH_INTRINSICS_H__
#define __ARCH_NEON_ARCH_INTRINSICS_H__ # define __ARCH_NEON_ARCH_INTRINSICS_H__
#define ARCH_WORD_BITS 32 # define ARCH_WORD_BITS 32
static __inline__ __attribute((always_inline,unused)) static __inline__ __attribute((always_inline, unused))
uint32_t word_is_zero(uint32_t a) { uint32_t word_is_zero(uint32_t a)
{
uint32_t ret; uint32_t ret;
__asm__("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc"); __asm__("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
return ret; return ret;
} }
static __inline__ __attribute((always_inline,unused)) static __inline__ __attribute((always_inline, unused))
uint64_t widemul(uint32_t a, uint32_t b) { uint64_t widemul(uint32_t a, uint32_t b)
/* Could be UMULL, but it's hard to express to CC that the registers must be different */ {
/*
* Could be UMULL, but it's hard to express to CC that the registers must
* be different
*/
return ((uint64_t)a) * b; return ((uint64_t)a) * b;
} }
#endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */ #endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */

File diff suppressed because it is too large Load Diff

View File

@ -15,7 +15,7 @@
#define USE_NEON_PERM 1 #define USE_NEON_PERM 1
#define LIMBHI(x) ((x##ull)>>28) #define LIMBHI(x) ((x##ull)>>28)
#define LIMBLO(x) ((x##ull)&((1ull<<28)-1)) #define LIMBLO(x) ((x##ull)&((1ull<<28)-1))
# define FIELD_LITERAL(a,b,c,d,e,f,g,h) \ #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
{{LIMBLO(a),LIMBLO(e), LIMBHI(a),LIMBHI(e), \ {{LIMBLO(a),LIMBLO(e), LIMBHI(a),LIMBHI(e), \
LIMBLO(b),LIMBLO(f), LIMBHI(b),LIMBHI(f), \ LIMBLO(b),LIMBLO(f), LIMBHI(b),LIMBHI(f), \
LIMBLO(c),LIMBLO(g), LIMBHI(c),LIMBHI(g), \ LIMBLO(c),LIMBLO(g), LIMBHI(c),LIMBHI(g), \
@ -23,42 +23,47 @@
#define LIMB_PLACE_VALUE(i) 28 #define LIMB_PLACE_VALUE(i) 28
void gf_add_RAW (gf out, const gf a, const gf b) { void gf_add_RAW(gf out, const gf a, const gf b)
for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) { {
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i]; for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
((uint32xn_t *) out)[i] =
((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
} }
} }
void gf_sub_RAW (gf out, const gf a, const gf b) { void gf_sub_RAW(gf out, const gf a, const gf b)
for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) { {
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i]; for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
((uint32xn_t *) out)[i] =
((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
} }
/* /*
unsigned int i; * unsigned int i; for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { * out->limb[i] = a->limb[i] - b->limb[i]; }
out->limb[i] = a->limb[i] - b->limb[i];
}
*/ */
} }
void gf_bias (gf a, int amt) { void gf_bias(gf a, int amt)
uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt; {
uint32x4_t lo = {co1,co2,co1,co1}, hi = {co1,co1,co1,co1}; uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
uint32x4_t *aa = (uint32x4_t*) a; uint32x4_t lo = { co1, co2, co1, co1 }, hi = {
co1, co1, co1, co1};
uint32x4_t *aa = (uint32x4_t *) a;
aa[0] += lo; aa[0] += lo;
aa[1] += hi; aa[1] += hi;
aa[2] += hi; aa[2] += hi;
aa[3] += hi; aa[3] += hi;
} }
void gf_weak_reduce (gf a) { void gf_weak_reduce(gf a)
{
uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1}, uint32x2_t *aa = (uint32x2_t *) a, vmask = {
tmp = vshr_n_u32(aa[7],28); (1ull << 28) - 1, (1ull << 28) - 1}, vm2 = {
0, -1}, tmp = vshr_n_u32(aa[7], 28);
for (unsigned int i=7; i>=1; i--) { for (unsigned int i = 7; i >= 1; i--) {
aa[i] = vsra_n_u32(aa[i] & vmask, aa[i-1], 28); aa[i] = vsra_n_u32(aa[i] & vmask, aa[i - 1], 28);
} }
aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp&vm2); aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp & vm2);
} }

View File

@ -11,20 +11,21 @@
*/ */
#ifndef __ARCH_REF64_ARCH_INTRINSICS_H__ #ifndef __ARCH_REF64_ARCH_INTRINSICS_H__
#define __ARCH_REF64_ARCH_INTRINSICS_H__ # define __ARCH_REF64_ARCH_INTRINSICS_H__
#define ARCH_WORD_BITS 64 # define ARCH_WORD_BITS 64
static __inline__ __attribute((always_inline,unused)) static __inline__ __attribute((always_inline, unused))
uint64_t word_is_zero(uint64_t a) { uint64_t word_is_zero(uint64_t a)
{
/* let's hope the compiler isn't clever enough to optimize this. */ /* let's hope the compiler isn't clever enough to optimize this. */
return (((__uint128_t)a)-1)>>64; return (((__uint128_t) a) - 1) >> 64;
} }
static __inline__ __attribute((always_inline,unused)) static __inline__ __attribute((always_inline, unused))
__uint128_t widemul(uint64_t a, uint64_t b) { __uint128_t widemul(uint64_t a, uint64_t b)
return ((__uint128_t)a) * b; {
return ((__uint128_t) a) * b;
} }
#endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */ #endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */

View File

@ -11,48 +11,49 @@
*/ */
#include "f_field.h" #include "f_field.h"
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
{
const uint64_t *a = as->limb, *b = bs->limb; const uint64_t *a = as->limb, *b = bs->limb;
uint64_t *c = cs->limb; uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum1 = 0, accum2; __uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull<<56) - 1; uint64_t mask = (1ull << 56) - 1;
uint64_t aa[4], bb[4], bbb[4]; uint64_t aa[4], bb[4], bbb[4];
unsigned int i; unsigned int i;
for (i=0; i<4; i++) { for (i = 0; i < 4; i++) {
aa[i] = a[i] + a[i+4]; aa[i] = a[i] + a[i + 4];
bb[i] = b[i] + b[i+4]; bb[i] = b[i] + b[i + 4];
bbb[i] = bb[i] + b[i+4]; bbb[i] = bb[i] + b[i + 4];
} }
int I_HATE_UNROLLED_LOOPS = 0; int I_HATE_UNROLLED_LOOPS = 0;
if (I_HATE_UNROLLED_LOOPS) { if (I_HATE_UNROLLED_LOOPS) {
/* The compiler probably won't unroll this, /*
* so it's like 80% slower. * The compiler probably won't unroll this, so it's like 80% slower.
*/ */
for (i=0; i<4; i++) { for (i = 0; i < 4; i++) {
accum2 = 0; accum2 = 0;
unsigned int j; unsigned int j;
for (j=0; j<=i; j++) { for (j = 0; j <= i; j++) {
accum2 += widemul(a[j], b[i-j]); accum2 += widemul(a[j], b[i - j]);
accum1 += widemul(aa[j], bb[i-j]); accum1 += widemul(aa[j], bb[i - j]);
accum0 += widemul(a[j+4], b[i-j+4]); accum0 += widemul(a[j + 4], b[i - j + 4]);
} }
for (; j<4; j++) { for (; j < 4; j++) {
accum2 += widemul(a[j], b[i-j+8]); accum2 += widemul(a[j], b[i - j + 8]);
accum1 += widemul(aa[j], bbb[i-j+4]); accum1 += widemul(aa[j], bbb[i - j + 4]);
accum0 += widemul(a[j+4], bb[i-j+4]); accum0 += widemul(a[j + 4], bb[i - j + 4]);
} }
accum1 -= accum2; accum1 -= accum2;
accum0 += accum2; accum0 += accum2;
c[i] = ((uint64_t)(accum0)) & mask; c[i] = ((uint64_t)(accum0)) & mask;
c[i+4] = ((uint64_t)(accum1)) & mask; c[i + 4] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56; accum0 >>= 56;
accum1 >>= 56; accum1 >>= 56;
@ -172,19 +173,22 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
c[1] += ((uint64_t)(accum1)); c[1] += ((uint64_t)(accum1));
} }
void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
{
const uint64_t *a = as->limb; const uint64_t *a = as->limb;
uint64_t *c = cs->limb; uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum4 = 0; __uint128_t accum0 = 0, accum4 = 0;
uint64_t mask = (1ull<<56) - 1; uint64_t mask = (1ull << 56) - 1;
int i; int i;
for (i=0; i<4; i++) { for (i = 0; i < 4; i++) {
accum0 += widemul(b, a[i]); accum0 += widemul(b, a[i]);
accum4 += widemul(b, a[i+4]); accum4 += widemul(b, a[i + 4]);
c[i] = accum0 & mask; accum0 >>= 56; c[i] = accum0 & mask;
c[i+4] = accum4 & mask; accum4 >>= 56; accum0 >>= 56;
c[i + 4] = accum4 & mask;
accum4 >>= 56;
} }
accum0 += accum4 + c[4]; accum0 += accum4 + c[4];
@ -196,24 +200,25 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
c[1] += accum4 >> 56; c[1] += accum4 >> 56;
} }
void gf_sqr (gf_s *__restrict__ cs, const gf as) { void gf_sqr(gf_s * __restrict__ cs, const gf as)
{
const uint64_t *a = as->limb; const uint64_t *a = as->limb;
uint64_t *c = cs->limb; uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum1 = 0, accum2; __uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull<<56) - 1; uint64_t mask = (1ull << 56) - 1;
uint64_t aa[4]; uint64_t aa[4];
/* For some reason clang doesn't vectorize this without prompting? */ /* For some reason clang doesn't vectorize this without prompting? */
unsigned int i; unsigned int i;
for (i=0; i<4; i++) { for (i = 0; i < 4; i++) {
aa[i] = a[i] + a[i+4]; aa[i] = a[i] + a[i + 4];
} }
accum2 = widemul(a[0],a[3]); accum2 = widemul(a[0], a[3]);
accum0 = widemul(aa[0],aa[3]); accum0 = widemul(aa[0], aa[3]);
accum1 = widemul(a[4],a[7]); accum1 = widemul(a[4], a[7]);
accum2 += widemul(a[1], a[2]); accum2 += widemul(a[1], a[2]);
accum0 += widemul(aa[1], aa[2]); accum0 += widemul(aa[1], aa[2]);
@ -222,21 +227,21 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
accum0 -= accum2; accum0 -= accum2;
accum1 += accum2; accum1 += accum2;
c[3] = ((uint64_t)(accum1))<<1 & mask; c[3] = ((uint64_t)(accum1)) << 1 & mask;
c[7] = ((uint64_t)(accum0))<<1 & mask; c[7] = ((uint64_t)(accum0)) << 1 & mask;
accum0 >>= 55; accum0 >>= 55;
accum1 >>= 55; accum1 >>= 55;
accum0 += widemul(2*aa[1],aa[3]); accum0 += widemul(2 * aa[1], aa[3]);
accum1 += widemul(2*a[5], a[7]); accum1 += widemul(2 * a[5], a[7]);
accum0 += widemul(aa[2], aa[2]); accum0 += widemul(aa[2], aa[2]);
accum1 += accum0; accum1 += accum0;
accum0 -= widemul(2*a[1], a[3]); accum0 -= widemul(2 * a[1], a[3]);
accum1 += widemul(a[6], a[6]); accum1 += widemul(a[6], a[6]);
accum2 = widemul(a[0],a[0]); accum2 = widemul(a[0], a[0]);
accum1 -= accum2; accum1 -= accum2;
accum0 += accum2; accum0 += accum2;
@ -250,16 +255,16 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
accum0 >>= 56; accum0 >>= 56;
accum1 >>= 56; accum1 >>= 56;
accum2 = widemul(2*aa[2],aa[3]); accum2 = widemul(2 * aa[2], aa[3]);
accum0 -= widemul(2*a[2], a[3]); accum0 -= widemul(2 * a[2], a[3]);
accum1 += widemul(2*a[6], a[7]); accum1 += widemul(2 * a[6], a[7]);
accum1 += accum2; accum1 += accum2;
accum0 += accum2; accum0 += accum2;
accum2 = widemul(2*a[0],a[1]); accum2 = widemul(2 * a[0], a[1]);
accum1 += widemul(2*aa[0], aa[1]); accum1 += widemul(2 * aa[0], aa[1]);
accum0 += widemul(2*a[4], a[5]); accum0 += widemul(2 * a[4], a[5]);
accum1 -= accum2; accum1 -= accum2;
accum0 += accum2; accum0 += accum2;
@ -270,16 +275,16 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
accum0 >>= 56; accum0 >>= 56;
accum1 >>= 56; accum1 >>= 56;
accum2 = widemul(aa[3],aa[3]); accum2 = widemul(aa[3], aa[3]);
accum0 -= widemul(a[3], a[3]); accum0 -= widemul(a[3], a[3]);
accum1 += widemul(a[7], a[7]); accum1 += widemul(a[7], a[7]);
accum1 += accum2; accum1 += accum2;
accum0 += accum2; accum0 += accum2;
accum2 = widemul(2*a[0],a[2]); accum2 = widemul(2 * a[0], a[2]);
accum1 += widemul(2*aa[0], aa[2]); accum1 += widemul(2 * aa[0], aa[2]);
accum0 += widemul(2*a[4], a[6]); accum0 += widemul(2 * a[4], a[6]);
accum2 += widemul(a[1], a[1]); accum2 += widemul(a[1], a[1]);
accum1 += widemul(aa[1], aa[1]); accum1 += widemul(aa[1], aa[1]);
@ -306,4 +311,3 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
c[0] += ((uint64_t)(accum1)); c[0] += ((uint64_t)(accum1));
} }

View File

@ -15,32 +15,36 @@
#define LIMB_PLACE_VALUE(i) 56 #define LIMB_PLACE_VALUE(i) 56
void gf_add_RAW (gf out, const gf a, const gf b) { void gf_add_RAW(gf out, const gf a, const gf b)
for (unsigned int i=0; i<8; i++) { {
for (unsigned int i = 0; i < 8; i++) {
out->limb[i] = a->limb[i] + b->limb[i]; out->limb[i] = a->limb[i] + b->limb[i];
} }
gf_weak_reduce(out); gf_weak_reduce(out);
} }
void gf_sub_RAW (gf out, const gf a, const gf b) { void gf_sub_RAW(gf out, const gf a, const gf b)
uint64_t co1 = ((1ull<<56)-1)*2, co2 = co1-2; {
for (unsigned int i=0; i<8; i++) { uint64_t co1 = ((1ull << 56) - 1) * 2, co2 = co1 - 2;
out->limb[i] = a->limb[i] - b->limb[i] + ((i==4) ? co2 : co1); for (unsigned int i = 0; i < 8; i++) {
out->limb[i] = a->limb[i] - b->limb[i] + ((i == 4) ? co2 : co1);
} }
gf_weak_reduce(out); gf_weak_reduce(out);
} }
void gf_bias (gf a, int amt) { void gf_bias(gf a, int amt)
(void) a; {
(void) amt; (void)a;
(void)amt;
} }
void gf_weak_reduce (gf a) { void gf_weak_reduce(gf a)
uint64_t mask = (1ull<<56) - 1; {
uint64_t mask = (1ull << 56) - 1;
uint64_t tmp = a->limb[7] >> 56; uint64_t tmp = a->limb[7] >> 56;
a->limb[4] += tmp; a->limb[4] += tmp;
for (unsigned int i=7; i>0; i--) { for (unsigned int i = 7; i > 0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56); a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
} }
a->limb[0] = (a->limb[0] & mask) + tmp; a->limb[0] = (a->limb[0] & mask) + tmp;
} }

View File

@ -10,303 +10,292 @@
* Originally written by Mike Hamburg * Originally written by Mike Hamburg
*/ */
#ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__ #ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
#define __ARCH_X86_64_ARCH_INTRINSICS_H__ # define __ARCH_X86_64_ARCH_INTRINSICS_H__
#define ARCH_WORD_BITS 64 # define ARCH_WORD_BITS 64
#include <openssl/e_os2.h> # include <openssl/e_os2.h>
/* FUTURE: autogenerate */ /* FUTURE: autogenerate */
static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) { static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b)
uint64_t c,d; {
#ifndef __BMI2__ uint64_t c, d;
# ifndef __BMI2__
__asm__ volatile __asm__ volatile
("movq %[a], %%rax;" ("movq %[a], %%rax;" "mulq %[b];":[c] "=&a"(c),[d] "=d"(d)
"mulq %[b];" :[b] "m"(*b),[a] "m"(*a)
: [c]"=&a"(c), [d]"=d"(d) :"cc");
: [b]"m"(*b), [a]"m"(*a) # else
: "cc");
#else
__asm__ volatile __asm__ volatile
("movq %[a], %%rdx;" ("movq %[a], %%rdx;" "mulx %[b], %[c], %[d];":[c] "=r"(c),[d] "=r"(d)
"mulx %[b], %[c], %[d];" :[b] "m"(*b),[a] "m"(*a)
: [c]"=r"(c), [d]"=r"(d) :"rdx");
: [b]"m"(*b), [a]"m"(*a) # endif
: "rdx"); return (((__uint128_t) (d)) << 64) | c;
#endif
return (((__uint128_t)(d))<<64) | c;
} }
static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b)
uint64_t c,d; {
#ifndef __BMI2__ uint64_t c, d;
# ifndef __BMI2__
__asm__ volatile __asm__ volatile
("movq %[a], %%rax;" ("movq %[a], %%rax;" "mulq %[b];":[c] "=&a"(c),[d] "=d"(d)
"mulq %[b];" :[b] "m"(*b),[a] "r"(a)
: [c]"=&a"(c), [d]"=d"(d) :"cc");
: [b]"m"(*b), [a]"r"(a) # else
: "cc");
#else
__asm__ volatile __asm__ volatile
("mulx %[b], %[c], %[d];" ("mulx %[b], %[c], %[d];":[c] "=r"(c),[d] "=r"(d)
: [c]"=r"(c), [d]"=r"(d) :[b] "m"(*b),[a] "d"(a));
: [b]"m"(*b), [a]"d"(a)); # endif
#endif return (((__uint128_t) (d)) << 64) | c;
return (((__uint128_t)(d))<<64) | c;
} }
static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) { static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b)
uint64_t c,d; {
#ifndef __BMI2__ uint64_t c, d;
# ifndef __BMI2__
__asm__ volatile __asm__ volatile
("mulq %[b];" ("mulq %[b];":[c] "=a"(c),[d] "=d"(d)
: [c]"=a"(c), [d]"=d"(d) :[b] "r"(b), "a"(a)
: [b]"r"(b), "a"(a) :"cc");
: "cc"); # else
#else
__asm__ volatile __asm__ volatile
("mulx %[b], %[c], %[d];" ("mulx %[b], %[c], %[d];":[c] "=r"(c),[d] "=r"(d)
: [c]"=r"(c), [d]"=r"(d) :[b] "r"(b),[a] "d"(a));
: [b]"r"(b), [a]"d"(a)); # endif
#endif return (((__uint128_t) (d)) << 64) | c;
return (((__uint128_t)(d))<<64) | c;
} }
static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b)
uint64_t c,d; {
#ifndef __BMI2__ uint64_t c, d;
# ifndef __BMI2__
__asm__ volatile __asm__ volatile
("movq %[a], %%rax; " ("movq %[a], %%rax; "
"addq %%rax, %%rax; " "addq %%rax, %%rax; " "mulq %[b];":[c] "=&a"(c),[d] "=d"(d)
"mulq %[b];" :[b] "m"(*b),[a] "m"(*a)
: [c]"=&a"(c), [d]"=d"(d) :"cc");
: [b]"m"(*b), [a]"m"(*a) # else
: "cc");
#else
__asm__ volatile __asm__ volatile
("movq %[a], %%rdx;" ("movq %[a], %%rdx;"
"leaq (,%%rdx,2), %%rdx;" "leaq (,%%rdx,2), %%rdx;" "mulx %[b], %[c], %[d];":[c] "=r"(c),[d] "=r"(d)
"mulx %[b], %[c], %[d];" :[b] "m"(*b),[a] "m"(*a)
: [c]"=r"(c), [d]"=r"(d) :"rdx");
: [b]"m"(*b), [a]"m"(*a) # endif
: "rdx"); return (((__uint128_t) (d)) << 64) | c;
#endif
return (((__uint128_t)(d))<<64) | c;
} }
static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { static __inline__ void mac(__uint128_t * acc, const uint64_t *a,
uint64_t lo = *acc, hi = *acc>>64; const uint64_t *b)
{
uint64_t lo = *acc, hi = *acc >> 64;
#ifdef __BMI2__ # ifdef __BMI2__
uint64_t c,d; uint64_t c, d;
__asm__ volatile __asm__ volatile
("movq %[a], %%rdx; " ("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; " "mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; " "addq %[c], %[lo]; "
"adcq %[d], %[hi]; " "adcq %[d], %[hi]; ":[c] "=&r"(c),[d] "=&r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
: [c]"=&r"(c), [d]"=&r"(d), [lo]"+r"(lo), [hi]"+r"(hi) :[b] "m"(*b),[a] "m"(*a)
: [b]"m"(*b), [a]"m"(*a) :"rdx", "cc");
: "rdx", "cc"); # else
#else
__asm__ volatile __asm__ volatile
("movq %[a], %%rax; " ("movq %[a], %%rax; "
"mulq %[b]; " "mulq %[b]; "
"addq %%rax, %[lo]; " "addq %%rax, %[lo]; " "adcq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
"adcq %%rdx, %[hi]; " :[b] "m"(*b),[a] "m"(*a)
: [lo]"+r"(lo), [hi]"+r"(hi) :"rax", "rdx", "cc");
: [b]"m"(*b), [a]"m"(*a) # endif
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo; *acc = (((__uint128_t) (hi)) << 64) | lo;
} }
static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) { static __inline__ void macac(__uint128_t * acc, __uint128_t * acc2,
uint64_t lo = *acc, hi = *acc>>64; const uint64_t *a, const uint64_t *b)
uint64_t lo2 = *acc2, hi2 = *acc2>>64; {
uint64_t lo = *acc, hi = *acc >> 64;
uint64_t lo2 = *acc2, hi2 = *acc2 >> 64;
#ifdef __BMI2__ # ifdef __BMI2__
uint64_t c,d; uint64_t c, d;
__asm__ volatile __asm__ volatile
("movq %[a], %%rdx; " ("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; " "mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; " "addq %[c], %[lo]; "
"adcq %[d], %[hi]; " "adcq %[d], %[hi]; "
"addq %[c], %[lo2]; " "addq %[c], %[lo2]; "
"adcq %[d], %[hi2]; " "adcq %[d], %[hi2]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi),
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) [lo2] "+r"(lo2),[hi2] "+r"(hi2)
: [b]"m"(*b), [a]"m"(*a) :[b] "m"(*b),[a] "m"(*a)
: "rdx", "cc"); :"rdx", "cc");
#else # else
__asm__ volatile __asm__ volatile
("movq %[a], %%rax; " ("movq %[a], %%rax; "
"mulq %[b]; " "mulq %[b]; "
"addq %%rax, %[lo]; " "addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; " "adcq %%rdx, %[hi]; "
"addq %%rax, %[lo2]; " "addq %%rax, %[lo2]; "
"adcq %%rdx, %[hi2]; " "adcq %%rdx, %[hi2]; ":[lo] "+r"(lo),[hi] "+r"(hi),[lo2] "+r"(lo2),
: [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) [hi2] "+r"(hi2)
: [b]"m"(*b), [a]"m"(*a) :[b] "m"(*b),[a] "m"(*a)
: "rax", "rdx", "cc"); :"rax", "rdx", "cc");
#endif # endif
*acc = (((__uint128_t)(hi))<<64) | lo; *acc = (((__uint128_t) (hi)) << 64) | lo;
*acc2 = (((__uint128_t)(hi2))<<64) | lo2; *acc2 = (((__uint128_t) (hi2)) << 64) | lo2;
} }
static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { static __inline__ void mac_rm(__uint128_t * acc, uint64_t a, const uint64_t *b)
uint64_t lo = *acc, hi = *acc>>64; {
uint64_t lo = *acc, hi = *acc >> 64;
#ifdef __BMI2__ # ifdef __BMI2__
uint64_t c,d; uint64_t c, d;
__asm__ volatile __asm__ volatile
("mulx %[b], %[c], %[d]; " ("mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; " "addq %[c], %[lo]; "
"adcq %[d], %[hi]; " "adcq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) :[b] "m"(*b),[a] "d"(a)
: [b]"m"(*b), [a]"d"(a) :"cc");
: "cc"); # else
#else
__asm__ volatile __asm__ volatile
("movq %[a], %%rax; " ("movq %[a], %%rax; "
"mulq %[b]; " "mulq %[b]; "
"addq %%rax, %[lo]; " "addq %%rax, %[lo]; " "adcq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
"adcq %%rdx, %[hi]; " :[b] "m"(*b),[a] "r"(a)
: [lo]"+r"(lo), [hi]"+r"(hi) :"rax", "rdx", "cc");
: [b]"m"(*b), [a]"r"(a) # endif
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo; *acc = (((__uint128_t) (hi)) << 64) | lo;
} }
static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) { static __inline__ void mac_rr(__uint128_t * acc, uint64_t a, const uint64_t b)
uint64_t lo = *acc, hi = *acc>>64; {
uint64_t lo = *acc, hi = *acc >> 64;
#ifdef __BMI2__ # ifdef __BMI2__
uint64_t c,d; uint64_t c, d;
__asm__ volatile __asm__ volatile
("mulx %[b], %[c], %[d]; " ("mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; " "addq %[c], %[lo]; "
"adcq %[d], %[hi]; " "adcq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) :[b] "r"(b),[a] "d"(a)
: [b]"r"(b), [a]"d"(a) :"cc");
: "cc"); # else
#else
__asm__ volatile __asm__ volatile
("mulq %[b]; " ("mulq %[b]; "
"addq %%rax, %[lo]; " "addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; " "adcq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi), "+a"(a)
: [lo]"+r"(lo), [hi]"+r"(hi), "+a"(a) :[b] "r"(b)
: [b]"r"(b) :"rdx", "cc");
: "rdx", "cc"); # endif
#endif
*acc = (((__uint128_t)(hi))<<64) | lo; *acc = (((__uint128_t) (hi)) << 64) | lo;
} }
static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { static __inline__ void mac2(__uint128_t * acc, const uint64_t *a,
uint64_t lo = *acc, hi = *acc>>64; const uint64_t *b)
{
uint64_t lo = *acc, hi = *acc >> 64;
#ifdef __BMI2__ # ifdef __BMI2__
uint64_t c,d; uint64_t c, d;
__asm__ volatile __asm__ volatile
("movq %[a], %%rdx; " ("movq %[a], %%rdx; "
"addq %%rdx, %%rdx; " "addq %%rdx, %%rdx; "
"mulx %[b], %[c], %[d]; " "mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; " "addq %[c], %[lo]; "
"adcq %[d], %[hi]; " "adcq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) :[b] "m"(*b),[a] "m"(*a)
: [b]"m"(*b), [a]"m"(*a) :"rdx", "cc");
: "rdx", "cc"); # else
#else
__asm__ volatile __asm__ volatile
("movq %[a], %%rax; " ("movq %[a], %%rax; "
"addq %%rax, %%rax; " "addq %%rax, %%rax; "
"mulq %[b]; " "mulq %[b]; "
"addq %%rax, %[lo]; " "addq %%rax, %[lo]; " "adcq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
"adcq %%rdx, %[hi]; " :[b] "m"(*b),[a] "m"(*a)
: [lo]"+r"(lo), [hi]"+r"(hi) :"rax", "rdx", "cc");
: [b]"m"(*b), [a]"m"(*a) # endif
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo; *acc = (((__uint128_t) (hi)) << 64) | lo;
} }
static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { static __inline__ void msb(__uint128_t * acc, const uint64_t *a,
uint64_t lo = *acc, hi = *acc>>64; const uint64_t *b)
#ifdef __BMI2__ {
uint64_t c,d; uint64_t lo = *acc, hi = *acc >> 64;
# ifdef __BMI2__
uint64_t c, d;
__asm__ volatile __asm__ volatile
("movq %[a], %%rdx; " ("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; " "mulx %[b], %[c], %[d]; "
"subq %[c], %[lo]; " "subq %[c], %[lo]; "
"sbbq %[d], %[hi]; " "sbbq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) :[b] "m"(*b),[a] "m"(*a)
: [b]"m"(*b), [a]"m"(*a) :"rdx", "cc");
: "rdx", "cc"); # else
#else
__asm__ volatile __asm__ volatile
("movq %[a], %%rax; " ("movq %[a], %%rax; "
"mulq %[b]; " "mulq %[b]; "
"subq %%rax, %[lo]; " "subq %%rax, %[lo]; " "sbbq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
"sbbq %%rdx, %[hi]; " :[b] "m"(*b),[a] "m"(*a)
: [lo]"+r"(lo), [hi]"+r"(hi) :"rax", "rdx", "cc");
: [b]"m"(*b), [a]"m"(*a) # endif
: "rax", "rdx", "cc"); *acc = (((__uint128_t) (hi)) << 64) | lo;
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
} }
static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { static __inline__ void msb2(__uint128_t * acc, const uint64_t *a,
uint64_t lo = *acc, hi = *acc>>64; const uint64_t *b)
#ifdef __BMI2__ {
uint64_t c,d; uint64_t lo = *acc, hi = *acc >> 64;
# ifdef __BMI2__
uint64_t c, d;
__asm__ volatile __asm__ volatile
("movq %[a], %%rdx; " ("movq %[a], %%rdx; "
"addq %%rdx, %%rdx; " "addq %%rdx, %%rdx; "
"mulx %[b], %[c], %[d]; " "mulx %[b], %[c], %[d]; "
"subq %[c], %[lo]; " "subq %[c], %[lo]; "
"sbbq %[d], %[hi]; " "sbbq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) :[b] "m"(*b),[a] "m"(*a)
: [b]"m"(*b), [a]"m"(*a) :"rdx", "cc");
: "rdx", "cc"); # else
#else
__asm__ volatile __asm__ volatile
("movq %[a], %%rax; " ("movq %[a], %%rax; "
"addq %%rax, %%rax; " "addq %%rax, %%rax; "
"mulq %[b]; " "mulq %[b]; "
"subq %%rax, %[lo]; " "subq %%rax, %[lo]; " "sbbq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
"sbbq %%rdx, %[hi]; " :[b] "m"(*b),[a] "m"(*a)
: [lo]"+r"(lo), [hi]"+r"(hi) :"rax", "rdx", "cc");
: [b]"m"(*b), [a]"m"(*a) # endif
: "rax", "rdx", "cc"); *acc = (((__uint128_t) (hi)) << 64) | lo;
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
} }
static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { static __inline__ void mrs(__uint128_t * acc, const uint64_t *a,
uint64_t c,d, lo = *acc, hi = *acc>>64; const uint64_t *b)
{
uint64_t c, d, lo = *acc, hi = *acc >> 64;
__asm__ volatile __asm__ volatile
("movq %[a], %%rdx; " ("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; " "mulx %[b], %[c], %[d]; "
"subq %[lo], %[c]; " "subq %[lo], %[c]; "
"sbbq %[hi], %[d]; " "sbbq %[hi], %[d]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) :[b] "m"(*b),[a] "m"(*a)
: [b]"m"(*b), [a]"m"(*a) :"rdx", "cc");
: "rdx", "cc"); *acc = (((__uint128_t) (d)) << 64) | c;
*acc = (((__uint128_t)(d))<<64) | c;
} }
static __inline__ uint64_t word_is_zero(uint64_t x) { static __inline__ uint64_t word_is_zero(uint64_t x)
__asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); {
__asm__ volatile ("neg %0; sbb %0, %0;":"+r" (x));
return ~x; return ~x;
} }
static inline uint64_t shrld(__uint128_t x, int n) { static inline uint64_t shrld(__uint128_t x, int n)
return x>>n; {
return x >> n;
} }
#endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */ #endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */

View File

@ -12,32 +12,34 @@
#include "f_field.h" #include "f_field.h"
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
{
const uint64_t *a = as->limb, *b = bs->limb; const uint64_t *a = as->limb, *b = bs->limb;
uint64_t *c = cs->limb; uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum1 = 0, accum2; __uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull<<56) - 1; uint64_t mask = (1ull << 56) - 1;
uint64_t aa[4] VECTOR_ALIGNED, bb[4] VECTOR_ALIGNED, bbb[4] VECTOR_ALIGNED; uint64_t aa[4] VECTOR_ALIGNED, bb[4] VECTOR_ALIGNED, bbb[4] VECTOR_ALIGNED;
/* For some reason clang doesn't vectorize this without prompting? */ /* For some reason clang doesn't vectorize this without prompting? */
unsigned int i; unsigned int i;
for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) { for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i]; ((uint64xn_t *) aa)[i] =
((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i]; ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i]; ((uint64xn_t *) bb)[i] =
((const uint64xn_t *)b)[i] + ((const uint64xn_t *)(&b[4]))[i];
((uint64xn_t *) bbb)[i] =
((const uint64xn_t *)bb)[i] + ((const uint64xn_t *)(&b[4]))[i];
} }
/* /*
for (int i=0; i<4; i++) { * for (int i=0; i<4; i++) { aa[i] = a[i] + a[i+4]; bb[i] = b[i] + b[i+4];
aa[i] = a[i] + a[i+4]; * }
bb[i] = b[i] + b[i+4];
}
*/ */
accum2 = widemul(&a[0],&b[3]); accum2 = widemul(&a[0], &b[3]);
accum0 = widemul(&aa[0],&bb[3]); accum0 = widemul(&aa[0], &bb[3]);
accum1 = widemul(&a[4],&b[7]); accum1 = widemul(&a[4], &b[7]);
mac(&accum2, &a[1], &b[2]); mac(&accum2, &a[1], &b[2]);
mac(&accum0, &aa[1], &bb[2]); mac(&accum0, &aa[1], &bb[2]);
@ -60,14 +62,14 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
accum0 >>= 56; accum0 >>= 56;
accum1 >>= 56; accum1 >>= 56;
mac(&accum0, &aa[1],&bb[3]); mac(&accum0, &aa[1], &bb[3]);
mac(&accum1, &a[5], &b[7]); mac(&accum1, &a[5], &b[7]);
mac(&accum0, &aa[2], &bb[2]); mac(&accum0, &aa[2], &bb[2]);
mac(&accum1, &a[6], &b[6]); mac(&accum1, &a[6], &b[6]);
mac(&accum0, &aa[3], &bb[1]); mac(&accum0, &aa[3], &bb[1]);
accum1 += accum0; accum1 += accum0;
accum2 = widemul(&a[0],&b[0]); accum2 = widemul(&a[0], &b[0]);
accum1 -= accum2; accum1 -= accum2;
accum0 += accum2; accum0 += accum2;
@ -84,7 +86,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
accum0 >>= 56; accum0 >>= 56;
accum1 >>= 56; accum1 >>= 56;
accum2 = widemul(&a[2],&b[7]); accum2 = widemul(&a[2], &b[7]);
mac(&accum0, &a[6], &bb[3]); mac(&accum0, &a[6], &bb[3]);
mac(&accum1, &aa[2], &bbb[3]); mac(&accum1, &aa[2], &bbb[3]);
@ -92,7 +94,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
mac(&accum0, &a[7], &bb[2]); mac(&accum0, &a[7], &bb[2]);
mac(&accum1, &aa[3], &bbb[2]); mac(&accum1, &aa[3], &bbb[2]);
mac(&accum2, &a[0],&b[1]); mac(&accum2, &a[0], &b[1]);
mac(&accum1, &aa[0], &bb[1]); mac(&accum1, &aa[0], &bb[1]);
mac(&accum0, &a[4], &b[5]); mac(&accum0, &a[4], &b[5]);
@ -109,11 +111,11 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
accum0 >>= 56; accum0 >>= 56;
accum1 >>= 56; accum1 >>= 56;
accum2 = widemul(&a[3],&b[7]); accum2 = widemul(&a[3], &b[7]);
mac(&accum0, &a[7], &bb[3]); mac(&accum0, &a[7], &bb[3]);
mac(&accum1, &aa[3], &bbb[3]); mac(&accum1, &aa[3], &bbb[3]);
mac(&accum2, &a[0],&b[2]); mac(&accum2, &a[0], &b[2]);
mac(&accum1, &aa[0], &bb[2]); mac(&accum1, &aa[0], &bb[2]);
mac(&accum0, &a[4], &b[6]); mac(&accum0, &a[4], &b[6]);
@ -147,36 +149,45 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
c[0] += ((uint64_t)(accum1)); c[0] += ((uint64_t)(accum1));
} }
void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
{
const uint64_t *a = as->limb; const uint64_t *a = as->limb;
uint64_t *c = cs->limb; uint64_t *c = cs->limb;
__uint128_t accum0, accum4; __uint128_t accum0, accum4;
uint64_t mask = (1ull<<56) - 1; uint64_t mask = (1ull << 56) - 1;
accum0 = widemul_rm(b, &a[0]); accum0 = widemul_rm(b, &a[0]);
accum4 = widemul_rm(b, &a[4]); accum4 = widemul_rm(b, &a[4]);
c[0] = accum0 & mask; accum0 >>= 56; c[0] = accum0 & mask;
c[4] = accum4 & mask; accum4 >>= 56; accum0 >>= 56;
c[4] = accum4 & mask;
accum4 >>= 56;
mac_rm(&accum0, b, &a[1]); mac_rm(&accum0, b, &a[1]);
mac_rm(&accum4, b, &a[5]); mac_rm(&accum4, b, &a[5]);
c[1] = accum0 & mask; accum0 >>= 56; c[1] = accum0 & mask;
c[5] = accum4 & mask; accum4 >>= 56; accum0 >>= 56;
c[5] = accum4 & mask;
accum4 >>= 56;
mac_rm(&accum0, b, &a[2]); mac_rm(&accum0, b, &a[2]);
mac_rm(&accum4, b, &a[6]); mac_rm(&accum4, b, &a[6]);
c[2] = accum0 & mask; accum0 >>= 56; c[2] = accum0 & mask;
c[6] = accum4 & mask; accum4 >>= 56; accum0 >>= 56;
c[6] = accum4 & mask;
accum4 >>= 56;
mac_rm(&accum0, b, &a[3]); mac_rm(&accum0, b, &a[3]);
mac_rm(&accum4, b, &a[7]); mac_rm(&accum4, b, &a[7]);
c[3] = accum0 & mask; accum0 >>= 56; c[3] = accum0 & mask;
c[7] = accum4 & mask; accum4 >>= 56; accum0 >>= 56;
c[7] = accum4 & mask;
accum4 >>= 56;
accum0 += accum4 + c[4]; accum0 += accum4 + c[4];
c[4] = accum0 & mask; c[4] = accum0 & mask;
@ -187,24 +198,26 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
c[1] += accum4 >> 56; c[1] += accum4 >> 56;
} }
void gf_sqr (gf_s *__restrict__ cs, const gf as) { void gf_sqr(gf_s * __restrict__ cs, const gf as)
{
const uint64_t *a = as->limb; const uint64_t *a = as->limb;
uint64_t *c = cs->limb; uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum1 = 0, accum2; __uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull<<56) - 1; uint64_t mask = (1ull << 56) - 1;
uint64_t aa[4] VECTOR_ALIGNED; uint64_t aa[4] VECTOR_ALIGNED;
/* For some reason clang doesn't vectorize this without prompting? */ /* For some reason clang doesn't vectorize this without prompting? */
unsigned int i; unsigned int i;
for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) { for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i]; ((uint64xn_t *) aa)[i] =
((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
} }
accum2 = widemul(&a[0],&a[3]); accum2 = widemul(&a[0], &a[3]);
accum0 = widemul(&aa[0],&aa[3]); accum0 = widemul(&aa[0], &aa[3]);
accum1 = widemul(&a[4],&a[7]); accum1 = widemul(&a[4], &a[7]);
mac(&accum2, &a[1], &a[2]); mac(&accum2, &a[1], &a[2]);
mac(&accum0, &aa[1], &aa[2]); mac(&accum0, &aa[1], &aa[2]);
@ -213,13 +226,13 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
accum0 -= accum2; accum0 -= accum2;
accum1 += accum2; accum1 += accum2;
c[3] = ((uint64_t)(accum1))<<1 & mask; c[3] = ((uint64_t)(accum1)) << 1 & mask;
c[7] = ((uint64_t)(accum0))<<1 & mask; c[7] = ((uint64_t)(accum0)) << 1 & mask;
accum0 >>= 55; accum0 >>= 55;
accum1 >>= 55; accum1 >>= 55;
mac2(&accum0, &aa[1],&aa[3]); mac2(&accum0, &aa[1], &aa[3]);
mac2(&accum1, &a[5], &a[7]); mac2(&accum1, &a[5], &a[7]);
mac(&accum0, &aa[2], &aa[2]); mac(&accum0, &aa[2], &aa[2]);
accum1 += accum0; accum1 += accum0;
@ -227,7 +240,7 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
msb2(&accum0, &a[1], &a[3]); msb2(&accum0, &a[1], &a[3]);
mac(&accum1, &a[6], &a[6]); mac(&accum1, &a[6], &a[6]);
accum2 = widemul(&a[0],&a[0]); accum2 = widemul(&a[0], &a[0]);
accum1 -= accum2; accum1 -= accum2;
accum0 += accum2; accum0 += accum2;
@ -241,14 +254,14 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
accum0 >>= 56; accum0 >>= 56;
accum1 >>= 56; accum1 >>= 56;
accum2 = widemul2(&aa[2],&aa[3]); accum2 = widemul2(&aa[2], &aa[3]);
msb2(&accum0, &a[2], &a[3]); msb2(&accum0, &a[2], &a[3]);
mac2(&accum1, &a[6], &a[7]); mac2(&accum1, &a[6], &a[7]);
accum1 += accum2; accum1 += accum2;
accum0 += accum2; accum0 += accum2;
accum2 = widemul2(&a[0],&a[1]); accum2 = widemul2(&a[0], &a[1]);
mac2(&accum1, &aa[0], &aa[1]); mac2(&accum1, &aa[0], &aa[1]);
mac2(&accum0, &a[4], &a[5]); mac2(&accum0, &a[4], &a[5]);
@ -261,14 +274,14 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
accum0 >>= 56; accum0 >>= 56;
accum1 >>= 56; accum1 >>= 56;
accum2 = widemul(&aa[3],&aa[3]); accum2 = widemul(&aa[3], &aa[3]);
msb(&accum0, &a[3], &a[3]); msb(&accum0, &a[3], &a[3]);
mac(&accum1, &a[7], &a[7]); mac(&accum1, &a[7], &a[7]);
accum1 += accum2; accum1 += accum2;
accum0 += accum2; accum0 += accum2;
accum2 = widemul2(&a[0],&a[2]); accum2 = widemul2(&a[0], &a[2]);
mac2(&accum1, &aa[0], &aa[2]); mac2(&accum1, &aa[0], &aa[2]);
mac2(&accum0, &a[4], &a[6]); mac2(&accum0, &a[4], &a[6]);

View File

@ -14,60 +14,63 @@
#define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}} #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
#define LIMB_PLACE_VALUE(i) 56 #define LIMB_PLACE_VALUE(i) 56
void gf_add_RAW (gf out, const gf a, const gf b) { void gf_add_RAW(gf out, const gf a, const gf b)
for (unsigned int i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) { {
((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i]; for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) {
((uint64xn_t *) out)[i] =
((const uint64xn_t *)a)[i] + ((const uint64xn_t *)b)[i];
} }
/* /*
unsigned int i; * unsigned int i; for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { * out->limb[i] = a->limb[i] + b->limb[i]; }
out->limb[i] = a->limb[i] + b->limb[i];
}
*/ */
} }
void gf_sub_RAW (gf out, const gf a, const gf b) { void gf_sub_RAW(gf out, const gf a, const gf b)
for (unsigned int i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) { {
((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i]; for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) {
((uint64xn_t *) out)[i] =
((const uint64xn_t *)a)[i] - ((const uint64xn_t *)b)[i];
} }
/* /*
unsigned int i; * unsigned int i; for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { * out->limb[i] = a->limb[i] - b->limb[i]; }
out->limb[i] = a->limb[i] - b->limb[i];
}
*/ */
} }
void gf_bias (gf a, int amt) { void gf_bias(gf a, int amt)
uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt; {
uint64_t co1 = ((1ull << 56) - 1) * amt, co2 = co1 - amt;
#if __AVX2__ #if __AVX2__
uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1}; uint64x4_t lo = { co1, co1, co1, co1 }, hi = {
uint64x4_t *aa = (uint64x4_t*) a; co2, co1, co1, co1};
uint64x4_t *aa = (uint64x4_t *) a;
aa[0] += lo; aa[0] += lo;
aa[1] += hi; aa[1] += hi;
#elif __SSE2__ #elif __SSE2__
uint64x2_t lo = {co1,co1}, hi = {co2,co1}; uint64x2_t lo = { co1, co1 }, hi = {
uint64x2_t *aa = (uint64x2_t*) a; co2, co1};
uint64x2_t *aa = (uint64x2_t *) a;
aa[0] += lo; aa[0] += lo;
aa[1] += lo; aa[1] += lo;
aa[2] += hi; aa[2] += hi;
aa[3] += lo; aa[3] += lo;
#else #else
for (unsigned int i=0; i<sizeof(*a)/sizeof(uint64_t); i++) { for (unsigned int i = 0; i < sizeof(*a) / sizeof(uint64_t); i++) {
a->limb[i] += (i==4) ? co2 : co1; a->limb[i] += (i == 4) ? co2 : co1;
} }
#endif #endif
} }
void gf_weak_reduce (gf a) { void gf_weak_reduce(gf a)
{
/* PERF: use pshufb/palignr if anyone cares about speed of this */ /* PERF: use pshufb/palignr if anyone cares about speed of this */
uint64_t mask = (1ull<<56) - 1; uint64_t mask = (1ull << 56) - 1;
uint64_t tmp = a->limb[7] >> 56; uint64_t tmp = a->limb[7] >> 56;
a->limb[4] += tmp; a->limb[4] += tmp;
for (unsigned int i=7; i>0; i--) { for (unsigned int i = 7; i > 0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56); a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
} }
a->limb[0] = (a->limb[0] & mask) + tmp; a->limb[0] = (a->limb[0] & mask) + tmp;
} }

View File

@ -11,10 +11,10 @@
*/ */
#ifndef __CONSTANT_TIME_H__ #ifndef __CONSTANT_TIME_H__
#define __CONSTANT_TIME_H__ 1 # define __CONSTANT_TIME_H__ 1
#include "word.h" # include "word.h"
#include <string.h> # include <string.h>
/* /*
* Constant-time operations on hopefully-compile-time-sized memory * Constant-time operations on hopefully-compile-time-sized memory
@ -36,20 +36,19 @@
* Instead, we're putting our trust in the loop unroller and unswitcher. * Instead, we're putting our trust in the loop unroller and unswitcher.
*/ */
/** /**
* Unaligned big (vector?) register. * Unaligned big (vector?) register.
*/ */
typedef struct { typedef struct {
big_register_t unaligned; big_register_t unaligned;
} __attribute__((packed)) unaligned_br_t; } __attribute__ ((packed)) unaligned_br_t;
/** /**
* Unaligned word register, for architectures where that matters. * Unaligned word register, for architectures where that matters.
*/ */
typedef struct { typedef struct {
word_t unaligned; word_t unaligned;
} __attribute__((packed)) unaligned_word_t; } __attribute__ ((packed)) unaligned_word_t;
/** /**
* @brief Constant-time conditional swap. * @brief Constant-time conditional swap.
@ -60,62 +59,58 @@ typedef struct {
* as their sizes, if the CPU cares about that sort of thing. * as their sizes, if the CPU cares about that sort of thing.
*/ */
static __inline__ void static __inline__ void
__attribute__((unused,always_inline)) __attribute__ ((unused, always_inline))
constant_time_cond_swap ( constant_time_cond_swap(void *__restrict__ a_,
void *__restrict__ a_, void *__restrict__ b_, word_t elem_bytes, mask_t doswap)
void *__restrict__ b_, {
word_t elem_bytes,
mask_t doswap
) {
word_t k; word_t k;
unsigned char *a = (unsigned char *)a_; unsigned char *a = (unsigned char *)a_;
unsigned char *b = (unsigned char *)b_; unsigned char *b = (unsigned char *)b_;
big_register_t br_mask = br_set_to_mask(doswap); big_register_t br_mask = br_set_to_mask(doswap);
for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) { for (k = 0; k <= elem_bytes - sizeof(big_register_t);
k += sizeof(big_register_t)) {
if (elem_bytes % sizeof(big_register_t)) { if (elem_bytes % sizeof(big_register_t)) {
/* unaligned */ /* unaligned */
big_register_t xor = big_register_t xor =
((unaligned_br_t*)(&a[k]))->unaligned ((unaligned_br_t *) (&a[k]))->unaligned
^ ((unaligned_br_t*)(&b[k]))->unaligned; ^ ((unaligned_br_t *) (&b[k]))->unaligned;
xor &= br_mask; xor &= br_mask;
((unaligned_br_t*)(&a[k]))->unaligned ^= xor; ((unaligned_br_t *) (&a[k]))->unaligned ^= xor;
((unaligned_br_t*)(&b[k]))->unaligned ^= xor; ((unaligned_br_t *) (&b[k]))->unaligned ^= xor;
} else { } else {
/* aligned */ /* aligned */
big_register_t xor = big_register_t xor = *((big_register_t *) (&a[k]))
*((big_register_t*)(&a[k])) ^ *((big_register_t *) (&b[k]));
^ *((big_register_t*)(&b[k]));
xor &= br_mask; xor &= br_mask;
*((big_register_t*)(&a[k])) ^= xor; *((big_register_t *) (&a[k])) ^= xor;
*((big_register_t*)(&b[k])) ^= xor; *((big_register_t *) (&b[k])) ^= xor;
} }
} }
if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { for (; k <= elem_bytes - sizeof(word_t); k += sizeof(word_t)) {
if (elem_bytes % sizeof(word_t)) { if (elem_bytes % sizeof(word_t)) {
/* unaligned */ /* unaligned */
word_t xor = word_t xor =
((unaligned_word_t*)(&a[k]))->unaligned ((unaligned_word_t *) (&a[k]))->unaligned
^ ((unaligned_word_t*)(&b[k]))->unaligned; ^ ((unaligned_word_t *) (&b[k]))->unaligned;
xor &= doswap; xor &= doswap;
((unaligned_word_t*)(&a[k]))->unaligned ^= xor; ((unaligned_word_t *) (&a[k]))->unaligned ^= xor;
((unaligned_word_t*)(&b[k]))->unaligned ^= xor; ((unaligned_word_t *) (&b[k]))->unaligned ^= xor;
} else { } else {
/* aligned */ /* aligned */
word_t xor = word_t xor = *((word_t *) (&a[k]))
*((word_t*)(&a[k])) ^ *((word_t *) (&b[k]));
^ *((word_t*)(&b[k]));
xor &= doswap; xor &= doswap;
*((word_t*)(&a[k])) ^= xor; *((word_t *) (&a[k])) ^= xor;
*((word_t*)(&b[k])) ^= xor; *((word_t *) (&b[k])) ^= xor;
} }
} }
} }
if (elem_bytes % sizeof(word_t)) { if (elem_bytes % sizeof(word_t)) {
for (; k<elem_bytes; k+=1) { for (; k < elem_bytes; k += 1) {
unsigned char xor = a[k] ^ b[k]; unsigned char xor = a[k] ^ b[k];
xor &= doswap; xor &= doswap;
a[k] ^= xor; a[k] ^= xor;
@ -133,53 +128,60 @@ constant_time_cond_swap (
* The table and output must not alias. * The table and output must not alias.
*/ */
static __inline__ void static __inline__ void
__attribute__((unused,always_inline)) __attribute__ ((unused, always_inline))
constant_time_lookup ( constant_time_lookup(void *__restrict__ out_,
void *__restrict__ out_,
const void *table_, const void *table_,
word_t elem_bytes, word_t elem_bytes, word_t n_table, word_t idx)
word_t n_table, {
word_t idx
) {
big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx); big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
/* Can't do pointer arithmetic on void* */ /* Can't do pointer arithmetic on void* */
unsigned char *out = (unsigned char *)out_; unsigned char *out = (unsigned char *)out_;
const unsigned char *table = (const unsigned char *)table_; const unsigned char *table = (const unsigned char *)table_;
word_t j,k; word_t j, k;
memset(out, 0, elem_bytes); memset(out, 0, elem_bytes);
for (j=0; j<n_table; j++, big_i-=big_one) { for (j = 0; j < n_table; j++, big_i -= big_one) {
big_register_t br_mask = br_is_zero(big_i); big_register_t br_mask = br_is_zero(big_i);
word_t mask; word_t mask;
for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) { for (k = 0; k <= elem_bytes - sizeof(big_register_t);
k += sizeof(big_register_t)) {
if (elem_bytes % sizeof(big_register_t)) { if (elem_bytes % sizeof(big_register_t)) {
/* unaligned */ /* unaligned */
((unaligned_br_t *)(out+k))->unaligned ((unaligned_br_t *) (out + k))->unaligned
|= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned; |=
br_mask &
((const unaligned_br_t
*)(&table[k + j * elem_bytes]))->unaligned;
} else { } else {
/* aligned */ /* aligned */
*(big_register_t *)(out+k) |= br_mask & *(const big_register_t*)(&table[k+j*elem_bytes]); *(big_register_t *) (out + k) |=
br_mask & *(const big_register_t
*)(&table[k + j * elem_bytes]);
} }
} }
mask = word_is_zero(idx^j); mask = word_is_zero(idx ^ j);
if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { for (; k <= elem_bytes - sizeof(word_t); k += sizeof(word_t)) {
if (elem_bytes % sizeof(word_t)) { if (elem_bytes % sizeof(word_t)) {
/* input unaligned, output aligned */ /* input unaligned, output aligned */
*(word_t *)(out+k) |= mask & ((const unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned; *(word_t *) (out + k) |=
mask &
((const unaligned_word_t
*)(&table[k + j * elem_bytes]))->unaligned;
} else { } else {
/* aligned */ /* aligned */
*(word_t *)(out+k) |= mask & *(const word_t*)(&table[k+j*elem_bytes]); *(word_t *) (out + k) |=
mask & *(const word_t *)(&table[k + j * elem_bytes]);
} }
} }
} }
if (elem_bytes % sizeof(word_t)) { if (elem_bytes % sizeof(word_t)) {
for (; k<elem_bytes; k+=1) { for (; k < elem_bytes; k += 1) {
out[k] |= mask & table[k+j*elem_bytes]; out[k] |= mask & table[k + j * elem_bytes];
} }
} }
} }
@ -195,15 +197,12 @@ constant_time_lookup (
* input, it must be equal and not partially overlap. * input, it must be equal and not partially overlap.
*/ */
static __inline__ void static __inline__ void
__attribute__((unused,always_inline)) __attribute__ ((unused, always_inline))
constant_time_select ( constant_time_select(void *a_,
void *a_,
const void *bFalse_, const void *bFalse_,
const void *bTrue_, const void *bTrue_,
word_t elem_bytes, word_t elem_bytes, mask_t mask, size_t alignment_bytes)
mask_t mask, {
size_t alignment_bytes
) {
unsigned char *a = (unsigned char *)a_; unsigned char *a = (unsigned char *)a_;
const unsigned char *bTrue = (const unsigned char *)bTrue_; const unsigned char *bTrue = (const unsigned char *)bTrue_;
const unsigned char *bFalse = (const unsigned char *)bFalse_; const unsigned char *bFalse = (const unsigned char *)bFalse_;
@ -212,39 +211,41 @@ constant_time_select (
alignment_bytes |= elem_bytes; alignment_bytes |= elem_bytes;
for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) { for (k = 0; k <= elem_bytes - sizeof(big_register_t);
k += sizeof(big_register_t)) {
if (alignment_bytes % sizeof(big_register_t)) { if (alignment_bytes % sizeof(big_register_t)) {
/* unaligned */ /* unaligned */
((unaligned_br_t*)(&a[k]))->unaligned = ((unaligned_br_t *) (&a[k]))->unaligned =
( br_mask & ((const unaligned_br_t*)(&bTrue [k]))->unaligned) (br_mask & ((const unaligned_br_t *)(&bTrue[k]))->unaligned)
| (~br_mask & ((const unaligned_br_t*)(&bFalse[k]))->unaligned); | (~br_mask &
((const unaligned_br_t *)(&bFalse[k]))->unaligned);
} else { } else {
/* aligned */ /* aligned */
*(big_register_t *)(a+k) = *(big_register_t *) (a + k) =
( br_mask & *(const big_register_t*)(&bTrue [k])) (br_mask & *(const big_register_t *)(&bTrue[k]))
| (~br_mask & *(const big_register_t*)(&bFalse[k])); | (~br_mask & *(const big_register_t *)(&bFalse[k]));
} }
} }
if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { for (; k <= elem_bytes - sizeof(word_t); k += sizeof(word_t)) {
if (alignment_bytes % sizeof(word_t)) { if (alignment_bytes % sizeof(word_t)) {
/* unaligned */ /* unaligned */
((unaligned_word_t*)(&a[k]))->unaligned = ((unaligned_word_t *) (&a[k]))->unaligned =
( mask & ((const unaligned_word_t*)(&bTrue [k]))->unaligned) (mask & ((const unaligned_word_t *)(&bTrue[k]))->unaligned)
| (~mask & ((const unaligned_word_t*)(&bFalse[k]))->unaligned); | (~mask &
((const unaligned_word_t *)(&bFalse[k]))->unaligned);
} else { } else {
/* aligned */ /* aligned */
*(word_t *)(a+k) = *(word_t *) (a + k) = (mask & *(const word_t *)(&bTrue[k]))
( mask & *(const word_t*)(&bTrue [k])) | (~mask & *(const word_t *)(&bFalse[k]));
| (~mask & *(const word_t*)(&bFalse[k]));
} }
} }
} }
if (elem_bytes % sizeof(word_t)) { if (elem_bytes % sizeof(word_t)) {
for (; k<elem_bytes; k+=1) { for (; k < elem_bytes; k += 1) {
a[k] = ( mask & bTrue[k]) | (~mask & bFalse[k]); a[k] = (mask & bTrue[k]) | (~mask & bFalse[k]);
} }
} }
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -76,20 +76,27 @@ const uint8_t out_u3[3][56] = {
0x11, 0xd9, 0xdc, 0x14, 0xfd, 0x4b, 0xf6, 0x7a, 0xf3, 0x07, 0x65, 0xc2, 0x11, 0xd9, 0xdc, 0x14, 0xfd, 0x4b, 0xf6, 0x7a, 0xf3, 0x07, 0x65, 0xc2,
0xae, 0x2b, 0x84, 0x6a, 0x4d, 0x23, 0xa8, 0xcd, 0x0d, 0xb8, 0x97, 0x08, 0xae, 0x2b, 0x84, 0x6a, 0x4d, 0x23, 0xa8, 0xcd, 0x0d, 0xb8, 0x97, 0x08,
0x62, 0x39, 0x49, 0x2c, 0xaf, 0x35, 0x0b, 0x51, 0xf8, 0x33, 0x86, 0x8b, 0x62, 0x39, 0x49, 0x2c, 0xaf, 0x35, 0x0b, 0x51, 0xf8, 0x33, 0x86, 0x8b,
0x9b, 0xc2, 0xb3, 0xbc, 0xa9, 0xcf, 0x41, 0x13 0x9b, 0xc2, 0xb3, 0xbc, 0xa9, 0xcf, 0x41, 0x13}, {
}, { 0xaa, 0x3b, 0x47, 0x49,
0xaa, 0x3b, 0x47, 0x49, 0xd5, 0x5b, 0x9d, 0xaf, 0x1e, 0x5b, 0x00, 0x28, 0xd5, 0x5b, 0x9d, 0xaf,
0x88, 0x26, 0xc4, 0x67, 0x27, 0x4c, 0xe3, 0xeb, 0xbd, 0xd5, 0xc1, 0x7b, 0x1e, 0x5b, 0x00, 0x28,
0x97, 0x5e, 0x09, 0xd4, 0xaf, 0x6c, 0x67, 0xcf, 0x10, 0xd0, 0x87, 0x20, 0x88, 0x26, 0xc4, 0x67,
0x2d, 0xb8, 0x82, 0x86, 0xe2, 0xb7, 0x9f, 0xce, 0xea, 0x3e, 0xc3, 0x53, 0x27, 0x4c, 0xe3, 0xeb,
0xef, 0x54, 0xfa, 0xa2, 0x6e, 0x21, 0x9f, 0x38 0xbd, 0xd5, 0xc1, 0x7b,
}, { 0x97, 0x5e, 0x09, 0xd4,
0xaf, 0x6c, 0x67, 0xcf,
0x10, 0xd0, 0x87, 0x20,
0x2d, 0xb8, 0x82, 0x86,
0xe2, 0xb7, 0x9f, 0xce,
0xea, 0x3e, 0xc3, 0x53,
0xef, 0x54, 0xfa, 0xa2,
0x6e, 0x21, 0x9f, 0x38},
{
0x07, 0x7f, 0x45, 0x36, 0x81, 0xca, 0xca, 0x36, 0x93, 0x19, 0x84, 0x20, 0x07, 0x7f, 0x45, 0x36, 0x81, 0xca, 0xca, 0x36, 0x93, 0x19, 0x84, 0x20,
0xbb, 0xe5, 0x15, 0xca, 0xe0, 0x00, 0x24, 0x72, 0x51, 0x9b, 0x3e, 0x67, 0xbb, 0xe5, 0x15, 0xca, 0xe0, 0x00, 0x24, 0x72, 0x51, 0x9b, 0x3e, 0x67,
0x66, 0x1a, 0x7e, 0x89, 0xca, 0xb9, 0x46, 0x95, 0xc8, 0xf4, 0xbc, 0xd6, 0x66, 0x1a, 0x7e, 0x89, 0xca, 0xb9, 0x46, 0x95, 0xc8, 0xf4, 0xbc, 0xd6,
0x6e, 0x61, 0xb9, 0xb9, 0xc9, 0x46, 0xda, 0x8d, 0x52, 0x4d, 0xe3, 0xd6, 0x6e, 0x61, 0xb9, 0xb9, 0xc9, 0x46, 0xda, 0x8d, 0x52, 0x4d, 0xe3, 0xd6,
0x9b, 0xd9, 0xd9, 0xd6, 0x6b, 0x99, 0x7e, 0x37 0x9b, 0xd9, 0xd9, 0xd6, 0x6b, 0x99, 0x7e, 0x37}
}
}; };
/* Test vectors from RFC8032 for Ed448 */ /* Test vectors from RFC8032 for Ed448 */
@ -590,7 +597,6 @@ static const uint8_t *dohash(EVP_MD_CTX *hashctx, const uint8_t *msg,
return hashout; return hashout;
} }
static int test_eddsa(void) static int test_eddsa(void)
{ {
uint8_t outsig[114]; uint8_t outsig[114];
@ -614,7 +620,8 @@ static int test_eddsa(void)
goto err; goto err;
} }
ED448_sign(outsig, msg3, sizeof(msg3), pubkey3, privkey3, context3, sizeof(context3)); ED448_sign(outsig, msg3, sizeof(msg3), pubkey3, privkey3, context3,
sizeof(context3));
if (memcmp(sig3, outsig, sizeof(sig3)) != 0) { if (memcmp(sig3, outsig, sizeof(sig3)) != 0) {
printf("Calculated sig and expected sig differ (3)\n"); printf("Calculated sig and expected sig differ (3)\n");
goto err; goto err;
@ -683,7 +690,7 @@ int main(int argc, char *argv[])
int j = -1; int j = -1;
if (argc != 1 && (argc != 2 || strcmp(argv[1], "-f") != 0)) { if (argc != 1 && (argc != 2 || strcmp(argv[1], "-f") != 0)) {
printf ("Usage: curve448_test [-f]\n"); printf("Usage: curve448_test [-f]\n");
return 1; return 1;
} }
@ -725,7 +732,8 @@ int main(int argc, char *argv[])
if (i == 1 || i == 1000 || i == 1000000) { if (i == 1 || i == 1000 || i == 1000000) {
j++; j++;
if (memcmp(out, out_u3[j], sizeof(out)) != 0) { if (memcmp(out, out_u3[j], sizeof(out)) != 0) {
printf("Calculated output and expected output differ (3, %ud)\n", printf
("Calculated output and expected output differ (3, %ud)\n",
i); i);
return 1; return 1;
} }

View File

@ -11,47 +11,47 @@
*/ */
#ifndef __DECAF_COMMON_H__ #ifndef __DECAF_COMMON_H__
#define __DECAF_COMMON_H__ 1 # define __DECAF_COMMON_H__ 1
#include <openssl/e_os2.h> # include <openssl/e_os2.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
/* Internal word types. /*
* * Internal word types. Somewhat tricky. This could be decided separately per
* Somewhat tricky. This could be decided separately per platform. However, * platform. However, the structs do need to be all the same size and
* the structs do need to be all the same size and alignment on a given * alignment on a given platform to support dynamic linking, since even if you
* platform to support dynamic linking, since even if you header was built * header was built with eg arch_neon, you might end up linking a library built
* with eg arch_neon, you might end up linking a library built with arch_arm32. * with arch_arm32.
*/ */
#ifndef DECAF_WORD_BITS # ifndef DECAF_WORD_BITS
#if (defined(__ILP64__) || defined(__amd64__) || defined(__x86_64__) || (((__UINT_FAST32_MAX__)>>30)>>30)) # if (defined(__ILP64__) || defined(__amd64__) || defined(__x86_64__) || (((__UINT_FAST32_MAX__)>>30)>>30))
#define DECAF_WORD_BITS 64 /**< The number of bits in a word */ # define DECAF_WORD_BITS 64 /**< The number of bits in a word */
#else # else
#define DECAF_WORD_BITS 32 /**< The number of bits in a word */ # define DECAF_WORD_BITS 32 /**< The number of bits in a word */
#endif # endif
#endif # endif
#if DECAF_WORD_BITS == 64 # if DECAF_WORD_BITS == 64
typedef uint64_t decaf_word_t; /**< Word size for internal computations */ typedef uint64_t decaf_word_t; /**< Word size for internal computations */
typedef int64_t decaf_sword_t; /**< Signed word size for internal computations */ typedef int64_t decaf_sword_t; /**< Signed word size for internal computations */
typedef uint64_t decaf_bool_t; /**< "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */ typedef uint64_t decaf_bool_t; /**< "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */
typedef __uint128_t decaf_dword_t; /**< Double-word size for internal computations */ typedef __uint128_t decaf_dword_t; /**< Double-word size for internal computations */
typedef __int128_t decaf_dsword_t; /**< Signed double-word size for internal computations */ typedef __int128_t decaf_dsword_t; /**< Signed double-word size for internal computations */
#elif DECAF_WORD_BITS == 32 /**< The number of bits in a word */ # elif DECAF_WORD_BITS == 32 /**< The number of bits in a word */
typedef uint32_t decaf_word_t; /**< Word size for internal computations */ typedef uint32_t decaf_word_t; /**< Word size for internal computations */
typedef int32_t decaf_sword_t; /**< Signed word size for internal computations */ typedef int32_t decaf_sword_t; /**< Signed word size for internal computations */
typedef uint32_t decaf_bool_t; /**< "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */ typedef uint32_t decaf_bool_t; /**< "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */
typedef uint64_t decaf_dword_t; /**< Double-word size for internal computations */ typedef uint64_t decaf_dword_t; /**< Double-word size for internal computations */
typedef int64_t decaf_dsword_t; /**< Signed double-word size for internal computations */ typedef int64_t decaf_dsword_t; /**< Signed double-word size for internal computations */
#else # else
#error "Only supporting DECAF_WORD_BITS = 32 or 64 for now" # error "Only supporting DECAF_WORD_BITS = 32 or 64 for now"
#endif # endif
/** DECAF_TRUE = -1 so that DECAF_TRUE & x = x */ /** DECAF_TRUE = -1 so that DECAF_TRUE & x = x */
static const decaf_bool_t DECAF_TRUE = -(decaf_bool_t)1; static const decaf_bool_t DECAF_TRUE = -(decaf_bool_t) 1;
/** DECAF_FALSE = 0 so that DECAF_FALSE & x = 0 */ /** DECAF_FALSE = 0 so that DECAF_FALSE & x = 0 */
static const decaf_bool_t DECAF_FALSE = 0; static const decaf_bool_t DECAF_FALSE = 0;
@ -62,18 +62,17 @@ typedef enum {
DECAF_FAILURE = 0 /**< The operation failed. */ DECAF_FAILURE = 0 /**< The operation failed. */
} decaf_error_t; } decaf_error_t;
/** Return success if x is true */ /** Return success if x is true */
static ossl_inline decaf_error_t static ossl_inline decaf_error_t decaf_succeed_if(decaf_bool_t x)
decaf_succeed_if(decaf_bool_t x) { {
return (decaf_error_t)x; return (decaf_error_t) x;
} }
/** Return DECAF_TRUE iff x == DECAF_SUCCESS */ /** Return DECAF_TRUE iff x == DECAF_SUCCESS */
static ossl_inline decaf_bool_t static ossl_inline decaf_bool_t decaf_successful(decaf_error_t e)
decaf_successful(decaf_error_t e) { {
decaf_dword_t w = ((decaf_word_t)e) ^ ((decaf_word_t)DECAF_SUCCESS); decaf_dword_t w = ((decaf_word_t) e) ^ ((decaf_word_t) DECAF_SUCCESS);
return (w-1)>>DECAF_WORD_BITS; return (w - 1) >> DECAF_WORD_BITS;
} }
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -11,31 +11,31 @@
*/ */
#ifndef __DECAF_ED448_H__ #ifndef __DECAF_ED448_H__
#define __DECAF_ED448_H__ 1 # define __DECAF_ED448_H__ 1
#include "point_448.h" # include "point_448.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
/** Number of bytes in an EdDSA public key. */ /** Number of bytes in an EdDSA public key. */
#define DECAF_EDDSA_448_PUBLIC_BYTES 57 # define DECAF_EDDSA_448_PUBLIC_BYTES 57
/** Number of bytes in an EdDSA private key. */ /** Number of bytes in an EdDSA private key. */
#define DECAF_EDDSA_448_PRIVATE_BYTES DECAF_EDDSA_448_PUBLIC_BYTES # define DECAF_EDDSA_448_PRIVATE_BYTES DECAF_EDDSA_448_PUBLIC_BYTES
/** Number of bytes in an EdDSA private key. */ /** Number of bytes in an EdDSA private key. */
#define DECAF_EDDSA_448_SIGNATURE_BYTES (DECAF_EDDSA_448_PUBLIC_BYTES + DECAF_EDDSA_448_PRIVATE_BYTES) # define DECAF_EDDSA_448_SIGNATURE_BYTES (DECAF_EDDSA_448_PUBLIC_BYTES + DECAF_EDDSA_448_PRIVATE_BYTES)
/** Does EdDSA support non-contextual signatures? */ /** Does EdDSA support non-contextual signatures? */
#define DECAF_EDDSA_448_SUPPORTS_CONTEXTLESS_SIGS 0 # define DECAF_EDDSA_448_SUPPORTS_CONTEXTLESS_SIGS 0
/** EdDSA encoding ratio. */ /** EdDSA encoding ratio. */
#define DECAF_448_EDDSA_ENCODE_RATIO 4 # define DECAF_448_EDDSA_ENCODE_RATIO 4
/** EdDSA decoding ratio. */ /** EdDSA decoding ratio. */
#define DECAF_448_EDDSA_DECODE_RATIO (4 / 4) # define DECAF_448_EDDSA_DECODE_RATIO (4 / 4)
/** /**
* @brief EdDSA key generation. This function uses a different (non-Decaf) * @brief EdDSA key generation. This function uses a different (non-Decaf)
@ -44,10 +44,13 @@ extern "C" {
* @param [out] pubkey The public key. * @param [out] pubkey The public key.
* @param [in] privkey The private key. * @param [in] privkey The private key.
*/ */
decaf_error_t decaf_ed448_derive_public_key ( decaf_error_t decaf_ed448_derive_public_key(uint8_t
uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES], pubkey
const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES] [DECAF_EDDSA_448_PUBLIC_BYTES],
); const uint8_t
privkey
[DECAF_EDDSA_448_PRIVATE_BYTES]
);
/** /**
* @brief EdDSA signing. * @brief EdDSA signing.
@ -66,16 +69,16 @@ decaf_error_t decaf_ed448_derive_public_key (
* safe. The C++ wrapper is designed to make it harder to screw this up, but this C code gives * safe. The C++ wrapper is designed to make it harder to screw this up, but this C code gives
* you no seat belt. * you no seat belt.
*/ */
decaf_error_t decaf_ed448_sign ( decaf_error_t decaf_ed448_sign(uint8_t
uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES], signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES], const uint8_t
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES], privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
const uint8_t *message, const uint8_t
size_t message_len, pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
uint8_t prehashed, const uint8_t *message, size_t message_len,
const uint8_t *context, uint8_t prehashed, const uint8_t *context,
size_t context_len size_t context_len)
) __attribute__((nonnull(1,2,3))); __attribute__ ((nonnull(1, 2, 3)));
/** /**
* @brief EdDSA signing with prehash. * @brief EdDSA signing with prehash.
@ -92,14 +95,17 @@ decaf_error_t decaf_ed448_sign (
* safe. The C++ wrapper is designed to make it harder to screw this up, but this C code gives * safe. The C++ wrapper is designed to make it harder to screw this up, but this C code gives
* you no seat belt. * you no seat belt.
*/ */
decaf_error_t decaf_ed448_sign_prehash ( decaf_error_t decaf_ed448_sign_prehash(uint8_t
uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES], signature
const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES], [DECAF_EDDSA_448_SIGNATURE_BYTES],
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES], const uint8_t
privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
const uint8_t
pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
const uint8_t hash[64], const uint8_t hash[64],
const uint8_t *context, const uint8_t *context,
size_t context_len size_t context_len)
) __attribute__((nonnull(1,2,3,4))); __attribute__ ((nonnull(1, 2, 3, 4)));
/** /**
* @brief EdDSA signature verification. * @brief EdDSA signature verification.
@ -119,15 +125,14 @@ decaf_error_t decaf_ed448_sign_prehash (
* safe. The C++ wrapper is designed to make it harder to screw this up, but this C code gives * safe. The C++ wrapper is designed to make it harder to screw this up, but this C code gives
* you no seat belt. * you no seat belt.
*/ */
decaf_error_t decaf_ed448_verify ( decaf_error_t decaf_ed448_verify(const uint8_t
const uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES], signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES], const uint8_t
const uint8_t *message, pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
size_t message_len, const uint8_t *message, size_t message_len,
uint8_t prehashed, uint8_t prehashed, const uint8_t *context,
const uint8_t *context, uint8_t context_len)
uint8_t context_len __attribute__ ((nonnull(1, 2)));
) __attribute__((nonnull(1,2)));
/** /**
* @brief EdDSA signature verification. * @brief EdDSA signature verification.
@ -145,13 +150,15 @@ decaf_error_t decaf_ed448_verify (
* safe. The C++ wrapper is designed to make it harder to screw this up, but this C code gives * safe. The C++ wrapper is designed to make it harder to screw this up, but this C code gives
* you no seat belt. * you no seat belt.
*/ */
decaf_error_t decaf_ed448_verify_prehash ( decaf_error_t decaf_ed448_verify_prehash(const uint8_t
const uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES], signature
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES], [DECAF_EDDSA_448_SIGNATURE_BYTES],
const uint8_t
pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
const uint8_t hash[64], const uint8_t hash[64],
const uint8_t *context, const uint8_t *context,
uint8_t context_len uint8_t context_len)
) __attribute__((nonnull(1,2))); __attribute__ ((nonnull(1, 2)));
/** /**
* @brief EdDSA point encoding. Used internally, exposed externally. * @brief EdDSA point encoding. Used internally, exposed externally.
@ -177,10 +184,11 @@ decaf_error_t decaf_ed448_verify_prehash (
* @param [out] enc The encoded point. * @param [out] enc The encoded point.
* @param [in] p The point. * @param [in] p The point.
*/ */
void curve448_point_mul_by_ratio_and_encode_like_eddsa ( void curve448_point_mul_by_ratio_and_encode_like_eddsa(uint8_t
uint8_t enc[DECAF_EDDSA_448_PUBLIC_BYTES], enc
const curve448_point_t p [DECAF_EDDSA_448_PUBLIC_BYTES],
); const curve448_point_t
p);
/** /**
* @brief EdDSA point decoding. Multiplies by DECAF_448_EDDSA_DECODE_RATIO, * @brief EdDSA point decoding. Multiplies by DECAF_448_EDDSA_DECODE_RATIO,
@ -191,10 +199,12 @@ void curve448_point_mul_by_ratio_and_encode_like_eddsa (
* @param [out] enc The encoded point. * @param [out] enc The encoded point.
* @param [in] p The point. * @param [in] p The point.
*/ */
decaf_error_t curve448_point_decode_like_eddsa_and_mul_by_ratio ( decaf_error_t curve448_point_decode_like_eddsa_and_mul_by_ratio(curve448_point_t
curve448_point_t p, p,
const uint8_t enc[DECAF_EDDSA_448_PUBLIC_BYTES] const uint8_t
); enc
[DECAF_EDDSA_448_PUBLIC_BYTES]
);
/** /**
* @brief EdDSA to ECDH public key conversion * @brief EdDSA to ECDH public key conversion
@ -207,10 +217,10 @@ decaf_error_t curve448_point_decode_like_eddsa_and_mul_by_ratio (
* @param[out] x The ECDH public key as in RFC7748(point on Montgomery curve) * @param[out] x The ECDH public key as in RFC7748(point on Montgomery curve)
* @param[in] ed The EdDSA public key(point on Edwards curve) * @param[in] ed The EdDSA public key(point on Edwards curve)
*/ */
void decaf_ed448_convert_public_key_to_x448 ( void decaf_ed448_convert_public_key_to_x448(uint8_t x[DECAF_X448_PUBLIC_BYTES],
uint8_t x[DECAF_X448_PUBLIC_BYTES], const uint8_t
const uint8_t ed[DECAF_EDDSA_448_PUBLIC_BYTES] ed[DECAF_EDDSA_448_PUBLIC_BYTES]
); );
/** /**
* @brief EdDSA to ECDH private key conversion * @brief EdDSA to ECDH private key conversion
@ -220,10 +230,13 @@ void decaf_ed448_convert_public_key_to_x448 (
* @param[out] x The ECDH private key as in RFC7748 * @param[out] x The ECDH private key as in RFC7748
* @param[in] ed The EdDSA private key * @param[in] ed The EdDSA private key
*/ */
decaf_error_t decaf_ed448_convert_private_key_to_x448 ( decaf_error_t decaf_ed448_convert_private_key_to_x448(uint8_t
uint8_t x[DECAF_X448_PRIVATE_BYTES], x
const uint8_t ed[DECAF_EDDSA_448_PRIVATE_BYTES] [DECAF_X448_PRIVATE_BYTES],
); const uint8_t
ed
[DECAF_EDDSA_448_PRIVATE_BYTES]
);
#ifdef __cplusplus #ifdef __cplusplus
} /* extern "C" */ } /* extern "C" */

View File

@ -27,12 +27,12 @@
#if NO_CONTEXT #if NO_CONTEXT
const uint8_t NO_CONTEXT_POINTS_HERE = 0; const uint8_t NO_CONTEXT_POINTS_HERE = 0;
const uint8_t * const DECAF_ED448_NO_CONTEXT = &NO_CONTEXT_POINTS_HERE; const uint8_t *const DECAF_ED448_NO_CONTEXT = &NO_CONTEXT_POINTS_HERE;
#endif #endif
/* EDDSA_BASE_POINT_RATIO = 1 or 2 /*
* Because EdDSA25519 is not on E_d but on the isogenous E_sigma_d, * EDDSA_BASE_POINT_RATIO = 1 or 2 Because EdDSA25519 is not on E_d but on the
* its base point is twice ours. * isogenous E_sigma_d, its base point is twice ours.
*/ */
#define EDDSA_BASE_POINT_RATIO (1+EDDSA_USE_SIGMA_ISOGENY) /* TODO: remove */ #define EDDSA_BASE_POINT_RATIO (1+EDDSA_USE_SIGMA_ISOGENY) /* TODO: remove */
@ -55,11 +55,10 @@ static decaf_error_t oneshot_hash(uint8_t *out, size_t outlen,
return DECAF_SUCCESS; return DECAF_SUCCESS;
} }
static void clamp(uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES]
static void clamp ( )
uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES] {
) { uint8_t hibit = (1 << 0) >> 1;
uint8_t hibit = (1<<0)>>1;
/* Blarg */ /* Blarg */
secret_scalar_ser[0] &= -COFACTOR; secret_scalar_ser[0] &= -COFACTOR;
@ -67,18 +66,17 @@ static void clamp (
secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] = 0; secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] = 0;
secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 2] |= 0x80; secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 2] |= 0x80;
} else { } else {
secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] &= hibit-1; secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] &= hibit - 1;
secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] |= hibit; secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] |= hibit;
} }
} }
static decaf_error_t hash_init_with_dom( static decaf_error_t hash_init_with_dom(EVP_MD_CTX *hashctx,
EVP_MD_CTX *hashctx,
uint8_t prehashed, uint8_t prehashed,
uint8_t for_prehash, uint8_t for_prehash,
const uint8_t *context, const uint8_t *context,
size_t context_len size_t context_len)
) { {
const char *dom_s = "SigEd448"; const char *dom_s = "SigEd448";
uint8_t dom[2]; uint8_t dom[2];
@ -108,24 +106,29 @@ static decaf_error_t hash_init_with_dom(
} }
/* In this file because it uses the hash */ /* In this file because it uses the hash */
decaf_error_t decaf_ed448_convert_private_key_to_x448 ( decaf_error_t decaf_ed448_convert_private_key_to_x448(uint8_t
uint8_t x[DECAF_X448_PRIVATE_BYTES], x
const uint8_t ed[DECAF_EDDSA_448_PRIVATE_BYTES] [DECAF_X448_PRIVATE_BYTES],
) { const uint8_t
ed
[DECAF_EDDSA_448_PRIVATE_BYTES]
)
{
/* pass the private key through oneshot_hash function */ /* pass the private key through oneshot_hash function */
/* and keep the first DECAF_X448_PRIVATE_BYTES bytes */ /* and keep the first DECAF_X448_PRIVATE_BYTES bytes */
return oneshot_hash( return oneshot_hash(x,
x,
DECAF_X448_PRIVATE_BYTES, DECAF_X448_PRIVATE_BYTES,
ed, ed, DECAF_EDDSA_448_PRIVATE_BYTES);
DECAF_EDDSA_448_PRIVATE_BYTES
);
} }
decaf_error_t decaf_ed448_derive_public_key ( decaf_error_t decaf_ed448_derive_public_key(uint8_t
uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES], pubkey
const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES] [DECAF_EDDSA_448_PUBLIC_BYTES],
) { const uint8_t
privkey
[DECAF_EDDSA_448_PRIVATE_BYTES]
)
{
/* only this much used for keygen */ /* only this much used for keygen */
uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES]; uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES];
curve448_scalar_t secret_scalar; curve448_scalar_t secret_scalar;
@ -138,19 +141,22 @@ decaf_error_t decaf_ed448_derive_public_key (
} }
clamp(secret_scalar_ser); clamp(secret_scalar_ser);
curve448_scalar_decode_long(secret_scalar, secret_scalar_ser, sizeof(secret_scalar_ser)); curve448_scalar_decode_long(secret_scalar, secret_scalar_ser,
sizeof(secret_scalar_ser));
/* Since we are going to mul_by_cofactor during encoding, divide by it here. /*
* However, the EdDSA base point is not the same as the decaf base point if * Since we are going to mul_by_cofactor during encoding, divide by it
* the sigma isogeny is in use: the EdDSA base point is on Etwist_d/(1-d) and * here. However, the EdDSA base point is not the same as the decaf base
* the decaf base point is on Etwist_d, and when converted it effectively * point if the sigma isogeny is in use: the EdDSA base point is on
* picks up a factor of 2 from the isogenies. So we might start at 2 instead of 1. * Etwist_d/(1-d) and the decaf base point is on Etwist_d, and when
* converted it effectively picks up a factor of 2 from the isogenies. So
* we might start at 2 instead of 1.
*/ */
for (c=1; c<DECAF_448_EDDSA_ENCODE_RATIO; c <<= 1) { for (c = 1; c < DECAF_448_EDDSA_ENCODE_RATIO; c <<= 1) {
curve448_scalar_halve(secret_scalar,secret_scalar); curve448_scalar_halve(secret_scalar, secret_scalar);
} }
curve448_precomputed_scalarmul(p,curve448_precomputed_base,secret_scalar); curve448_precomputed_scalarmul(p, curve448_precomputed_base, secret_scalar);
curve448_point_mul_by_ratio_and_encode_like_eddsa(pubkey, p); curve448_point_mul_by_ratio_and_encode_like_eddsa(pubkey, p);
@ -162,21 +168,21 @@ decaf_error_t decaf_ed448_derive_public_key (
return DECAF_SUCCESS; return DECAF_SUCCESS;
} }
decaf_error_t decaf_ed448_sign ( decaf_error_t decaf_ed448_sign(uint8_t
uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES], signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES], const uint8_t
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES], privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
const uint8_t *message, const uint8_t
size_t message_len, pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
uint8_t prehashed, const uint8_t *message, size_t message_len,
const uint8_t *context, uint8_t prehashed, const uint8_t *context,
size_t context_len size_t context_len)
) { {
curve448_scalar_t secret_scalar; curve448_scalar_t secret_scalar;
EVP_MD_CTX *hashctx = EVP_MD_CTX_new(); EVP_MD_CTX *hashctx = EVP_MD_CTX_new();
decaf_error_t ret = DECAF_FAILURE; decaf_error_t ret = DECAF_FAILURE;
curve448_scalar_t nonce_scalar; curve448_scalar_t nonce_scalar;
uint8_t nonce_point[DECAF_EDDSA_448_PUBLIC_BYTES] = {0}; uint8_t nonce_point[DECAF_EDDSA_448_PUBLIC_BYTES] = { 0 };
unsigned int c; unsigned int c;
curve448_scalar_t challenge_scalar; curve448_scalar_t challenge_scalar;
@ -188,18 +194,18 @@ decaf_error_t decaf_ed448_sign (
struct { struct {
uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES]; uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES];
uint8_t seed[DECAF_EDDSA_448_PRIVATE_BYTES]; uint8_t seed[DECAF_EDDSA_448_PRIVATE_BYTES];
} __attribute__((packed)) expanded; } __attribute__ ((packed)) expanded;
if (!oneshot_hash((uint8_t *)&expanded, sizeof(expanded), privkey, if (!oneshot_hash((uint8_t *)&expanded, sizeof(expanded), privkey,
DECAF_EDDSA_448_PRIVATE_BYTES)) DECAF_EDDSA_448_PRIVATE_BYTES))
goto err; goto err;
clamp(expanded.secret_scalar_ser); clamp(expanded.secret_scalar_ser);
curve448_scalar_decode_long(secret_scalar, expanded.secret_scalar_ser, sizeof(expanded.secret_scalar_ser)); curve448_scalar_decode_long(secret_scalar, expanded.secret_scalar_ser,
sizeof(expanded.secret_scalar_ser));
/* Hash to create the nonce */ /* Hash to create the nonce */
if (!hash_init_with_dom(hashctx, prehashed, 0, context, context_len) if (!hash_init_with_dom(hashctx, prehashed, 0, context, context_len)
|| !EVP_DigestUpdate(hashctx, expanded.seed, || !EVP_DigestUpdate(hashctx, expanded.seed, sizeof(expanded.seed))
sizeof(expanded.seed))
|| !EVP_DigestUpdate(hashctx, message, message_len)) { || !EVP_DigestUpdate(hashctx, message, message_len)) {
OPENSSL_cleanse(&expanded, sizeof(expanded)); OPENSSL_cleanse(&expanded, sizeof(expanded));
goto err; goto err;
@ -209,7 +215,7 @@ decaf_error_t decaf_ed448_sign (
/* Decode the nonce */ /* Decode the nonce */
{ {
uint8_t nonce[2*DECAF_EDDSA_448_PRIVATE_BYTES]; uint8_t nonce[2 * DECAF_EDDSA_448_PRIVATE_BYTES];
if (!EVP_DigestFinalXOF(hashctx, nonce, sizeof(nonce))) if (!EVP_DigestFinalXOF(hashctx, nonce, sizeof(nonce)))
goto err; goto err;
@ -222,39 +228,41 @@ decaf_error_t decaf_ed448_sign (
curve448_scalar_t nonce_scalar_2; curve448_scalar_t nonce_scalar_2;
curve448_point_t p; curve448_point_t p;
curve448_scalar_halve(nonce_scalar_2,nonce_scalar); curve448_scalar_halve(nonce_scalar_2, nonce_scalar);
for (c = 2; c < DECAF_448_EDDSA_ENCODE_RATIO; c <<= 1) { for (c = 2; c < DECAF_448_EDDSA_ENCODE_RATIO; c <<= 1) {
curve448_scalar_halve(nonce_scalar_2,nonce_scalar_2); curve448_scalar_halve(nonce_scalar_2, nonce_scalar_2);
} }
curve448_precomputed_scalarmul(p,curve448_precomputed_base,nonce_scalar_2); curve448_precomputed_scalarmul(p, curve448_precomputed_base,
nonce_scalar_2);
curve448_point_mul_by_ratio_and_encode_like_eddsa(nonce_point, p); curve448_point_mul_by_ratio_and_encode_like_eddsa(nonce_point, p);
curve448_point_destroy(p); curve448_point_destroy(p);
curve448_scalar_destroy(nonce_scalar_2); curve448_scalar_destroy(nonce_scalar_2);
} }
{ {
uint8_t challenge[2*DECAF_EDDSA_448_PRIVATE_BYTES]; uint8_t challenge[2 * DECAF_EDDSA_448_PRIVATE_BYTES];
/* Compute the challenge */ /* Compute the challenge */
if (!hash_init_with_dom(hashctx, prehashed, 0, context, context_len) if (!hash_init_with_dom(hashctx, prehashed, 0, context, context_len)
|| !EVP_DigestUpdate(hashctx, nonce_point, sizeof(nonce_point)) || !EVP_DigestUpdate(hashctx, nonce_point, sizeof(nonce_point))
|| !EVP_DigestUpdate(hashctx, pubkey, || !EVP_DigestUpdate(hashctx, pubkey, DECAF_EDDSA_448_PUBLIC_BYTES)
DECAF_EDDSA_448_PUBLIC_BYTES)
|| !EVP_DigestUpdate(hashctx, message, message_len) || !EVP_DigestUpdate(hashctx, message, message_len)
|| !EVP_DigestFinalXOF(hashctx, challenge, sizeof(challenge))) || !EVP_DigestFinalXOF(hashctx, challenge, sizeof(challenge)))
goto err; goto err;
curve448_scalar_decode_long(challenge_scalar,challenge,sizeof(challenge)); curve448_scalar_decode_long(challenge_scalar, challenge,
OPENSSL_cleanse(challenge,sizeof(challenge)); sizeof(challenge));
OPENSSL_cleanse(challenge, sizeof(challenge));
} }
curve448_scalar_mul(challenge_scalar,challenge_scalar,secret_scalar); curve448_scalar_mul(challenge_scalar, challenge_scalar, secret_scalar);
curve448_scalar_add(challenge_scalar,challenge_scalar,nonce_scalar); curve448_scalar_add(challenge_scalar, challenge_scalar, nonce_scalar);
OPENSSL_cleanse(signature,DECAF_EDDSA_448_SIGNATURE_BYTES); OPENSSL_cleanse(signature, DECAF_EDDSA_448_SIGNATURE_BYTES);
memcpy(signature,nonce_point,sizeof(nonce_point)); memcpy(signature, nonce_point, sizeof(nonce_point));
curve448_scalar_encode(&signature[DECAF_EDDSA_448_PUBLIC_BYTES],challenge_scalar); curve448_scalar_encode(&signature[DECAF_EDDSA_448_PUBLIC_BYTES],
challenge_scalar);
curve448_scalar_destroy(secret_scalar); curve448_scalar_destroy(secret_scalar);
curve448_scalar_destroy(nonce_scalar); curve448_scalar_destroy(nonce_scalar);
@ -266,52 +274,59 @@ decaf_error_t decaf_ed448_sign (
return ret; return ret;
} }
decaf_error_t decaf_ed448_sign_prehash(uint8_t
decaf_error_t decaf_ed448_sign_prehash ( signature
uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES], [DECAF_EDDSA_448_SIGNATURE_BYTES],
const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES], const uint8_t
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES], privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
const uint8_t
pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
const uint8_t hash[64], const uint8_t hash[64],
const uint8_t *context, const uint8_t *context,
size_t context_len size_t context_len)
) { {
return decaf_ed448_sign(signature,privkey,pubkey,hash,64,1,context, return decaf_ed448_sign(signature, privkey, pubkey, hash, 64, 1, context,
context_len); context_len);
/*OPENSSL_cleanse(hash,sizeof(hash));*/ /*
* OPENSSL_cleanse(hash,sizeof(hash));
*/
} }
decaf_error_t decaf_ed448_verify ( decaf_error_t decaf_ed448_verify(const uint8_t
const uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES], signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES], const uint8_t
const uint8_t *message, pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
size_t message_len, const uint8_t *message, size_t message_len,
uint8_t prehashed, uint8_t prehashed, const uint8_t *context,
const uint8_t *context, uint8_t context_len)
uint8_t context_len {
) {
curve448_point_t pk_point, r_point; curve448_point_t pk_point, r_point;
decaf_error_t error = curve448_point_decode_like_eddsa_and_mul_by_ratio(pk_point,pubkey); decaf_error_t error =
curve448_point_decode_like_eddsa_and_mul_by_ratio(pk_point, pubkey);
curve448_scalar_t challenge_scalar; curve448_scalar_t challenge_scalar;
curve448_scalar_t response_scalar; curve448_scalar_t response_scalar;
unsigned int c; unsigned int c;
if (DECAF_SUCCESS != error) { return error; } if (DECAF_SUCCESS != error) {
return error;
}
error = curve448_point_decode_like_eddsa_and_mul_by_ratio(r_point,signature); error =
if (DECAF_SUCCESS != error) { return error; } curve448_point_decode_like_eddsa_and_mul_by_ratio(r_point, signature);
if (DECAF_SUCCESS != error) {
return error;
}
{ {
/* Compute the challenge */ /* Compute the challenge */
EVP_MD_CTX *hashctx = EVP_MD_CTX_new(); EVP_MD_CTX *hashctx = EVP_MD_CTX_new();
uint8_t challenge[2*DECAF_EDDSA_448_PRIVATE_BYTES]; uint8_t challenge[2 * DECAF_EDDSA_448_PRIVATE_BYTES];
if (hashctx == NULL if (hashctx == NULL
|| !hash_init_with_dom(hashctx, prehashed, 0, context, || !hash_init_with_dom(hashctx, prehashed, 0, context, context_len)
context_len)
|| !EVP_DigestUpdate(hashctx, signature, || !EVP_DigestUpdate(hashctx, signature,
DECAF_EDDSA_448_PUBLIC_BYTES) DECAF_EDDSA_448_PUBLIC_BYTES)
|| !EVP_DigestUpdate(hashctx, pubkey, || !EVP_DigestUpdate(hashctx, pubkey, DECAF_EDDSA_448_PUBLIC_BYTES)
DECAF_EDDSA_448_PUBLIC_BYTES)
|| !EVP_DigestUpdate(hashctx, message, message_len) || !EVP_DigestUpdate(hashctx, message, message_len)
|| !EVP_DigestFinalXOF(hashctx, challenge, sizeof(challenge))) { || !EVP_DigestFinalXOF(hashctx, challenge, sizeof(challenge))) {
EVP_MD_CTX_free(hashctx); EVP_MD_CTX_free(hashctx);
@ -319,43 +334,42 @@ decaf_error_t decaf_ed448_verify (
} }
EVP_MD_CTX_free(hashctx); EVP_MD_CTX_free(hashctx);
curve448_scalar_decode_long(challenge_scalar,challenge,sizeof(challenge)); curve448_scalar_decode_long(challenge_scalar, challenge,
OPENSSL_cleanse(challenge,sizeof(challenge)); sizeof(challenge));
OPENSSL_cleanse(challenge, sizeof(challenge));
} }
curve448_scalar_sub(challenge_scalar, curve448_scalar_zero, challenge_scalar); curve448_scalar_sub(challenge_scalar, curve448_scalar_zero,
challenge_scalar);
curve448_scalar_decode_long( curve448_scalar_decode_long(response_scalar,
response_scalar,
&signature[DECAF_EDDSA_448_PUBLIC_BYTES], &signature[DECAF_EDDSA_448_PUBLIC_BYTES],
DECAF_EDDSA_448_PRIVATE_BYTES DECAF_EDDSA_448_PRIVATE_BYTES);
);
for (c=1; c<DECAF_448_EDDSA_DECODE_RATIO; c<<=1) { for (c = 1; c < DECAF_448_EDDSA_DECODE_RATIO; c <<= 1) {
curve448_scalar_add(response_scalar,response_scalar,response_scalar); curve448_scalar_add(response_scalar, response_scalar, response_scalar);
} }
/* pk_point = -c(x(P)) + (cx + k)G = kG */ /* pk_point = -c(x(P)) + (cx + k)G = kG */
curve448_base_double_scalarmul_non_secret( curve448_base_double_scalarmul_non_secret(pk_point,
pk_point,
response_scalar, response_scalar,
pk_point, pk_point, challenge_scalar);
challenge_scalar return decaf_succeed_if(curve448_point_eq(pk_point, r_point));
);
return decaf_succeed_if(curve448_point_eq(pk_point,r_point));
} }
decaf_error_t decaf_ed448_verify_prehash(const uint8_t
decaf_error_t decaf_ed448_verify_prehash ( signature
const uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES], [DECAF_EDDSA_448_SIGNATURE_BYTES],
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES], const uint8_t
pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
const uint8_t hash[64], const uint8_t hash[64],
const uint8_t *context, const uint8_t *context,
uint8_t context_len uint8_t context_len)
) { {
decaf_error_t ret; decaf_error_t ret;
ret = decaf_ed448_verify(signature,pubkey,hash,64,1,context,context_len); ret =
decaf_ed448_verify(signature, pubkey, hash, 64, 1, context,
context_len);
return ret; return ret;
} }
@ -370,7 +384,6 @@ int ED448_sign(uint8_t *out_sig, const uint8_t *message, size_t message_len,
== DECAF_SUCCESS; == DECAF_SUCCESS;
} }
int ED448_verify(const uint8_t *message, size_t message_len, int ED448_verify(const uint8_t *message, size_t message_len,
const uint8_t signature[114], const uint8_t public_key[57], const uint8_t signature[114], const uint8_t public_key[57],
const uint8_t *context, size_t context_len) const uint8_t *context, size_t context_len)

View File

@ -12,37 +12,35 @@
#include "field.h" #include "field.h"
mask_t gf_isr ( mask_t gf_isr(gf a, const gf x)
gf a, {
const gf x
) {
gf L0, L1, L2; gf L0, L1, L2;
gf_sqr (L1, x ); gf_sqr(L1, x);
gf_mul (L2, x, L1 ); gf_mul(L2, x, L1);
gf_sqr (L1, L2 ); gf_sqr(L1, L2);
gf_mul (L2, x, L1 ); gf_mul(L2, x, L1);
gf_sqrn (L1, L2, 3 ); gf_sqrn(L1, L2, 3);
gf_mul (L0, L2, L1 ); gf_mul(L0, L2, L1);
gf_sqrn (L1, L0, 3 ); gf_sqrn(L1, L0, 3);
gf_mul (L0, L2, L1 ); gf_mul(L0, L2, L1);
gf_sqrn (L2, L0, 9 ); gf_sqrn(L2, L0, 9);
gf_mul (L1, L0, L2 ); gf_mul(L1, L0, L2);
gf_sqr (L0, L1 ); gf_sqr(L0, L1);
gf_mul (L2, x, L0 ); gf_mul(L2, x, L0);
gf_sqrn (L0, L2, 18 ); gf_sqrn(L0, L2, 18);
gf_mul (L2, L1, L0 ); gf_mul(L2, L1, L0);
gf_sqrn (L0, L2, 37 ); gf_sqrn(L0, L2, 37);
gf_mul (L1, L2, L0 ); gf_mul(L1, L2, L0);
gf_sqrn (L0, L1, 37 ); gf_sqrn(L0, L1, 37);
gf_mul (L1, L2, L0 ); gf_mul(L1, L2, L0);
gf_sqrn (L0, L1, 111 ); gf_sqrn(L0, L1, 111);
gf_mul (L2, L1, L0 ); gf_mul(L2, L1, L0);
gf_sqr (L0, L2 ); gf_sqr(L0, L2);
gf_mul (L1, x, L0 ); gf_mul(L1, x, L0);
gf_sqrn (L0, L1, 223 ); gf_sqrn(L0, L1, 223);
gf_mul (L1, L2, L0 ); gf_mul(L1, L2, L0);
gf_sqr (L2, L1); gf_sqr(L2, L1);
gf_mul (L0, L2, x); gf_mul(L0, L2, x);
gf_copy(a,L1); gf_copy(a, L1);
return gf_eq(L0,ONE); return gf_eq(L0, ONE);
} }

View File

@ -11,91 +11,97 @@
*/ */
#ifndef __P448_F_FIELD_H__ #ifndef __P448_F_FIELD_H__
#define __P448_F_FIELD_H__ 1 # define __P448_F_FIELD_H__ 1
#include "constant_time.h" # include "constant_time.h"
#include <string.h> # include <string.h>
#include <assert.h> # include <assert.h>
#include "word.h" # include "word.h"
#define __DECAF_448_GF_DEFINED__ 1 # define __DECAF_448_GF_DEFINED__ 1
#define NLIMBS (64/sizeof(word_t)) # define NLIMBS (64/sizeof(word_t))
#define X_SER_BYTES 56 # define X_SER_BYTES 56
#define SER_BYTES 56 # define SER_BYTES 56
typedef struct gf_448_s { typedef struct gf_448_s {
word_t limb[NLIMBS]; word_t limb[NLIMBS];
} __attribute__((aligned(32))) gf_448_s, gf_448_t[1]; } __attribute__ ((aligned(32))) gf_448_s, gf_448_t[1];
#define GF_LIT_LIMB_BITS 56 # define GF_LIT_LIMB_BITS 56
#define GF_BITS 448 # define GF_BITS 448
#define ZERO gf_448_ZERO # define ZERO gf_448_ZERO
#define ONE gf_448_ONE # define ONE gf_448_ONE
#define MODULUS gf_448_MODULUS # define MODULUS gf_448_MODULUS
#define gf gf_448_t # define gf gf_448_t
#define gf_s gf_448_s # define gf_s gf_448_s
#define gf_eq gf_448_eq # define gf_eq gf_448_eq
#define gf_hibit gf_448_hibit # define gf_hibit gf_448_hibit
#define gf_lobit gf_448_lobit # define gf_lobit gf_448_lobit
#define gf_copy gf_448_copy # define gf_copy gf_448_copy
#define gf_add gf_448_add # define gf_add gf_448_add
#define gf_sub gf_448_sub # define gf_sub gf_448_sub
#define gf_add_RAW gf_448_add_RAW # define gf_add_RAW gf_448_add_RAW
#define gf_sub_RAW gf_448_sub_RAW # define gf_sub_RAW gf_448_sub_RAW
#define gf_bias gf_448_bias # define gf_bias gf_448_bias
#define gf_weak_reduce gf_448_weak_reduce # define gf_weak_reduce gf_448_weak_reduce
#define gf_strong_reduce gf_448_strong_reduce # define gf_strong_reduce gf_448_strong_reduce
#define gf_mul gf_448_mul # define gf_mul gf_448_mul
#define gf_sqr gf_448_sqr # define gf_sqr gf_448_sqr
#define gf_mulw_unsigned gf_448_mulw_unsigned # define gf_mulw_unsigned gf_448_mulw_unsigned
#define gf_isr gf_448_isr # define gf_isr gf_448_isr
#define gf_serialize gf_448_serialize # define gf_serialize gf_448_serialize
#define gf_deserialize gf_448_deserialize # define gf_deserialize gf_448_deserialize
/* RFC 7748 support */ /* RFC 7748 support */
#define X_PUBLIC_BYTES X_SER_BYTES # define X_PUBLIC_BYTES X_SER_BYTES
#define X_PRIVATE_BYTES X_PUBLIC_BYTES # define X_PRIVATE_BYTES X_PUBLIC_BYTES
#define X_PRIVATE_BITS 448 # define X_PRIVATE_BITS 448
#define INLINE_UNUSED __inline__ __attribute__((unused,always_inline)) # define INLINE_UNUSED __inline__ __attribute__((unused,always_inline))
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
/* Defined below in f_impl.h */ /* Defined below in f_impl.h */
static INLINE_UNUSED void gf_copy (gf out, const gf a) { *out = *a; } static INLINE_UNUSED void gf_copy(gf out, const gf a)
static INLINE_UNUSED void gf_add_RAW (gf out, const gf a, const gf b); {
static INLINE_UNUSED void gf_sub_RAW (gf out, const gf a, const gf b); *out = *a;
static INLINE_UNUSED void gf_bias (gf inout, int amount); }
static INLINE_UNUSED void gf_weak_reduce (gf inout);
void gf_strong_reduce (gf inout); static INLINE_UNUSED void gf_add_RAW(gf out, const gf a, const gf b);
void gf_add (gf out, const gf a, const gf b); static INLINE_UNUSED void gf_sub_RAW(gf out, const gf a, const gf b);
void gf_sub (gf out, const gf a, const gf b); static INLINE_UNUSED void gf_bias(gf inout, int amount);
void gf_mul (gf_s *__restrict__ out, const gf a, const gf b); static INLINE_UNUSED void gf_weak_reduce(gf inout);
void gf_mulw_unsigned (gf_s *__restrict__ out, const gf a, uint32_t b);
void gf_sqr (gf_s *__restrict__ out, const gf a); void gf_strong_reduce(gf inout);
void gf_add(gf out, const gf a, const gf b);
void gf_sub(gf out, const gf a, const gf b);
void gf_mul(gf_s * __restrict__ out, const gf a, const gf b);
void gf_mulw_unsigned(gf_s * __restrict__ out, const gf a, uint32_t b);
void gf_sqr(gf_s * __restrict__ out, const gf a);
mask_t gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0. Return true if successful */ mask_t gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0. Return true if successful */
mask_t gf_eq (const gf x, const gf y); mask_t gf_eq(const gf x, const gf y);
mask_t gf_lobit (const gf x); mask_t gf_lobit(const gf x);
mask_t gf_hibit (const gf x); mask_t gf_hibit(const gf x);
void gf_serialize (uint8_t *serial, const gf x,int with_highbit); void gf_serialize(uint8_t *serial, const gf x, int with_highbit);
mask_t gf_deserialize (gf x, const uint8_t serial[SER_BYTES],int with_hibit,uint8_t hi_nmask); mask_t gf_deserialize(gf x, const uint8_t serial[SER_BYTES], int with_hibit,
uint8_t hi_nmask);
#ifdef __cplusplus #ifdef __cplusplus
} /* extern "C" */ } /* extern "C" */
#endif #endif
#include "f_impl.h" /* Bring in the inline implementations */ # include "f_impl.h" /* Bring in the inline implementations */
#ifndef LIMBPERM # ifndef LIMBPERM
#define LIMBPERM(i) (i) # define LIMBPERM(i) (i)
#endif # endif
#define LIMB_MASK(i) (((1)<<LIMB_PLACE_VALUE(i))-1) # define LIMB_MASK(i) (((1)<<LIMB_PLACE_VALUE(i))-1)
static const gf ZERO = {{{0}}}, ONE = {{{1}}}; static const gf ZERO = { {{0}} }, ONE = { { {
1}}};
#endif /* __P448_F_FIELD_H__ */ #endif /* __P448_F_FIELD_H__ */

View File

@ -11,24 +11,29 @@
*/ */
#include "field.h" #include "field.h"
static const gf MODULUS = {FIELD_LITERAL( static const gf MODULUS =
0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff, 0xfffffffffffffe, 0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff { FIELD_LITERAL(0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff,
)}; 0xffffffffffffff, 0xfffffffffffffe, 0xffffffffffffff,
0xffffffffffffff, 0xffffffffffffff)
};
/** Serialize to wire format. */ /** Serialize to wire format. */
void gf_serialize (uint8_t serial[SER_BYTES], const gf x, int with_hibit) { void gf_serialize(uint8_t serial[SER_BYTES], const gf x, int with_hibit)
unsigned int j=0, fill=0; {
unsigned int j = 0, fill = 0;
dword_t buffer = 0; dword_t buffer = 0;
unsigned int i; unsigned int i;
gf red; gf red;
gf_copy(red, x); gf_copy(red, x);
gf_strong_reduce(red); gf_strong_reduce(red);
if (!with_hibit) { assert(gf_hibit(red) == 0); } if (!with_hibit) {
assert(gf_hibit(red) == 0);
}
UNROLL for (i=0; i<(with_hibit ? X_SER_BYTES : SER_BYTES); i++) { UNROLL for (i = 0; i < (with_hibit ? X_SER_BYTES : SER_BYTES); i++) {
if (fill < 8 && j < NLIMBS) { if (fill < 8 && j < NLIMBS) {
buffer |= ((dword_t)red->limb[LIMBPERM(j)]) << fill; buffer |= ((dword_t) red->limb[LIMBPERM(j)]) << fill;
fill += LIMB_PLACE_VALUE(LIMBPERM(j)); fill += LIMB_PLACE_VALUE(LIMBPERM(j));
j++; j++;
} }
@ -39,49 +44,58 @@ void gf_serialize (uint8_t serial[SER_BYTES], const gf x, int with_hibit) {
} }
/** Return high bit of x = low bit of 2x mod p */ /** Return high bit of x = low bit of 2x mod p */
mask_t gf_hibit(const gf x) { mask_t gf_hibit(const gf x)
{
gf y; gf y;
gf_add(y,x,x); gf_add(y, x, x);
gf_strong_reduce(y); gf_strong_reduce(y);
return -(y->limb[0]&1); return -(y->limb[0] & 1);
} }
/** Return high bit of x = low bit of 2x mod p */ /** Return high bit of x = low bit of 2x mod p */
mask_t gf_lobit(const gf x) { mask_t gf_lobit(const gf x)
{
gf y; gf y;
gf_copy(y,x); gf_copy(y, x);
gf_strong_reduce(y); gf_strong_reduce(y);
return -(y->limb[0]&1); return -(y->limb[0] & 1);
} }
/** Deserialize from wire format; return -1 on success and 0 on failure. */ /** Deserialize from wire format; return -1 on success and 0 on failure. */
mask_t gf_deserialize (gf x, const uint8_t serial[SER_BYTES], int with_hibit, uint8_t hi_nmask) { mask_t gf_deserialize(gf x, const uint8_t serial[SER_BYTES], int with_hibit,
unsigned int j=0, fill=0; uint8_t hi_nmask)
{
unsigned int j = 0, fill = 0;
dword_t buffer = 0; dword_t buffer = 0;
dsword_t scarry = 0; dsword_t scarry = 0;
const unsigned nbytes = with_hibit ? X_SER_BYTES : SER_BYTES; const unsigned nbytes = with_hibit ? X_SER_BYTES : SER_BYTES;
unsigned int i; unsigned int i;
mask_t succ; mask_t succ;
UNROLL for (i=0; i<NLIMBS; i++) { UNROLL for (i = 0; i < NLIMBS; i++) {
UNROLL while (fill < LIMB_PLACE_VALUE(LIMBPERM(i)) && j < nbytes) { UNROLL while (fill < LIMB_PLACE_VALUE(LIMBPERM(i)) && j < nbytes) {
uint8_t sj = serial[j]; uint8_t sj = serial[j];
if (j==nbytes-1) sj &= ~hi_nmask; if (j == nbytes - 1)
buffer |= ((dword_t)sj) << fill; sj &= ~hi_nmask;
buffer |= ((dword_t) sj) << fill;
fill += 8; fill += 8;
j++; j++;
} }
x->limb[LIMBPERM(i)] = (i<NLIMBS-1) ? buffer & LIMB_MASK(LIMBPERM(i)) : buffer; x->limb[LIMBPERM(i)] =
(i < NLIMBS - 1) ? buffer & LIMB_MASK(LIMBPERM(i)) : buffer;
fill -= LIMB_PLACE_VALUE(LIMBPERM(i)); fill -= LIMB_PLACE_VALUE(LIMBPERM(i));
buffer >>= LIMB_PLACE_VALUE(LIMBPERM(i)); buffer >>= LIMB_PLACE_VALUE(LIMBPERM(i));
scarry = (scarry + x->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)]) >> (8*sizeof(word_t)); scarry =
(scarry + x->limb[LIMBPERM(i)] -
MODULUS->limb[LIMBPERM(i)]) >> (8 * sizeof(word_t));
} }
succ = with_hibit ? -(mask_t)1 : ~gf_hibit(x); succ = with_hibit ? -(mask_t) 1 : ~gf_hibit(x);
return succ & word_is_zero(buffer) & ~word_is_zero(scarry); return succ & word_is_zero(buffer) & ~word_is_zero(scarry);
} }
/** Reduce to canonical form. */ /** Reduce to canonical form. */
void gf_strong_reduce (gf a) { void gf_strong_reduce(gf a)
{
dsword_t scarry; dsword_t scarry;
word_t scarry_0; word_t scarry_0;
dword_t carry = 0; dword_t carry = 0;
@ -94,23 +108,26 @@ void gf_strong_reduce (gf a) {
/* compute total_value - p. No need to reduce mod p. */ /* compute total_value - p. No need to reduce mod p. */
scarry = 0; scarry = 0;
for (i=0; i<NLIMBS; i++) { for (i = 0; i < NLIMBS; i++) {
scarry = scarry + a->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)]; scarry = scarry + a->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)];
a->limb[LIMBPERM(i)] = scarry & LIMB_MASK(LIMBPERM(i)); a->limb[LIMBPERM(i)] = scarry & LIMB_MASK(LIMBPERM(i));
scarry >>= LIMB_PLACE_VALUE(LIMBPERM(i)); scarry >>= LIMB_PLACE_VALUE(LIMBPERM(i));
} }
/* uncommon case: it was >= p, so now scarry = 0 and this = x /*
* common case: it was < p, so now scarry = -1 and this = x - p + 2^255 * uncommon case: it was >= p, so now scarry = 0 and this = x common case:
* so let's add back in p. will carry back off the top for 2^255. * it was < p, so now scarry = -1 and this = x - p + 2^255 so let's add
* back in p. will carry back off the top for 2^255.
*/ */
assert(word_is_zero(scarry) | word_is_zero(scarry+1)); assert(word_is_zero(scarry) | word_is_zero(scarry + 1));
scarry_0 = scarry; scarry_0 = scarry;
/* add it back */ /* add it back */
for (i=0; i<NLIMBS; i++) { for (i = 0; i < NLIMBS; i++) {
carry = carry + a->limb[LIMBPERM(i)] + (scarry_0 & MODULUS->limb[LIMBPERM(i)]); carry =
carry + a->limb[LIMBPERM(i)] +
(scarry_0 & MODULUS->limb[LIMBPERM(i)]);
a->limb[LIMBPERM(i)] = carry & LIMB_MASK(LIMBPERM(i)); a->limb[LIMBPERM(i)] = carry & LIMB_MASK(LIMBPERM(i));
carry >>= LIMB_PLACE_VALUE(LIMBPERM(i)); carry >>= LIMB_PLACE_VALUE(LIMBPERM(i));
} }
@ -119,28 +136,31 @@ void gf_strong_reduce (gf a) {
} }
/** Subtract two gf elements d=a-b */ /** Subtract two gf elements d=a-b */
void gf_sub (gf d, const gf a, const gf b) { void gf_sub(gf d, const gf a, const gf b)
gf_sub_RAW ( d, a, b ); {
gf_bias( d, 2 ); gf_sub_RAW(d, a, b);
gf_weak_reduce ( d ); gf_bias(d, 2);
gf_weak_reduce(d);
} }
/** Add two field elements d = a+b */ /** Add two field elements d = a+b */
void gf_add (gf d, const gf a, const gf b) { void gf_add(gf d, const gf a, const gf b)
gf_add_RAW ( d, a, b ); {
gf_weak_reduce ( d ); gf_add_RAW(d, a, b);
gf_weak_reduce(d);
} }
/** Compare a==b */ /** Compare a==b */
mask_t gf_eq(const gf a, const gf b) { mask_t gf_eq(const gf a, const gf b)
{
gf c; gf c;
mask_t ret=0; mask_t ret = 0;
unsigned int i; unsigned int i;
gf_sub(c,a,b); gf_sub(c, a, b);
gf_strong_reduce(c); gf_strong_reduce(c);
for (i=0; i<NLIMBS; i++) { for (i = 0; i < NLIMBS; i++) {
ret |= c->limb[LIMBPERM(i)]; ret |= c->limb[LIMBPERM(i)];
} }

View File

@ -11,85 +11,90 @@
*/ */
#ifndef __GF_H__ #ifndef __GF_H__
#define __GF_H__ # define __GF_H__
#include "constant_time.h" # include "constant_time.h"
#include "f_field.h" # include "f_field.h"
#include <string.h> # include <string.h>
/** Square x, n times. */ /** Square x, n times. */
static ossl_inline void gf_sqrn ( static ossl_inline void gf_sqrn(gf_s * __restrict__ y, const gf x, int n)
gf_s *__restrict__ y, {
const gf x,
int n
) {
gf tmp; gf tmp;
assert(n>0); assert(n > 0);
if (n&1) { if (n & 1) {
gf_sqr(y,x); gf_sqr(y, x);
n--; n--;
} else { } else {
gf_sqr(tmp,x); gf_sqr(tmp, x);
gf_sqr(y,tmp); gf_sqr(y, tmp);
n-=2; n -= 2;
} }
for (; n; n-=2) { for (; n; n -= 2) {
gf_sqr(tmp,y); gf_sqr(tmp, y);
gf_sqr(y,tmp); gf_sqr(y, tmp);
} }
} }
#define gf_add_nr gf_add_RAW # define gf_add_nr gf_add_RAW
/** Subtract mod p. Bias by 2 and don't reduce */ /** Subtract mod p. Bias by 2 and don't reduce */
static ossl_inline void gf_sub_nr ( gf c, const gf a, const gf b ) { static ossl_inline void gf_sub_nr(gf c, const gf a, const gf b)
gf_sub_RAW(c,a,b); {
gf_sub_RAW(c, a, b);
gf_bias(c, 2); gf_bias(c, 2);
if (GF_HEADROOM < 3) gf_weak_reduce(c); if (GF_HEADROOM < 3)
gf_weak_reduce(c);
} }
/** Subtract mod p. Bias by amt but don't reduce. */ /** Subtract mod p. Bias by amt but don't reduce. */
static ossl_inline void gf_subx_nr ( gf c, const gf a, const gf b, int amt ) { static ossl_inline void gf_subx_nr(gf c, const gf a, const gf b, int amt)
gf_sub_RAW(c,a,b); {
gf_sub_RAW(c, a, b);
gf_bias(c, amt); gf_bias(c, amt);
if (GF_HEADROOM < amt+1) gf_weak_reduce(c); if (GF_HEADROOM < amt + 1)
gf_weak_reduce(c);
} }
/** Mul by signed int. Not constant-time WRT the sign of that int. */ /** Mul by signed int. Not constant-time WRT the sign of that int. */
static ossl_inline void gf_mulw(gf c, const gf a, int32_t w) { static ossl_inline void gf_mulw(gf c, const gf a, int32_t w)
if (w>0) { {
if (w > 0) {
gf_mulw_unsigned(c, a, w); gf_mulw_unsigned(c, a, w);
} else { } else {
gf_mulw_unsigned(c, a, -w); gf_mulw_unsigned(c, a, -w);
gf_sub(c,ZERO,c); gf_sub(c, ZERO, c);
} }
} }
/** Constant time, x = is_z ? z : y */ /** Constant time, x = is_z ? z : y */
static ossl_inline void gf_cond_sel(gf x, const gf y, const gf z, mask_t is_z) { static ossl_inline void gf_cond_sel(gf x, const gf y, const gf z, mask_t is_z)
constant_time_select(x,y,z,sizeof(gf),is_z,0); {
constant_time_select(x, y, z, sizeof(gf), is_z, 0);
} }
/** Constant time, if (neg) x=-x; */ /** Constant time, if (neg) x=-x; */
static ossl_inline void gf_cond_neg(gf x, mask_t neg) { static ossl_inline void gf_cond_neg(gf x, mask_t neg)
{
gf y; gf y;
gf_sub(y,ZERO,x); gf_sub(y, ZERO, x);
gf_cond_sel(x,x,y,neg); gf_cond_sel(x, x, y, neg);
} }
/** Constant time, if (swap) (x,y) = (y,x); */ /** Constant time, if (swap) (x,y) = (y,x); */
static ossl_inline void static ossl_inline void gf_cond_swap(gf x, gf_s * __restrict__ y, mask_t swap)
gf_cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) { {
constant_time_cond_swap(x,y,sizeof(gf_s),swap); constant_time_cond_swap(x, y, sizeof(gf_s), swap);
} }
static ossl_inline void gf_mul_qnr(gf_s *__restrict__ out, const gf x) { static ossl_inline void gf_mul_qnr(gf_s * __restrict__ out, const gf x)
gf_sub(out,ZERO,x); {
gf_sub(out, ZERO, x);
} }
static ossl_inline void gf_div_qnr(gf_s *__restrict__ out, const gf x) { static ossl_inline void gf_div_qnr(gf_s * __restrict__ out, const gf x)
gf_sub(out,ZERO,x); {
gf_sub(out, ZERO, x);
} }
#endif /* __GF_H__ */ #endif /* __GF_H__ */

View File

@ -11,52 +11,52 @@
*/ */
#ifndef __DECAF_POINT_448_H__ #ifndef __DECAF_POINT_448_H__
#define __DECAF_POINT_448_H__ 1 # define __DECAF_POINT_448_H__ 1
#include "curve448utils.h" # include "curve448utils.h"
#include "field.h" # include "field.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
/** @cond internal */ /** @cond internal */
#define DECAF_448_SCALAR_LIMBS ((446-1)/DECAF_WORD_BITS+1) # define DECAF_448_SCALAR_LIMBS ((446-1)/DECAF_WORD_BITS+1)
/** @endcond */ /** @endcond */
/** The number of bits in a scalar */ /** The number of bits in a scalar */
#define DECAF_448_SCALAR_BITS 446 # define DECAF_448_SCALAR_BITS 446
/** Number of bytes in a serialized point. */ /** Number of bytes in a serialized point. */
#define DECAF_448_SER_BYTES 56 # define DECAF_448_SER_BYTES 56
/** Number of bytes in an elligated point. For now set the same as SER_BYTES /** Number of bytes in an elligated point. For now set the same as SER_BYTES
* but could be different for other curves. * but could be different for other curves.
*/ */
#define DECAF_448_HASH_BYTES 56 # define DECAF_448_HASH_BYTES 56
/** Number of bytes in a serialized scalar. */ /** Number of bytes in a serialized scalar. */
#define DECAF_448_SCALAR_BYTES 56 # define DECAF_448_SCALAR_BYTES 56
/** Number of bits in the "which" field of an elligator inverse */ /** Number of bits in the "which" field of an elligator inverse */
#define DECAF_448_INVERT_ELLIGATOR_WHICH_BITS 3 # define DECAF_448_INVERT_ELLIGATOR_WHICH_BITS 3
/** The cofactor the curve would have, if we hadn't removed it */ /** The cofactor the curve would have, if we hadn't removed it */
#define DECAF_448_REMOVED_COFACTOR 4 # define DECAF_448_REMOVED_COFACTOR 4
/** X448 encoding ratio. */ /** X448 encoding ratio. */
#define DECAF_X448_ENCODE_RATIO 2 # define DECAF_X448_ENCODE_RATIO 2
/** Number of bytes in an x448 public key */ /** Number of bytes in an x448 public key */
#define DECAF_X448_PUBLIC_BYTES 56 # define DECAF_X448_PUBLIC_BYTES 56
/** Number of bytes in an x448 private key */ /** Number of bytes in an x448 private key */
#define DECAF_X448_PRIVATE_BYTES 56 # define DECAF_X448_PRIVATE_BYTES 56
/** Twisted Edwards extended homogeneous coordinates */ /** Twisted Edwards extended homogeneous coordinates */
typedef struct curve448_point_s { typedef struct curve448_point_s {
/** @cond internal */ /** @cond internal */
gf_448_t x,y,z,t; gf_448_t x, y, z, t;
/** @endcond */ /** @endcond */
} curve448_point_t[1]; } curve448_point_t[1];
@ -98,10 +98,10 @@ extern const struct curve448_precomputed_s *curve448_precomputed_base;
* @retval DECAF_FAILURE The scalar was greater than the modulus, * @retval DECAF_FAILURE The scalar was greater than the modulus,
* and has been reduced modulo that modulus. * and has been reduced modulo that modulus.
*/ */
__owur decaf_error_t curve448_scalar_decode ( __owur decaf_error_t curve448_scalar_decode(curve448_scalar_t out,
curve448_scalar_t out, const unsigned char
const unsigned char ser[DECAF_448_SCALAR_BYTES] ser[DECAF_448_SCALAR_BYTES]
); );
/** /**
* @brief Read a scalar from wire format or from bytes. Reduces mod * @brief Read a scalar from wire format or from bytes. Reduces mod
@ -111,11 +111,8 @@ __owur decaf_error_t curve448_scalar_decode (
* @param [in] ser_len Length of serialized form. * @param [in] ser_len Length of serialized form.
* @param [out] out Deserialized form. * @param [out] out Deserialized form.
*/ */
void curve448_scalar_decode_long ( void curve448_scalar_decode_long(curve448_scalar_t out,
curve448_scalar_t out, const unsigned char *ser, size_t ser_len);
const unsigned char *ser,
size_t ser_len
);
/** /**
* @brief Serialize a scalar to wire format. * @brief Serialize a scalar to wire format.
@ -123,10 +120,8 @@ void curve448_scalar_decode_long (
* @param [out] ser Serialized form of a scalar. * @param [out] ser Serialized form of a scalar.
* @param [in] s Deserialized scalar. * @param [in] s Deserialized scalar.
*/ */
void curve448_scalar_encode ( void curve448_scalar_encode(unsigned char ser[DECAF_448_SCALAR_BYTES],
unsigned char ser[DECAF_448_SCALAR_BYTES], const curve448_scalar_t s);
const curve448_scalar_t s
);
/** /**
* @brief Add two scalars. The scalars may use the same memory. * @brief Add two scalars. The scalars may use the same memory.
@ -134,11 +129,8 @@ void curve448_scalar_encode (
* @param [in] b Another scalar. * @param [in] b Another scalar.
* @param [out] out a+b. * @param [out] out a+b.
*/ */
void curve448_scalar_add ( void curve448_scalar_add(curve448_scalar_t out,
curve448_scalar_t out, const curve448_scalar_t a, const curve448_scalar_t b);
const curve448_scalar_t a,
const curve448_scalar_t b
);
/** /**
* @brief Subtract two scalars. The scalars may use the same memory. * @brief Subtract two scalars. The scalars may use the same memory.
@ -146,11 +138,8 @@ void curve448_scalar_add (
* @param [in] b Another scalar. * @param [in] b Another scalar.
* @param [out] out a-b. * @param [out] out a-b.
*/ */
void curve448_scalar_sub ( void curve448_scalar_sub(curve448_scalar_t out,
curve448_scalar_t out, const curve448_scalar_t a, const curve448_scalar_t b);
const curve448_scalar_t a,
const curve448_scalar_t b
);
/** /**
* @brief Multiply two scalars. The scalars may use the same memory. * @brief Multiply two scalars. The scalars may use the same memory.
@ -158,21 +147,15 @@ void curve448_scalar_sub (
* @param [in] b Another scalar. * @param [in] b Another scalar.
* @param [out] out a*b. * @param [out] out a*b.
*/ */
void curve448_scalar_mul ( void curve448_scalar_mul(curve448_scalar_t out,
curve448_scalar_t out, const curve448_scalar_t a, const curve448_scalar_t b);
const curve448_scalar_t a,
const curve448_scalar_t b
);
/** /**
* @brief Halve a scalar. The scalars may use the same memory. * @brief Halve a scalar. The scalars may use the same memory.
* @param [in] a A scalar. * @param [in] a A scalar.
* @param [out] out a/2. * @param [out] out a/2.
*/ */
void curve448_scalar_halve ( void curve448_scalar_halve(curve448_scalar_t out, const curve448_scalar_t a);
curve448_scalar_t out,
const curve448_scalar_t a
);
/** /**
* @brief Copy a scalar. The scalars may use the same memory, in which * @brief Copy a scalar. The scalars may use the same memory, in which
@ -180,10 +163,9 @@ void curve448_scalar_halve (
* @param [in] a A scalar. * @param [in] a A scalar.
* @param [out] out Will become a copy of a. * @param [out] out Will become a copy of a.
*/ */
static ossl_inline void curve448_scalar_copy ( static ossl_inline void curve448_scalar_copy(curve448_scalar_t out,
curve448_scalar_t out, const curve448_scalar_t a)
const curve448_scalar_t a {
) {
*out = *a; *out = *a;
} }
@ -194,11 +176,10 @@ static ossl_inline void curve448_scalar_copy (
* @param [out] a A copy of the point. * @param [out] a A copy of the point.
* @param [in] b Any point. * @param [in] b Any point.
*/ */
static ossl_inline void curve448_point_copy ( static ossl_inline void curve448_point_copy(curve448_point_t a,
curve448_point_t a, const curve448_point_t b)
const curve448_point_t b {
) { *a = *b;
*a=*b;
} }
/** /**
@ -210,10 +191,8 @@ static ossl_inline void curve448_point_copy (
* @retval DECAF_TRUE The points are equal. * @retval DECAF_TRUE The points are equal.
* @retval DECAF_FALSE The points are not equal. * @retval DECAF_FALSE The points are not equal.
*/ */
__owur decaf_bool_t curve448_point_eq ( __owur decaf_bool_t curve448_point_eq(const curve448_point_t a,
const curve448_point_t a, const curve448_point_t b);
const curve448_point_t b
);
/** /**
* @brief Double a point. Equivalent to * @brief Double a point. Equivalent to
@ -222,10 +201,7 @@ __owur decaf_bool_t curve448_point_eq (
* @param [out] two_a The sum a+a. * @param [out] two_a The sum a+a.
* @param [in] a A point. * @param [in] a A point.
*/ */
void curve448_point_double ( void curve448_point_double(curve448_point_t two_a, const curve448_point_t a);
curve448_point_t two_a,
const curve448_point_t a
);
/** /**
* @brief RFC 7748 Diffie-Hellman scalarmul. This function uses a different * @brief RFC 7748 Diffie-Hellman scalarmul. This function uses a different
@ -239,11 +215,10 @@ void curve448_point_double (
* @retval DECAF_FAILURE The scalarmul didn't succeed, because the base * @retval DECAF_FAILURE The scalarmul didn't succeed, because the base
* point is in a small subgroup. * point is in a small subgroup.
*/ */
__owur decaf_error_t decaf_x448 ( __owur decaf_error_t decaf_x448(uint8_t out[DECAF_X448_PUBLIC_BYTES],
uint8_t out[DECAF_X448_PUBLIC_BYTES],
const uint8_t base[DECAF_X448_PUBLIC_BYTES], const uint8_t base[DECAF_X448_PUBLIC_BYTES],
const uint8_t scalar[DECAF_X448_PRIVATE_BYTES] const uint8_t scalar[DECAF_X448_PRIVATE_BYTES]
); );
/** /**
* @brief Multiply a point by DECAF_X448_ENCODE_RATIO, * @brief Multiply a point by DECAF_X448_ENCODE_RATIO,
@ -265,10 +240,10 @@ __owur decaf_error_t decaf_x448 (
* @param [out] out The scaled and encoded point. * @param [out] out The scaled and encoded point.
* @param [in] p The point to be scaled and encoded. * @param [in] p The point to be scaled and encoded.
*/ */
void curve448_point_mul_by_ratio_and_encode_like_x448 ( void curve448_point_mul_by_ratio_and_encode_like_x448(uint8_t
uint8_t out[DECAF_X448_PUBLIC_BYTES], out
const curve448_point_t p [DECAF_X448_PUBLIC_BYTES],
); const curve448_point_t p);
/** The base point for X448 Diffie-Hellman */ /** The base point for X448 Diffie-Hellman */
extern const uint8_t decaf_x448_base_point[DECAF_X448_PUBLIC_BYTES]; extern const uint8_t decaf_x448_base_point[DECAF_X448_PUBLIC_BYTES];
@ -283,11 +258,9 @@ extern const uint8_t decaf_x448_base_point[DECAF_X448_PUBLIC_BYTES];
* @param [out] scaled The scaled point base*scalar * @param [out] scaled The scaled point base*scalar
* @param [in] scalar The scalar to multiply by. * @param [in] scalar The scalar to multiply by.
*/ */
void decaf_x448_derive_public_key ( void decaf_x448_derive_public_key(uint8_t out[DECAF_X448_PUBLIC_BYTES],
uint8_t out[DECAF_X448_PUBLIC_BYTES],
const uint8_t scalar[DECAF_X448_PRIVATE_BYTES] const uint8_t scalar[DECAF_X448_PRIVATE_BYTES]
); );
/** /**
* @brief Multiply a precomputed base point by a scalar: * @brief Multiply a precomputed base point by a scalar:
@ -300,12 +273,9 @@ void decaf_x448_derive_public_key (
* @param [in] base The point to be scaled. * @param [in] base The point to be scaled.
* @param [in] scalar The scalar to multiply by. * @param [in] scalar The scalar to multiply by.
*/ */
void curve448_precomputed_scalarmul ( void curve448_precomputed_scalarmul(curve448_point_t scaled,
curve448_point_t scaled, const curve448_precomputed_s * base,
const curve448_precomputed_s *base, const curve448_scalar_t scalar);
const curve448_scalar_t scalar
);
/** /**
* @brief Multiply two base points by two scalars: * @brief Multiply two base points by two scalars:
@ -322,12 +292,10 @@ void curve448_precomputed_scalarmul (
* @warning: This function takes variable time, and may leak the scalars * @warning: This function takes variable time, and may leak the scalars
* used. It is designed for signature verification. * used. It is designed for signature verification.
*/ */
void curve448_base_double_scalarmul_non_secret ( void curve448_base_double_scalarmul_non_secret(curve448_point_t combo,
curve448_point_t combo,
const curve448_scalar_t scalar1, const curve448_scalar_t scalar1,
const curve448_point_t base2, const curve448_point_t base2,
const curve448_scalar_t scalar2 const curve448_scalar_t scalar2);
);
/** /**
* @brief Test that a point is valid, for debugging purposes. * @brief Test that a point is valid, for debugging purposes.
@ -336,23 +304,17 @@ void curve448_base_double_scalarmul_non_secret (
* @retval DECAF_TRUE The point is valid. * @retval DECAF_TRUE The point is valid.
* @retval DECAF_FALSE The point is invalid. * @retval DECAF_FALSE The point is invalid.
*/ */
__owur decaf_bool_t curve448_point_valid ( __owur decaf_bool_t curve448_point_valid(const curve448_point_t to_test);
const curve448_point_t to_test
);
/** /**
* @brief Overwrite scalar with zeros. * @brief Overwrite scalar with zeros.
*/ */
void curve448_scalar_destroy ( void curve448_scalar_destroy(curve448_scalar_t scalar);
curve448_scalar_t scalar
);
/** /**
* @brief Overwrite point with zeros. * @brief Overwrite point with zeros.
*/ */
void curve448_point_destroy ( void curve448_point_destroy(curve448_point_t point);
curve448_point_t point
);
#ifdef __cplusplus #ifdef __cplusplus
} /* extern "C" */ } /* extern "C" */

View File

@ -15,63 +15,72 @@
#include "constant_time.h" #include "constant_time.h"
#include "point_448.h" #include "point_448.h"
static const decaf_word_t MONTGOMERY_FACTOR = (decaf_word_t)0x3bd440fae918bc5; static const decaf_word_t MONTGOMERY_FACTOR = (decaf_word_t) 0x3bd440fae918bc5;
static const curve448_scalar_t sc_p = {{{ static const curve448_scalar_t sc_p = { {{
SC_LIMB(0x2378c292ab5844f3), SC_LIMB(0x216cc2728dc58f55), SC_LIMB(0xc44edb49aed63690), SC_LIMB(0xffffffff7cca23e9), SC_LIMB(0xffffffffffffffff), SC_LIMB(0xffffffffffffffff), SC_LIMB(0x3fffffffffffffff) SC_LIMB(0x2378c292ab5844f3),
}}}, sc_r2 = {{{ SC_LIMB(0x216cc2728dc58f55),
SC_LIMB(0xe3539257049b9b60), SC_LIMB(0x7af32c4bc1b195d9), SC_LIMB(0x0d66de2388ea1859), SC_LIMB(0xae17cf725ee4d838), SC_LIMB(0x1a9cc14ba3c47c44), SC_LIMB(0x2052bcb7e4d070af), SC_LIMB(0x3402a939f823b729) SC_LIMB(0xc44edb49aed63690),
SC_LIMB(0xffffffff7cca23e9),
SC_LIMB(0xffffffffffffffff),
SC_LIMB(0xffffffffffffffff),
SC_LIMB(0x3fffffffffffffff)
}}
}, sc_r2 = { { {
SC_LIMB(0xe3539257049b9b60), SC_LIMB(0x7af32c4bc1b195d9),
SC_LIMB(0x0d66de2388ea1859), SC_LIMB(0xae17cf725ee4d838),
SC_LIMB(0x1a9cc14ba3c47c44), SC_LIMB(0x2052bcb7e4d070af),
SC_LIMB(0x3402a939f823b729)
}}}; }}};
/* End of template stuff */ /* End of template stuff */
#define WBITS DECAF_WORD_BITS /* NB this may be different from ARCH_WORD_BITS */ #define WBITS DECAF_WORD_BITS /* NB this may be different from ARCH_WORD_BITS */
const curve448_scalar_t curve448_scalar_one = {{{1}}}, curve448_scalar_zero = {{{0}}}; const curve448_scalar_t curve448_scalar_one = { {{1}} }, curve448_scalar_zero = { { {
0}}};
/** {extra,accum} - sub +? p /** {extra,accum} - sub +? p
* Must have extra <= 1 * Must have extra <= 1
*/ */
static void sc_subx( static void sc_subx(curve448_scalar_t out,
curve448_scalar_t out,
const decaf_word_t accum[DECAF_448_SCALAR_LIMBS], const decaf_word_t accum[DECAF_448_SCALAR_LIMBS],
const curve448_scalar_t sub, const curve448_scalar_t sub,
const curve448_scalar_t p, const curve448_scalar_t p, decaf_word_t extra)
decaf_word_t extra {
) {
decaf_dsword_t chain = 0; decaf_dsword_t chain = 0;
unsigned int i; unsigned int i;
decaf_word_t borrow; decaf_word_t borrow;
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) { for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
chain = (chain + accum[i]) - sub->limb[i]; chain = (chain + accum[i]) - sub->limb[i];
out->limb[i] = chain; out->limb[i] = chain;
chain >>= WBITS; chain >>= WBITS;
} }
borrow = chain+extra; /* = 0 or -1 */ borrow = chain + extra; /* = 0 or -1 */
chain = 0; chain = 0;
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) { for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
chain = (chain + out->limb[i]) + (p->limb[i] & borrow); chain = (chain + out->limb[i]) + (p->limb[i] & borrow);
out->limb[i] = chain; out->limb[i] = chain;
chain >>= WBITS; chain >>= WBITS;
} }
} }
static void sc_montmul ( static void sc_montmul(curve448_scalar_t out,
curve448_scalar_t out, const curve448_scalar_t a, const curve448_scalar_t b)
const curve448_scalar_t a, {
const curve448_scalar_t b unsigned int i, j;
) { decaf_word_t accum[DECAF_448_SCALAR_LIMBS + 1] = { 0 };
unsigned int i,j;
decaf_word_t accum[DECAF_448_SCALAR_LIMBS+1] = {0};
decaf_word_t hi_carry = 0; decaf_word_t hi_carry = 0;
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) { for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
decaf_word_t mand = a->limb[i]; decaf_word_t mand = a->limb[i];
const decaf_word_t *mier = b->limb; const decaf_word_t *mier = b->limb;
decaf_dword_t chain = 0; decaf_dword_t chain = 0;
for (j=0; j<DECAF_448_SCALAR_LIMBS; j++) { for (j = 0; j < DECAF_448_SCALAR_LIMBS; j++) {
chain += ((decaf_dword_t)mand)*mier[j] + accum[j]; chain += ((decaf_dword_t) mand) * mier[j] + accum[j];
accum[j] = chain; accum[j] = chain;
chain >>= WBITS; chain >>= WBITS;
} }
@ -80,45 +89,40 @@ static void sc_montmul (
mand = accum[0] * MONTGOMERY_FACTOR; mand = accum[0] * MONTGOMERY_FACTOR;
chain = 0; chain = 0;
mier = sc_p->limb; mier = sc_p->limb;
for (j=0; j<DECAF_448_SCALAR_LIMBS; j++) { for (j = 0; j < DECAF_448_SCALAR_LIMBS; j++) {
chain += (decaf_dword_t)mand*mier[j] + accum[j]; chain += (decaf_dword_t) mand *mier[j] + accum[j];
if (j) accum[j-1] = chain; if (j)
accum[j - 1] = chain;
chain >>= WBITS; chain >>= WBITS;
} }
chain += accum[j]; chain += accum[j];
chain += hi_carry; chain += hi_carry;
accum[j-1] = chain; accum[j - 1] = chain;
hi_carry = chain >> WBITS; hi_carry = chain >> WBITS;
} }
sc_subx(out, accum, sc_p, sc_p, hi_carry); sc_subx(out, accum, sc_p, sc_p, hi_carry);
} }
void curve448_scalar_mul ( void curve448_scalar_mul(curve448_scalar_t out,
curve448_scalar_t out, const curve448_scalar_t a, const curve448_scalar_t b)
const curve448_scalar_t a, {
const curve448_scalar_t b sc_montmul(out, a, b);
) { sc_montmul(out, out, sc_r2);
sc_montmul(out,a,b);
sc_montmul(out,out,sc_r2);
} }
void curve448_scalar_sub ( void curve448_scalar_sub(curve448_scalar_t out,
curve448_scalar_t out, const curve448_scalar_t a, const curve448_scalar_t b)
const curve448_scalar_t a, {
const curve448_scalar_t b
) {
sc_subx(out, a->limb, b, sc_p, 0); sc_subx(out, a->limb, b, sc_p, 0);
} }
void curve448_scalar_add ( void curve448_scalar_add(curve448_scalar_t out,
curve448_scalar_t out, const curve448_scalar_t a, const curve448_scalar_t b)
const curve448_scalar_t a, {
const curve448_scalar_t b
) {
decaf_dword_t chain = 0; decaf_dword_t chain = 0;
unsigned int i; unsigned int i;
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) { for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
chain = (chain + a->limb[i]) + b->limb[i]; chain = (chain + a->limb[i]) + b->limb[i];
out->limb[i] = chain; out->limb[i] = chain;
chain >>= WBITS; chain >>= WBITS;
@ -126,50 +130,47 @@ void curve448_scalar_add (
sc_subx(out, out->limb, sc_p, sc_p, chain); sc_subx(out, out->limb, sc_p, sc_p, chain);
} }
static ossl_inline void scalar_decode_short ( static ossl_inline void scalar_decode_short(curve448_scalar_t s,
curve448_scalar_t s,
const unsigned char *ser, const unsigned char *ser,
unsigned int nbytes unsigned int nbytes)
) { {
unsigned int i,j,k=0; unsigned int i, j, k = 0;
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) { for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
decaf_word_t out = 0; decaf_word_t out = 0;
for (j=0; j<sizeof(decaf_word_t) && k<nbytes; j++,k++) { for (j = 0; j < sizeof(decaf_word_t) && k < nbytes; j++, k++) {
out |= ((decaf_word_t)ser[k])<<(8*j); out |= ((decaf_word_t) ser[k]) << (8 * j);
} }
s->limb[i] = out; s->limb[i] = out;
} }
} }
decaf_error_t curve448_scalar_decode( decaf_error_t curve448_scalar_decode(curve448_scalar_t s,
curve448_scalar_t s, const unsigned char
const unsigned char ser[DECAF_448_SCALAR_BYTES] ser[DECAF_448_SCALAR_BYTES]
) { )
{
unsigned int i; unsigned int i;
decaf_dsword_t accum = 0; decaf_dsword_t accum = 0;
scalar_decode_short(s, ser, DECAF_448_SCALAR_BYTES); scalar_decode_short(s, ser, DECAF_448_SCALAR_BYTES);
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) { for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
accum = (accum + s->limb[i] - sc_p->limb[i]) >> WBITS; accum = (accum + s->limb[i] - sc_p->limb[i]) >> WBITS;
} }
/* Here accum == 0 or -1 */ /* Here accum == 0 or -1 */
curve448_scalar_mul(s,s,curve448_scalar_one); /* ham-handed reduce */ curve448_scalar_mul(s, s, curve448_scalar_one); /* ham-handed reduce */
return decaf_succeed_if(~word_is_zero(accum)); return decaf_succeed_if(~word_is_zero(accum));
} }
void curve448_scalar_destroy ( void curve448_scalar_destroy(curve448_scalar_t scalar)
curve448_scalar_t scalar {
) {
OPENSSL_cleanse(scalar, sizeof(curve448_scalar_t)); OPENSSL_cleanse(scalar, sizeof(curve448_scalar_t));
} }
void curve448_scalar_decode_long( void curve448_scalar_decode_long(curve448_scalar_t s,
curve448_scalar_t s, const unsigned char *ser, size_t ser_len)
const unsigned char *ser, {
size_t ser_len
) {
size_t i; size_t i;
curve448_scalar_t t1, t2; curve448_scalar_t t1, t2;
@ -178,23 +179,24 @@ void curve448_scalar_decode_long(
return; return;
} }
i = ser_len - (ser_len%DECAF_448_SCALAR_BYTES); i = ser_len - (ser_len % DECAF_448_SCALAR_BYTES);
if (i==ser_len) i -= DECAF_448_SCALAR_BYTES; if (i == ser_len)
i -= DECAF_448_SCALAR_BYTES;
scalar_decode_short(t1, &ser[i], ser_len-i); scalar_decode_short(t1, &ser[i], ser_len - i);
if (ser_len == sizeof(curve448_scalar_t)) { if (ser_len == sizeof(curve448_scalar_t)) {
assert(i==0); assert(i == 0);
/* ham-handed reduce */ /* ham-handed reduce */
curve448_scalar_mul(s,t1,curve448_scalar_one); curve448_scalar_mul(s, t1, curve448_scalar_one);
curve448_scalar_destroy(t1); curve448_scalar_destroy(t1);
return; return;
} }
while (i) { while (i) {
i -= DECAF_448_SCALAR_BYTES; i -= DECAF_448_SCALAR_BYTES;
sc_montmul(t1,t1,sc_r2); sc_montmul(t1, t1, sc_r2);
ignore_result( curve448_scalar_decode(t2, ser+i) ); ignore_result(curve448_scalar_decode(t2, ser + i));
curve448_scalar_add(t1, t1, t2); curve448_scalar_add(t1, t1, t2);
} }
@ -203,33 +205,29 @@ void curve448_scalar_decode_long(
curve448_scalar_destroy(t2); curve448_scalar_destroy(t2);
} }
void curve448_scalar_encode( void curve448_scalar_encode(unsigned char ser[DECAF_448_SCALAR_BYTES],
unsigned char ser[DECAF_448_SCALAR_BYTES], const curve448_scalar_t s)
const curve448_scalar_t s {
) { unsigned int i, j, k = 0;
unsigned int i,j,k=0; for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) { for (j = 0; j < sizeof(decaf_word_t); j++, k++) {
for (j=0; j<sizeof(decaf_word_t); j++,k++) { ser[k] = s->limb[i] >> (8 * j);
ser[k] = s->limb[i] >> (8*j);
} }
} }
} }
void curve448_scalar_halve ( void curve448_scalar_halve(curve448_scalar_t out, const curve448_scalar_t a)
curve448_scalar_t out, {
const curve448_scalar_t a
) {
decaf_word_t mask = -(a->limb[0] & 1); decaf_word_t mask = -(a->limb[0] & 1);
decaf_dword_t chain = 0; decaf_dword_t chain = 0;
unsigned int i; unsigned int i;
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) { for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
chain = (chain + a->limb[i]) + (sc_p->limb[i] & mask); chain = (chain + a->limb[i]) + (sc_p->limb[i] & mask);
out->limb[i] = chain; out->limb[i] = chain;
chain >>= DECAF_WORD_BITS; chain >>= DECAF_WORD_BITS;
} }
for (i=0; i<DECAF_448_SCALAR_LIMBS-1; i++) { for (i = 0; i < DECAF_448_SCALAR_LIMBS - 1; i++) {
out->limb[i] = out->limb[i]>>1 | out->limb[i+1]<<(WBITS-1); out->limb[i] = out->limb[i] >> 1 | out->limb[i + 1] << (WBITS - 1);
} }
out->limb[i] = out->limb[i]>>1 | chain<<(WBITS-1); out->limb[i] = out->limb[i] >> 1 | chain << (WBITS - 1);
} }

View File

@ -11,207 +11,211 @@
*/ */
#ifndef __WORD_H__ #ifndef __WORD_H__
#define __WORD_H__ # define __WORD_H__
#include <string.h> # include <string.h>
#include <assert.h> # include <assert.h>
#include <openssl/e_os2.h> # include <openssl/e_os2.h>
#include "arch_intrinsics.h" # include "arch_intrinsics.h"
#include "curve448utils.h" # include "curve448utils.h"
#ifndef _BSD_SOURCE # ifndef _BSD_SOURCE
#define _BSD_SOURCE 1 # define _BSD_SOURCE 1
#endif # endif
#ifndef _DEFAULT_SOURCE # ifndef _DEFAULT_SOURCE
#define _DEFAULT_SOURCE 1 # define _DEFAULT_SOURCE 1
#endif # endif
#include <stdlib.h> # include <stdlib.h>
#if defined(__ARM_NEON__) # if defined(__ARM_NEON__)
#include <arm_neon.h> # include <arm_neon.h>
#elif defined(__SSE2__) # elif defined(__SSE2__)
#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4) # if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
#include <immintrin.h> # include <immintrin.h>
#else # else
#include <emmintrin.h> # include <emmintrin.h>
#endif # endif
#endif # endif
#if (ARCH_WORD_BITS == 64) # if (ARCH_WORD_BITS == 64)
typedef uint64_t word_t, mask_t; typedef uint64_t word_t, mask_t;
typedef __uint128_t dword_t; typedef __uint128_t dword_t;
typedef int32_t hsword_t; typedef int32_t hsword_t;
typedef int64_t sword_t; typedef int64_t sword_t;
typedef __int128_t dsword_t; typedef __int128_t dsword_t;
#elif (ARCH_WORD_BITS == 32) # elif (ARCH_WORD_BITS == 32)
typedef uint32_t word_t, mask_t; typedef uint32_t word_t, mask_t;
typedef uint64_t dword_t; typedef uint64_t dword_t;
typedef int16_t hsword_t; typedef int16_t hsword_t;
typedef int32_t sword_t; typedef int32_t sword_t;
typedef int64_t dsword_t; typedef int64_t dsword_t;
#else # else
#error "For now, libdecaf only supports 32- and 64-bit architectures." # error "For now, libdecaf only supports 32- and 64-bit architectures."
#endif # endif
/* Scalar limbs are keyed off of the API word size instead of the arch word size. */ /*
#if DECAF_WORD_BITS == 64 * Scalar limbs are keyed off of the API word size instead of the arch word
#define SC_LIMB(x) (x) * size.
#elif DECAF_WORD_BITS == 32 */
#define SC_LIMB(x) ((uint32_t)x),(x>>32) # if DECAF_WORD_BITS == 64
#else # define SC_LIMB(x) (x)
#error "For now, libdecaf only supports 32- and 64-bit architectures." # elif DECAF_WORD_BITS == 32
#endif # define SC_LIMB(x) ((uint32_t)x),(x>>32)
# else
# error "For now, libdecaf only supports 32- and 64-bit architectures."
# endif
#ifdef __ARM_NEON__ # ifdef __ARM_NEON__
typedef uint32x4_t vecmask_t; typedef uint32x4_t vecmask_t;
#elif defined(__clang__) # elif defined(__clang__)
typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2))); typedef uint64_t uint64x2_t __attribute__ ((ext_vector_type(2)));
typedef int64_t int64x2_t __attribute__((ext_vector_type(2))); typedef int64_t int64x2_t __attribute__ ((ext_vector_type(2)));
typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4))); typedef uint64_t uint64x4_t __attribute__ ((ext_vector_type(4)));
typedef int64_t int64x4_t __attribute__((ext_vector_type(4))); typedef int64_t int64x4_t __attribute__ ((ext_vector_type(4)));
typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4))); typedef uint32_t uint32x4_t __attribute__ ((ext_vector_type(4)));
typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); typedef int32_t int32x4_t __attribute__ ((ext_vector_type(4)));
typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2))); typedef uint32_t uint32x2_t __attribute__ ((ext_vector_type(2)));
typedef int32_t int32x2_t __attribute__((ext_vector_type(2))); typedef int32_t int32x2_t __attribute__ ((ext_vector_type(2)));
typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8))); typedef uint32_t uint32x8_t __attribute__ ((ext_vector_type(8)));
typedef int32_t int32x8_t __attribute__((ext_vector_type(8))); typedef int32_t int32x8_t __attribute__ ((ext_vector_type(8)));
typedef word_t vecmask_t __attribute__((ext_vector_type(4))); typedef word_t vecmask_t __attribute__ ((ext_vector_type(4)));
#else /* GCC, hopefully? */ # else /* GCC, hopefully? */
typedef uint64_t uint64x2_t __attribute__((vector_size(16))); typedef uint64_t uint64x2_t __attribute__ ((vector_size(16)));
typedef int64_t int64x2_t __attribute__((vector_size(16))); typedef int64_t int64x2_t __attribute__ ((vector_size(16)));
typedef uint64_t uint64x4_t __attribute__((vector_size(32))); typedef uint64_t uint64x4_t __attribute__ ((vector_size(32)));
typedef int64_t int64x4_t __attribute__((vector_size(32))); typedef int64_t int64x4_t __attribute__ ((vector_size(32)));
typedef uint32_t uint32x4_t __attribute__((vector_size(16))); typedef uint32_t uint32x4_t __attribute__ ((vector_size(16)));
typedef int32_t int32x4_t __attribute__((vector_size(16))); typedef int32_t int32x4_t __attribute__ ((vector_size(16)));
typedef uint32_t uint32x2_t __attribute__((vector_size(8))); typedef uint32_t uint32x2_t __attribute__ ((vector_size(8)));
typedef int32_t int32x2_t __attribute__((vector_size(8))); typedef int32_t int32x2_t __attribute__ ((vector_size(8)));
typedef uint32_t uint32x8_t __attribute__((vector_size(32))); typedef uint32_t uint32x8_t __attribute__ ((vector_size(32)));
typedef int32_t int32x8_t __attribute__((vector_size(32))); typedef int32_t int32x8_t __attribute__ ((vector_size(32)));
typedef word_t vecmask_t __attribute__((vector_size(32))); typedef word_t vecmask_t __attribute__ ((vector_size(32)));
#endif # endif
#if defined(__AVX2__) # if defined(__AVX2__)
#define VECTOR_ALIGNED __attribute__((aligned(32))) # define VECTOR_ALIGNED __attribute__((aligned(32)))
typedef uint32x8_t big_register_t; typedef uint32x8_t big_register_t;
typedef uint64x4_t uint64xn_t; typedef uint64x4_t uint64xn_t;
typedef uint32x8_t uint32xn_t; typedef uint32x8_t uint32xn_t;
static ossl_inline big_register_t static ossl_inline big_register_t br_set_to_mask(mask_t x)
br_set_to_mask(mask_t x) { {
uint32_t y = (uint32_t)x; uint32_t y = (uint32_t)x;
big_register_t ret = {y,y,y,y,y,y,y,y}; big_register_t ret = { y, y, y, y, y, y, y, y };
return ret; return ret;
} }
#elif defined(__SSE2__) # elif defined(__SSE2__)
#define VECTOR_ALIGNED __attribute__((aligned(16))) # define VECTOR_ALIGNED __attribute__((aligned(16)))
typedef uint32x4_t big_register_t; typedef uint32x4_t big_register_t;
typedef uint64x2_t uint64xn_t; typedef uint64x2_t uint64xn_t;
typedef uint32x4_t uint32xn_t; typedef uint32x4_t uint32xn_t;
static ossl_inline big_register_t static ossl_inline big_register_t br_set_to_mask(mask_t x)
br_set_to_mask(mask_t x) { {
uint32_t y = x; uint32_t y = x;
big_register_t ret = {y,y,y,y}; big_register_t ret = { y, y, y, y };
return ret; return ret;
} }
#elif defined(__ARM_NEON__) # elif defined(__ARM_NEON__)
#define VECTOR_ALIGNED __attribute__((aligned(16))) # define VECTOR_ALIGNED __attribute__((aligned(16)))
typedef uint32x4_t big_register_t; typedef uint32x4_t big_register_t;
typedef uint64x2_t uint64xn_t; typedef uint64x2_t uint64xn_t;
typedef uint32x4_t uint32xn_t; typedef uint32x4_t uint32xn_t;
static ossl_inline big_register_t static ossl_inline big_register_t br_set_to_mask(mask_t x)
br_set_to_mask(mask_t x) { {
return vdupq_n_u32(x); return vdupq_n_u32(x);
} }
#elif defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \ # elif defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
|| defined(__aarch64__) || defined(__aarch64__)
#define VECTOR_ALIGNED __attribute__((aligned(8))) # define VECTOR_ALIGNED __attribute__((aligned(8)))
typedef uint64_t big_register_t, uint64xn_t; typedef uint64_t big_register_t, uint64xn_t;
typedef uint32_t uint32xn_t; typedef uint32_t uint32xn_t;
static ossl_inline big_register_t static ossl_inline big_register_t br_set_to_mask(mask_t x)
br_set_to_mask(mask_t x) { {
return (big_register_t)x; return (big_register_t) x;
} }
#else # else
#define VECTOR_ALIGNED __attribute__((aligned(4))) # define VECTOR_ALIGNED __attribute__((aligned(4)))
typedef uint64_t uint64xn_t; typedef uint64_t uint64xn_t;
typedef uint32_t uint32xn_t; typedef uint32_t uint32xn_t;
typedef uint32_t big_register_t; typedef uint32_t big_register_t;
static ossl_inline big_register_t static ossl_inline big_register_t br_set_to_mask(mask_t x)
br_set_to_mask(mask_t x) { {
return (big_register_t)x; return (big_register_t) x;
} }
#endif # endif
#if defined(__AVX2__) # if defined(__AVX2__)
static ossl_inline big_register_t static ossl_inline big_register_t br_is_zero(big_register_t x)
br_is_zero(big_register_t x) { {
return (big_register_t)(x == br_set_to_mask(0)); return (big_register_t) (x == br_set_to_mask(0));
} }
#elif defined(__SSE2__) # elif defined(__SSE2__)
static ossl_inline big_register_t static ossl_inline big_register_t br_is_zero(big_register_t x)
br_is_zero(big_register_t x) { {
return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128()); return (big_register_t) _mm_cmpeq_epi32((__m128i) x, _mm_setzero_si128());
//return (big_register_t)(x == br_set_to_mask(0)); // return (big_register_t)(x == br_set_to_mask(0));
} }
#elif defined(__ARM_NEON__) # elif defined(__ARM_NEON__)
static ossl_inline big_register_t static ossl_inline big_register_t br_is_zero(big_register_t x)
br_is_zero(big_register_t x) { {
return vceqq_u32(x,x^x); return vceqq_u32(x, x ^ x);
} }
#else # else
#define br_is_zero word_is_zero # define br_is_zero word_is_zero
#endif # endif
/* PERF: vectorize vs unroll */ /* PERF: vectorize vs unroll */
#ifdef __clang__ # ifdef __clang__
#if 100*__clang_major__ + __clang_minor__ > 305 # if 100*__clang_major__ + __clang_minor__ > 305
#define UNROLL _Pragma("clang loop unroll(full)") # define UNROLL _Pragma("clang loop unroll(full)")
#endif # endif
#endif # endif
#ifndef UNROLL # ifndef UNROLL
#define UNROLL # define UNROLL
#endif # endif
/* The plan on booleans: /*
* * The plan on booleans: The external interface uses decaf_bool_t, but this
* The external interface uses decaf_bool_t, but this might be a different * might be a different size than our particular arch's word_t (and thus
* size than our particular arch's word_t (and thus mask_t). Also, the caller * mask_t). Also, the caller isn't guaranteed to pass it as nonzero. So
* isn't guaranteed to pass it as nonzero. So bool_to_mask converts word sizes * bool_to_mask converts word sizes and checks nonzero. On the flip side,
* and checks nonzero. * mask_t is always -1 or 0, but it might be a different size than
* * decaf_bool_t. On the third hand, we have success vs boolean types, but
* On the flip side, mask_t is always -1 or 0, but it might be a different size * that's handled in common.h: it converts between decaf_bool_t and
* than decaf_bool_t. * decaf_error_t.
*
* On the third hand, we have success vs boolean types, but that's handled in
* common.h: it converts between decaf_bool_t and decaf_error_t.
*/ */
static ossl_inline decaf_bool_t mask_to_bool (mask_t m) { static ossl_inline decaf_bool_t mask_to_bool(mask_t m)
return (decaf_sword_t)(sword_t)m; {
return (decaf_sword_t) (sword_t) m;
} }
static ossl_inline mask_t bool_to_mask (decaf_bool_t m) { static ossl_inline mask_t bool_to_mask(decaf_bool_t m)
{
/* On most arches this will be optimized to a simple cast. */ /* On most arches this will be optimized to a simple cast. */
mask_t ret = 0; mask_t ret = 0;
unsigned int i; unsigned int i;
unsigned int limit = sizeof(decaf_bool_t)/sizeof(mask_t); unsigned int limit = sizeof(decaf_bool_t) / sizeof(mask_t);
if (limit < 1) limit = 1; if (limit < 1)
for (i=0; i<limit; i++) { limit = 1;
ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t))); for (i = 0; i < limit; i++) {
ret |= ~word_is_zero(m >> (i * 8 * sizeof(word_t)));
} }
return ret; return ret;
} }
static ossl_inline void ignore_result ( decaf_bool_t boo ) { static ossl_inline void ignore_result(decaf_bool_t boo)
{
(void)boo; (void)boo;
} }