mirror of
https://github.com/QuasarApp/openssl.git
synced 2025-04-29 19:24:37 +00:00
crypto/poly1305: don't break carry chains.
RT#4483 [poly1305-armv4.pl: remove redundant #ifdef __thumb2__] [poly1305-ppc*.pl: presumably more accurate benchmark results] Reviewed-by: Richard Levitte <levitte@openssl.org>
This commit is contained in:
parent
1400f013e1
commit
4b8736a22e
@ -10,10 +10,10 @@
|
||||
# IALU(*)/gcc-4.4 NEON
|
||||
#
|
||||
# ARM11xx(ARMv6) 7.78/+100% -
|
||||
# Cortex-A5 6.30/+130% 2.96
|
||||
# Cortex-A5 6.35/+130% 2.96
|
||||
# Cortex-A8 6.25/+115% 2.36
|
||||
# Cortex-A9 5.10/+95% 2.55
|
||||
# Cortex-A15 3.79/+85% 1.25(**)
|
||||
# Cortex-A15 3.85/+85% 1.25(**)
|
||||
# Snapdragon S4 5.70/+100% 1.48(**)
|
||||
#
|
||||
# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
|
||||
@ -313,7 +313,8 @@ poly1305_blocks:
|
||||
adds $h0,$h0,r1
|
||||
adcs $h1,$h1,#0
|
||||
adcs $h2,$h2,#0
|
||||
adc $h3,$h3,#0
|
||||
adcs $h3,$h3,#0
|
||||
adc $h4,$h4,#0
|
||||
|
||||
cmp r0,lr @ done yet?
|
||||
bhi .Loop
|
||||
@ -735,9 +736,7 @@ poly1305_blocks_neon:
|
||||
.align 4
|
||||
.Leven:
|
||||
subs $len,$len,#64
|
||||
# ifdef __thumb2__
|
||||
it lo
|
||||
# endif
|
||||
movlo $in2,$zeros
|
||||
|
||||
vmov.i32 $H4,#1<<24 @ padbit, yes, always
|
||||
@ -745,9 +744,7 @@ poly1305_blocks_neon:
|
||||
add $inp,$inp,#64
|
||||
vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
|
||||
add $in2,$in2,#64
|
||||
# ifdef __thumb2__
|
||||
itt hi
|
||||
# endif
|
||||
addhi $tbl1,$ctx,#(48+1*9*4)
|
||||
addhi $tbl0,$ctx,#(48+3*9*4)
|
||||
|
||||
@ -817,9 +814,7 @@ poly1305_blocks_neon:
|
||||
vmull.u32 $D4,$H4#hi,${R0}[1]
|
||||
subs $len,$len,#64
|
||||
vmlal.u32 $D0,$H4#hi,${S1}[1]
|
||||
# ifdef __thumb2__
|
||||
it lo
|
||||
# endif
|
||||
movlo $in2,$zeros
|
||||
vmlal.u32 $D3,$H2#hi,${R1}[1]
|
||||
vld1.32 ${S4}[1],[$tbl1,:32]
|
||||
@ -946,9 +941,7 @@ poly1305_blocks_neon:
|
||||
add $tbl1,$ctx,#(48+0*9*4)
|
||||
add $tbl0,$ctx,#(48+1*9*4)
|
||||
adds $len,$len,#32
|
||||
# ifdef __thumb2__
|
||||
it ne
|
||||
# endif
|
||||
movne $len,#0
|
||||
bne .Long_tail
|
||||
|
||||
@ -990,14 +983,10 @@ poly1305_blocks_neon:
|
||||
vmlal.u32 $D2,$H0#hi,$R2
|
||||
|
||||
vmlal.u32 $D3,$H0#hi,$R3
|
||||
# ifdef __thumb2__
|
||||
it ne
|
||||
# endif
|
||||
it ne
|
||||
addne $tbl1,$ctx,#(48+2*9*4)
|
||||
vmlal.u32 $D0,$H2#hi,$S3
|
||||
# ifdef __thumb2__
|
||||
it ne
|
||||
# endif
|
||||
it ne
|
||||
addne $tbl0,$ctx,#(48+3*9*4)
|
||||
vmlal.u32 $D4,$H1#hi,$R3
|
||||
vmlal.u32 $D1,$H3#hi,$S3
|
||||
@ -1138,7 +1127,8 @@ poly1305_emit_neon:
|
||||
adds $h0,$h0,$g0
|
||||
adcs $h1,$h1,#0
|
||||
adcs $h2,$h2,#0
|
||||
adc $h3,$h3,#0
|
||||
adcs $h3,$h3,#0
|
||||
adc $h4,$h4,#0
|
||||
|
||||
adds $g0,$h0,#5 @ compare to modulus
|
||||
adcs $g1,$h1,#0
|
||||
@ -1147,24 +1137,16 @@ poly1305_emit_neon:
|
||||
adc $g4,$h4,#0
|
||||
tst $g4,#4 @ did it carry/borrow?
|
||||
|
||||
# ifdef __thumb2__
|
||||
it ne
|
||||
# endif
|
||||
movne $h0,$g0
|
||||
ldr $g0,[$nonce,#0]
|
||||
# ifdef __thumb2__
|
||||
it ne
|
||||
# endif
|
||||
movne $h1,$g1
|
||||
ldr $g1,[$nonce,#4]
|
||||
# ifdef __thumb2__
|
||||
it ne
|
||||
# endif
|
||||
movne $h2,$g2
|
||||
ldr $g2,[$nonce,#8]
|
||||
# ifdef __thumb2__
|
||||
it ne
|
||||
# endif
|
||||
movne $h3,$g3
|
||||
ldr $g3,[$nonce,#12]
|
||||
|
||||
|
@ -16,10 +16,10 @@
|
||||
# IALU/gcc-4.9 NEON
|
||||
#
|
||||
# Apple A7 1.86/+5% 0.72
|
||||
# Cortex-A53 2.63/+58% 1.47
|
||||
# Cortex-A53 2.69/+58% 1.47
|
||||
# Cortex-A57 2.70/+7% 1.14
|
||||
# Denver 1.39/+50% 1.18(*)
|
||||
# X-Gene 2.00/+68% 2.19
|
||||
# Denver 1.64/+50% 1.18(*)
|
||||
# X-Gene 2.13/+68% 2.19
|
||||
#
|
||||
# (*) estimate based on resources availability is less than 1.0,
|
||||
# i.e. measured result is worse than expected, presumably binary
|
||||
@ -151,7 +151,8 @@ poly1305_blocks:
|
||||
and $h2,$d2,#3
|
||||
add $t0,$t0,$d2,lsr#2
|
||||
adds $h0,$d0,$t0
|
||||
adc $h1,$d1,xzr
|
||||
adcs $h1,$d1,xzr
|
||||
adc $h2,$h2,xzr
|
||||
|
||||
cbnz $len,.Loop
|
||||
|
||||
@ -235,7 +236,8 @@ poly1305_mult:
|
||||
and $h2,$d2,#3
|
||||
add $t0,$t0,$d2,lsr#2
|
||||
adds $h0,$d0,$t0
|
||||
adc $h1,$d1,xzr
|
||||
adcs $h1,$d1,xzr
|
||||
adc $h2,$h2,xzr
|
||||
|
||||
ret
|
||||
.size poly1305_mult,.-poly1305_mult
|
||||
@ -310,7 +312,8 @@ poly1305_blocks_neon:
|
||||
and $h2,$d2,#3
|
||||
add $t0,$t0,$d2,lsr#2
|
||||
adds $h0,$h0,$t0
|
||||
adc $h1,$h1,xzr
|
||||
adcs $h1,$h1,xzr
|
||||
adc $h2,$h2,xzr
|
||||
|
||||
#ifdef __ARMEB__
|
||||
rev $d0,$d0
|
||||
@ -870,7 +873,8 @@ poly1305_emit_neon:
|
||||
add $d0,$d0,$h2,lsr#2
|
||||
and $h2,$h2,#3
|
||||
adds $h0,$h0,$d0
|
||||
adc $h1,$h1,xzr
|
||||
adcs $h1,$h1,xzr
|
||||
adc $h2,$h2,xzr
|
||||
|
||||
adds $d0,$h0,#5 // compare to modulus
|
||||
adcs $d1,$h1,xzr
|
||||
|
@ -11,7 +11,7 @@
|
||||
#
|
||||
# October 2015
|
||||
#
|
||||
# Performance is [incredible for a 32-bit processor] 1.76 cycles per
|
||||
# Performance is [incredible for a 32-bit processor] 1.82 cycles per
|
||||
# processed byte. Comparison to compiler-generated code is problematic,
|
||||
# because results were observed to vary from 2.1 to 7.6 cpb depending
|
||||
# on compiler's ability to inline small functions. Compiler also
|
||||
@ -128,7 +128,7 @@ _poly1305_blocks:
|
||||
|| SWAP2 $D1,$D1
|
||||
|
||||
ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
|
||||
|| ADD $D0,B24,B31 ; B-copy of h0+inp[0]
|
||||
|| ADD $D0,B24,B27 ; B-copy of h0+inp[0]
|
||||
|| SWAP4 $D1,$D1
|
||||
ADDU $D1,B25,$D1:$H1 ; h1+=inp[1]
|
||||
|| MVK 3,$THREE
|
||||
@ -140,12 +140,12 @@ _poly1305_blocks:
|
||||
|
||||
loop?:
|
||||
MPY32U $H0,$R0,A17:A16
|
||||
|| MPY32U B31,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16
|
||||
|| MPY32U B27,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16
|
||||
|| ADDU $D0,$D1:$H1,B25:B24 ; ADDU $D0,$D1:$H1,$D1:$H1
|
||||
|| ADDU $D2,B28,$D2:$H2 ; h2+=inp[2]
|
||||
|| SWAP2 $D3,$D3
|
||||
MPY32U $H0,$R2,A19:A18
|
||||
|| MPY32U B31,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18
|
||||
|| MPY32U B27,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18
|
||||
|| ADD $D0,$H1,A24 ; A-copy of B24
|
||||
|| SWAP4 $D3,$D3
|
||||
|| [A2] SUB A2,1,A2 ; decrement loop counter
|
||||
@ -227,8 +227,8 @@ loop?:
|
||||
|
||||
SHRU $H4,2,B16 ; last reduction step
|
||||
|| AND $H4,$THREE,$H4
|
||||
|| [A2] BNOP loop?
|
||||
ADDAW B16,B16,B16 ; 5*(h4>>2)
|
||||
|| [A2] BNOP loop?
|
||||
|
||||
ADDU B24,B16,B25:B24 ; B24 is h0
|
||||
|| [A2] SWAP2 $D2,$D2
|
||||
@ -236,8 +236,9 @@ loop?:
|
||||
|| [A2] SWAP4 $D2,$D2
|
||||
ADDU B28,B27,B29:B28 ; B28 is h2
|
||||
|| [A2] ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
|
||||
|| [A2] ADD $D0,B24,B31 ; B-copy of h0+inp[0]
|
||||
ADD B30,B29,B30 ; B30 is h3
|
||||
|| [A2] ADD $D0,B24,B27 ; B-copy of h0+inp[0]
|
||||
ADDU B30,B29,B31:B30 ; B30 is h3
|
||||
ADD B31,$H4,$H4
|
||||
|| [A2] ADDU $D1,B26,$D1:$H1 ; h1+=inp[1]
|
||||
;;===== branch to loop? is taken here
|
||||
|
||||
|
@ -17,11 +17,10 @@
|
||||
# -m32 -m64
|
||||
#
|
||||
# Freescale e300 14.8/+80% -
|
||||
# PPC74x0 7.40/+60% -
|
||||
# PPC970 7.20/+114% 3.51/+205%
|
||||
# POWER6 3.96/+250% 2.02/+170%
|
||||
# POWER7 3.67/+260% 1.87/+100%
|
||||
# POWER8 - 2.13/+200%
|
||||
# PPC74x0 7.60/+60% -
|
||||
# PPC970 7.00/+114% 3.51/+205%
|
||||
# POWER7 3.75/+260% 1.93/+100%
|
||||
# POWER8 - 2.03/+200%
|
||||
#
|
||||
# Do we need floating-point implementation for PPC? Results presented
|
||||
# in poly1305_ieee754.c are tricky to compare to, because they are for
|
||||
@ -212,6 +211,7 @@ $code.=<<___;
|
||||
add $t0,$t0,$t1
|
||||
addc $h0,$d0,$t0
|
||||
addze $h1,$d1
|
||||
addze $h2,$h2
|
||||
|
||||
bdnz Loop
|
||||
|
||||
@ -518,6 +518,7 @@ $code.=<<___;
|
||||
addze $h1,$h1
|
||||
addze $h2,$h2
|
||||
addze $h3,$h3
|
||||
addze $h4,$h4
|
||||
|
||||
bdnz Loop
|
||||
|
||||
|
@ -15,8 +15,8 @@
|
||||
# and improvement coefficients relative to gcc-generated code.
|
||||
#
|
||||
# Freescale e300 9.78/+30%
|
||||
# PPC74x0 7.08/+50%
|
||||
# PPC970 6.24/+80%
|
||||
# PPC74x0 6.92/+50%
|
||||
# PPC970 6.03/+80%
|
||||
# POWER7 3.50/+30%
|
||||
# POWER8 3.75/+10%
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
#
|
||||
# June 2015
|
||||
#
|
||||
# ~6.4/2.2 cpb on z10/z196+, >2x improvement over compiler-generated
|
||||
# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
|
||||
# code. For older compiler improvement coefficient is >3x, because
|
||||
# then base 2^64 and base 2^32 implementations are compared.
|
||||
#
|
||||
@ -138,11 +138,12 @@ poly1305_blocks:
|
||||
ngr $h0,$h2
|
||||
srlg $t0,$h2,2
|
||||
algr $h0,$t0
|
||||
lghi $t1,3
|
||||
ngr $h2,$t1
|
||||
|
||||
algr $h0,$d0lo
|
||||
lghi $t1,3
|
||||
alcgr $h1,$d1hi # $d1hi is still zero
|
||||
ngr $h2,$t1
|
||||
alcgr $h2,$d1hi # $d1hi is still zero
|
||||
|
||||
brct$g $len,.Loop
|
||||
|
||||
|
@ -16,10 +16,10 @@
|
||||
#
|
||||
# IALU(*) FMA
|
||||
#
|
||||
# UltraSPARC III 11.9(**)
|
||||
# SPARC T3 7.85
|
||||
# SPARC T4 1.67(***) 6.55
|
||||
# SPARC64 X 5.54 3.64
|
||||
# UltraSPARC III 12.3(**)
|
||||
# SPARC T3 7.92
|
||||
# SPARC T4 1.70(***) 6.55
|
||||
# SPARC64 X 5.60 3.64
|
||||
#
|
||||
# (*) Comparison to compiler-generated code is really problematic,
|
||||
# because latter's performance varies too much depending on too
|
||||
@ -251,8 +251,9 @@ poly1305_blocks:
|
||||
addcc $t0,$d0,$h0
|
||||
addccc %g0,$h1,$h1
|
||||
addccc %g0,$h2,$h2
|
||||
addccc %g0,$h3,$h3
|
||||
brnz,pt $len,.Loop
|
||||
addc %g0,$h3,$h3
|
||||
addc %g0,$h4,$h4
|
||||
|
||||
st $h1,[$ctx+0] ! store hash value
|
||||
st $h0,[$ctx+4]
|
||||
@ -295,6 +296,7 @@ poly1305_blocks_vis3:
|
||||
neg $shr,$shl
|
||||
|
||||
srlx $R1,2,$S1
|
||||
b .Loop_vis3
|
||||
add $R1,$S1,$S1
|
||||
|
||||
.Loop_vis3:
|
||||
@ -342,8 +344,9 @@ poly1305_blocks_vis3:
|
||||
add $T1,$T0,$T0
|
||||
|
||||
addcc $T0,$D0,$H0
|
||||
addxccc %g0,$D1,$H1
|
||||
brnz,pt $len,.Loop_vis3
|
||||
addxc %g0,$D1,$H1
|
||||
addxc %g0,$H2,$H2
|
||||
|
||||
stx $H0,[$ctx+0] ! store hash value
|
||||
stx $H1,[$ctx+8]
|
||||
|
@ -299,6 +299,7 @@ if ($sse2) {
|
||||
&adc ("ebx",0);
|
||||
&adc ("ecx",0);
|
||||
&adc ("esi",0);
|
||||
&adc ("edi",0);
|
||||
|
||||
&cmp ("ebp",&wparam(2)); # done yet?
|
||||
&jne (&label("loop"));
|
||||
@ -1166,11 +1167,12 @@ my $addr = shift;
|
||||
&shr ("edi",2);
|
||||
&lea ("ebp",&DWP(0,"edi","edi",4)); # *5
|
||||
&mov ("edi",&wparam(1)); # output
|
||||
add ("eax","ebp");
|
||||
&add ("eax","ebp");
|
||||
&mov ("ebp",&wparam(2)); # key
|
||||
adc ("ebx",0);
|
||||
adc ("ecx",0);
|
||||
adc ("edx",0);
|
||||
&adc ("ebx",0);
|
||||
&adc ("ecx",0);
|
||||
&adc ("edx",0);
|
||||
&adc ("esi",0);
|
||||
|
||||
&movd ($D0,"eax"); # offload original hash value
|
||||
&add ("eax",5); # compare to modulus
|
||||
|
@ -15,16 +15,16 @@
|
||||
# measured with rdtsc at fixed clock frequency.
|
||||
#
|
||||
# IALU/gcc-4.8(*) AVX(**) AVX2
|
||||
# P4 4.90/+120% -
|
||||
# Core 2 2.39/+90% -
|
||||
# Westmere 1.86/+120% -
|
||||
# P4 4.46/+120% -
|
||||
# Core 2 2.41/+90% -
|
||||
# Westmere 1.88/+120% -
|
||||
# Sandy Bridge 1.39/+140% 1.10
|
||||
# Haswell 1.10/+175% 1.11 0.65
|
||||
# Skylake 1.12/+120% 0.96 0.51
|
||||
# Haswell 1.14/+175% 1.11 0.65
|
||||
# Skylake 1.13/+120% 0.96 0.51
|
||||
# Silvermont 2.83/+95% -
|
||||
# VIA Nano 1.82/+150% -
|
||||
# Sledgehammer 1.38/+160% -
|
||||
# Bulldozer 2.21/+130% 0.97
|
||||
# Bulldozer 2.30/+130% 0.97
|
||||
#
|
||||
# (*) improvement coefficients relative to clang are more modest and
|
||||
# are ~50% on most processors, in both cases we are comparing to
|
||||
@ -114,6 +114,7 @@ $code.=<<___;
|
||||
add $d3,%rax
|
||||
add %rax,$h0
|
||||
adc \$0,$h1
|
||||
adc \$0,$h2
|
||||
___
|
||||
}
|
||||
|
||||
@ -184,8 +185,8 @@ $code.=<<___;
|
||||
.align 32
|
||||
poly1305_blocks:
|
||||
.Lblocks:
|
||||
sub \$16,$len # too short?
|
||||
jc .Lno_data
|
||||
shr \$4,$len
|
||||
jz .Lno_data # too short
|
||||
|
||||
push %rbx
|
||||
push %rbp
|
||||
@ -220,8 +221,8 @@ ___
|
||||
&poly1305_iteration();
|
||||
$code.=<<___;
|
||||
mov $r1,%rax
|
||||
sub \$16,%r15 # len-=16
|
||||
jnc .Loop
|
||||
dec %r15 # len-=16
|
||||
jnz .Loop
|
||||
|
||||
mov $h0,0($ctx) # store hash value
|
||||
mov $h1,8($ctx)
|
||||
@ -521,6 +522,7 @@ poly1305_blocks_avx:
|
||||
add $d2,$d1 # =*5
|
||||
add $d1,$h0
|
||||
adc \$0,$h1
|
||||
adc \$0,$h2
|
||||
|
||||
mov $s1,$r1
|
||||
mov $s1,%rax
|
||||
@ -1315,6 +1317,7 @@ poly1305_emit_avx:
|
||||
add %rcx,%rax
|
||||
add %rax,%r8
|
||||
adc \$0,%r9
|
||||
adc \$0,%r10
|
||||
|
||||
mov %r8,%rax
|
||||
add \$5,%r8 # compare to modulus
|
||||
@ -1407,6 +1410,7 @@ poly1305_blocks_avx2:
|
||||
add $d2,$d1 # =*5
|
||||
add $d1,$h0
|
||||
adc \$0,$h1
|
||||
adc \$0,$h2
|
||||
|
||||
mov $s1,$r1
|
||||
mov $s1,%rax
|
||||
|
@ -207,7 +207,17 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)
|
||||
c = (h2 >> 2) + (h2 & ~3UL);
|
||||
h2 &= 3;
|
||||
h0 += c;
|
||||
h1 += (c = CONSTANT_TIME_CARRY(h0,c)); /* doesn't overflow */
|
||||
h1 += (c = CONSTANT_TIME_CARRY(h0,c));
|
||||
h2 += CONSTANT_TIME_CARRY(h1,c);
|
||||
/*
|
||||
* Occasional overflows to 3rd bit of h2 are taken care of
|
||||
* "naturally". If after this point we end up at the top of
|
||||
* this loop, then the overflow bit will be accounted for
|
||||
* in next iteration. If we end up in poly1305_emit, then
|
||||
* comparison to modulus below will still count as "carry
|
||||
* into 131st bit", so that properly reduced value will be
|
||||
* picked in conditional move.
|
||||
*/
|
||||
|
||||
inp += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
@ -231,12 +241,12 @@ static void poly1305_emit(void *ctx, unsigned char mac[16],
|
||||
h1 = st->h[1];
|
||||
h2 = st->h[2];
|
||||
|
||||
/* compute h + -p */
|
||||
/* compare to modulus by computing h + -p */
|
||||
g0 = (u64)(t = (u128)h0 + 5);
|
||||
g1 = (u64)(t = (u128)h1 + (t >> 64));
|
||||
g2 = h2 + (u64)(t >> 64);
|
||||
|
||||
/* if there was carry into 130th bit, h1:h0 = g1:g0 */
|
||||
/* if there was carry into 131st bit, h1:h0 = g1:g0 */
|
||||
mask = 0 - (g2 >> 2);
|
||||
g0 &= mask;
|
||||
g1 &= mask;
|
||||
@ -361,7 +371,17 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)
|
||||
h0 += c;
|
||||
h1 += (c = CONSTANT_TIME_CARRY(h0,c));
|
||||
h2 += (c = CONSTANT_TIME_CARRY(h1,c));
|
||||
h3 += (c = CONSTANT_TIME_CARRY(h2,c)); /* doesn't overflow */
|
||||
h3 += (c = CONSTANT_TIME_CARRY(h2,c));
|
||||
h4 += CONSTANT_TIME_CARRY(h3,c);
|
||||
/*
|
||||
* Occasional overflows to 3rd bit of h4 are taken care of
|
||||
* "naturally". If after this point we end up at the top of
|
||||
* this loop, then the overflow bit will be accounted for
|
||||
* in next iteration. If we end up in poly1305_emit, then
|
||||
* comparison to modulus below will still count as "carry
|
||||
* into 131st bit", so that properly reduced value will be
|
||||
* picked in conditional move.
|
||||
*/
|
||||
|
||||
inp += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
@ -389,14 +409,14 @@ static void poly1305_emit(void *ctx, unsigned char mac[16],
|
||||
h3 = st->h[3];
|
||||
h4 = st->h[4];
|
||||
|
||||
/* compute h + -p */
|
||||
/* compare to modulus by computing h + -p */
|
||||
g0 = (u32)(t = (u64)h0 + 5);
|
||||
g1 = (u32)(t = (u64)h1 + (t >> 32));
|
||||
g2 = (u32)(t = (u64)h2 + (t >> 32));
|
||||
g3 = (u32)(t = (u64)h3 + (t >> 32));
|
||||
g4 = h4 + (u32)(t >> 32);
|
||||
|
||||
/* if there was carry into 130th bit, h3:h0 = g3:g0 */
|
||||
/* if there was carry into 131st bit, h3:h0 = g3:g0 */
|
||||
mask = 0 - (g4 >> 2);
|
||||
g0 &= mask;
|
||||
g1 &= mask;
|
||||
@ -728,6 +748,58 @@ static const struct poly1305_test poly1305_tests[] = {
|
||||
"99e5822dd4173c995e3dae0ddefb9774""3fde3b080134b39f76e9bf8d0e88d546",
|
||||
"2637408fe13086ea73f971e3425e2820"
|
||||
},
|
||||
/*
|
||||
* test vectors from Hanno Böck
|
||||
*/
|
||||
{
|
||||
"cccccccccccccccccccccccccccccccccccccccccccccccccc80cccccccccccc"
|
||||
"cccccccccccccccccccccccccccccccccccccccccccccccccccccccccecccccc"
|
||||
"ccccccccccccccccccccccccccccccc5cccccccccccccccccccccccccccccccc"
|
||||
"cccccccccce3cccccccccccccccccccccccccccccccccccccccccccccccccccc"
|
||||
"ccccccccaccccccccccccccccccccce6cccccccccc000000afcccccccccccccc"
|
||||
"ccccfffffff50000000000000000000000000000000000000000000000000000"
|
||||
"00ffffffe7000000000000000000000000000000000000000000000000000000"
|
||||
"0000000000000000000000000000000000000000000000000000719205a8521d"
|
||||
"fc",
|
||||
"7f1b0264000000000000000000000000""0000000000000000cccccccccccccccc",
|
||||
"8559b876eceed66eb37798c0457baff9"
|
||||
},
|
||||
{
|
||||
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa0000000000"
|
||||
"00000000800264",
|
||||
"e0001600000000000000000000000000""0000aaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
"00bd1258978e205444c9aaaa82006fed"
|
||||
},
|
||||
{
|
||||
"02fc",
|
||||
"0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c""0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c",
|
||||
"06120c0c0c0c0c0c0c0c0c0c0c0c0c0c"
|
||||
},
|
||||
{
|
||||
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
|
||||
"7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
|
||||
"7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
|
||||
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b007b7b7b7b7b7b7b7b7b"
|
||||
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b"
|
||||
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
|
||||
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
|
||||
"7b6e7b001300000000b300000000000000000000000000000000000000000000"
|
||||
"f20000000000000000000000000000000000002000efff000900000000000000"
|
||||
"0000000000100000000009000000640000000000000000000000001300000000"
|
||||
"b300000000000000000000000000000000000000000000f20000000000000000"
|
||||
"000000000000000000002000efff00090000000000000000007a000010000000"
|
||||
"000900000064000000000000000000000000000000000000000000000000fc",
|
||||
"00ff0000000000000000000000000000""00000000001e00000000000000007b7b",
|
||||
"33205bbf9e9f8f7212ab9e2ab9b7e4a5"
|
||||
},
|
||||
{
|
||||
"7777777777777777777777777777777777777777777777777777777777777777"
|
||||
"7777777777777777777777777777777777777777777777777777777777777777"
|
||||
"777777777777777777777777ffffffe9e9acacacacacacacacacacac0000acac"
|
||||
"ec0100acacac2caca2acacacacacacacacacacac64f2",
|
||||
"0000007f0000007f0100002000000000""0000cf77777777777777777777777777",
|
||||
"02ee7c8c546ddeb1a467e4c3981158b9"
|
||||
},
|
||||
/*
|
||||
* test vectors from Andrew Moon
|
||||
*/
|
||||
|
Loading…
x
Reference in New Issue
Block a user