crypto/poly1305: don't break carry chains.

RT#4483

[poly1305-armv4.pl: remove redundant #ifdef __thumb2__]
[poly1305-ppc*.pl: presumably more accurate benchmark results]

Reviewed-by: Richard Levitte <levitte@openssl.org>
This commit is contained in:
Andy Polyakov 2016-03-29 10:02:45 +02:00
parent 1400f013e1
commit 4b8736a22e
10 changed files with 146 additions and 76 deletions

View File

@ -10,10 +10,10 @@
# IALU(*)/gcc-4.4 NEON
#
# ARM11xx(ARMv6) 7.78/+100% -
# Cortex-A5 6.30/+130% 2.96
# Cortex-A5 6.35/+130% 2.96
# Cortex-A8 6.25/+115% 2.36
# Cortex-A9 5.10/+95% 2.55
# Cortex-A15 3.79/+85% 1.25(**)
# Cortex-A15 3.85/+85% 1.25(**)
# Snapdragon S4 5.70/+100% 1.48(**)
#
# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
@ -313,7 +313,8 @@ poly1305_blocks:
adds $h0,$h0,r1
adcs $h1,$h1,#0
adcs $h2,$h2,#0
adc $h3,$h3,#0
adcs $h3,$h3,#0
adc $h4,$h4,#0
cmp r0,lr @ done yet?
bhi .Loop
@ -735,9 +736,7 @@ poly1305_blocks_neon:
.align 4
.Leven:
subs $len,$len,#64
# ifdef __thumb2__
it lo
# endif
movlo $in2,$zeros
vmov.i32 $H4,#1<<24 @ padbit, yes, always
@ -745,9 +744,7 @@ poly1305_blocks_neon:
add $inp,$inp,#64
vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
add $in2,$in2,#64
# ifdef __thumb2__
itt hi
# endif
addhi $tbl1,$ctx,#(48+1*9*4)
addhi $tbl0,$ctx,#(48+3*9*4)
@ -817,9 +814,7 @@ poly1305_blocks_neon:
vmull.u32 $D4,$H4#hi,${R0}[1]
subs $len,$len,#64
vmlal.u32 $D0,$H4#hi,${S1}[1]
# ifdef __thumb2__
it lo
# endif
movlo $in2,$zeros
vmlal.u32 $D3,$H2#hi,${R1}[1]
vld1.32 ${S4}[1],[$tbl1,:32]
@ -946,9 +941,7 @@ poly1305_blocks_neon:
add $tbl1,$ctx,#(48+0*9*4)
add $tbl0,$ctx,#(48+1*9*4)
adds $len,$len,#32
# ifdef __thumb2__
it ne
# endif
movne $len,#0
bne .Long_tail
@ -990,14 +983,10 @@ poly1305_blocks_neon:
vmlal.u32 $D2,$H0#hi,$R2
vmlal.u32 $D3,$H0#hi,$R3
# ifdef __thumb2__
it ne
# endif
it ne
addne $tbl1,$ctx,#(48+2*9*4)
vmlal.u32 $D0,$H2#hi,$S3
# ifdef __thumb2__
it ne
# endif
it ne
addne $tbl0,$ctx,#(48+3*9*4)
vmlal.u32 $D4,$H1#hi,$R3
vmlal.u32 $D1,$H3#hi,$S3
@ -1138,7 +1127,8 @@ poly1305_emit_neon:
adds $h0,$h0,$g0
adcs $h1,$h1,#0
adcs $h2,$h2,#0
adc $h3,$h3,#0
adcs $h3,$h3,#0
adc $h4,$h4,#0
adds $g0,$h0,#5 @ compare to modulus
adcs $g1,$h1,#0
@ -1147,24 +1137,16 @@ poly1305_emit_neon:
adc $g4,$h4,#0
tst $g4,#4 @ did it carry/borrow?
# ifdef __thumb2__
it ne
# endif
movne $h0,$g0
ldr $g0,[$nonce,#0]
# ifdef __thumb2__
it ne
# endif
movne $h1,$g1
ldr $g1,[$nonce,#4]
# ifdef __thumb2__
it ne
# endif
movne $h2,$g2
ldr $g2,[$nonce,#8]
# ifdef __thumb2__
it ne
# endif
movne $h3,$g3
ldr $g3,[$nonce,#12]

View File

@ -16,10 +16,10 @@
# IALU/gcc-4.9 NEON
#
# Apple A7 1.86/+5% 0.72
# Cortex-A53 2.63/+58% 1.47
# Cortex-A53 2.69/+58% 1.47
# Cortex-A57 2.70/+7% 1.14
# Denver 1.39/+50% 1.18(*)
# X-Gene 2.00/+68% 2.19
# Denver 1.64/+50% 1.18(*)
# X-Gene 2.13/+68% 2.19
#
# (*) estimate based on resources availability is less than 1.0,
# i.e. measured result is worse than expected, presumably binary
@ -151,7 +151,8 @@ poly1305_blocks:
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$d0,$t0
adc $h1,$d1,xzr
adcs $h1,$d1,xzr
adc $h2,$h2,xzr
cbnz $len,.Loop
@ -235,7 +236,8 @@ poly1305_mult:
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$d0,$t0
adc $h1,$d1,xzr
adcs $h1,$d1,xzr
adc $h2,$h2,xzr
ret
.size poly1305_mult,.-poly1305_mult
@ -310,7 +312,8 @@ poly1305_blocks_neon:
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$h0,$t0
adc $h1,$h1,xzr
adcs $h1,$h1,xzr
adc $h2,$h2,xzr
#ifdef __ARMEB__
rev $d0,$d0
@ -870,7 +873,8 @@ poly1305_emit_neon:
add $d0,$d0,$h2,lsr#2
and $h2,$h2,#3
adds $h0,$h0,$d0
adc $h1,$h1,xzr
adcs $h1,$h1,xzr
adc $h2,$h2,xzr
adds $d0,$h0,#5 // compare to modulus
adcs $d1,$h1,xzr

View File

@ -11,7 +11,7 @@
#
# October 2015
#
# Performance is [incredible for a 32-bit processor] 1.76 cycles per
# Performance is [incredible for a 32-bit processor] 1.82 cycles per
# processed byte. Comparison to compiler-generated code is problematic,
# because results were observed to vary from 2.1 to 7.6 cpb depending
# on compiler's ability to inline small functions. Compiler also
@ -128,7 +128,7 @@ _poly1305_blocks:
|| SWAP2 $D1,$D1
ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
|| ADD $D0,B24,B31 ; B-copy of h0+inp[0]
|| ADD $D0,B24,B27 ; B-copy of h0+inp[0]
|| SWAP4 $D1,$D1
ADDU $D1,B25,$D1:$H1 ; h1+=inp[1]
|| MVK 3,$THREE
@ -140,12 +140,12 @@ _poly1305_blocks:
loop?:
MPY32U $H0,$R0,A17:A16
|| MPY32U B31,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16
|| MPY32U B27,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16
|| ADDU $D0,$D1:$H1,B25:B24 ; ADDU $D0,$D1:$H1,$D1:$H1
|| ADDU $D2,B28,$D2:$H2 ; h2+=inp[2]
|| SWAP2 $D3,$D3
MPY32U $H0,$R2,A19:A18
|| MPY32U B31,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18
|| MPY32U B27,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18
|| ADD $D0,$H1,A24 ; A-copy of B24
|| SWAP4 $D3,$D3
|| [A2] SUB A2,1,A2 ; decrement loop counter
@ -227,8 +227,8 @@ loop?:
SHRU $H4,2,B16 ; last reduction step
|| AND $H4,$THREE,$H4
|| [A2] BNOP loop?
ADDAW B16,B16,B16 ; 5*(h4>>2)
|| [A2] BNOP loop?
ADDU B24,B16,B25:B24 ; B24 is h0
|| [A2] SWAP2 $D2,$D2
@ -236,8 +236,9 @@ loop?:
|| [A2] SWAP4 $D2,$D2
ADDU B28,B27,B29:B28 ; B28 is h2
|| [A2] ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
|| [A2] ADD $D0,B24,B31 ; B-copy of h0+inp[0]
ADD B30,B29,B30 ; B30 is h3
|| [A2] ADD $D0,B24,B27 ; B-copy of h0+inp[0]
ADDU B30,B29,B31:B30 ; B30 is h3
ADD B31,$H4,$H4
|| [A2] ADDU $D1,B26,$D1:$H1 ; h1+=inp[1]
;;===== branch to loop? is taken here

View File

@ -17,11 +17,10 @@
# -m32 -m64
#
# Freescale e300 14.8/+80% -
# PPC74x0 7.40/+60% -
# PPC970 7.20/+114% 3.51/+205%
# POWER6 3.96/+250% 2.02/+170%
# POWER7 3.67/+260% 1.87/+100%
# POWER8 - 2.13/+200%
# PPC74x0 7.60/+60% -
# PPC970 7.00/+114% 3.51/+205%
# POWER7 3.75/+260% 1.93/+100%
# POWER8 - 2.03/+200%
#
# Do we need floating-point implementation for PPC? Results presented
# in poly1305_ieee754.c are tricky to compare to, because they are for
@ -212,6 +211,7 @@ $code.=<<___;
add $t0,$t0,$t1
addc $h0,$d0,$t0
addze $h1,$d1
addze $h2,$h2
bdnz Loop
@ -518,6 +518,7 @@ $code.=<<___;
addze $h1,$h1
addze $h2,$h2
addze $h3,$h3
addze $h4,$h4
bdnz Loop

View File

@ -15,8 +15,8 @@
# and improvement coefficients relative to gcc-generated code.
#
# Freescale e300 9.78/+30%
# PPC74x0 7.08/+50%
# PPC970 6.24/+80%
# PPC74x0 6.92/+50%
# PPC970 6.03/+80%
# POWER7 3.50/+30%
# POWER8 3.75/+10%

View File

@ -11,7 +11,7 @@
#
# June 2015
#
# ~6.4/2.2 cpb on z10/z196+, >2x improvement over compiler-generated
# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
# code. For older compiler improvement coefficient is >3x, because
# then base 2^64 and base 2^32 implementations are compared.
#
@ -138,11 +138,12 @@ poly1305_blocks:
ngr $h0,$h2
srlg $t0,$h2,2
algr $h0,$t0
lghi $t1,3
ngr $h2,$t1
algr $h0,$d0lo
lghi $t1,3
alcgr $h1,$d1hi # $d1hi is still zero
ngr $h2,$t1
alcgr $h2,$d1hi # $d1hi is still zero
brct$g $len,.Loop

View File

@ -16,10 +16,10 @@
#
# IALU(*) FMA
#
# UltraSPARC III 11.9(**)
# SPARC T3 7.85
# SPARC T4 1.67(***) 6.55
# SPARC64 X 5.54 3.64
# UltraSPARC III 12.3(**)
# SPARC T3 7.92
# SPARC T4 1.70(***) 6.55
# SPARC64 X 5.60 3.64
#
# (*) Comparison to compiler-generated code is really problematic,
# because latter's performance varies too much depending on too
@ -251,8 +251,9 @@ poly1305_blocks:
addcc $t0,$d0,$h0
addccc %g0,$h1,$h1
addccc %g0,$h2,$h2
addccc %g0,$h3,$h3
brnz,pt $len,.Loop
addc %g0,$h3,$h3
addc %g0,$h4,$h4
st $h1,[$ctx+0] ! store hash value
st $h0,[$ctx+4]
@ -295,6 +296,7 @@ poly1305_blocks_vis3:
neg $shr,$shl
srlx $R1,2,$S1
b .Loop_vis3
add $R1,$S1,$S1
.Loop_vis3:
@ -342,8 +344,9 @@ poly1305_blocks_vis3:
add $T1,$T0,$T0
addcc $T0,$D0,$H0
addxccc %g0,$D1,$H1
brnz,pt $len,.Loop_vis3
addxc %g0,$D1,$H1
addxc %g0,$H2,$H2
stx $H0,[$ctx+0] ! store hash value
stx $H1,[$ctx+8]

View File

@ -299,6 +299,7 @@ if ($sse2) {
&adc ("ebx",0);
&adc ("ecx",0);
&adc ("esi",0);
&adc ("edi",0);
&cmp ("ebp",&wparam(2)); # done yet?
&jne (&label("loop"));
@ -1166,11 +1167,12 @@ my $addr = shift;
&shr ("edi",2);
&lea ("ebp",&DWP(0,"edi","edi",4)); # *5
&mov ("edi",&wparam(1)); # output
add ("eax","ebp");
&add ("eax","ebp");
&mov ("ebp",&wparam(2)); # key
adc ("ebx",0);
adc ("ecx",0);
adc ("edx",0);
&adc ("ebx",0);
&adc ("ecx",0);
&adc ("edx",0);
&adc ("esi",0);
&movd ($D0,"eax"); # offload original hash value
&add ("eax",5); # compare to modulus

View File

@ -15,16 +15,16 @@
# measured with rdtsc at fixed clock frequency.
#
# IALU/gcc-4.8(*) AVX(**) AVX2
# P4 4.90/+120% -
# Core 2 2.39/+90% -
# Westmere 1.86/+120% -
# P4 4.46/+120% -
# Core 2 2.41/+90% -
# Westmere 1.88/+120% -
# Sandy Bridge 1.39/+140% 1.10
# Haswell 1.10/+175% 1.11 0.65
# Skylake 1.12/+120% 0.96 0.51
# Haswell 1.14/+175% 1.11 0.65
# Skylake 1.13/+120% 0.96 0.51
# Silvermont 2.83/+95% -
# VIA Nano 1.82/+150% -
# Sledgehammer 1.38/+160% -
# Bulldozer 2.21/+130% 0.97
# Bulldozer 2.30/+130% 0.97
#
# (*) improvement coefficients relative to clang are more modest and
# are ~50% on most processors, in both cases we are comparing to
@ -114,6 +114,7 @@ $code.=<<___;
add $d3,%rax
add %rax,$h0
adc \$0,$h1
adc \$0,$h2
___
}
@ -184,8 +185,8 @@ $code.=<<___;
.align 32
poly1305_blocks:
.Lblocks:
sub \$16,$len # too short?
jc .Lno_data
shr \$4,$len
jz .Lno_data # too short
push %rbx
push %rbp
@ -220,8 +221,8 @@ ___
&poly1305_iteration();
$code.=<<___;
mov $r1,%rax
sub \$16,%r15 # len-=16
jnc .Loop
dec %r15 # len-=16
jnz .Loop
mov $h0,0($ctx) # store hash value
mov $h1,8($ctx)
@ -521,6 +522,7 @@ poly1305_blocks_avx:
add $d2,$d1 # =*5
add $d1,$h0
adc \$0,$h1
adc \$0,$h2
mov $s1,$r1
mov $s1,%rax
@ -1315,6 +1317,7 @@ poly1305_emit_avx:
add %rcx,%rax
add %rax,%r8
adc \$0,%r9
adc \$0,%r10
mov %r8,%rax
add \$5,%r8 # compare to modulus
@ -1407,6 +1410,7 @@ poly1305_blocks_avx2:
add $d2,$d1 # =*5
add $d1,$h0
adc \$0,$h1
adc \$0,$h2
mov $s1,$r1
mov $s1,%rax

View File

@ -207,7 +207,17 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)
c = (h2 >> 2) + (h2 & ~3UL);
h2 &= 3;
h0 += c;
h1 += (c = CONSTANT_TIME_CARRY(h0,c)); /* doesn't overflow */
h1 += (c = CONSTANT_TIME_CARRY(h0,c));
h2 += CONSTANT_TIME_CARRY(h1,c);
/*
* Occasional overflows to 3rd bit of h2 are taken care of
* "naturally". If after this point we end up at the top of
* this loop, then the overflow bit will be accounted for
* in next iteration. If we end up in poly1305_emit, then
* comparison to modulus below will still count as "carry
* into 131st bit", so that properly reduced value will be
* picked in conditional move.
*/
inp += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
@ -231,12 +241,12 @@ static void poly1305_emit(void *ctx, unsigned char mac[16],
h1 = st->h[1];
h2 = st->h[2];
/* compute h + -p */
/* compare to modulus by computing h + -p */
g0 = (u64)(t = (u128)h0 + 5);
g1 = (u64)(t = (u128)h1 + (t >> 64));
g2 = h2 + (u64)(t >> 64);
/* if there was carry into 130th bit, h1:h0 = g1:g0 */
/* if there was carry into 131st bit, h1:h0 = g1:g0 */
mask = 0 - (g2 >> 2);
g0 &= mask;
g1 &= mask;
@ -361,7 +371,17 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)
h0 += c;
h1 += (c = CONSTANT_TIME_CARRY(h0,c));
h2 += (c = CONSTANT_TIME_CARRY(h1,c));
h3 += (c = CONSTANT_TIME_CARRY(h2,c)); /* doesn't overflow */
h3 += (c = CONSTANT_TIME_CARRY(h2,c));
h4 += CONSTANT_TIME_CARRY(h3,c);
/*
* Occasional overflows to 3rd bit of h4 are taken care of
* "naturally". If after this point we end up at the top of
* this loop, then the overflow bit will be accounted for
* in next iteration. If we end up in poly1305_emit, then
* comparison to modulus below will still count as "carry
* into 131st bit", so that properly reduced value will be
* picked in conditional move.
*/
inp += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
@ -389,14 +409,14 @@ static void poly1305_emit(void *ctx, unsigned char mac[16],
h3 = st->h[3];
h4 = st->h[4];
/* compute h + -p */
/* compare to modulus by computing h + -p */
g0 = (u32)(t = (u64)h0 + 5);
g1 = (u32)(t = (u64)h1 + (t >> 32));
g2 = (u32)(t = (u64)h2 + (t >> 32));
g3 = (u32)(t = (u64)h3 + (t >> 32));
g4 = h4 + (u32)(t >> 32);
/* if there was carry into 130th bit, h3:h0 = g3:g0 */
/* if there was carry into 131st bit, h3:h0 = g3:g0 */
mask = 0 - (g4 >> 2);
g0 &= mask;
g1 &= mask;
@ -728,6 +748,58 @@ static const struct poly1305_test poly1305_tests[] = {
"99e5822dd4173c995e3dae0ddefb9774""3fde3b080134b39f76e9bf8d0e88d546",
"2637408fe13086ea73f971e3425e2820"
},
/*
* test vectors from Hanno Böck
*/
{
"cccccccccccccccccccccccccccccccccccccccccccccccccc80cccccccccccc"
"cccccccccccccccccccccccccccccccccccccccccccccccccccccccccecccccc"
"ccccccccccccccccccccccccccccccc5cccccccccccccccccccccccccccccccc"
"cccccccccce3cccccccccccccccccccccccccccccccccccccccccccccccccccc"
"ccccccccaccccccccccccccccccccce6cccccccccc000000afcccccccccccccc"
"ccccfffffff50000000000000000000000000000000000000000000000000000"
"00ffffffe7000000000000000000000000000000000000000000000000000000"
"0000000000000000000000000000000000000000000000000000719205a8521d"
"fc",
"7f1b0264000000000000000000000000""0000000000000000cccccccccccccccc",
"8559b876eceed66eb37798c0457baff9"
},
{
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa0000000000"
"00000000800264",
"e0001600000000000000000000000000""0000aaaaaaaaaaaaaaaaaaaaaaaaaaaa",
"00bd1258978e205444c9aaaa82006fed"
},
{
"02fc",
"0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c""0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c",
"06120c0c0c0c0c0c0c0c0c0c0c0c0c0c"
},
{
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
"7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
"7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b007b7b7b7b7b7b7b7b7b"
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b"
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
"7b6e7b001300000000b300000000000000000000000000000000000000000000"
"f20000000000000000000000000000000000002000efff000900000000000000"
"0000000000100000000009000000640000000000000000000000001300000000"
"b300000000000000000000000000000000000000000000f20000000000000000"
"000000000000000000002000efff00090000000000000000007a000010000000"
"000900000064000000000000000000000000000000000000000000000000fc",
"00ff0000000000000000000000000000""00000000001e00000000000000007b7b",
"33205bbf9e9f8f7212ab9e2ab9b7e4a5"
},
{
"7777777777777777777777777777777777777777777777777777777777777777"
"7777777777777777777777777777777777777777777777777777777777777777"
"777777777777777777777777ffffffe9e9acacacacacacacacacacac0000acac"
"ec0100acacac2caca2acacacacacacacacacacac64f2",
"0000007f0000007f0100002000000000""0000cf77777777777777777777777777",
"02ee7c8c546ddeb1a467e4c3981158b9"
},
/*
* test vectors from Andrew Moon
*/