mirror of
https://github.com/QuasarApp/openssl.git
synced 2025-04-29 19:24:37 +00:00
crypto/poly1305: don't break carry chains.
RT#4483 [poly1305-armv4.pl: remove redundant #ifdef __thumb2__] [poly1305-ppc*.pl: presumably more accurate benchmark results] Reviewed-by: Richard Levitte <levitte@openssl.org>
This commit is contained in:
parent
1400f013e1
commit
4b8736a22e
@ -10,10 +10,10 @@
|
|||||||
# IALU(*)/gcc-4.4 NEON
|
# IALU(*)/gcc-4.4 NEON
|
||||||
#
|
#
|
||||||
# ARM11xx(ARMv6) 7.78/+100% -
|
# ARM11xx(ARMv6) 7.78/+100% -
|
||||||
# Cortex-A5 6.30/+130% 2.96
|
# Cortex-A5 6.35/+130% 2.96
|
||||||
# Cortex-A8 6.25/+115% 2.36
|
# Cortex-A8 6.25/+115% 2.36
|
||||||
# Cortex-A9 5.10/+95% 2.55
|
# Cortex-A9 5.10/+95% 2.55
|
||||||
# Cortex-A15 3.79/+85% 1.25(**)
|
# Cortex-A15 3.85/+85% 1.25(**)
|
||||||
# Snapdragon S4 5.70/+100% 1.48(**)
|
# Snapdragon S4 5.70/+100% 1.48(**)
|
||||||
#
|
#
|
||||||
# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
|
# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
|
||||||
@ -313,7 +313,8 @@ poly1305_blocks:
|
|||||||
adds $h0,$h0,r1
|
adds $h0,$h0,r1
|
||||||
adcs $h1,$h1,#0
|
adcs $h1,$h1,#0
|
||||||
adcs $h2,$h2,#0
|
adcs $h2,$h2,#0
|
||||||
adc $h3,$h3,#0
|
adcs $h3,$h3,#0
|
||||||
|
adc $h4,$h4,#0
|
||||||
|
|
||||||
cmp r0,lr @ done yet?
|
cmp r0,lr @ done yet?
|
||||||
bhi .Loop
|
bhi .Loop
|
||||||
@ -735,9 +736,7 @@ poly1305_blocks_neon:
|
|||||||
.align 4
|
.align 4
|
||||||
.Leven:
|
.Leven:
|
||||||
subs $len,$len,#64
|
subs $len,$len,#64
|
||||||
# ifdef __thumb2__
|
|
||||||
it lo
|
it lo
|
||||||
# endif
|
|
||||||
movlo $in2,$zeros
|
movlo $in2,$zeros
|
||||||
|
|
||||||
vmov.i32 $H4,#1<<24 @ padbit, yes, always
|
vmov.i32 $H4,#1<<24 @ padbit, yes, always
|
||||||
@ -745,9 +744,7 @@ poly1305_blocks_neon:
|
|||||||
add $inp,$inp,#64
|
add $inp,$inp,#64
|
||||||
vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
|
vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
|
||||||
add $in2,$in2,#64
|
add $in2,$in2,#64
|
||||||
# ifdef __thumb2__
|
|
||||||
itt hi
|
itt hi
|
||||||
# endif
|
|
||||||
addhi $tbl1,$ctx,#(48+1*9*4)
|
addhi $tbl1,$ctx,#(48+1*9*4)
|
||||||
addhi $tbl0,$ctx,#(48+3*9*4)
|
addhi $tbl0,$ctx,#(48+3*9*4)
|
||||||
|
|
||||||
@ -817,9 +814,7 @@ poly1305_blocks_neon:
|
|||||||
vmull.u32 $D4,$H4#hi,${R0}[1]
|
vmull.u32 $D4,$H4#hi,${R0}[1]
|
||||||
subs $len,$len,#64
|
subs $len,$len,#64
|
||||||
vmlal.u32 $D0,$H4#hi,${S1}[1]
|
vmlal.u32 $D0,$H4#hi,${S1}[1]
|
||||||
# ifdef __thumb2__
|
|
||||||
it lo
|
it lo
|
||||||
# endif
|
|
||||||
movlo $in2,$zeros
|
movlo $in2,$zeros
|
||||||
vmlal.u32 $D3,$H2#hi,${R1}[1]
|
vmlal.u32 $D3,$H2#hi,${R1}[1]
|
||||||
vld1.32 ${S4}[1],[$tbl1,:32]
|
vld1.32 ${S4}[1],[$tbl1,:32]
|
||||||
@ -946,9 +941,7 @@ poly1305_blocks_neon:
|
|||||||
add $tbl1,$ctx,#(48+0*9*4)
|
add $tbl1,$ctx,#(48+0*9*4)
|
||||||
add $tbl0,$ctx,#(48+1*9*4)
|
add $tbl0,$ctx,#(48+1*9*4)
|
||||||
adds $len,$len,#32
|
adds $len,$len,#32
|
||||||
# ifdef __thumb2__
|
|
||||||
it ne
|
it ne
|
||||||
# endif
|
|
||||||
movne $len,#0
|
movne $len,#0
|
||||||
bne .Long_tail
|
bne .Long_tail
|
||||||
|
|
||||||
@ -990,14 +983,10 @@ poly1305_blocks_neon:
|
|||||||
vmlal.u32 $D2,$H0#hi,$R2
|
vmlal.u32 $D2,$H0#hi,$R2
|
||||||
|
|
||||||
vmlal.u32 $D3,$H0#hi,$R3
|
vmlal.u32 $D3,$H0#hi,$R3
|
||||||
# ifdef __thumb2__
|
it ne
|
||||||
it ne
|
|
||||||
# endif
|
|
||||||
addne $tbl1,$ctx,#(48+2*9*4)
|
addne $tbl1,$ctx,#(48+2*9*4)
|
||||||
vmlal.u32 $D0,$H2#hi,$S3
|
vmlal.u32 $D0,$H2#hi,$S3
|
||||||
# ifdef __thumb2__
|
it ne
|
||||||
it ne
|
|
||||||
# endif
|
|
||||||
addne $tbl0,$ctx,#(48+3*9*4)
|
addne $tbl0,$ctx,#(48+3*9*4)
|
||||||
vmlal.u32 $D4,$H1#hi,$R3
|
vmlal.u32 $D4,$H1#hi,$R3
|
||||||
vmlal.u32 $D1,$H3#hi,$S3
|
vmlal.u32 $D1,$H3#hi,$S3
|
||||||
@ -1138,7 +1127,8 @@ poly1305_emit_neon:
|
|||||||
adds $h0,$h0,$g0
|
adds $h0,$h0,$g0
|
||||||
adcs $h1,$h1,#0
|
adcs $h1,$h1,#0
|
||||||
adcs $h2,$h2,#0
|
adcs $h2,$h2,#0
|
||||||
adc $h3,$h3,#0
|
adcs $h3,$h3,#0
|
||||||
|
adc $h4,$h4,#0
|
||||||
|
|
||||||
adds $g0,$h0,#5 @ compare to modulus
|
adds $g0,$h0,#5 @ compare to modulus
|
||||||
adcs $g1,$h1,#0
|
adcs $g1,$h1,#0
|
||||||
@ -1147,24 +1137,16 @@ poly1305_emit_neon:
|
|||||||
adc $g4,$h4,#0
|
adc $g4,$h4,#0
|
||||||
tst $g4,#4 @ did it carry/borrow?
|
tst $g4,#4 @ did it carry/borrow?
|
||||||
|
|
||||||
# ifdef __thumb2__
|
|
||||||
it ne
|
it ne
|
||||||
# endif
|
|
||||||
movne $h0,$g0
|
movne $h0,$g0
|
||||||
ldr $g0,[$nonce,#0]
|
ldr $g0,[$nonce,#0]
|
||||||
# ifdef __thumb2__
|
|
||||||
it ne
|
it ne
|
||||||
# endif
|
|
||||||
movne $h1,$g1
|
movne $h1,$g1
|
||||||
ldr $g1,[$nonce,#4]
|
ldr $g1,[$nonce,#4]
|
||||||
# ifdef __thumb2__
|
|
||||||
it ne
|
it ne
|
||||||
# endif
|
|
||||||
movne $h2,$g2
|
movne $h2,$g2
|
||||||
ldr $g2,[$nonce,#8]
|
ldr $g2,[$nonce,#8]
|
||||||
# ifdef __thumb2__
|
|
||||||
it ne
|
it ne
|
||||||
# endif
|
|
||||||
movne $h3,$g3
|
movne $h3,$g3
|
||||||
ldr $g3,[$nonce,#12]
|
ldr $g3,[$nonce,#12]
|
||||||
|
|
||||||
|
@ -16,10 +16,10 @@
|
|||||||
# IALU/gcc-4.9 NEON
|
# IALU/gcc-4.9 NEON
|
||||||
#
|
#
|
||||||
# Apple A7 1.86/+5% 0.72
|
# Apple A7 1.86/+5% 0.72
|
||||||
# Cortex-A53 2.63/+58% 1.47
|
# Cortex-A53 2.69/+58% 1.47
|
||||||
# Cortex-A57 2.70/+7% 1.14
|
# Cortex-A57 2.70/+7% 1.14
|
||||||
# Denver 1.39/+50% 1.18(*)
|
# Denver 1.64/+50% 1.18(*)
|
||||||
# X-Gene 2.00/+68% 2.19
|
# X-Gene 2.13/+68% 2.19
|
||||||
#
|
#
|
||||||
# (*) estimate based on resources availability is less than 1.0,
|
# (*) estimate based on resources availability is less than 1.0,
|
||||||
# i.e. measured result is worse than expected, presumably binary
|
# i.e. measured result is worse than expected, presumably binary
|
||||||
@ -151,7 +151,8 @@ poly1305_blocks:
|
|||||||
and $h2,$d2,#3
|
and $h2,$d2,#3
|
||||||
add $t0,$t0,$d2,lsr#2
|
add $t0,$t0,$d2,lsr#2
|
||||||
adds $h0,$d0,$t0
|
adds $h0,$d0,$t0
|
||||||
adc $h1,$d1,xzr
|
adcs $h1,$d1,xzr
|
||||||
|
adc $h2,$h2,xzr
|
||||||
|
|
||||||
cbnz $len,.Loop
|
cbnz $len,.Loop
|
||||||
|
|
||||||
@ -235,7 +236,8 @@ poly1305_mult:
|
|||||||
and $h2,$d2,#3
|
and $h2,$d2,#3
|
||||||
add $t0,$t0,$d2,lsr#2
|
add $t0,$t0,$d2,lsr#2
|
||||||
adds $h0,$d0,$t0
|
adds $h0,$d0,$t0
|
||||||
adc $h1,$d1,xzr
|
adcs $h1,$d1,xzr
|
||||||
|
adc $h2,$h2,xzr
|
||||||
|
|
||||||
ret
|
ret
|
||||||
.size poly1305_mult,.-poly1305_mult
|
.size poly1305_mult,.-poly1305_mult
|
||||||
@ -310,7 +312,8 @@ poly1305_blocks_neon:
|
|||||||
and $h2,$d2,#3
|
and $h2,$d2,#3
|
||||||
add $t0,$t0,$d2,lsr#2
|
add $t0,$t0,$d2,lsr#2
|
||||||
adds $h0,$h0,$t0
|
adds $h0,$h0,$t0
|
||||||
adc $h1,$h1,xzr
|
adcs $h1,$h1,xzr
|
||||||
|
adc $h2,$h2,xzr
|
||||||
|
|
||||||
#ifdef __ARMEB__
|
#ifdef __ARMEB__
|
||||||
rev $d0,$d0
|
rev $d0,$d0
|
||||||
@ -870,7 +873,8 @@ poly1305_emit_neon:
|
|||||||
add $d0,$d0,$h2,lsr#2
|
add $d0,$d0,$h2,lsr#2
|
||||||
and $h2,$h2,#3
|
and $h2,$h2,#3
|
||||||
adds $h0,$h0,$d0
|
adds $h0,$h0,$d0
|
||||||
adc $h1,$h1,xzr
|
adcs $h1,$h1,xzr
|
||||||
|
adc $h2,$h2,xzr
|
||||||
|
|
||||||
adds $d0,$h0,#5 // compare to modulus
|
adds $d0,$h0,#5 // compare to modulus
|
||||||
adcs $d1,$h1,xzr
|
adcs $d1,$h1,xzr
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
#
|
#
|
||||||
# October 2015
|
# October 2015
|
||||||
#
|
#
|
||||||
# Performance is [incredible for a 32-bit processor] 1.76 cycles per
|
# Performance is [incredible for a 32-bit processor] 1.82 cycles per
|
||||||
# processed byte. Comparison to compiler-generated code is problematic,
|
# processed byte. Comparison to compiler-generated code is problematic,
|
||||||
# because results were observed to vary from 2.1 to 7.6 cpb depending
|
# because results were observed to vary from 2.1 to 7.6 cpb depending
|
||||||
# on compiler's ability to inline small functions. Compiler also
|
# on compiler's ability to inline small functions. Compiler also
|
||||||
@ -128,7 +128,7 @@ _poly1305_blocks:
|
|||||||
|| SWAP2 $D1,$D1
|
|| SWAP2 $D1,$D1
|
||||||
|
|
||||||
ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
|
ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
|
||||||
|| ADD $D0,B24,B31 ; B-copy of h0+inp[0]
|
|| ADD $D0,B24,B27 ; B-copy of h0+inp[0]
|
||||||
|| SWAP4 $D1,$D1
|
|| SWAP4 $D1,$D1
|
||||||
ADDU $D1,B25,$D1:$H1 ; h1+=inp[1]
|
ADDU $D1,B25,$D1:$H1 ; h1+=inp[1]
|
||||||
|| MVK 3,$THREE
|
|| MVK 3,$THREE
|
||||||
@ -140,12 +140,12 @@ _poly1305_blocks:
|
|||||||
|
|
||||||
loop?:
|
loop?:
|
||||||
MPY32U $H0,$R0,A17:A16
|
MPY32U $H0,$R0,A17:A16
|
||||||
|| MPY32U B31,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16
|
|| MPY32U B27,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16
|
||||||
|| ADDU $D0,$D1:$H1,B25:B24 ; ADDU $D0,$D1:$H1,$D1:$H1
|
|| ADDU $D0,$D1:$H1,B25:B24 ; ADDU $D0,$D1:$H1,$D1:$H1
|
||||||
|| ADDU $D2,B28,$D2:$H2 ; h2+=inp[2]
|
|| ADDU $D2,B28,$D2:$H2 ; h2+=inp[2]
|
||||||
|| SWAP2 $D3,$D3
|
|| SWAP2 $D3,$D3
|
||||||
MPY32U $H0,$R2,A19:A18
|
MPY32U $H0,$R2,A19:A18
|
||||||
|| MPY32U B31,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18
|
|| MPY32U B27,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18
|
||||||
|| ADD $D0,$H1,A24 ; A-copy of B24
|
|| ADD $D0,$H1,A24 ; A-copy of B24
|
||||||
|| SWAP4 $D3,$D3
|
|| SWAP4 $D3,$D3
|
||||||
|| [A2] SUB A2,1,A2 ; decrement loop counter
|
|| [A2] SUB A2,1,A2 ; decrement loop counter
|
||||||
@ -227,8 +227,8 @@ loop?:
|
|||||||
|
|
||||||
SHRU $H4,2,B16 ; last reduction step
|
SHRU $H4,2,B16 ; last reduction step
|
||||||
|| AND $H4,$THREE,$H4
|
|| AND $H4,$THREE,$H4
|
||||||
|| [A2] BNOP loop?
|
|
||||||
ADDAW B16,B16,B16 ; 5*(h4>>2)
|
ADDAW B16,B16,B16 ; 5*(h4>>2)
|
||||||
|
|| [A2] BNOP loop?
|
||||||
|
|
||||||
ADDU B24,B16,B25:B24 ; B24 is h0
|
ADDU B24,B16,B25:B24 ; B24 is h0
|
||||||
|| [A2] SWAP2 $D2,$D2
|
|| [A2] SWAP2 $D2,$D2
|
||||||
@ -236,8 +236,9 @@ loop?:
|
|||||||
|| [A2] SWAP4 $D2,$D2
|
|| [A2] SWAP4 $D2,$D2
|
||||||
ADDU B28,B27,B29:B28 ; B28 is h2
|
ADDU B28,B27,B29:B28 ; B28 is h2
|
||||||
|| [A2] ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
|
|| [A2] ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
|
||||||
|| [A2] ADD $D0,B24,B31 ; B-copy of h0+inp[0]
|
|| [A2] ADD $D0,B24,B27 ; B-copy of h0+inp[0]
|
||||||
ADD B30,B29,B30 ; B30 is h3
|
ADDU B30,B29,B31:B30 ; B30 is h3
|
||||||
|
ADD B31,$H4,$H4
|
||||||
|| [A2] ADDU $D1,B26,$D1:$H1 ; h1+=inp[1]
|
|| [A2] ADDU $D1,B26,$D1:$H1 ; h1+=inp[1]
|
||||||
;;===== branch to loop? is taken here
|
;;===== branch to loop? is taken here
|
||||||
|
|
||||||
|
@ -17,11 +17,10 @@
|
|||||||
# -m32 -m64
|
# -m32 -m64
|
||||||
#
|
#
|
||||||
# Freescale e300 14.8/+80% -
|
# Freescale e300 14.8/+80% -
|
||||||
# PPC74x0 7.40/+60% -
|
# PPC74x0 7.60/+60% -
|
||||||
# PPC970 7.20/+114% 3.51/+205%
|
# PPC970 7.00/+114% 3.51/+205%
|
||||||
# POWER6 3.96/+250% 2.02/+170%
|
# POWER7 3.75/+260% 1.93/+100%
|
||||||
# POWER7 3.67/+260% 1.87/+100%
|
# POWER8 - 2.03/+200%
|
||||||
# POWER8 - 2.13/+200%
|
|
||||||
#
|
#
|
||||||
# Do we need floating-point implementation for PPC? Results presented
|
# Do we need floating-point implementation for PPC? Results presented
|
||||||
# in poly1305_ieee754.c are tricky to compare to, because they are for
|
# in poly1305_ieee754.c are tricky to compare to, because they are for
|
||||||
@ -212,6 +211,7 @@ $code.=<<___;
|
|||||||
add $t0,$t0,$t1
|
add $t0,$t0,$t1
|
||||||
addc $h0,$d0,$t0
|
addc $h0,$d0,$t0
|
||||||
addze $h1,$d1
|
addze $h1,$d1
|
||||||
|
addze $h2,$h2
|
||||||
|
|
||||||
bdnz Loop
|
bdnz Loop
|
||||||
|
|
||||||
@ -518,6 +518,7 @@ $code.=<<___;
|
|||||||
addze $h1,$h1
|
addze $h1,$h1
|
||||||
addze $h2,$h2
|
addze $h2,$h2
|
||||||
addze $h3,$h3
|
addze $h3,$h3
|
||||||
|
addze $h4,$h4
|
||||||
|
|
||||||
bdnz Loop
|
bdnz Loop
|
||||||
|
|
||||||
|
@ -15,8 +15,8 @@
|
|||||||
# and improvement coefficients relative to gcc-generated code.
|
# and improvement coefficients relative to gcc-generated code.
|
||||||
#
|
#
|
||||||
# Freescale e300 9.78/+30%
|
# Freescale e300 9.78/+30%
|
||||||
# PPC74x0 7.08/+50%
|
# PPC74x0 6.92/+50%
|
||||||
# PPC970 6.24/+80%
|
# PPC970 6.03/+80%
|
||||||
# POWER7 3.50/+30%
|
# POWER7 3.50/+30%
|
||||||
# POWER8 3.75/+10%
|
# POWER8 3.75/+10%
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
#
|
#
|
||||||
# June 2015
|
# June 2015
|
||||||
#
|
#
|
||||||
# ~6.4/2.2 cpb on z10/z196+, >2x improvement over compiler-generated
|
# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
|
||||||
# code. For older compiler improvement coefficient is >3x, because
|
# code. For older compiler improvement coefficient is >3x, because
|
||||||
# then base 2^64 and base 2^32 implementations are compared.
|
# then base 2^64 and base 2^32 implementations are compared.
|
||||||
#
|
#
|
||||||
@ -138,11 +138,12 @@ poly1305_blocks:
|
|||||||
ngr $h0,$h2
|
ngr $h0,$h2
|
||||||
srlg $t0,$h2,2
|
srlg $t0,$h2,2
|
||||||
algr $h0,$t0
|
algr $h0,$t0
|
||||||
|
lghi $t1,3
|
||||||
|
ngr $h2,$t1
|
||||||
|
|
||||||
algr $h0,$d0lo
|
algr $h0,$d0lo
|
||||||
lghi $t1,3
|
|
||||||
alcgr $h1,$d1hi # $d1hi is still zero
|
alcgr $h1,$d1hi # $d1hi is still zero
|
||||||
ngr $h2,$t1
|
alcgr $h2,$d1hi # $d1hi is still zero
|
||||||
|
|
||||||
brct$g $len,.Loop
|
brct$g $len,.Loop
|
||||||
|
|
||||||
|
@ -16,10 +16,10 @@
|
|||||||
#
|
#
|
||||||
# IALU(*) FMA
|
# IALU(*) FMA
|
||||||
#
|
#
|
||||||
# UltraSPARC III 11.9(**)
|
# UltraSPARC III 12.3(**)
|
||||||
# SPARC T3 7.85
|
# SPARC T3 7.92
|
||||||
# SPARC T4 1.67(***) 6.55
|
# SPARC T4 1.70(***) 6.55
|
||||||
# SPARC64 X 5.54 3.64
|
# SPARC64 X 5.60 3.64
|
||||||
#
|
#
|
||||||
# (*) Comparison to compiler-generated code is really problematic,
|
# (*) Comparison to compiler-generated code is really problematic,
|
||||||
# because latter's performance varies too much depending on too
|
# because latter's performance varies too much depending on too
|
||||||
@ -251,8 +251,9 @@ poly1305_blocks:
|
|||||||
addcc $t0,$d0,$h0
|
addcc $t0,$d0,$h0
|
||||||
addccc %g0,$h1,$h1
|
addccc %g0,$h1,$h1
|
||||||
addccc %g0,$h2,$h2
|
addccc %g0,$h2,$h2
|
||||||
|
addccc %g0,$h3,$h3
|
||||||
brnz,pt $len,.Loop
|
brnz,pt $len,.Loop
|
||||||
addc %g0,$h3,$h3
|
addc %g0,$h4,$h4
|
||||||
|
|
||||||
st $h1,[$ctx+0] ! store hash value
|
st $h1,[$ctx+0] ! store hash value
|
||||||
st $h0,[$ctx+4]
|
st $h0,[$ctx+4]
|
||||||
@ -295,6 +296,7 @@ poly1305_blocks_vis3:
|
|||||||
neg $shr,$shl
|
neg $shr,$shl
|
||||||
|
|
||||||
srlx $R1,2,$S1
|
srlx $R1,2,$S1
|
||||||
|
b .Loop_vis3
|
||||||
add $R1,$S1,$S1
|
add $R1,$S1,$S1
|
||||||
|
|
||||||
.Loop_vis3:
|
.Loop_vis3:
|
||||||
@ -342,8 +344,9 @@ poly1305_blocks_vis3:
|
|||||||
add $T1,$T0,$T0
|
add $T1,$T0,$T0
|
||||||
|
|
||||||
addcc $T0,$D0,$H0
|
addcc $T0,$D0,$H0
|
||||||
|
addxccc %g0,$D1,$H1
|
||||||
brnz,pt $len,.Loop_vis3
|
brnz,pt $len,.Loop_vis3
|
||||||
addxc %g0,$D1,$H1
|
addxc %g0,$H2,$H2
|
||||||
|
|
||||||
stx $H0,[$ctx+0] ! store hash value
|
stx $H0,[$ctx+0] ! store hash value
|
||||||
stx $H1,[$ctx+8]
|
stx $H1,[$ctx+8]
|
||||||
|
@ -299,6 +299,7 @@ if ($sse2) {
|
|||||||
&adc ("ebx",0);
|
&adc ("ebx",0);
|
||||||
&adc ("ecx",0);
|
&adc ("ecx",0);
|
||||||
&adc ("esi",0);
|
&adc ("esi",0);
|
||||||
|
&adc ("edi",0);
|
||||||
|
|
||||||
&cmp ("ebp",&wparam(2)); # done yet?
|
&cmp ("ebp",&wparam(2)); # done yet?
|
||||||
&jne (&label("loop"));
|
&jne (&label("loop"));
|
||||||
@ -1166,11 +1167,12 @@ my $addr = shift;
|
|||||||
&shr ("edi",2);
|
&shr ("edi",2);
|
||||||
&lea ("ebp",&DWP(0,"edi","edi",4)); # *5
|
&lea ("ebp",&DWP(0,"edi","edi",4)); # *5
|
||||||
&mov ("edi",&wparam(1)); # output
|
&mov ("edi",&wparam(1)); # output
|
||||||
add ("eax","ebp");
|
&add ("eax","ebp");
|
||||||
&mov ("ebp",&wparam(2)); # key
|
&mov ("ebp",&wparam(2)); # key
|
||||||
adc ("ebx",0);
|
&adc ("ebx",0);
|
||||||
adc ("ecx",0);
|
&adc ("ecx",0);
|
||||||
adc ("edx",0);
|
&adc ("edx",0);
|
||||||
|
&adc ("esi",0);
|
||||||
|
|
||||||
&movd ($D0,"eax"); # offload original hash value
|
&movd ($D0,"eax"); # offload original hash value
|
||||||
&add ("eax",5); # compare to modulus
|
&add ("eax",5); # compare to modulus
|
||||||
|
@ -15,16 +15,16 @@
|
|||||||
# measured with rdtsc at fixed clock frequency.
|
# measured with rdtsc at fixed clock frequency.
|
||||||
#
|
#
|
||||||
# IALU/gcc-4.8(*) AVX(**) AVX2
|
# IALU/gcc-4.8(*) AVX(**) AVX2
|
||||||
# P4 4.90/+120% -
|
# P4 4.46/+120% -
|
||||||
# Core 2 2.39/+90% -
|
# Core 2 2.41/+90% -
|
||||||
# Westmere 1.86/+120% -
|
# Westmere 1.88/+120% -
|
||||||
# Sandy Bridge 1.39/+140% 1.10
|
# Sandy Bridge 1.39/+140% 1.10
|
||||||
# Haswell 1.10/+175% 1.11 0.65
|
# Haswell 1.14/+175% 1.11 0.65
|
||||||
# Skylake 1.12/+120% 0.96 0.51
|
# Skylake 1.13/+120% 0.96 0.51
|
||||||
# Silvermont 2.83/+95% -
|
# Silvermont 2.83/+95% -
|
||||||
# VIA Nano 1.82/+150% -
|
# VIA Nano 1.82/+150% -
|
||||||
# Sledgehammer 1.38/+160% -
|
# Sledgehammer 1.38/+160% -
|
||||||
# Bulldozer 2.21/+130% 0.97
|
# Bulldozer 2.30/+130% 0.97
|
||||||
#
|
#
|
||||||
# (*) improvement coefficients relative to clang are more modest and
|
# (*) improvement coefficients relative to clang are more modest and
|
||||||
# are ~50% on most processors, in both cases we are comparing to
|
# are ~50% on most processors, in both cases we are comparing to
|
||||||
@ -114,6 +114,7 @@ $code.=<<___;
|
|||||||
add $d3,%rax
|
add $d3,%rax
|
||||||
add %rax,$h0
|
add %rax,$h0
|
||||||
adc \$0,$h1
|
adc \$0,$h1
|
||||||
|
adc \$0,$h2
|
||||||
___
|
___
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -184,8 +185,8 @@ $code.=<<___;
|
|||||||
.align 32
|
.align 32
|
||||||
poly1305_blocks:
|
poly1305_blocks:
|
||||||
.Lblocks:
|
.Lblocks:
|
||||||
sub \$16,$len # too short?
|
shr \$4,$len
|
||||||
jc .Lno_data
|
jz .Lno_data # too short
|
||||||
|
|
||||||
push %rbx
|
push %rbx
|
||||||
push %rbp
|
push %rbp
|
||||||
@ -220,8 +221,8 @@ ___
|
|||||||
&poly1305_iteration();
|
&poly1305_iteration();
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
mov $r1,%rax
|
mov $r1,%rax
|
||||||
sub \$16,%r15 # len-=16
|
dec %r15 # len-=16
|
||||||
jnc .Loop
|
jnz .Loop
|
||||||
|
|
||||||
mov $h0,0($ctx) # store hash value
|
mov $h0,0($ctx) # store hash value
|
||||||
mov $h1,8($ctx)
|
mov $h1,8($ctx)
|
||||||
@ -521,6 +522,7 @@ poly1305_blocks_avx:
|
|||||||
add $d2,$d1 # =*5
|
add $d2,$d1 # =*5
|
||||||
add $d1,$h0
|
add $d1,$h0
|
||||||
adc \$0,$h1
|
adc \$0,$h1
|
||||||
|
adc \$0,$h2
|
||||||
|
|
||||||
mov $s1,$r1
|
mov $s1,$r1
|
||||||
mov $s1,%rax
|
mov $s1,%rax
|
||||||
@ -1315,6 +1317,7 @@ poly1305_emit_avx:
|
|||||||
add %rcx,%rax
|
add %rcx,%rax
|
||||||
add %rax,%r8
|
add %rax,%r8
|
||||||
adc \$0,%r9
|
adc \$0,%r9
|
||||||
|
adc \$0,%r10
|
||||||
|
|
||||||
mov %r8,%rax
|
mov %r8,%rax
|
||||||
add \$5,%r8 # compare to modulus
|
add \$5,%r8 # compare to modulus
|
||||||
@ -1407,6 +1410,7 @@ poly1305_blocks_avx2:
|
|||||||
add $d2,$d1 # =*5
|
add $d2,$d1 # =*5
|
||||||
add $d1,$h0
|
add $d1,$h0
|
||||||
adc \$0,$h1
|
adc \$0,$h1
|
||||||
|
adc \$0,$h2
|
||||||
|
|
||||||
mov $s1,$r1
|
mov $s1,$r1
|
||||||
mov $s1,%rax
|
mov $s1,%rax
|
||||||
|
@ -207,7 +207,17 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)
|
|||||||
c = (h2 >> 2) + (h2 & ~3UL);
|
c = (h2 >> 2) + (h2 & ~3UL);
|
||||||
h2 &= 3;
|
h2 &= 3;
|
||||||
h0 += c;
|
h0 += c;
|
||||||
h1 += (c = CONSTANT_TIME_CARRY(h0,c)); /* doesn't overflow */
|
h1 += (c = CONSTANT_TIME_CARRY(h0,c));
|
||||||
|
h2 += CONSTANT_TIME_CARRY(h1,c);
|
||||||
|
/*
|
||||||
|
* Occasional overflows to 3rd bit of h2 are taken care of
|
||||||
|
* "naturally". If after this point we end up at the top of
|
||||||
|
* this loop, then the overflow bit will be accounted for
|
||||||
|
* in next iteration. If we end up in poly1305_emit, then
|
||||||
|
* comparison to modulus below will still count as "carry
|
||||||
|
* into 131st bit", so that properly reduced value will be
|
||||||
|
* picked in conditional move.
|
||||||
|
*/
|
||||||
|
|
||||||
inp += POLY1305_BLOCK_SIZE;
|
inp += POLY1305_BLOCK_SIZE;
|
||||||
len -= POLY1305_BLOCK_SIZE;
|
len -= POLY1305_BLOCK_SIZE;
|
||||||
@ -231,12 +241,12 @@ static void poly1305_emit(void *ctx, unsigned char mac[16],
|
|||||||
h1 = st->h[1];
|
h1 = st->h[1];
|
||||||
h2 = st->h[2];
|
h2 = st->h[2];
|
||||||
|
|
||||||
/* compute h + -p */
|
/* compare to modulus by computing h + -p */
|
||||||
g0 = (u64)(t = (u128)h0 + 5);
|
g0 = (u64)(t = (u128)h0 + 5);
|
||||||
g1 = (u64)(t = (u128)h1 + (t >> 64));
|
g1 = (u64)(t = (u128)h1 + (t >> 64));
|
||||||
g2 = h2 + (u64)(t >> 64);
|
g2 = h2 + (u64)(t >> 64);
|
||||||
|
|
||||||
/* if there was carry into 130th bit, h1:h0 = g1:g0 */
|
/* if there was carry into 131st bit, h1:h0 = g1:g0 */
|
||||||
mask = 0 - (g2 >> 2);
|
mask = 0 - (g2 >> 2);
|
||||||
g0 &= mask;
|
g0 &= mask;
|
||||||
g1 &= mask;
|
g1 &= mask;
|
||||||
@ -361,7 +371,17 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)
|
|||||||
h0 += c;
|
h0 += c;
|
||||||
h1 += (c = CONSTANT_TIME_CARRY(h0,c));
|
h1 += (c = CONSTANT_TIME_CARRY(h0,c));
|
||||||
h2 += (c = CONSTANT_TIME_CARRY(h1,c));
|
h2 += (c = CONSTANT_TIME_CARRY(h1,c));
|
||||||
h3 += (c = CONSTANT_TIME_CARRY(h2,c)); /* doesn't overflow */
|
h3 += (c = CONSTANT_TIME_CARRY(h2,c));
|
||||||
|
h4 += CONSTANT_TIME_CARRY(h3,c);
|
||||||
|
/*
|
||||||
|
* Occasional overflows to 3rd bit of h4 are taken care of
|
||||||
|
* "naturally". If after this point we end up at the top of
|
||||||
|
* this loop, then the overflow bit will be accounted for
|
||||||
|
* in next iteration. If we end up in poly1305_emit, then
|
||||||
|
* comparison to modulus below will still count as "carry
|
||||||
|
* into 131st bit", so that properly reduced value will be
|
||||||
|
* picked in conditional move.
|
||||||
|
*/
|
||||||
|
|
||||||
inp += POLY1305_BLOCK_SIZE;
|
inp += POLY1305_BLOCK_SIZE;
|
||||||
len -= POLY1305_BLOCK_SIZE;
|
len -= POLY1305_BLOCK_SIZE;
|
||||||
@ -389,14 +409,14 @@ static void poly1305_emit(void *ctx, unsigned char mac[16],
|
|||||||
h3 = st->h[3];
|
h3 = st->h[3];
|
||||||
h4 = st->h[4];
|
h4 = st->h[4];
|
||||||
|
|
||||||
/* compute h + -p */
|
/* compare to modulus by computing h + -p */
|
||||||
g0 = (u32)(t = (u64)h0 + 5);
|
g0 = (u32)(t = (u64)h0 + 5);
|
||||||
g1 = (u32)(t = (u64)h1 + (t >> 32));
|
g1 = (u32)(t = (u64)h1 + (t >> 32));
|
||||||
g2 = (u32)(t = (u64)h2 + (t >> 32));
|
g2 = (u32)(t = (u64)h2 + (t >> 32));
|
||||||
g3 = (u32)(t = (u64)h3 + (t >> 32));
|
g3 = (u32)(t = (u64)h3 + (t >> 32));
|
||||||
g4 = h4 + (u32)(t >> 32);
|
g4 = h4 + (u32)(t >> 32);
|
||||||
|
|
||||||
/* if there was carry into 130th bit, h3:h0 = g3:g0 */
|
/* if there was carry into 131st bit, h3:h0 = g3:g0 */
|
||||||
mask = 0 - (g4 >> 2);
|
mask = 0 - (g4 >> 2);
|
||||||
g0 &= mask;
|
g0 &= mask;
|
||||||
g1 &= mask;
|
g1 &= mask;
|
||||||
@ -728,6 +748,58 @@ static const struct poly1305_test poly1305_tests[] = {
|
|||||||
"99e5822dd4173c995e3dae0ddefb9774""3fde3b080134b39f76e9bf8d0e88d546",
|
"99e5822dd4173c995e3dae0ddefb9774""3fde3b080134b39f76e9bf8d0e88d546",
|
||||||
"2637408fe13086ea73f971e3425e2820"
|
"2637408fe13086ea73f971e3425e2820"
|
||||||
},
|
},
|
||||||
|
/*
|
||||||
|
* test vectors from Hanno Böck
|
||||||
|
*/
|
||||||
|
{
|
||||||
|
"cccccccccccccccccccccccccccccccccccccccccccccccccc80cccccccccccc"
|
||||||
|
"cccccccccccccccccccccccccccccccccccccccccccccccccccccccccecccccc"
|
||||||
|
"ccccccccccccccccccccccccccccccc5cccccccccccccccccccccccccccccccc"
|
||||||
|
"cccccccccce3cccccccccccccccccccccccccccccccccccccccccccccccccccc"
|
||||||
|
"ccccccccaccccccccccccccccccccce6cccccccccc000000afcccccccccccccc"
|
||||||
|
"ccccfffffff50000000000000000000000000000000000000000000000000000"
|
||||||
|
"00ffffffe7000000000000000000000000000000000000000000000000000000"
|
||||||
|
"0000000000000000000000000000000000000000000000000000719205a8521d"
|
||||||
|
"fc",
|
||||||
|
"7f1b0264000000000000000000000000""0000000000000000cccccccccccccccc",
|
||||||
|
"8559b876eceed66eb37798c0457baff9"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa0000000000"
|
||||||
|
"00000000800264",
|
||||||
|
"e0001600000000000000000000000000""0000aaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||||
|
"00bd1258978e205444c9aaaa82006fed"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"02fc",
|
||||||
|
"0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c""0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c",
|
||||||
|
"06120c0c0c0c0c0c0c0c0c0c0c0c0c0c"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
|
||||||
|
"7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
|
||||||
|
"7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
|
||||||
|
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b007b7b7b7b7b7b7b7b7b"
|
||||||
|
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b"
|
||||||
|
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
|
||||||
|
"7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
|
||||||
|
"7b6e7b001300000000b300000000000000000000000000000000000000000000"
|
||||||
|
"f20000000000000000000000000000000000002000efff000900000000000000"
|
||||||
|
"0000000000100000000009000000640000000000000000000000001300000000"
|
||||||
|
"b300000000000000000000000000000000000000000000f20000000000000000"
|
||||||
|
"000000000000000000002000efff00090000000000000000007a000010000000"
|
||||||
|
"000900000064000000000000000000000000000000000000000000000000fc",
|
||||||
|
"00ff0000000000000000000000000000""00000000001e00000000000000007b7b",
|
||||||
|
"33205bbf9e9f8f7212ab9e2ab9b7e4a5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"7777777777777777777777777777777777777777777777777777777777777777"
|
||||||
|
"7777777777777777777777777777777777777777777777777777777777777777"
|
||||||
|
"777777777777777777777777ffffffe9e9acacacacacacacacacacac0000acac"
|
||||||
|
"ec0100acacac2caca2acacacacacacacacacacac64f2",
|
||||||
|
"0000007f0000007f0100002000000000""0000cf77777777777777777777777777",
|
||||||
|
"02ee7c8c546ddeb1a467e4c3981158b9"
|
||||||
|
},
|
||||||
/*
|
/*
|
||||||
* test vectors from Andrew Moon
|
* test vectors from Andrew Moon
|
||||||
*/
|
*/
|
||||||
|
Loading…
x
Reference in New Issue
Block a user