bn/asm/sparcv9-mont.pl: fix squaring code path.

This module is used only with odd input lengths, i.e. not used in normal
PKI cases, on contemporary processors. The problem was "illuminated" by
fuzzing tests.

Reviewed-by: Richard Levitte <levitte@openssl.org>
(cherry picked from commit 120a9e1a825bd0407639bedb1e8e15823cf7a545)
This commit is contained in:
Andy Polyakov 2017-03-22 10:51:25 +01:00
parent a9614a81c6
commit 6fd7aa95f0

View File

@ -300,7 +300,7 @@ ___
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
######## code without following dedicated squaring procedure. ######## code without following dedicated squaring procedure.
######## ########
$sbit="%i2"; # re-use $bp! $sbit="%o5";
$code.=<<___; $code.=<<___;
.align 32 .align 32
@ -413,7 +413,7 @@ $code.=<<___;
mulx $apj,$mul0,$acc0 mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1 mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0 add $acc0,$car0,$car0
add $tpj,$car1,$car1 add $tpj,$sbit,$sbit
ld [$ap+$j],$apj ! ap[j] ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0 and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j] ld [$np+$j],$npj ! np[j]
@ -422,7 +422,7 @@ $code.=<<___;
ld [$tp+8],$tpj ! tp[j] ld [$tp+8],$tpj ! tp[j]
add $acc0,$acc0,$acc0 add $acc0,$acc0,$acc0
add $j,4,$j ! j++ add $j,4,$j ! j++
or $sbit,$acc0,$acc0 add $sbit,$acc0,$acc0
srlx $acc0,32,$sbit srlx $acc0,32,$sbit
and $acc0,$mask,$acc0 and $acc0,$mask,$acc0
cmp $j,$num cmp $j,$num
@ -436,12 +436,12 @@ $code.=<<___;
mulx $apj,$mul0,$acc0 mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1 mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0 add $acc0,$car0,$car0
add $tpj,$car1,$car1 add $tpj,$sbit,$sbit
and $car0,$mask,$acc0 and $car0,$mask,$acc0
srlx $car0,32,$car0 srlx $car0,32,$car0
add $acc1,$car1,$car1 add $acc1,$car1,$car1
add $acc0,$acc0,$acc0 add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0 add $sbit,$acc0,$acc0
srlx $acc0,32,$sbit srlx $acc0,32,$sbit
and $acc0,$mask,$acc0 and $acc0,$mask,$acc0
add $acc0,$car1,$car1 add $acc0,$car1,$car1
@ -449,7 +449,7 @@ $code.=<<___;
srlx $car1,32,$car1 srlx $car1,32,$car1
add $car0,$car0,$car0 add $car0,$car0,$car0
or $sbit,$car0,$car0 add $sbit,$car0,$car0
add $car0,$car1,$car1 add $car0,$car1,$car1
add $car2,$car1,$car1 add $car2,$car1,$car1
st $car1,[$tp+4] st $car1,[$tp+4]
@ -509,7 +509,7 @@ $code.=<<___;
.Lsqr_inner2: .Lsqr_inner2:
mulx $apj,$mul0,$acc0 mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1 mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1 add $tpj,$sbit,$sbit
add $acc0,$car0,$car0 add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j] ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0 and $car0,$mask,$acc0
@ -517,7 +517,7 @@ $code.=<<___;
srlx $car0,32,$car0 srlx $car0,32,$car0
add $acc0,$acc0,$acc0 add $acc0,$acc0,$acc0
ld [$tp+8],$tpj ! tp[j] ld [$tp+8],$tpj ! tp[j]
or $sbit,$acc0,$acc0 add $sbit,$acc0,$acc0
add $j,4,$j ! j++ add $j,4,$j ! j++
srlx $acc0,32,$sbit srlx $acc0,32,$sbit
and $acc0,$mask,$acc0 and $acc0,$mask,$acc0
@ -532,12 +532,12 @@ $code.=<<___;
.Lsqr_no_inner2: .Lsqr_no_inner2:
mulx $apj,$mul0,$acc0 mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1 mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1 add $tpj,$sbit,$sbit
add $acc0,$car0,$car0 add $acc0,$car0,$car0
and $car0,$mask,$acc0 and $car0,$mask,$acc0
srlx $car0,32,$car0 srlx $car0,32,$car0
add $acc0,$acc0,$acc0 add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0 add $sbit,$acc0,$acc0
srlx $acc0,32,$sbit srlx $acc0,32,$sbit
and $acc0,$mask,$acc0 and $acc0,$mask,$acc0
add $acc0,$car1,$car1 add $acc0,$car1,$car1
@ -546,7 +546,7 @@ $code.=<<___;
srlx $car1,32,$car1 srlx $car1,32,$car1
add $car0,$car0,$car0 add $car0,$car0,$car0
or $sbit,$car0,$car0 add $sbit,$car0,$car0
add $car0,$car1,$car1 add $car0,$car1,$car1
add $car2,$car1,$car1 add $car2,$car1,$car1
st $car1,[$tp+4] st $car1,[$tp+4]
@ -591,14 +591,17 @@ $code.=<<___;
!.Lsqr_last !.Lsqr_last
mulx $npj,$mul1,$acc1 mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1 add $tpj,$acc0,$acc0
srlx $acc0,32,$tmp0
and $acc0,$mask,$acc0
add $tmp0,$sbit,$sbit
add $acc0,$car1,$car1 add $acc0,$car1,$car1
add $acc1,$car1,$car1 add $acc1,$car1,$car1
st $car1,[$tp] st $car1,[$tp]
srlx $car1,32,$car1 srlx $car1,32,$car1
add $car0,$car0,$car0 ! recover $car0 add $car0,$car0,$car0 ! recover $car0
or $sbit,$car0,$car0 add $sbit,$car0,$car0
add $car0,$car1,$car1 add $car0,$car1,$car1
add $car2,$car1,$car1 add $car2,$car1,$car1
st $car1,[$tp+4] st $car1,[$tp+4]