mirror of
https://github.com/QuasarApp/openssl.git
synced 2025-05-18 20:39:42 +00:00
chacha/asm/chacha-ppc.pl: optimize AltiVec/VMX code path.
32-bit vector rotate instruction was defined from beginning, it not being used from the start must be a brain-slip... Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de> Reviewed-by: Rich Salz <rsalz@openssl.org> (Merged from https://github.com/openssl/openssl/pull/6363)
This commit is contained in:
parent
95c81f8c88
commit
c869c3ada9
@ -23,11 +23,14 @@
|
|||||||
# IALU/gcc-4.x 3xAltiVec+1xIALU
|
# IALU/gcc-4.x 3xAltiVec+1xIALU
|
||||||
#
|
#
|
||||||
# Freescale e300 13.6/+115% -
|
# Freescale e300 13.6/+115% -
|
||||||
# PPC74x0/G4e 6.81/+310% 4.66
|
# PPC74x0/G4e 6.81/+310% 3.72
|
||||||
# PPC970/G5 9.29/+160% 4.60
|
# PPC970/G5 9.29/+160% ?
|
||||||
# POWER7 8.62/+61% 4.27
|
# POWER7 8.62/+61% 3.38
|
||||||
# POWER8 8.70/+51% 3.96
|
# POWER8 8.70/+51% 3.36
|
||||||
# POWER9 6.61/+29% 3.67
|
# POWER9 6.61/+29% 3.30(*)
|
||||||
|
#
|
||||||
|
# (*) this is trade-off result, it's possible to improve it, but
|
||||||
|
# then it would negatively affect all others;
|
||||||
|
|
||||||
$flavour = shift;
|
$flavour = shift;
|
||||||
|
|
||||||
@ -392,19 +395,19 @@ Loop_tail: # byte-by-byte loop
|
|||||||
___
|
___
|
||||||
|
|
||||||
{{{
|
{{{
|
||||||
my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2) =
|
my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2)
|
||||||
map("v$_",(0..14));
|
= map("v$_",(0..11));
|
||||||
my (@K)=map("v$_",(15..20));
|
my @K = map("v$_",(12..17));
|
||||||
my ($FOUR,$sixteen,$twenty4,$twenty,$twelve,$twenty5,$seven) =
|
my ($FOUR,$sixteen,$twenty4) = map("v$_",(18..20));
|
||||||
map("v$_",(21..27));
|
my ($inpperm,$outperm,$outmask) = map("v$_",(21..23));
|
||||||
my ($inpperm,$outperm,$outmask) = map("v$_",(28..30));
|
my @D = map("v$_",(24..28));
|
||||||
my @D=("v31",$seven,$T0,$T1,$T2);
|
my ($twelve,$seven,$T0,$T1) = @D;
|
||||||
|
|
||||||
my $FRAME=$LOCALS+64+13*16+18*$SIZE_T; # 13*16 is for v20-v31 offload
|
my $FRAME=$LOCALS+64+10*16+18*$SIZE_T; # 10*16 is for v20-v28 offload
|
||||||
|
|
||||||
sub VMXROUND {
|
sub VMXROUND {
|
||||||
my $odd = pop;
|
my $odd = pop;
|
||||||
my ($a,$b,$c,$d,$t)=@_;
|
my ($a,$b,$c,$d)=@_;
|
||||||
|
|
||||||
(
|
(
|
||||||
"&vadduwm ('$a','$a','$b')",
|
"&vadduwm ('$a','$a','$b')",
|
||||||
@ -412,20 +415,16 @@ my ($a,$b,$c,$d,$t)=@_;
|
|||||||
"&vperm ('$d','$d','$d','$sixteen')",
|
"&vperm ('$d','$d','$d','$sixteen')",
|
||||||
|
|
||||||
"&vadduwm ('$c','$c','$d')",
|
"&vadduwm ('$c','$c','$d')",
|
||||||
"&vxor ('$t','$b','$c')",
|
"&vxor ('$b','$b','$c')",
|
||||||
"&vsrw ('$b','$t','$twenty')",
|
"&vrlw ('$b','$b','$twelve')",
|
||||||
"&vslw ('$t','$t','$twelve')",
|
|
||||||
"&vor ('$b','$b','$t')",
|
|
||||||
|
|
||||||
"&vadduwm ('$a','$a','$b')",
|
"&vadduwm ('$a','$a','$b')",
|
||||||
"&vxor ('$d','$d','$a')",
|
"&vxor ('$d','$d','$a')",
|
||||||
"&vperm ('$d','$d','$d','$twenty4')",
|
"&vperm ('$d','$d','$d','$twenty4')",
|
||||||
|
|
||||||
"&vadduwm ('$c','$c','$d')",
|
"&vadduwm ('$c','$c','$d')",
|
||||||
"&vxor ('$t','$b','$c')",
|
"&vxor ('$b','$b','$c')",
|
||||||
"&vsrw ('$b','$t','$twenty5')",
|
"&vrlw ('$b','$b','$seven')",
|
||||||
"&vslw ('$t','$t','$seven')",
|
|
||||||
"&vor ('$b','$b','$t')",
|
|
||||||
|
|
||||||
"&vsldoi ('$c','$c','$c',8)",
|
"&vsldoi ('$c','$c','$c',8)",
|
||||||
"&vsldoi ('$b','$b','$b',$odd?4:12)",
|
"&vsldoi ('$b','$b','$b',$odd?4:12)",
|
||||||
@ -461,13 +460,7 @@ $code.=<<___;
|
|||||||
stvx v26,r10,$sp
|
stvx v26,r10,$sp
|
||||||
addi r10,r10,32
|
addi r10,r10,32
|
||||||
stvx v27,r11,$sp
|
stvx v27,r11,$sp
|
||||||
addi r11,r11,32
|
|
||||||
stvx v28,r10,$sp
|
stvx v28,r10,$sp
|
||||||
addi r10,r10,32
|
|
||||||
stvx v29,r11,$sp
|
|
||||||
addi r11,r11,32
|
|
||||||
stvx v30,r10,$sp
|
|
||||||
stvx v31,r11,$sp
|
|
||||||
stw r12,`$FRAME-$SIZE_T*18-4`($sp) # save vrsave
|
stw r12,`$FRAME-$SIZE_T*18-4`($sp) # save vrsave
|
||||||
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
|
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
|
||||||
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
|
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
|
||||||
@ -487,9 +480,9 @@ $code.=<<___;
|
|||||||
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
|
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
|
||||||
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
|
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
|
||||||
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
|
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
|
||||||
li r12,-1
|
li r12,-8
|
||||||
$PUSH r0, `$FRAME+$LRSAVE`($sp)
|
$PUSH r0, `$FRAME+$LRSAVE`($sp)
|
||||||
mtspr 256,r12 # preserve all AltiVec registers
|
mtspr 256,r12 # preserve 29 AltiVec registers
|
||||||
|
|
||||||
bl Lconsts # returns pointer Lsigma in r12
|
bl Lconsts # returns pointer Lsigma in r12
|
||||||
li @x[0],16
|
li @x[0],16
|
||||||
@ -526,11 +519,6 @@ $code.=<<___;
|
|||||||
lwz @d[3],12($ctr)
|
lwz @d[3],12($ctr)
|
||||||
vadduwm @K[5],@K[4],@K[5]
|
vadduwm @K[5],@K[4],@K[5]
|
||||||
|
|
||||||
vspltisw $twenty,-12 # synthesize constants
|
|
||||||
vspltisw $twelve,12
|
|
||||||
vspltisw $twenty5,-7
|
|
||||||
#vspltisw $seven,7 # synthesized in the loop
|
|
||||||
|
|
||||||
vxor $T0,$T0,$T0 # 0x00..00
|
vxor $T0,$T0,$T0 # 0x00..00
|
||||||
vspltisw $outmask,-1 # 0xff..ff
|
vspltisw $outmask,-1 # 0xff..ff
|
||||||
?lvsr $inpperm,0,$inp # prepare for unaligned load
|
?lvsr $inpperm,0,$inp # prepare for unaligned load
|
||||||
@ -543,6 +531,7 @@ $code.=<<___;
|
|||||||
be?vxor $outperm,$outperm,$T1
|
be?vxor $outperm,$outperm,$T1
|
||||||
be?vperm $inpperm,$inpperm,$inpperm,$T0
|
be?vperm $inpperm,$inpperm,$inpperm,$T0
|
||||||
|
|
||||||
|
li r0,10 # inner loop counter
|
||||||
b Loop_outer_vmx
|
b Loop_outer_vmx
|
||||||
|
|
||||||
.align 4
|
.align 4
|
||||||
@ -560,7 +549,6 @@ Loop_outer_vmx:
|
|||||||
ori @x[3],@x[3],0x6574
|
ori @x[3],@x[3],0x6574
|
||||||
vmr $B0,@K[1]
|
vmr $B0,@K[1]
|
||||||
|
|
||||||
li r0,10 # inner loop counter
|
|
||||||
lwz @x[4],0($key) # load key to GPR
|
lwz @x[4],0($key) # load key to GPR
|
||||||
vmr $B1,@K[1]
|
vmr $B1,@K[1]
|
||||||
lwz @x[5],4($key)
|
lwz @x[5],4($key)
|
||||||
@ -586,15 +574,17 @@ Loop_outer_vmx:
|
|||||||
mr @t[1],@x[5]
|
mr @t[1],@x[5]
|
||||||
mr @t[2],@x[6]
|
mr @t[2],@x[6]
|
||||||
mr @t[3],@x[7]
|
mr @t[3],@x[7]
|
||||||
|
|
||||||
|
vspltisw $twelve,12 # synthesize constants
|
||||||
vspltisw $seven,7
|
vspltisw $seven,7
|
||||||
|
|
||||||
mtctr r0
|
mtctr r0
|
||||||
nop
|
nop
|
||||||
Loop_vmx:
|
Loop_vmx:
|
||||||
___
|
___
|
||||||
my @thread0=&VMXROUND($A0,$B0,$C0,$D0,$T0,0);
|
my @thread0=&VMXROUND($A0,$B0,$C0,$D0,0);
|
||||||
my @thread1=&VMXROUND($A1,$B1,$C1,$D1,$T1,0);
|
my @thread1=&VMXROUND($A1,$B1,$C1,$D1,0);
|
||||||
my @thread2=&VMXROUND($A2,$B2,$C2,$D2,$T2,0);
|
my @thread2=&VMXROUND($A2,$B2,$C2,$D2,0);
|
||||||
my @thread3=&ROUND(0,4,8,12);
|
my @thread3=&ROUND(0,4,8,12);
|
||||||
|
|
||||||
foreach (@thread0) {
|
foreach (@thread0) {
|
||||||
@ -602,10 +592,11 @@ ___
|
|||||||
eval(shift(@thread1)); eval(shift(@thread3));
|
eval(shift(@thread1)); eval(shift(@thread3));
|
||||||
eval(shift(@thread2)); eval(shift(@thread3));
|
eval(shift(@thread2)); eval(shift(@thread3));
|
||||||
}
|
}
|
||||||
|
foreach (@thread3) { eval; }
|
||||||
|
|
||||||
@thread0=&VMXROUND($A0,$B0,$C0,$D0,$T0,1);
|
@thread0=&VMXROUND($A0,$B0,$C0,$D0,1);
|
||||||
@thread1=&VMXROUND($A1,$B1,$C1,$D1,$T1,1);
|
@thread1=&VMXROUND($A1,$B1,$C1,$D1,1);
|
||||||
@thread2=&VMXROUND($A2,$B2,$C2,$D2,$T2,1);
|
@thread2=&VMXROUND($A2,$B2,$C2,$D2,1);
|
||||||
@thread3=&ROUND(0,5,10,15);
|
@thread3=&ROUND(0,5,10,15);
|
||||||
|
|
||||||
foreach (@thread0) {
|
foreach (@thread0) {
|
||||||
@ -613,6 +604,7 @@ ___
|
|||||||
eval(shift(@thread1)); eval(shift(@thread3));
|
eval(shift(@thread1)); eval(shift(@thread3));
|
||||||
eval(shift(@thread2)); eval(shift(@thread3));
|
eval(shift(@thread2)); eval(shift(@thread3));
|
||||||
}
|
}
|
||||||
|
foreach (@thread3) { eval; }
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
bdnz Loop_vmx
|
bdnz Loop_vmx
|
||||||
|
|
||||||
@ -866,13 +858,7 @@ Ldone_vmx:
|
|||||||
lvx v26,r10,$sp
|
lvx v26,r10,$sp
|
||||||
addi r10,r10,32
|
addi r10,r10,32
|
||||||
lvx v27,r11,$sp
|
lvx v27,r11,$sp
|
||||||
addi r11,r11,32
|
|
||||||
lvx v28,r10,$sp
|
lvx v28,r10,$sp
|
||||||
addi r10,r10,32
|
|
||||||
lvx v29,r11,$sp
|
|
||||||
addi r11,r11,32
|
|
||||||
lvx v30,r10,$sp
|
|
||||||
lvx v31,r11,$sp
|
|
||||||
$POP r0, `$FRAME+$LRSAVE`($sp)
|
$POP r0, `$FRAME+$LRSAVE`($sp)
|
||||||
$POP r14,`$FRAME-$SIZE_T*18`($sp)
|
$POP r14,`$FRAME-$SIZE_T*18`($sp)
|
||||||
$POP r15,`$FRAME-$SIZE_T*17`($sp)
|
$POP r15,`$FRAME-$SIZE_T*17`($sp)
|
||||||
@ -904,7 +890,7 @@ Ldone_vmx:
|
|||||||
Lconsts:
|
Lconsts:
|
||||||
mflr r0
|
mflr r0
|
||||||
bcl 20,31,\$+4
|
bcl 20,31,\$+4
|
||||||
mflr r12 #vvvvv "distance between . and _vpaes_consts
|
mflr r12 #vvvvv "distance between . and Lsigma
|
||||||
addi r12,r12,`64-8`
|
addi r12,r12,`64-8`
|
||||||
mtlr r0
|
mtlr r0
|
||||||
blr
|
blr
|
||||||
|
Loading…
x
Reference in New Issue
Block a user