Fix an overflow bug in rsaz_512_sqr

There is an overflow bug in the x64_64 Montgomery squaring procedure used in
exponentiation with 512-bit moduli. No EC algorithms are affected. Analysis
suggests that attacks against 2-prime RSA1024, 3-prime RSA1536, and DSA1024 as a
result of this defect would be very difficult to perform and are not believed
likely. Attacks against DH512 are considered just feasible. However, for an
attack the target would have to re-use the DH512 private key, which is not
recommended anyway. Also applications directly using the low level API
BN_mod_exp may be affected if they use BN_FLG_CONSTTIME.

CVE-2019-1551

Reviewed-by: Paul Dale <paul.dale@oracle.com>
Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
(Merged from https://github.com/openssl/openssl/pull/10576)
This commit is contained in:
Andy Polyakov 2019-12-04 12:48:21 +01:00 committed by Bernd Edlinger
parent b39c0475a6
commit f1c5eea8a8

View File

@ -140,7 +140,7 @@ rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
subq \$128+24, %rsp subq \$128+24, %rsp
.Lsqr_body: .Lsqr_body:
movq $mod, %rbp # common argument movq $mod, %xmm1 # common off-load
movq ($inp), %rdx movq ($inp), %rdx
movq 8($inp), %rax movq 8($inp), %rax
movq $n0, 128(%rsp) movq $n0, 128(%rsp)
@ -158,7 +158,8 @@ $code.=<<___;
.Loop_sqr: .Loop_sqr:
movl $times,128+8(%rsp) movl $times,128+8(%rsp)
#first iteration #first iteration
movq %rdx, %rbx movq %rdx, %rbx # 0($inp)
mov %rax, %rbp # 8($inp)
mulq %rdx mulq %rdx
movq %rax, %r8 movq %rax, %r8
movq 16($inp), %rax movq 16($inp), %rax
@ -197,31 +198,29 @@ $code.=<<___;
mulq %rbx mulq %rbx
addq %rax, %r14 addq %rax, %r14
movq %rbx, %rax movq %rbx, %rax
movq %rdx, %r15 adcq \$0, %rdx
adcq \$0, %r15
addq %r8, %r8 #shlq \$1, %r8 xorq %rcx,%rcx # rcx:r8 = r8 << 1
movq %r9, %rcx addq %r8, %r8
adcq %r9, %r9 #shld \$1, %r8, %r9 movq %rdx, %r15
adcq \$0, %rcx
mulq %rax mulq %rax
movq %rax, (%rsp) addq %r8, %rdx
addq %rdx, %r8 adcq \$0, %rcx
adcq \$0, %r9
movq %r8, 8(%rsp) movq %rax, (%rsp)
shrq \$63, %rcx movq %rdx, 8(%rsp)
#second iteration #second iteration
movq 8($inp), %r8
movq 16($inp), %rax movq 16($inp), %rax
mulq %r8 mulq %rbp
addq %rax, %r10 addq %rax, %r10
movq 24($inp), %rax movq 24($inp), %rax
movq %rdx, %rbx movq %rdx, %rbx
adcq \$0, %rbx adcq \$0, %rbx
mulq %r8 mulq %rbp
addq %rax, %r11 addq %rax, %r11
movq 32($inp), %rax movq 32($inp), %rax
adcq \$0, %rdx adcq \$0, %rdx
@ -229,7 +228,7 @@ $code.=<<___;
movq %rdx, %rbx movq %rdx, %rbx
adcq \$0, %rbx adcq \$0, %rbx
mulq %r8 mulq %rbp
addq %rax, %r12 addq %rax, %r12
movq 40($inp), %rax movq 40($inp), %rax
adcq \$0, %rdx adcq \$0, %rdx
@ -237,7 +236,7 @@ $code.=<<___;
movq %rdx, %rbx movq %rdx, %rbx
adcq \$0, %rbx adcq \$0, %rbx
mulq %r8 mulq %rbp
addq %rax, %r13 addq %rax, %r13
movq 48($inp), %rax movq 48($inp), %rax
adcq \$0, %rdx adcq \$0, %rdx
@ -245,7 +244,7 @@ $code.=<<___;
movq %rdx, %rbx movq %rdx, %rbx
adcq \$0, %rbx adcq \$0, %rbx
mulq %r8 mulq %rbp
addq %rax, %r14 addq %rax, %r14
movq 56($inp), %rax movq 56($inp), %rax
adcq \$0, %rdx adcq \$0, %rdx
@ -253,39 +252,39 @@ $code.=<<___;
movq %rdx, %rbx movq %rdx, %rbx
adcq \$0, %rbx adcq \$0, %rbx
mulq %r8 mulq %rbp
addq %rax, %r15 addq %rax, %r15
movq %r8, %rax movq %rbp, %rax
adcq \$0, %rdx adcq \$0, %rdx
addq %rbx, %r15 addq %rbx, %r15
movq %rdx, %r8 adcq \$0, %rdx
movq %r10, %rdx
adcq \$0, %r8
add %rdx, %rdx xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 addq %r9, %r9
movq %r11, %rbx movq %rdx, %r8
adcq %r11, %r11 #shld \$1, %r10, %r11 adcq %r10, %r10
adcq \$0, %rbx
mulq %rax mulq %rax
addq %rcx, %rax
movq 16($inp), %rbp
adcq \$0, %rdx
addq %rax, %r9 addq %rax, %r9
movq 24($inp), %rax
adcq %rdx, %r10 adcq %rdx, %r10
adcq \$0, %r11 adcq \$0, %rbx
movq %r9, 16(%rsp) movq %r9, 16(%rsp)
movq %r10, 24(%rsp) movq %r10, 24(%rsp)
shrq \$63, %rbx
#third iteration #third iteration
movq 16($inp), %r9 mulq %rbp
movq 24($inp), %rax
mulq %r9
addq %rax, %r12 addq %rax, %r12
movq 32($inp), %rax movq 32($inp), %rax
movq %rdx, %rcx movq %rdx, %rcx
adcq \$0, %rcx adcq \$0, %rcx
mulq %r9 mulq %rbp
addq %rax, %r13 addq %rax, %r13
movq 40($inp), %rax movq 40($inp), %rax
adcq \$0, %rdx adcq \$0, %rdx
@ -293,7 +292,7 @@ $code.=<<___;
movq %rdx, %rcx movq %rdx, %rcx
adcq \$0, %rcx adcq \$0, %rcx
mulq %r9 mulq %rbp
addq %rax, %r14 addq %rax, %r14
movq 48($inp), %rax movq 48($inp), %rax
adcq \$0, %rdx adcq \$0, %rdx
@ -301,9 +300,7 @@ $code.=<<___;
movq %rdx, %rcx movq %rdx, %rcx
adcq \$0, %rcx adcq \$0, %rcx
mulq %r9 mulq %rbp
movq %r12, %r10
lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
addq %rax, %r15 addq %rax, %r15
movq 56($inp), %rax movq 56($inp), %rax
adcq \$0, %rdx adcq \$0, %rdx
@ -311,36 +308,40 @@ $code.=<<___;
movq %rdx, %rcx movq %rdx, %rcx
adcq \$0, %rcx adcq \$0, %rcx
mulq %r9 mulq %rbp
shrq \$63, %r10
addq %rax, %r8 addq %rax, %r8
movq %r9, %rax movq %rbp, %rax
adcq \$0, %rdx adcq \$0, %rdx
addq %rcx, %r8 addq %rcx, %r8
movq %rdx, %r9 adcq \$0, %rdx
adcq \$0, %r9
movq %r13, %rcx xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13 addq %r11, %r11
movq %rdx, %r9
adcq %r12, %r12
adcq \$0, %rcx
mulq %rax mulq %rax
addq %rbx, %rax
movq 24($inp), %r10
adcq \$0, %rdx
addq %rax, %r11 addq %rax, %r11
movq 32($inp), %rax
adcq %rdx, %r12 adcq %rdx, %r12
adcq \$0, %r13 adcq \$0, %rcx
movq %r11, 32(%rsp) movq %r11, 32(%rsp)
movq %r12, 40(%rsp) movq %r12, 40(%rsp)
shrq \$63, %rcx
#fourth iteration #fourth iteration
movq 24($inp), %r10 mov %rax, %r11 # 32($inp)
movq 32($inp), %rax
mulq %r10 mulq %r10
addq %rax, %r14 addq %rax, %r14
movq 40($inp), %rax movq 40($inp), %rax
movq %rdx, %rbx movq %rdx, %rbx
adcq \$0, %rbx adcq \$0, %rbx
mov %rax, %r12 # 40($inp)
mulq %r10 mulq %r10
addq %rax, %r15 addq %rax, %r15
movq 48($inp), %rax movq 48($inp), %rax
@ -349,9 +350,8 @@ $code.=<<___;
movq %rdx, %rbx movq %rdx, %rbx
adcq \$0, %rbx adcq \$0, %rbx
mov %rax, %rbp # 48($inp)
mulq %r10 mulq %r10
movq %r14, %r12
leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
addq %rax, %r8 addq %rax, %r8
movq 56($inp), %rax movq 56($inp), %rax
adcq \$0, %rdx adcq \$0, %rdx
@ -360,32 +360,33 @@ $code.=<<___;
adcq \$0, %rbx adcq \$0, %rbx
mulq %r10 mulq %r10
shrq \$63, %r12
addq %rax, %r9 addq %rax, %r9
movq %r10, %rax movq %r10, %rax
adcq \$0, %rdx adcq \$0, %rdx
addq %rbx, %r9 addq %rbx, %r9
movq %rdx, %r10 adcq \$0, %rdx
adcq \$0, %r10
movq %r15, %rbx xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15 addq %r13, %r13
movq %rdx, %r10
adcq %r14, %r14
adcq \$0, %rbx
mulq %rax mulq %rax
addq %rcx, %rax
adcq \$0, %rdx
addq %rax, %r13 addq %rax, %r13
movq %r12, %rax # 40($inp)
adcq %rdx, %r14 adcq %rdx, %r14
adcq \$0, %r15 adcq \$0, %rbx
movq %r13, 48(%rsp) movq %r13, 48(%rsp)
movq %r14, 56(%rsp) movq %r14, 56(%rsp)
shrq \$63, %rbx
#fifth iteration #fifth iteration
movq 32($inp), %r11
movq 40($inp), %rax
mulq %r11 mulq %r11
addq %rax, %r8 addq %rax, %r8
movq 48($inp), %rax movq %rbp, %rax # 48($inp)
movq %rdx, %rcx movq %rdx, %rcx
adcq \$0, %rcx adcq \$0, %rcx
@ -393,97 +394,99 @@ $code.=<<___;
addq %rax, %r9 addq %rax, %r9
movq 56($inp), %rax movq 56($inp), %rax
adcq \$0, %rdx adcq \$0, %rdx
movq %r8, %r12
leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
addq %rcx, %r9 addq %rcx, %r9
movq %rdx, %rcx movq %rdx, %rcx
adcq \$0, %rcx adcq \$0, %rcx
mov %rax, %r14 # 56($inp)
mulq %r11 mulq %r11
shrq \$63, %r12
addq %rax, %r10 addq %rax, %r10
movq %r11, %rax movq %r11, %rax
adcq \$0, %rdx adcq \$0, %rdx
addq %rcx, %r10 addq %rcx, %r10
movq %rdx, %r11 adcq \$0, %rdx
adcq \$0, %r11
movq %r9, %rcx xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9 addq %r15, %r15
movq %rdx, %r11
adcq %r8, %r8
adcq \$0, %rcx
mulq %rax mulq %rax
addq %rbx, %rax
adcq \$0, %rdx
addq %rax, %r15 addq %rax, %r15
movq %rbp, %rax # 48($inp)
adcq %rdx, %r8 adcq %rdx, %r8
adcq \$0, %r9 adcq \$0, %rcx
movq %r15, 64(%rsp) movq %r15, 64(%rsp)
movq %r8, 72(%rsp) movq %r8, 72(%rsp)
shrq \$63, %rcx
#sixth iteration #sixth iteration
movq 40($inp), %r12
movq 48($inp), %rax
mulq %r12 mulq %r12
addq %rax, %r10 addq %rax, %r10
movq 56($inp), %rax movq %r14, %rax # 56($inp)
movq %rdx, %rbx movq %rdx, %rbx
adcq \$0, %rbx adcq \$0, %rbx
mulq %r12 mulq %r12
addq %rax, %r11 addq %rax, %r11
movq %r12, %rax movq %r12, %rax
movq %r10, %r15
leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
adcq \$0, %rdx adcq \$0, %rdx
shrq \$63, %r15
addq %rbx, %r11 addq %rbx, %r11
movq %rdx, %r12 adcq \$0, %rdx
adcq \$0, %r12
movq %r11, %rbx xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11 addq %r9, %r9
movq %rdx, %r12
adcq %r10, %r10
adcq \$0, %rbx
mulq %rax mulq %rax
addq %rcx, %rax
adcq \$0, %rdx
addq %rax, %r9 addq %rax, %r9
movq %r14, %rax # 56($inp)
adcq %rdx, %r10 adcq %rdx, %r10
adcq \$0, %r11 adcq \$0, %rbx
movq %r9, 80(%rsp) movq %r9, 80(%rsp)
movq %r10, 88(%rsp) movq %r10, 88(%rsp)
#seventh iteration #seventh iteration
movq 48($inp), %r13 mulq %rbp
movq 56($inp), %rax
mulq %r13
addq %rax, %r12 addq %rax, %r12
movq %r13, %rax movq %rbp, %rax
movq %rdx, %r13 adcq \$0, %rdx
adcq \$0, %r13
xorq %r14, %r14 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
shlq \$1, %rbx addq %r11, %r11
adcq %r12, %r12 #shld \$1, %rbx, %r12 movq %rdx, %r13
adcq %r13, %r13 #shld \$1, %r12, %r13 adcq %r12, %r12
adcq %r14, %r14 #shld \$1, %r13, %r14 adcq \$0, %rcx
mulq %rax mulq %rax
addq %rbx, %rax
adcq \$0, %rdx
addq %rax, %r11 addq %rax, %r11
movq %r14, %rax # 56($inp)
adcq %rdx, %r12 adcq %rdx, %r12
adcq \$0, %r13 adcq \$0, %rcx
movq %r11, 96(%rsp) movq %r11, 96(%rsp)
movq %r12, 104(%rsp) movq %r12, 104(%rsp)
#eighth iteration #eighth iteration
movq 56($inp), %rax xorq %rbx, %rbx # rbx:r13 = r13 << 1
addq %r13, %r13
adcq \$0, %rbx
mulq %rax mulq %rax
addq %rax, %r13 addq %rcx, %rax
adcq \$0, %rdx adcq \$0, %rdx
addq %r13, %rax
addq %rdx, %r14 adcq %rbx, %rdx
movq %r13, 112(%rsp)
movq %r14, 120(%rsp)
movq (%rsp), %r8 movq (%rsp), %r8
movq 8(%rsp), %r9 movq 8(%rsp), %r9
@ -493,6 +496,10 @@ $code.=<<___;
movq 40(%rsp), %r13 movq 40(%rsp), %r13
movq 48(%rsp), %r14 movq 48(%rsp), %r14
movq 56(%rsp), %r15 movq 56(%rsp), %r15
movq %xmm1, %rbp
movq %rax, 112(%rsp)
movq %rdx, 120(%rsp)
call __rsaz_512_reduce call __rsaz_512_reduce
@ -524,9 +531,9 @@ $code.=<<___;
.Loop_sqrx: .Loop_sqrx:
movl $times,128+8(%rsp) movl $times,128+8(%rsp)
movq $out, %xmm0 # off-load movq $out, %xmm0 # off-load
movq %rbp, %xmm1 # off-load
#first iteration #first iteration
mulx %rax, %r8, %r9 mulx %rax, %r8, %r9
mov %rax, %rbx
mulx 16($inp), %rcx, %r10 mulx 16($inp), %rcx, %r10
xor %rbp, %rbp # cf=0, of=0 xor %rbp, %rbp # cf=0, of=0
@ -534,40 +541,39 @@ $code.=<<___;
mulx 24($inp), %rax, %r11 mulx 24($inp), %rax, %r11
adcx %rcx, %r9 adcx %rcx, %r9
mulx 32($inp), %rcx, %r12 .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
adcx %rax, %r10 adcx %rax, %r10
mulx 40($inp), %rax, %r13 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
adcx %rcx, %r11 adcx %rcx, %r11
.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14 mulx 48($inp), %rcx, %r14
adcx %rax, %r12 adcx %rax, %r12
adcx %rcx, %r13 adcx %rcx, %r13
.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15 mulx 56($inp), %rax, %r15
adcx %rax, %r14 adcx %rax, %r14
adcx %rbp, %r15 # %rbp is 0 adcx %rbp, %r15 # %rbp is 0
mov %r9, %rcx mulx %rdx, %rax, $out
shld \$1, %r8, %r9 mov %rbx, %rdx # 8($inp)
shl \$1, %r8 xor %rcx, %rcx
adox %r8, %r8
xor %ebp, %ebp adcx $out, %r8
mulx %rdx, %rax, %rdx adox %rbp, %rcx
adcx %rdx, %r8 adcx %rbp, %rcx
mov 8($inp), %rdx
adcx %rbp, %r9
mov %rax, (%rsp) mov %rax, (%rsp)
mov %r8, 8(%rsp) mov %r8, 8(%rsp)
#second iteration #second iteration
mulx 16($inp), %rax, %rbx .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
adox %rax, %r10 adox %rax, %r10
adcx %rbx, %r11 adcx %rbx, %r11
.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8 mulx 24($inp), $out, %r8
adox $out, %r11 adox $out, %r11
.byte 0x66
adcx %r8, %r12 adcx %r8, %r12
mulx 32($inp), %rax, %rbx mulx 32($inp), %rax, %rbx
@ -585,24 +591,25 @@ $code.=<<___;
.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
adox $out, %r15 adox $out, %r15
adcx %rbp, %r8 adcx %rbp, %r8
mulx %rdx, %rax, $out
adox %rbp, %r8 adox %rbp, %r8
.byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
mov %r11, %rbx xor %rbx, %rbx
shld \$1, %r10, %r11 adcx %rcx, %rax
shld \$1, %rcx, %r10 adox %r9, %r9
adcx %rbp, $out
xor %ebp,%ebp adox %r10, %r10
mulx %rdx, %rax, %rcx
mov 16($inp), %rdx
adcx %rax, %r9 adcx %rax, %r9
adcx %rcx, %r10 adox %rbp, %rbx
adcx %rbp, %r11 adcx $out, %r10
adcx %rbp, %rbx
mov %r9, 16(%rsp) mov %r9, 16(%rsp)
.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
#third iteration #third iteration
.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9 mulx 24($inp), $out, %r9
adox $out, %r12 adox $out, %r12
adcx %r9, %r13 adcx %r9, %r13
@ -610,7 +617,7 @@ $code.=<<___;
adox %rax, %r13 adox %rax, %r13
adcx %rcx, %r14 adcx %rcx, %r14
mulx 40($inp), $out, %r9 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
adox $out, %r14 adox $out, %r14
adcx %r9, %r15 adcx %r9, %r15
@ -618,27 +625,28 @@ $code.=<<___;
adox %rax, %r15 adox %rax, %r15
adcx %rcx, %r8 adcx %rcx, %r8
.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9 mulx 56($inp), $out, %r9
adox $out, %r8 adox $out, %r8
adcx %rbp, %r9 adcx %rbp, %r9
mulx %rdx, %rax, $out
adox %rbp, %r9 adox %rbp, %r9
mov %r13, %rcx
shld \$1, %r12, %r13
shld \$1, %rbx, %r12
xor %ebp, %ebp
mulx %rdx, %rax, %rdx
adcx %rax, %r11
adcx %rdx, %r12
mov 24($inp), %rdx mov 24($inp), %rdx
adcx %rbp, %r13
xor %rcx, %rcx
adcx %rbx, %rax
adox %r11, %r11
adcx %rbp, $out
adox %r12, %r12
adcx %rax, %r11
adox %rbp, %rcx
adcx $out, %r12
adcx %rbp, %rcx
mov %r11, 32(%rsp) mov %r11, 32(%rsp)
.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp) mov %r12, 40(%rsp)
#fourth iteration #fourth iteration
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx mulx 32($inp), %rax, %rbx
adox %rax, %r14 adox %rax, %r14
adcx %rbx, %r15 adcx %rbx, %r15
@ -653,25 +661,25 @@ $code.=<<___;
mulx 56($inp), $out, %r10 mulx 56($inp), $out, %r10
adox $out, %r9 adox $out, %r9
adcx %rbp, %r10 adcx %rbp, %r10
mulx %rdx, %rax, $out
adox %rbp, %r10 adox %rbp, %r10
.byte 0x66
mov %r15, %rbx
shld \$1, %r14, %r15
shld \$1, %rcx, %r14
xor %ebp, %ebp
mulx %rdx, %rax, %rdx
adcx %rax, %r13
adcx %rdx, %r14
mov 32($inp), %rdx mov 32($inp), %rdx
adcx %rbp, %r15
xor %rbx, %rbx
adcx %rcx, %rax
adox %r13, %r13
adcx %rbp, $out
adox %r14, %r14
adcx %rax, %r13
adox %rbp, %rbx
adcx $out, %r14
adcx %rbp, %rbx
mov %r13, 48(%rsp) mov %r13, 48(%rsp)
mov %r14, 56(%rsp) mov %r14, 56(%rsp)
#fifth iteration #fifth iteration
.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11 mulx 40($inp), $out, %r11
adox $out, %r8 adox $out, %r8
adcx %r11, %r9 adcx %r11, %r9
@ -682,18 +690,19 @@ $code.=<<___;
mulx 56($inp), $out, %r11 mulx 56($inp), $out, %r11
adox $out, %r10 adox $out, %r10
adcx %rbp, %r11 adcx %rbp, %r11
mulx %rdx, %rax, $out
mov 40($inp), %rdx
adox %rbp, %r11 adox %rbp, %r11
mov %r9, %rcx xor %rcx, %rcx
shld \$1, %r8, %r9 adcx %rbx, %rax
shld \$1, %rbx, %r8 adox %r15, %r15
adcx %rbp, $out
xor %ebp, %ebp adox %r8, %r8
mulx %rdx, %rax, %rdx
adcx %rax, %r15 adcx %rax, %r15
adcx %rdx, %r8 adox %rbp, %rcx
mov 40($inp), %rdx adcx $out, %r8
adcx %rbp, %r9 adcx %rbp, %rcx
mov %r15, 64(%rsp) mov %r15, 64(%rsp)
mov %r8, 72(%rsp) mov %r8, 72(%rsp)
@ -706,18 +715,19 @@ $code.=<<___;
.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
adox $out, %r11 adox $out, %r11
adcx %rbp, %r12 adcx %rbp, %r12
mulx %rdx, %rax, $out
adox %rbp, %r12 adox %rbp, %r12
mov %r11, %rbx
shld \$1, %r10, %r11
shld \$1, %rcx, %r10
xor %ebp, %ebp
mulx %rdx, %rax, %rdx
adcx %rax, %r9
adcx %rdx, %r10
mov 48($inp), %rdx mov 48($inp), %rdx
adcx %rbp, %r11
xor %rbx, %rbx
adcx %rcx, %rax
adox %r9, %r9
adcx %rbp, $out
adox %r10, %r10
adcx %rax, %r9
adcx $out, %r10
adox %rbp, %rbx
adcx %rbp, %rbx
mov %r9, 80(%rsp) mov %r9, 80(%rsp)
mov %r10, 88(%rsp) mov %r10, 88(%rsp)
@ -727,31 +737,31 @@ $code.=<<___;
adox %rax, %r12 adox %rax, %r12
adox %rbp, %r13 adox %rbp, %r13
xor %r14, %r14 mulx %rdx, %rax, $out
shld \$1, %r13, %r14 xor %rcx, %rcx
shld \$1, %r12, %r13
shld \$1, %rbx, %r12
xor %ebp, %ebp
mulx %rdx, %rax, %rdx
adcx %rax, %r11
adcx %rdx, %r12
mov 56($inp), %rdx mov 56($inp), %rdx
adcx %rbp, %r13 adcx %rbx, %rax
adox %r11, %r11
adcx %rbp, $out
adox %r12, %r12
adcx %rax, %r11
adox %rbp, %rcx
adcx $out, %r12
adcx %rbp, %rcx
.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
#eighth iteration #eighth iteration
mulx %rdx, %rax, %rdx mulx %rdx, %rax, %rdx
adox %rax, %r13 xor %rbx, %rbx
adox %rbp, %rdx adcx %rcx, %rax
adox %r13, %r13
adcx %rbp, %rdx
adox %rbp, %rbx
adcx %r13, %rax
adcx %rdx, %rbx
.byte 0x66
add %rdx, %r14
movq %r13, 112(%rsp)
movq %r14, 120(%rsp)
movq %xmm0, $out movq %xmm0, $out
movq %xmm1, %rbp movq %xmm1, %rbp
@ -765,6 +775,9 @@ $code.=<<___;
movq 48(%rsp), %r14 movq 48(%rsp), %r14
movq 56(%rsp), %r15 movq 56(%rsp), %r15
movq %rax, 112(%rsp)
movq %rbx, 120(%rsp)
call __rsaz_512_reducex call __rsaz_512_reducex
addq 64(%rsp), %r8 addq 64(%rsp), %r8