This commit is contained in:
Zhi Guan
2015-08-15 15:02:15 +08:00
parent 06df2fab54
commit 3bdc0ea895
2536 changed files with 417052 additions and 271997 deletions

View File

@@ -41,7 +41,7 @@ $j="s4";
$m1="s5";
$code=<<___;
#indef __linux__
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>

289
crypto/bn/asm/armv4-gf2m.pl Normal file
View File

@@ -0,0 +1,289 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
# C for the time being... Except that it has two code paths: pure
# integer code suitable for any ARMv4 and later CPU and NEON code
# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
# faster than compiler-generated code. For ECDH and ECDSA verify (but
# not for ECDSA sign) it means 25%-45% improvement depending on key
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
#
# April 2014
#
# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
# referred below, which improves ECDH and ECDSA verify benchmarks
# by 18-40%.
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$code=<<___;
#include "arm_arch.h"
.text
.code 32
___
################
# private interface to mul_1x1_ialu
#
$a="r1";
$b="r0";
($a0,$a1,$a2,$a12,$a4,$a14)=
($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
$mask="r12";
$code.=<<___;
.type mul_1x1_ialu,%function
.align 5
mul_1x1_ialu:
mov $a0,#0
bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
str $a0,[sp,#0] @ tab[0]=0
add $a2,$a1,$a1 @ a2=a1<<1
str $a1,[sp,#4] @ tab[1]=a1
eor $a12,$a1,$a2 @ a1^a2
str $a2,[sp,#8] @ tab[2]=a2
mov $a4,$a1,lsl#2 @ a4=a1<<2
str $a12,[sp,#12] @ tab[3]=a1^a2
eor $a14,$a1,$a4 @ a1^a4
str $a4,[sp,#16] @ tab[4]=a4
eor $a0,$a2,$a4 @ a2^a4
str $a14,[sp,#20] @ tab[5]=a1^a4
eor $a12,$a12,$a4 @ a1^a2^a4
str $a0,[sp,#24] @ tab[6]=a2^a4
and $i0,$mask,$b,lsl#2
str $a12,[sp,#28] @ tab[7]=a1^a2^a4
and $i1,$mask,$b,lsr#1
ldr $lo,[sp,$i0] @ tab[b & 0x7]
and $i0,$mask,$b,lsr#4
ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
and $i1,$mask,$b,lsr#7
ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
eor $lo,$lo,$t1,lsl#3 @ stall
mov $hi,$t1,lsr#29
ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
and $i0,$mask,$b,lsr#10
eor $lo,$lo,$t0,lsl#6
eor $hi,$hi,$t0,lsr#26
ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
and $i1,$mask,$b,lsr#13
eor $lo,$lo,$t1,lsl#9
eor $hi,$hi,$t1,lsr#23
ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
and $i0,$mask,$b,lsr#16
eor $lo,$lo,$t0,lsl#12
eor $hi,$hi,$t0,lsr#20
ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
and $i1,$mask,$b,lsr#19
eor $lo,$lo,$t1,lsl#15
eor $hi,$hi,$t1,lsr#17
ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
and $i0,$mask,$b,lsr#22
eor $lo,$lo,$t0,lsl#18
eor $hi,$hi,$t0,lsr#14
ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
and $i1,$mask,$b,lsr#25
eor $lo,$lo,$t1,lsl#21
eor $hi,$hi,$t1,lsr#11
ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
tst $a,#1<<30
and $i0,$mask,$b,lsr#28
eor $lo,$lo,$t0,lsl#24
eor $hi,$hi,$t0,lsr#8
ldr $t0,[sp,$i0] @ tab[b >> 30 ]
eorne $lo,$lo,$b,lsl#30
eorne $hi,$hi,$b,lsr#2
tst $a,#1<<31
eor $lo,$lo,$t1,lsl#27
eor $hi,$hi,$t1,lsr#5
eorne $lo,$lo,$b,lsl#31
eorne $hi,$hi,$b,lsr#1
eor $lo,$lo,$t0,lsl#30
eor $hi,$hi,$t0,lsr#2
mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu
___
################
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
{
$code.=<<___;
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
bn_GF2m_mul_2x2:
#if __ARM_MAX_ARCH__>=7
ldr r12,.LOPENSSL_armcap
.Lpic: ldr r12,[pc,r12]
tst r12,#1
bne .LNEON
#endif
___
$ret="r10"; # reassigned 1st argument
$code.=<<___;
stmdb sp!,{r4-r10,lr}
mov $ret,r0 @ reassign 1st argument
mov $b,r3 @ $b=b1
ldr r3,[sp,#32] @ load b0
mov $mask,#7<<2
sub sp,sp,#32 @ allocate tab[8]
bl mul_1x1_ialu @ a1·b1
str $lo,[$ret,#8]
str $hi,[$ret,#12]
eor $b,$b,r3 @ flip b0 and b1
eor $a,$a,r2 @ flip a0 and a1
eor r3,r3,$b
eor r2,r2,$a
eor $b,$b,r3
eor $a,$a,r2
bl mul_1x1_ialu @ a0·b0
str $lo,[$ret]
str $hi,[$ret,#4]
eor $a,$a,r2
eor $b,$b,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
___
@r=map("r$_",(6..9));
$code.=<<___;
ldmia $ret,{@r[0]-@r[3]}
eor $lo,$lo,$hi
eor $hi,$hi,@r[1]
eor $lo,$lo,@r[0]
eor $hi,$hi,@r[2]
eor $lo,$lo,@r[3]
eor $hi,$hi,@r[3]
str $hi,[$ret,#8]
eor $lo,$lo,$hi
add sp,sp,#32 @ destroy tab[8]
str $lo,[$ret,#4]
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r10,pc}
#else
ldmia sp!,{r4-r10,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
___
}
{
my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.align 5
.LNEON:
ldr r12, [sp] @ 5th argument
vmov.32 $a, r2, r1
vmov.32 $b, r12, r3
vmov.i64 $k48, #0x0000ffffffffffff
vmov.i64 $k32, #0x00000000ffffffff
vmov.i64 $k16, #0x000000000000ffff
vext.8 $t0#lo, $a, $a, #1 @ A1
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
vext.8 $r#lo, $b, $b, #1 @ B1
vmull.p8 $r, $a, $r#lo @ E = A*B1
vext.8 $t1#lo, $a, $a, #2 @ A2
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
vext.8 $t3#lo, $b, $b, #2 @ B2
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
vext.8 $t2#lo, $a, $a, #3 @ A3
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
vext.8 $r#lo, $b, $b, #3 @ B3
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r#lo @ I = A*B3
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
vand $t0#hi, $t0#hi, $k48
vext.8 $t3#lo, $b, $b, #4 @ B4
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
vand $t1#hi, $t1#hi, $k32
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
veor $t2, $t2, $r @ N = I + J
veor $t0#lo, $t0#lo, $t0#hi
veor $t1#lo, $t1#lo, $t1#hi
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
vand $t2#hi, $t2#hi, $k16
vext.8 $t0, $t0, $t0, #15
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
vmov.i64 $t3#hi, #0
vext.8 $t1, $t1, $t1, #14
veor $t2#lo, $t2#lo, $t2#hi
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3, #12
vext.8 $t2, $t2, $t2, #13
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
vst1.32 {$r}, [r0]
ret @ bx lr
#endif
___
}
$code.=<<___;
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-(.Lpic+8)
#endif
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT; # enforce flush

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -23,6 +23,24 @@
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
# November 2013
#
# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
# performance improvement on Cortex-A8 is ~45-100% depending on key
# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
# On Snapdragon S4 improvement was measured to vary from ~70% to
# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
# rather because original integer-only code seems to perform
# suboptimally on S4. Situation on Cortex-A9 is unfortunately
# different. It's being looked into, but the trouble is that
# performance for vectors longer than 256 bits is actually couple
# of percent worse than for integer-only code. The code is chosen
# for execution on all NEON-capable processors, because gain on
# others outweighs the marginal loss on Cortex-A9.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
$bp="r2"; $bi="r2"; $rp="r2";
@@ -49,16 +67,40 @@ $_n0="$num,#14*4";
$_num="$num,#15*4"; $_bpend=$_num;
$code=<<___;
#include "arm_arch.h"
.text
.code 32
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-bn_mul_mont
#endif
.global bn_mul_mont
.type bn_mul_mont,%function
.align 2
.align 5
bn_mul_mont:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
ldr $num,[sp,#3*4] @ load num
cmp $num,#2
#if __ARM_MAX_ARCH__>=7
tst ip,#7
bne .Lialu
adr r0,bn_mul_mont
ldr r2,.LOPENSSL_armcap
ldr r0,[r0,r2]
tst r0,#1 @ NEON available?
ldmia sp, {r0,r2}
beq .Lialu
add sp,sp,#8
b bn_mul8x_mont_neon
.align 4
.Lialu:
#endif
cmp ip,#2
mov $num,ip @ load num
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
@@ -89,9 +131,9 @@ bn_mul_mont:
.L1st:
ldr $aj,[$ap],#4 @ ap[j],ap++
mov $alo,$ahi
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
ldr $nj,[$np],#4 @ np[j],np++
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo
@@ -101,21 +143,21 @@ bn_mul_mont:
bne .L1st
adds $nlo,$nlo,$ahi
mov $nhi,#0
adc $nhi,$nhi,#0
ldr $tp,[$_bp] @ restore bp
str $nlo,[$num] @ tp[num-1]=
mov $nhi,#0
ldr $n0,[$_n0] @ restore n0
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
.Louter:
sub $tj,$num,sp @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $bi,[$tp,#4]! @ *(++bp)
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $aj,[$ap,#-4] @ ap[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $alo,[sp] @ tp[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $tj,[sp,#4] @ tp[1]
mov $ahi,#0
@@ -129,13 +171,13 @@ bn_mul_mont:
.Linner:
ldr $aj,[$ap],#4 @ ap[j],ap++
adds $alo,$ahi,$tj @ +=tp[j]
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
ldr $nj,[$np],#4 @ np[j],np++
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
ldr $tj,[$tp,#8] @ tp[j+1]
adc $ahi,$ahi,#0
ldr $tj,[$tp,#8] @ tp[j+1]
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
@@ -144,13 +186,13 @@ bn_mul_mont:
adds $nlo,$nlo,$ahi
mov $nhi,#0
adc $nhi,$nhi,#0
adds $nlo,$nlo,$tj
adc $nhi,$nhi,#0
ldr $tp,[$_bp] @ restore bp
ldr $tj,[$_bpend] @ restore &bp[num]
str $nlo,[$num] @ tp[num-1]=
adc $nhi,$nhi,#0
ldr $n0,[$_n0] @ restore n0
adds $nlo,$nlo,$tj
ldr $tj,[$_bpend] @ restore &bp[num]
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
cmp $tp,$tj
@@ -188,14 +230,447 @@ bn_mul_mont:
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt: tst lr,#1
.Labrt:
#if __ARM_ARCH__>=5
ret @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size bn_mul_mont,.-bn_mul_mont
.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
___
{
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
my ($Z,$Temp)=("q4","q5");
my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
my $zero=&Dlo($Z);
my $temp=&Dlo($Temp);
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.type bn_mul8x_mont_neon,%function
.align 5
bn_mul8x_mont_neon:
mov ip,sp
stmdb sp!,{r4-r11}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load rest of parameter block
sub $toutptr,sp,#16
vld1.32 {${Bi}[0]}, [$bptr,:32]!
sub $toutptr,$toutptr,$num,lsl#4
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
and $toutptr,$toutptr,#-64
vld1.32 {${M0}[0]}, [$n0,:32]
mov sp,$toutptr @ alloca
veor $zero,$zero,$zero
subs $inner,$num,#8
vzip.16 $Bi,$zero
vmull.u32 $A0xB,$Bi,${A0}[0]
vmull.u32 $A1xB,$Bi,${A0}[1]
vmull.u32 $A2xB,$Bi,${A1}[0]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
vmull.u32 $A3xB,$Bi,${A1}[1]
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
veor $zero,$zero,$zero
vmul.u32 $Ni,$temp,$M0
vmull.u32 $A4xB,$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]!
vmull.u32 $A5xB,$Bi,${A2}[1]
vmull.u32 $A6xB,$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmull.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_1st
@ special case for num=8, everything is in register bank...
vmlal.u32 $A0xB,$Ni,${N0}[0]
sub $outer,$num,#1
vmlal.u32 $A1xB,$Ni,${N0}[1]
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0]
vmov $Temp,$A0xB
vmlal.u32 $A5xB,$Ni,${N2}[1]
vmov $A0xB,$A1xB
vmlal.u32 $A6xB,$Ni,${N3}[0]
vmov $A1xB,$A2xB
vmlal.u32 $A7xB,$Ni,${N3}[1]
vmov $A2xB,$A3xB
vmov $A3xB,$A4xB
vshr.u64 $temp,$temp,#16
vmov $A4xB,$A5xB
vmov $A5xB,$A6xB
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
vmov $A6xB,$A7xB
veor $A7xB,$A7xB
vshr.u64 $temp,$temp,#16
b .LNEON_outer8
.align 4
.LNEON_outer8:
vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero
vzip.16 $Bi,$zero
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
vmlal.u32 $A0xB,$Bi,${A0}[0]
vmlal.u32 $A1xB,$Bi,${A0}[1]
vmlal.u32 $A2xB,$Bi,${A1}[0]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
vmlal.u32 $A3xB,$Bi,${A1}[1]
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
veor $zero,$zero,$zero
subs $outer,$outer,#1
vmul.u32 $Ni,$temp,$M0
vmlal.u32 $A4xB,$Bi,${A2}[0]
vmlal.u32 $A5xB,$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 $A7xB,$Bi,${A3}[1]
vmlal.u32 $A0xB,$Ni,${N0}[0]
vmlal.u32 $A1xB,$Ni,${N0}[1]
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0]
vmov $Temp,$A0xB
vmlal.u32 $A5xB,$Ni,${N2}[1]
vmov $A0xB,$A1xB
vmlal.u32 $A6xB,$Ni,${N3}[0]
vmov $A1xB,$A2xB
vmlal.u32 $A7xB,$Ni,${N3}[1]
vmov $A2xB,$A3xB
vmov $A3xB,$A4xB
vshr.u64 $temp,$temp,#16
vmov $A4xB,$A5xB
vmov $A5xB,$A6xB
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
vmov $A6xB,$A7xB
veor $A7xB,$A7xB
vshr.u64 $temp,$temp,#16
bne .LNEON_outer8
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
mov $toutptr,sp
vshr.u64 $temp,`&Dlo("$A0xB")`,#16
mov $inner,$num
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
add $tinptr,sp,#16
vshr.u64 $temp,`&Dhi("$A0xB")`,#16
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
b .LNEON_tail2
.align 4
.LNEON_1st:
vmlal.u32 $A0xB,$Ni,${N0}[0]
vld1.32 {$A0-$A3}, [$aptr]!
vmlal.u32 $A1xB,$Ni,${N0}[1]
subs $inner,$inner,#8
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0]
vld1.32 {$N0-$N1}, [$nptr]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmull.u32 $A0xB,$Bi,${A0}[0]
vld1.32 {$N2-$N3}, [$nptr]!
vmull.u32 $A1xB,$Bi,${A0}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vmull.u32 $A2xB,$Bi,${A1}[0]
vmull.u32 $A3xB,$Bi,${A1}[1]
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vmull.u32 $A4xB,$Bi,${A2}[0]
vmull.u32 $A5xB,$Bi,${A2}[1]
vmull.u32 $A6xB,$Bi,${A3}[0]
vmull.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_1st
vmlal.u32 $A0xB,$Ni,${N0}[0]
add $tinptr,sp,#16
vmlal.u32 $A1xB,$Ni,${N0}[1]
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
vmlal.u32 $A2xB,$Ni,${N1}[0]
vld1.64 {$Temp}, [sp,:128]
vmlal.u32 $A3xB,$Ni,${N1}[1]
sub $outer,$num,#1
vmlal.u32 $A4xB,$Ni,${N2}[0]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vshr.u64 $temp,$temp,#16
vld1.64 {$A0xB}, [$tinptr, :128]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
veor $Z,$Z,$Z
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vst1.64 {$Z}, [$toutptr,:128]
vshr.u64 $temp,$temp,#16
b .LNEON_outer
.align 4
.LNEON_outer:
vld1.32 {${Bi}[0]}, [$bptr,:32]!
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
vld1.32 {$A0-$A3}, [$aptr]!
veor $zero,$zero,$zero
mov $toutptr,sp
vzip.16 $Bi,$zero
sub $inner,$num,#8
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
vmlal.u32 $A0xB,$Bi,${A0}[0]
vld1.64 {$A3xB-$A4xB},[$tinptr,:256]!
vmlal.u32 $A1xB,$Bi,${A0}[1]
vmlal.u32 $A2xB,$Bi,${A1}[0]
vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
vmlal.u32 $A3xB,$Bi,${A1}[1]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
veor $zero,$zero,$zero
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
vld1.64 {$A7xB},[$tinptr,:128]!
vmul.u32 $Ni,$temp,$M0
vmlal.u32 $A4xB,$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]!
vmlal.u32 $A5xB,$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 $A7xB,$Bi,${A3}[1]
.LNEON_inner:
vmlal.u32 $A0xB,$Ni,${N0}[0]
vld1.32 {$A0-$A3}, [$aptr]!
vmlal.u32 $A1xB,$Ni,${N0}[1]
subs $inner,$inner,#8
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A4xB,$Ni,${N2}[0]
vld1.64 {$A0xB}, [$tinptr, :128]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vmlal.u32 $A0xB,$Bi,${A0}[0]
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
vmlal.u32 $A1xB,$Bi,${A0}[1]
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vmlal.u32 $A2xB,$Bi,${A1}[0]
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
vmlal.u32 $A3xB,$Bi,${A1}[1]
vld1.32 {$N0-$N3}, [$nptr]!
vmlal.u32 $A4xB,$Bi,${A2}[0]
vld1.64 {$A7xB}, [$tinptr, :128]!
vmlal.u32 $A5xB,$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0]
vmlal.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_inner
vmlal.u32 $A0xB,$Ni,${N0}[0]
add $tinptr,sp,#16
vmlal.u32 $A1xB,$Ni,${N0}[1]
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
vmlal.u32 $A2xB,$Ni,${N1}[0]
vld1.64 {$Temp}, [sp,:128]
vmlal.u32 $A3xB,$Ni,${N1}[1]
subs $outer,$outer,#1
vmlal.u32 $A4xB,$Ni,${N2}[0]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vld1.64 {$A0xB}, [$tinptr, :128]!
vshr.u64 $temp,$temp,#16
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vshr.u64 $temp,$temp,#16
bne .LNEON_outer
mov $toutptr,sp
mov $inner,$num
.LNEON_tail:
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dlo("$A0xB")`,#16
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dhi("$A0xB")`,#16
vld1.64 {$A7xB}, [$tinptr, :128]!
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
.LNEON_tail2:
vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A1xB")`,#16
vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
vshr.u64 $temp,`&Dhi("$A1xB")`,#16
vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A2xB")`,#16
vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
vshr.u64 $temp,`&Dhi("$A2xB")`,#16
vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A3xB")`,#16
vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
vshr.u64 $temp,`&Dhi("$A3xB")`,#16
vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A4xB")`,#16
vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
vshr.u64 $temp,`&Dhi("$A4xB")`,#16
vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A5xB")`,#16
vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
vshr.u64 $temp,`&Dhi("$A5xB")`,#16
vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A6xB")`,#16
vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
vld1.64 {$A0xB}, [$tinptr, :128]!
vshr.u64 $temp,`&Dhi("$A6xB")`,#16
vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A7xB")`,#16
vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dhi("$A7xB")`,#16
vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
subs $inner,$inner,#8
vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
bne .LNEON_tail
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
subs $aptr,sp,#0 @ clear carry flag
add $bptr,sp,$num,lsl#2
.LNEON_sub:
ldmia $aptr!, {r4-r7}
ldmia $nptr!, {r8-r11}
sbcs r8, r4,r8
sbcs r9, r5,r9
sbcs r10,r6,r10
sbcs r11,r7,r11
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_sub
ldr r10, [$aptr] @ load top-most bit
veor q0,q0,q0
sub r11,$bptr,sp @ this is num*4
veor q1,q1,q1
mov $aptr,sp
sub $rptr,$rptr,r11 @ rewind $rptr
mov $nptr,$bptr @ second 3/4th of frame
sbcs r10,r10,#0 @ result is carry flag
.LNEON_copy_n_zap:
ldmia $aptr!, {r4-r7}
ldmia $rptr, {r8-r11}
movcc r8, r4
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
movcc r11,r7
ldmia $aptr, {r4-r7}
stmia $rptr!, {r8-r11}
sub $aptr,$aptr,#16
ldmia $rptr, {r8-r11}
movcc r8, r4
vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
movcc r11,r7
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_copy_n_zap
sub sp,ip,#96
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r11}
ret @ bx lr
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
___
}
$code.=<<___;
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT;

851
crypto/bn/asm/ia64-mont.pl Normal file
View File

@@ -0,0 +1,851 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2010
#
# "Teaser" Montgomery multiplication module for IA-64. There are
# several possibilities for improvement:
#
# - modulo-scheduling outer loop would eliminate quite a number of
# stalls after ldf8, xma and getf.sig outside inner loop and
# improve shorter key performance;
# - shorter vector support [with input vectors being fetched only
# once] should be added;
# - 2x unroll with help of n0[1] would make the code scalable on
# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
# acute interest, because upcoming Tukwila's individual cores are
# reportedly based on Itanium 2 design;
# - dedicated squaring procedure(?);
#
# January 2010
#
# Shorter vector support is implemented by zero-padding ap and np
# vectors up to 8 elements, or 512 bits. This means that 256-bit
# inputs will be processed only 2 times faster than 512-bit inputs,
# not 4 [as one would expect, because algorithm complexity is n^2].
# The reason for padding is that inputs shorter than 512 bits won't
# be processed faster anyway, because minimal critical path of the
# core loop happens to match 512-bit timing. Either way, it resulted
# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
# 1024-bit one [in comparison to original version of *this* module].
#
# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
# this module is:
# sign verify sign/s verify/s
# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
#
# ... and *without* (but still with ia64.S):
#
# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
#
# As it can be seen, RSA sign performance improves by 130-30%,
# hereafter less for longer keys, while verify - by 74-13%.
# DSA performance improves by 115-30%.
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
$code=<<___;
.explicit
.text
// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
// const BN_ULONG *bp,const BN_ULONG *np,
// const BN_ULONG *n0p,int num);
.align 64
.global bn_mul_mont#
.proc bn_mul_mont#
bn_mul_mont:
.prologue
.body
{ .mmi; cmp4.le p6,p7=2,r37;;
(p6) cmp4.lt.unc p8,p9=8,r37
mov ret0=r0 };;
{ .bbb;
(p9) br.cond.dptk.many bn_mul_mont_8
(p8) br.cond.dpnt.many bn_mul_mont_general
(p7) br.ret.spnt.many b0 };;
.endp bn_mul_mont#
prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
rptr=r8; aptr=r9; bptr=r14; nptr=r15;
tptr=r16; // &tp[0]
tp_1=r17; // &tp[-1]
num=r18; len=r19; lc=r20;
topbit=r21; // carry bit from tmp[num]
n0=f6;
m0=f7;
bi=f8;
.align 64
.local bn_mul_mont_general#
.proc bn_mul_mont_general#
bn_mul_mont_general:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,6,2,0,8
$ADDP aptr=0,in1
.save ar.lc,prevlc
mov prevlc=ar.lc }
{ .mmi; .vframe prevsp
mov prevsp=sp
$ADDP bptr=0,in2
.save pr,prevpr
mov prevpr=pr };;
.body
.rotf alo[6],nlo[4],ahi[8],nhi[6]
.rotr a[3],n[3],t[2]
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
ldf8 alo[4]=[aptr],16 // ap[0]
$ADDP r30=8,in1 };;
{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
ldf8 alo[2]=[aptr],16 // ap[2]
$ADDP in4=0,in4 };;
{ .mmi; ldf8 alo[1]=[r30] // ap[3]
ldf8 n0=[in4] // n0
$ADDP rptr=0,in0 }
{ .mmi; $ADDP nptr=0,in3
mov r31=16
zxt4 num=in5 };;
{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
shladd len=num,3,r0
shladd r31=num,3,r31 };;
{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
add lc=-5,num
sub r31=sp,r31 };;
{ .mfb; and sp=-16,r31 // alloca
xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
nop.b 0 }
{ .mfb; nop.m 0
xmpy.lu alo[4]=alo[4],bi
brp.loop.imp .L1st_ctop,.L1st_cend-16
};;
{ .mfi; nop.m 0
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
add tp_1=8,sp }
{ .mfi; nop.m 0
xma.lu alo[3]=alo[3],bi,ahi[2]
mov pr.rot=0x20001f<<16
// ------^----- (p40) at first (p23)
// ----------^^ p[16:20]=1
};;
{ .mfi; nop.m 0
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
mov ar.lc=lc }
{ .mfi; nop.m 0
fcvt.fxu.s1 nhi[1]=f0
mov ar.ec=8 };;
.align 32
.L1st_ctop:
.pred.rel "mutex",p40,p42
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
(p40) add n[2]=n[2],a[2] } // (p23) }
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
(p42) add n[2]=n[2],a[2],1 };; // (p23)
{ .mfi; (p21) getf.sig a[0]=alo[5]
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
(p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
{ .mfi; (p23) st8 [tp_1]=n[2],8
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
{ .mmb; (p21) getf.sig n[0]=nlo[3]
(p16) nop.m 0
br.ctop.sptk .L1st_ctop };;
.L1st_cend:
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
getf.sig n[0]=nhi[4]
add num=-1,num };; // num--
{ .mmi; .pred.rel "mutex",p40,p42
(p40) add n[0]=n[0],a[0]
(p42) add n[0]=n[0],a[0],1
sub aptr=aptr,len };; // rewind
{ .mmi; .pred.rel "mutex",p40,p42
(p40) cmp.ltu p41,p39=n[0],a[0]
(p42) cmp.leu p41,p39=n[0],a[0]
sub nptr=nptr,len };;
{ .mmi; .pred.rel "mutex",p39,p41
(p39) add topbit=r0,r0
(p41) add topbit=r0,r0,1
nop.i 0 }
{ .mmi; st8 [tp_1]=n[0]
add tptr=16,sp
add tp_1=8,sp };;
.Louter:
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
ldf8 ahi[3]=[tptr] // tp[0]
add r30=8,aptr };;
{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
ldf8 alo[3]=[r30],16 // ap[1]
add r31=8,nptr };;
{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
brp.loop.imp .Linner_ctop,.Linner_cend-16
}
{ .mfb; ldf8 alo[1]=[r30] // ap[3]
xma.lu alo[4]=alo[4],bi,ahi[3]
clrrrb.pr };;
{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
nop.i 0 }
{ .mfi; ldf8 nlo[1]=[r31] // np[1]
xma.lu alo[3]=alo[3],bi,ahi[2]
mov pr.rot=0x20101f<<16
// ------^----- (p40) at first (p23)
// --------^--- (p30) at first (p22)
// ----------^^ p[16:20]=1
};;
{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
mov ar.lc=lc }
{ .mfi;
fcvt.fxu.s1 nhi[1]=f0
mov ar.ec=8 };;
// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
// in latter case accounts for two-tick pipeline stall, which means
// that its performance would be ~20% lower than optimal one. No
// attempt was made to address this, because original Itanium is
// hardly represented out in the wild...
.align 32
.Linner_ctop:
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p30,p32
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
(p40) add n[2]=n[2],a[2] } // (p23)
{ .mfi; (p16) nop.m 0
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
(p42) add n[2]=n[2],a[2],1 };; // (p23)
{ .mfi; (p21) getf.sig a[0]=alo[5]
(p16) nop.f 0
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
{ .mfi; (p21) ld8 t[0]=[tptr],8
(p16) nop.f 0
(p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
(p30) add a[1]=a[1],t[1] } // (p22)
{ .mfi; (p16) nop.m 0
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
(p32) add a[1]=a[1],t[1],1 };; // (p22)
{ .mmi; (p21) getf.sig n[0]=nlo[3]
(p16) nop.m 0
(p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
{ .mmb; (p23) st8 [tp_1]=n[2],8
(p32) cmp.leu p31,p29=a[1],t[1] // (p22)
br.ctop.sptk .Linner_ctop };;
.Linner_cend:
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
getf.sig n[0]=nhi[4]
nop.i 0 };;
{ .mmi; .pred.rel "mutex",p31,p33
(p31) add a[0]=a[0],topbit
(p33) add a[0]=a[0],topbit,1
mov topbit=r0 };;
{ .mfi; .pred.rel "mutex",p31,p33
(p31) cmp.ltu p32,p30=a[0],topbit
(p33) cmp.leu p32,p30=a[0],topbit
}
{ .mfi; .pred.rel "mutex",p40,p42
(p40) add n[0]=n[0],a[0]
(p42) add n[0]=n[0],a[0],1
};;
{ .mmi; .pred.rel "mutex",p44,p46
(p40) cmp.ltu p41,p39=n[0],a[0]
(p42) cmp.leu p41,p39=n[0],a[0]
(p32) add topbit=r0,r0,1 }
{ .mmi; st8 [tp_1]=n[0],8
cmp4.ne p6,p0=1,num
sub aptr=aptr,len };; // rewind
{ .mmi; sub nptr=nptr,len
(p41) add topbit=r0,r0,1
add tptr=16,sp }
{ .mmb; add tp_1=8,sp
add num=-1,num // num--
(p6) br.cond.sptk.many .Louter };;
{ .mbb; add lc=4,lc
brp.loop.imp .Lsub_ctop,.Lsub_cend-16
clrrrb.pr };;
{ .mii; nop.m 0
mov pr.rot=0x10001<<16
// ------^---- (p33) at first (p17)
mov ar.lc=lc }
{ .mii; nop.m 0
mov ar.ec=3
nop.i 0 };;
.Lsub_ctop:
.pred.rel "mutex",p33,p35
{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
(p16) nop.f 0
(p33) sub n[1]=t[1],n[1] } // (p17)
{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
(p16) nop.f 0
(p35) sub n[1]=t[1],n[1],1 };; // (p17)
{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
(p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
(p18) nop.b 0 }
{ .mib; (p18) nop.m 0
(p35) cmp.geu p34,p32=n[1],t[1] // (p17)
br.ctop.sptk .Lsub_ctop };;
.Lsub_cend:
{ .mmb; .pred.rel "mutex",p34,p36
(p34) sub topbit=topbit,r0 // (p19)
(p36) sub topbit=topbit,r0,1
brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
}
{ .mmb; sub rptr=rptr,len // rewind
sub tptr=tptr,len
clrrrb.pr };;
{ .mmi; and aptr=tptr,topbit
andcm bptr=rptr,topbit
mov pr.rot=1<<16 };;
{ .mii; or nptr=aptr,bptr
mov ar.lc=lc
mov ar.ec=3 };;
.Lcopy_ctop:
{ .mmb; (p16) ld8 n[0]=[nptr],8
(p18) st8 [tptr]=r0,8
(p16) nop.b 0 }
{ .mmb; (p16) nop.m 0
(p18) st8 [rptr]=n[2],8
br.ctop.sptk .Lcopy_ctop };;
.Lcopy_cend:
{ .mmi; mov ret0=1 // signal "handled"
rum 1<<5 // clear um.mfh
mov ar.lc=prevlc }
{ .mib; .restore sp
mov sp=prevsp
mov pr=prevpr,0x1ffff
br.ret.sptk.many b0 };;
.endp bn_mul_mont_general#
a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
t0=r15;
ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
.align 64
.skip 48 // aligns loop body
.local bn_mul_mont_8#
.proc bn_mul_mont_8#
bn_mul_mont_8:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,6,2,0,8
.vframe prevsp
mov prevsp=sp
.save ar.lc,prevlc
mov prevlc=ar.lc }
{ .mmi; add r17=-6*16,sp
add sp=-7*16,sp
.save pr,prevpr
mov prevpr=pr };;
{ .mmi; .save.gf 0,0x10
stf.spill [sp]=f16,-16
.save.gf 0,0x20
stf.spill [r17]=f17,32
add r16=-5*16,prevsp};;
{ .mmi; .save.gf 0,0x40
stf.spill [r16]=f18,32
.save.gf 0,0x80
stf.spill [r17]=f19,32
$ADDP aptr=0,in1 };;
{ .mmi; .save.gf 0,0x100
stf.spill [r16]=f20,32
.save.gf 0,0x200
stf.spill [r17]=f21,32
$ADDP r29=8,in1 };;
{ .mmi; .save.gf 0,0x400
stf.spill [r16]=f22
.save.gf 0,0x800
stf.spill [r17]=f23
$ADDP rptr=0,in0 };;
.body
.rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
.rotr t[8]
// load input vectors padding them to 8 elements
{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
ldf8 ai1=[r29],16 // ap[1]
$ADDP bptr=0,in2 }
{ .mmi; $ADDP r30=8,in2
$ADDP nptr=0,in3
$ADDP r31=8,in3 };;
{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
ldf8 bj[6]=[r30],16 // bp[1]
cmp4.le p4,p5=3,in5 }
{ .mmi; ldf8 ni0=[nptr],16 // np[0]
ldf8 ni1=[r31],16 // np[1]
cmp4.le p6,p7=4,in5 };;
{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
(p5)fcvt.fxu ai2=f0
cmp4.le p8,p9=5,in5 }
{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
(p7)fcvt.fxu ai3=f0
cmp4.le p10,p11=6,in5 }
{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
(p5)fcvt.fxu bj[5]=f0
cmp4.le p12,p13=7,in5 }
{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
(p7)fcvt.fxu bj[4]=f0
cmp4.le p14,p15=8,in5 }
{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
(p5)fcvt.fxu ni2=f0
addp4 r28=-1,in5 }
{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
(p7)fcvt.fxu ni3=f0
$ADDP in4=0,in4 };;
{ .mfi; ldf8 n0=[in4]
fcvt.fxu tf[1]=f0
nop.i 0 }
{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
(p9)fcvt.fxu ai4=f0
mov t[0]=r0 }
{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
(p11)fcvt.fxu ai5=f0
mov t[1]=r0 }
{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
(p9)fcvt.fxu bj[3]=f0
mov t[2]=r0 }
{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
(p11)fcvt.fxu bj[2]=f0
mov t[3]=r0 }
{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
(p9)fcvt.fxu ni4=f0
mov t[4]=r0 }
{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
(p11)fcvt.fxu ni5=f0
mov t[5]=r0 };;
{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
(p13)fcvt.fxu ai6=f0
mov t[6]=r0 }
{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
(p15)fcvt.fxu ai7=f0
mov t[7]=r0 }
{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
(p13)fcvt.fxu bj[1]=f0
mov ar.lc=r28 }
{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
(p15)fcvt.fxu bj[0]=f0
mov ar.ec=1 }
{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
(p13)fcvt.fxu ni6=f0
mov pr.rot=1<<16 }
{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
(p15)fcvt.fxu ni7=f0
brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
};;
// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
// to measure with help of Interval Time Counter indicated that the
// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
// addressing the issue is problematic, because I don't have access
// to platform-specific instruction-level profiler. On Itanium it
// should run in 56*n ticks, because of higher xma latency...
.Louter_8_ctop:
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 0:
(p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
(p40) add a3=a3,n3 } // (p17) a3+=n3
{ .mfi; (p42) add a3=a3,n3,1
(p16) xma.lu alo[0]=ai0,bj[7],tf[1]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig a7=alo[8] // 1:
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
(p50) add t[6]=t[6],a3,1 };;
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
(p40) cmp.ltu p43,p41=a3,n3 }
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
(p48) cmp.ltu p51,p49=t[6],a3
(p50) cmp.leu p51,p49=t[6],a3 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p16) nop.m 0 // 4:
(p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
(p41) add a4=a4,n4 } // (p17) a4+=n4
{ .mfi; (p43) add a4=a4,n4,1
(p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
(p16) nop.i 0 };;
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
(p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
(p51) add t[5]=t[5],a4,1 };;
{ .mfi; (p16) nop.m 0 // 6:
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
(p41) cmp.ltu p42,p40=a4,n4 }
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
(p49) cmp.ltu p50,p48=t[5],a4
(p51) cmp.leu p50,p48=t[5],a4 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 8:
(p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
(p40) add a5=a5,n5 } // (p17) a5+=n5
{ .mfi; (p42) add a5=a5,n5,1
(p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a1=alo[1] // 9:
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
(p50) add t[4]=t[4],a5,1 };;
{ .mfi; (p16) nop.m 0 // 10:
(p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
(p40) cmp.ltu p43,p41=a5,n5 }
{ .mfi; (p42) cmp.leu p43,p41=a5,n5
(p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
(p48) cmp.ltu p51,p49=t[4],a5
(p50) cmp.leu p51,p49=t[4],a5 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
(p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
(p41) add a6=a6,n6 } // (p17) a6+=n6
{ .mfi; (p43) add a6=a6,n6,1
(p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a2=alo[2] // 13:
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
(p51) add t[3]=t[3],a6,1 };;
{ .mfi; (p16) nop.m 0 // 14:
(p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
(p41) cmp.ltu p42,p40=a6,n6 }
{ .mfi; (p43) cmp.leu p42,p40=a6,n6
(p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
(p16) nop.i 0 };;
{ .mii; (p16) nop.m 0 // 15:
(p49) cmp.ltu p50,p48=t[3],a6
(p51) cmp.leu p50,p48=t[3],a6 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 16:
(p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
(p40) add a7=a7,n7 } // (p17) a7+=n7
{ .mfi; (p42) add a7=a7,n7,1
(p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a3=alo[3] // 17:
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
(p50) add t[2]=t[2],a7,1 };;
{ .mfi; (p16) nop.m 0 // 18:
(p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
(p40) cmp.ltu p43,p41=a7,n7 }
{ .mfi; (p42) cmp.leu p43,p41=a7,n7
(p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n1=nlo[1] // 19:
(p48) cmp.ltu p51,p49=t[2],a7
(p50) cmp.leu p51,p49=t[2],a7 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p16) nop.m 0 // 20:
(p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
(p41) add a8=a8,n8 } // (p17) a8+=n8
{ .mfi; (p43) add a8=a8,n8,1
(p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a4=alo[4] // 21:
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
(p51) add t[1]=t[1],a8,1 };;
{ .mfi; (p16) nop.m 0 // 22:
(p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
(p41) cmp.ltu p42,p40=a8,n8 }
{ .mfi; (p43) cmp.leu p42,p40=a8,n8
(p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n2=nlo[2] // 23:
(p49) cmp.ltu p50,p48=t[1],a8
(p51) cmp.leu p50,p48=t[1],a8 };;
{ .mfi; (p16) nop.m 0 // 24:
(p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
(p16) add a1=a1,n1 } // (p16) a1+=n1
{ .mfi; (p16) nop.m 0
(p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
(p17) mov t[0]=r0 };;
{ .mii; (p16) getf.sig a5=alo[5] // 25:
(p16) add t0=t[7],a1 // (p16) t[7]+=a1
(p42) add t[0]=t[0],r0,1 };;
{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
(p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
(p50) add t[0]=t[0],r0,1 }
{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
(p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n3=nlo[3] // 27:
(p16) cmp.ltu.unc p50,p48=t0,a1
(p16) nop.i 0 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 28:
(p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
(p40) add a2=a2,n2 } // (p16) a2+=n2
{ .mfi; (p42) add a2=a2,n2,1
(p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a6=alo[6] // 29:
(p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
(p50) add t[6]=t[6],a2,1 };;
{ .mfi; (p16) nop.m 0 // 30:
(p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
(p40) cmp.ltu p41,p39=a2,n2 }
{ .mfi; (p42) cmp.leu p41,p39=a2,n2
(p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
(p16) nop.i 0 };;
{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
(p16) nop.f 0
(p48) cmp.ltu p49,p47=t[6],a2 }
{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
(p16) nop.f 0
br.ctop.sptk.many .Louter_8_ctop };;
.Louter_8_cend:
// above loop has to execute one more time, without (p16), which is
// replaced with merged move of np[8] to GPR bank
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mmi; (p0) getf.sig n1=ni0 // 0:
(p40) add a3=a3,n3 // (p17) a3+=n3
(p42) add a3=a3,n3,1 };;
{ .mii; (p17) getf.sig a7=alo[8] // 1:
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
(p50) add t[6]=t[6],a3,1 };;
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
(p40) cmp.ltu p43,p41=a3,n3 }
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
(p0) nop.i 0 };;
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
(p48) cmp.ltu p51,p49=t[6],a3
(p50) cmp.leu p51,p49=t[6],a3 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mmi; (p0) getf.sig n2=ni1 // 4:
(p41) add a4=a4,n4 // (p17) a4+=n4
(p43) add a4=a4,n4,1 };;
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
(p0) nop.f 0
(p51) add t[5]=t[5],a4,1 };;
{ .mfi; (p0) getf.sig n3=ni2 // 6:
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
(p41) cmp.ltu p42,p40=a4,n4 }
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
(p0) nop.i 0 };;
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
(p49) cmp.ltu p50,p48=t[5],a4
(p51) cmp.leu p50,p48=t[5],a4 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mii; (p0) getf.sig n4=ni3 // 8:
(p40) add a5=a5,n5 // (p17) a5+=n5
(p42) add a5=a5,n5,1 };;
{ .mii; (p0) nop.m 0 // 9:
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
(p50) add t[4]=t[4],a5,1 };;
{ .mii; (p0) nop.m 0 // 10:
(p40) cmp.ltu p43,p41=a5,n5
(p42) cmp.leu p43,p41=a5,n5 };;
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
(p48) cmp.ltu p51,p49=t[4],a5
(p50) cmp.leu p51,p49=t[4],a5 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mii; (p17) getf.sig n8=nhi[8] // 12:
(p41) add a6=a6,n6 // (p17) a6+=n6
(p43) add a6=a6,n6,1 };;
{ .mii; (p0) getf.sig n5=ni4 // 13:
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
(p51) add t[3]=t[3],a6,1 };;
{ .mii; (p0) nop.m 0 // 14:
(p41) cmp.ltu p42,p40=a6,n6
(p43) cmp.leu p42,p40=a6,n6 };;
{ .mii; (p0) getf.sig n6=ni5 // 15:
(p49) cmp.ltu p50,p48=t[3],a6
(p51) cmp.leu p50,p48=t[3],a6 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mii; (p0) nop.m 0 // 16:
(p40) add a7=a7,n7 // (p17) a7+=n7
(p42) add a7=a7,n7,1 };;
{ .mii; (p0) nop.m 0 // 17:
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
(p50) add t[2]=t[2],a7,1 };;
{ .mii; (p0) nop.m 0 // 18:
(p40) cmp.ltu p43,p41=a7,n7
(p42) cmp.leu p43,p41=a7,n7 };;
{ .mii; (p0) getf.sig n7=ni6 // 19:
(p48) cmp.ltu p51,p49=t[2],a7
(p50) cmp.leu p51,p49=t[2],a7 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mii; (p0) nop.m 0 // 20:
(p41) add a8=a8,n8 // (p17) a8+=n8
(p43) add a8=a8,n8,1 };;
{ .mmi; (p0) nop.m 0 // 21:
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
(p51) add t[1]=t[1],a8,1 }
{ .mmi; (p17) mov t[0]=r0
(p41) cmp.ltu p42,p40=a8,n8
(p43) cmp.leu p42,p40=a8,n8 };;
{ .mmi; (p0) getf.sig n8=ni7 // 22:
(p49) cmp.ltu p50,p48=t[1],a8
(p51) cmp.leu p50,p48=t[1],a8 }
{ .mmi; (p42) add t[0]=t[0],r0,1
(p0) add r16=-7*16,prevsp
(p0) add r17=-6*16,prevsp };;
// subtract np[8] from carrybit|tmp[8]
// carrybit|tmp[8] layout upon exit from above loop is:
// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
{ .mmi; (p50)add t[0]=t[0],r0,1
add r18=-5*16,prevsp
sub n1=t0,n1 };;
{ .mmi; cmp.gtu p34,p32=n1,t0;;
.pred.rel "mutex",p32,p34
(p32)sub n2=t[7],n2
(p34)sub n2=t[7],n2,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
(p34)cmp.geu p35,p33=n2,t[7];;
.pred.rel "mutex",p33,p35
(p33)sub n3=t[6],n3 }
{ .mmi; (p35)sub n3=t[6],n3,1;;
(p33)cmp.gtu p34,p32=n3,t[6]
(p35)cmp.geu p34,p32=n3,t[6] };;
.pred.rel "mutex",p32,p34
{ .mii; (p32)sub n4=t[5],n4
(p34)sub n4=t[5],n4,1;;
(p32)cmp.gtu p35,p33=n4,t[5] }
{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
.pred.rel "mutex",p33,p35
(p33)sub n5=t[4],n5
(p35)sub n5=t[4],n5,1 };;
{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
(p35)cmp.geu p34,p32=n5,t[4];;
.pred.rel "mutex",p32,p34
(p32)sub n6=t[3],n6 }
{ .mmi; (p34)sub n6=t[3],n6,1;;
(p32)cmp.gtu p35,p33=n6,t[3]
(p34)cmp.geu p35,p33=n6,t[3] };;
.pred.rel "mutex",p33,p35
{ .mii; (p33)sub n7=t[2],n7
(p35)sub n7=t[2],n7,1;;
(p33)cmp.gtu p34,p32=n7,t[2] }
{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
.pred.rel "mutex",p32,p34
(p32)sub n8=t[1],n8
(p34)sub n8=t[1],n8,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
(p34)cmp.geu p35,p33=n8,t[1];;
.pred.rel "mutex",p33,p35
(p33)sub a8=t[0],r0 }
{ .mmi; (p35)sub a8=t[0],r0,1;;
(p33)cmp.gtu p34,p32=a8,t[0]
(p35)cmp.geu p34,p32=a8,t[0] };;
// save the result, either tmp[num] or tmp[num]-np[num]
.pred.rel "mutex",p32,p34
{ .mmi; (p32)st8 [rptr]=n1,8
(p34)st8 [rptr]=t0,8
add r19=-4*16,prevsp};;
{ .mmb; (p32)st8 [rptr]=n2,8
(p34)st8 [rptr]=t[7],8
(p5)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n3,8
(p34)st8 [rptr]=t[6],8
(p7)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n4,8
(p34)st8 [rptr]=t[5],8
(p9)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n5,8
(p34)st8 [rptr]=t[4],8
(p11)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n6,8
(p34)st8 [rptr]=t[3],8
(p13)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n7,8
(p34)st8 [rptr]=t[2],8
(p15)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n8,8
(p34)st8 [rptr]=t[1],8
nop.b 0 };;
.Ldone: // epilogue
{ .mmi; ldf.fill f16=[r16],64
ldf.fill f17=[r17],64
nop.i 0 }
{ .mmi; ldf.fill f18=[r18],64
ldf.fill f19=[r19],64
mov pr=prevpr,0x1ffff };;
{ .mmi; ldf.fill f20=[r16]
ldf.fill f21=[r17]
mov ar.lc=prevlc }
{ .mmi; ldf.fill f22=[r18]
ldf.fill f23=[r19]
mov ret0=1 } // signal "handled"
{ .mib; rum 1<<5
.restore sp
mov sp=prevsp
br.ret.sptk.many b0 };;
.endp bn_mul_mont_8#
.type copyright#,\@object
copyright:
stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
___
$output=shift and open STDOUT,">$output";
print $code;
close STDOUT;

426
crypto/bn/asm/mips-mont.pl Normal file
View File

@@ -0,0 +1,426 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# This module doesn't present direct interest for OpenSSL, because it
# doesn't provide better performance for longer keys, at least not on
# in-order-execution cores. While 512-bit RSA sign operations can be
# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
# verify:-( All comparisons are against bn_mul_mont-free assembler.
# The module might be of interest to embedded system developers, as
# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
# code.
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp;
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
$PTR_SUB="dsub"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$SZREG=8;
} else {
$PTR_ADD="add";
$PTR_SUB="sub";
$REG_S="sw";
$REG_L="lw";
$SZREG=4;
}
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
#
# <appro@openssl.org>
#
######################################################################
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
if ($flavour =~ /64|n32/i) {
$LD="ld";
$ST="sd";
$MULTU="dmultu";
$ADDU="daddu";
$SUBU="dsubu";
$BNSZ=8;
} else {
$LD="lw";
$ST="sw";
$MULTU="multu";
$ADDU="addu";
$SUBU="subu";
$BNSZ=4;
}
# int bn_mul_mont(
$rp=$a0; # BN_ULONG *rp,
$ap=$a1; # const BN_ULONG *ap,
$bp=$a2; # const BN_ULONG *bp,
$np=$a3; # const BN_ULONG *np,
$n0=$a4; # const BN_ULONG *n0,
$num=$a5; # int num);
$lo0=$a6;
$hi0=$a7;
$lo1=$t1;
$hi1=$t2;
$aj=$s0;
$bi=$s1;
$nj=$s2;
$tp=$s3;
$alo=$s4;
$ahi=$s5;
$nlo=$s6;
$nhi=$s7;
$tj=$s8;
$i=$s9;
$j=$s10;
$m1=$s11;
$FRAMESIZE=14;
$code=<<___;
.text
.set noat
.set noreorder
.align 5
.globl bn_mul_mont
.ent bn_mul_mont
bn_mul_mont:
___
$code.=<<___ if ($flavour =~ /o32/i);
lw $n0,16($sp)
lw $num,20($sp)
___
$code.=<<___;
slt $at,$num,4
bnez $at,1f
li $t0,0
slt $at,$num,17 # on in-order CPU
bnez $at,bn_mul_mont_internal
nop
1: jr $ra
li $a0,0
.end bn_mul_mont
.align 5
.ent bn_mul_mont_internal
bn_mul_mont_internal:
.frame $fp,$FRAMESIZE*$SZREG,$ra
.mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
$PTR_SUB $sp,$FRAMESIZE*$SZREG
$REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
$REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
$REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
$REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
$REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
$REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
$REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
$REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
$REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
$REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
$REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
$REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
___
$code.=<<___;
move $fp,$sp
.set reorder
$LD $n0,0($n0)
$LD $bi,0($bp) # bp[0]
$LD $aj,0($ap) # ap[0]
$LD $nj,0($np) # np[0]
$PTR_SUB $sp,2*$BNSZ # place for two extra words
sll $num,`log($BNSZ)/log(2)`
li $at,-4096
$PTR_SUB $sp,$num
and $sp,$at
$MULTU $aj,$bi
$LD $alo,$BNSZ($ap)
$LD $nlo,$BNSZ($np)
mflo $lo0
mfhi $hi0
$MULTU $lo0,$n0
mflo $m1
$MULTU $alo,$bi
mflo $alo
mfhi $ahi
$MULTU $nj,$m1
mflo $lo1
mfhi $hi1
$MULTU $nlo,$m1
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
mflo $nlo
mfhi $nhi
move $tp,$sp
li $j,2*$BNSZ
.align 4
.L1st:
.set noreorder
$PTR_ADD $aj,$ap,$j
$PTR_ADD $nj,$np,$j
$LD $aj,($aj)
$LD $nj,($nj)
$MULTU $aj,$bi
$ADDU $lo0,$alo,$hi0
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo0,$hi0
sltu $t0,$lo1,$hi1
$ADDU $hi0,$ahi,$at
$ADDU $hi1,$nhi,$t0
mflo $alo
mfhi $ahi
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$MULTU $nj,$m1
$ADDU $hi1,$at
addu $j,$BNSZ
$ST $lo1,($tp)
sltu $t0,$j,$num
mflo $nlo
mfhi $nhi
bnez $t0,.L1st
$PTR_ADD $tp,$BNSZ
.set reorder
$ADDU $lo0,$alo,$hi0
sltu $at,$lo0,$hi0
$ADDU $hi0,$ahi,$at
$ADDU $lo1,$nlo,$hi1
sltu $t0,$lo1,$hi1
$ADDU $hi1,$nhi,$t0
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
$ST $lo1,($tp)
$ADDU $hi1,$hi0
sltu $at,$hi1,$hi0
$ST $hi1,$BNSZ($tp)
$ST $at,2*$BNSZ($tp)
li $i,$BNSZ
.align 4
.Louter:
$PTR_ADD $bi,$bp,$i
$LD $bi,($bi)
$LD $aj,($ap)
$LD $alo,$BNSZ($ap)
$LD $tj,($sp)
$MULTU $aj,$bi
$LD $nj,($np)
$LD $nlo,$BNSZ($np)
mflo $lo0
mfhi $hi0
$ADDU $lo0,$tj
$MULTU $lo0,$n0
sltu $at,$lo0,$tj
$ADDU $hi0,$at
mflo $m1
$MULTU $alo,$bi
mflo $alo
mfhi $ahi
$MULTU $nj,$m1
mflo $lo1
mfhi $hi1
$MULTU $nlo,$m1
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
mflo $nlo
mfhi $nhi
move $tp,$sp
li $j,2*$BNSZ
$LD $tj,$BNSZ($tp)
.align 4
.Linner:
.set noreorder
$PTR_ADD $aj,$ap,$j
$PTR_ADD $nj,$np,$j
$LD $aj,($aj)
$LD $nj,($nj)
$MULTU $aj,$bi
$ADDU $lo0,$alo,$hi0
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo0,$hi0
sltu $t0,$lo1,$hi1
$ADDU $hi0,$ahi,$at
$ADDU $hi1,$nhi,$t0
mflo $alo
mfhi $ahi
$ADDU $lo0,$tj
addu $j,$BNSZ
$MULTU $nj,$m1
sltu $at,$lo0,$tj
$ADDU $lo1,$lo0
$ADDU $hi0,$at
sltu $t0,$lo1,$lo0
$LD $tj,2*$BNSZ($tp)
$ADDU $hi1,$t0
sltu $at,$j,$num
mflo $nlo
mfhi $nhi
$ST $lo1,($tp)
bnez $at,.Linner
$PTR_ADD $tp,$BNSZ
.set reorder
$ADDU $lo0,$alo,$hi0
sltu $at,$lo0,$hi0
$ADDU $hi0,$ahi,$at
$ADDU $lo0,$tj
sltu $t0,$lo0,$tj
$ADDU $hi0,$t0
$LD $tj,2*$BNSZ($tp)
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo1,$hi1
$ADDU $hi1,$nhi,$at
$ADDU $lo1,$lo0
sltu $t0,$lo1,$lo0
$ADDU $hi1,$t0
$ST $lo1,($tp)
$ADDU $lo1,$hi1,$hi0
sltu $hi1,$lo1,$hi0
$ADDU $lo1,$tj
sltu $at,$lo1,$tj
$ADDU $hi1,$at
$ST $lo1,$BNSZ($tp)
$ST $hi1,2*$BNSZ($tp)
addu $i,$BNSZ
sltu $t0,$i,$num
bnez $t0,.Louter
.set noreorder
$PTR_ADD $tj,$sp,$num # &tp[num]
move $tp,$sp
move $ap,$sp
li $hi0,0 # clear borrow bit
.align 4
.Lsub: $LD $lo0,($tp)
$LD $lo1,($np)
$PTR_ADD $tp,$BNSZ
$PTR_ADD $np,$BNSZ
$SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
sgtu $at,$lo1,$lo0
$SUBU $lo0,$lo1,$hi0
sgtu $hi0,$lo0,$lo1
$ST $lo0,($rp)
or $hi0,$at
sltu $at,$tp,$tj
bnez $at,.Lsub
$PTR_ADD $rp,$BNSZ
$SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
move $tp,$sp
$PTR_SUB $rp,$num # restore rp
not $hi1,$hi0
and $ap,$hi0,$sp
and $bp,$hi1,$rp
or $ap,$ap,$bp # ap=borrow?tp:rp
.align 4
.Lcopy: $LD $aj,($ap)
$PTR_ADD $ap,$BNSZ
$ST $zero,($tp)
$PTR_ADD $tp,$BNSZ
sltu $at,$tp,$tj
$ST $aj,($rp)
bnez $at,.Lcopy
$PTR_ADD $rp,$BNSZ
li $a0,1
li $t0,1
.set noreorder
move $sp,$fp
$REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
$REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
$REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
$REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
$REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
$REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
$REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
$REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
$REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
$REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
$REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
$REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE*$SZREG
.end bn_mul_mont_internal
.rdata
.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

2234
crypto/bn/asm/mips.pl Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,995 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# On PA-7100LC this module performs ~90-50% better, less for longer
# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
# that compiler utilized xmpyu instruction to perform 32x32=64-bit
# multiplication, which in turn means that "baseline" performance was
# optimal in respect to instruction set capabilities. Fair comparison
# with vendor compiler is problematic, because OpenSSL doesn't define
# BN_LLONG [presumably] for historical reasons, which drives compiler
# toward 4 times 16x16=32-bit multiplicatons [plus complementary
# shifts and additions] instead. This means that you should observe
# several times improvement over code generated by vendor compiler
# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
# improvement coefficient was never collected on PA-7100LC, or any
# other 1.1 CPU, because I don't have access to such machine with
# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
# of ~5x on PA-8600.
#
# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
# reportedly ~2x faster than vendor compiler generated code [according
# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
# this implementation is actually 32-bit one, in the sense that it
# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
# 64-bit BN_LONGs... How do they interoperate then? No problem. This
# module picks halves of 64-bit values in reverse order and pretends
# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
# i.e. there is no "wider" multiplication like on most other 64-bit
# platforms. This means that even being effectively 32-bit, this
# implementation performs "64-bit" computational task in same amount
# of arithmetic operations, most notably multiplications. It requires
# more memory references, most notably to tp[num], but this doesn't
# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
# 2.0 code path provides virtually same performance as pa-risc2[W].s:
# it's ~10% better for shortest key length and ~10% worse for longest
# one.
#
# In case it wasn't clear. The module has two distinct code paths:
# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
# additions and 64-bit integer loads, not to mention specific
# instruction scheduling. In 64-bit build naturally only 2.0 code path
# is assembled. In 32-bit application context both code paths are
# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
# is taken automatically. Also, in 32-bit build the module imposes
# couple of limitations: vector lengths has to be even and vector
# addresses has to be 64-bit aligned. Normally neither is a problem:
# most common key lengths are even and vectors are commonly malloc-ed,
# which ensures alignment.
#
# Special thanks to polarhome.com for providing HP-UX account on
# PA-RISC 1.1 machine, and to correspondent who chose to remain
# anonymous for testing the code on PA-RISC 2.0 machine.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
$BN_SZ =$SIZE_T;
} else {
$LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
$BN_SZ =$SIZE_T;
if (open CONF,"<${dir}../../opensslconf.h") {
while(<CONF>) {
if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
$BN_SZ=8;
$LEVEL="2.0";
last;
}
}
close CONF;
}
}
$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
# [+ argument transfer]
$LOCALS=$FRAME-$FRAME_MARKER;
$FRAME+=32; # local variables
$tp="%r31";
$ti1="%r29";
$ti0="%r28";
$rp="%r26";
$ap="%r25";
$bp="%r24";
$np="%r23";
$n0="%r22"; # passed through stack in 32-bit
$num="%r21"; # passed through stack in 32-bit
$idx="%r20";
$arrsz="%r19";
$nm1="%r7";
$nm0="%r6";
$ab1="%r5";
$ab0="%r4";
$fp="%r3";
$hi1="%r2";
$hi0="%r1";
$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
$fm0="%fr4"; $fti=$fm0;
$fbi="%fr5L";
$fn0="%fr5R";
$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
.ALIGN 64
bn_mul_mont
.PROC
.CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
ldo -$FRAME(%sp),$fp
___
$code.=<<___ if ($SIZE_T==4);
ldw `-$FRAME_MARKER-4`($fp),$n0
ldw `-$FRAME_MARKER-8`($fp),$num
nop
nop ; alignment
___
$code.=<<___ if ($BN_SZ==4);
comiclr,<= 6,$num,%r0 ; are vectors long enough?
b L\$abort
ldi 0,%r28 ; signal "unhandled"
add,ev %r0,$num,$num ; is $num even?
b L\$abort
nop
or $ap,$np,$ti1
extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
b L\$abort
nop
nop ; alignment
nop
fldws 0($n0),${fn0}
fldws,ma 4($bp),${fbi} ; bp[0]
___
$code.=<<___ if ($BN_SZ==8);
comib,> 3,$num,L\$abort ; are vectors long enough?
ldi 0,%r28 ; signal "unhandled"
addl $num,$num,$num ; I operate on 32-bit values
fldws 4($n0),${fn0} ; only low part of n0
fldws 4($bp),${fbi} ; bp[0] in flipped word order
___
$code.=<<___;
fldds 0($ap),${fai} ; ap[0,1]
fldds 0($np),${fni} ; np[0,1]
sh2addl $num,%r0,$arrsz
ldi 31,$hi0
ldo 36($arrsz),$hi1 ; space for tp[num+1]
andcm $hi1,$hi0,$hi1 ; align
addl $hi1,%sp,%sp
$PUSH $fp,-$SIZE_T(%sp)
ldo `$LOCALS+16`($fp),$xfer
ldo `$LOCALS+32+4`($fp),$tp
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
xmpyu ${fn0},${fab0}R,${fm0}
addl $arrsz,$ap,$ap ; point at the end
addl $arrsz,$np,$np
subi 0,$arrsz,$idx ; j=0
ldo 8($idx),$idx ; j++++
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
fstds ${fab1},0($xfer)
fstds ${fnm1},8($xfer)
flddx $idx($ap),${fai} ; ap[2,3]
flddx $idx($np),${fni} ; np[2,3]
___
$code.=<<___ if ($BN_SZ==4);
mtctl $hi0,%cr11 ; $hi0 still holds 31
extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
b L\$parisc11
nop
___
$code.=<<___; # PA-RISC 2.0 code-path
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
extrd,u $ab0,31,32,$hi0
extrd,u $ab0,63,32,$ab0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
ldo 8($idx),$idx ; j++++
addl $ab0,$nm0,$nm0 ; low part is discarded
extrd,u $nm0,31,32,$hi1
L\$1st
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,63,32,$ab1
addl $hi1,$nm1,$nm1
flddx $idx($ap),${fai} ; ap[j,j+1]
flddx $idx($np),${fni} ; np[j,j+1]
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
addl $hi0,$ab0,$ab0
extrd,u $ab0,31,32,$hi0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
stw $nm1,-4($tp) ; tp[j-1]
addl $ab0,$nm0,$nm0
stw,ma $nm0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$1st ; j++++
extrd,u $nm0,31,32,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,63,32,$ab1
addl $hi1,$nm1,$nm1
ldd -16($xfer),$ab0
addl $ab1,$nm1,$nm1
ldd -8($xfer),$nm0
extrd,u $nm1,31,32,$hi1
addl $hi0,$ab0,$ab0
extrd,u $ab0,31,32,$hi0
stw $nm1,-4($tp) ; tp[j-1]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
ldd 0($xfer),$ab1
addl $ab0,$nm0,$nm0
ldd,mb 8($xfer),$nm1
extrd,u $nm0,31,32,$hi1
stw,ma $nm0,8($tp) ; tp[j-1]
ldo -1($num),$num ; i--
subi 0,$arrsz,$idx ; j=0
___
$code.=<<___ if ($BN_SZ==4);
fldws,ma 4($bp),${fbi} ; bp[1]
___
$code.=<<___ if ($BN_SZ==8);
fldws 0($bp),${fbi} ; bp[1] in flipped word order
___
$code.=<<___;
flddx $idx($ap),${fai} ; ap[0,1]
flddx $idx($np),${fni} ; np[0,1]
fldws 8($xfer),${fti}R ; tp[0]
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
stw $nm1,-4($tp) ; tp[j-1]
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
addl $hi1,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
stw $hi0,0($tp)
stw $hi1,4($tp)
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
xmpyu ${fn0},${fab0}R,${fm0}
ldo `$LOCALS+32+4`($fp),$tp
L\$outer
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer) ; 33-bit value
fstds ${fnm0},-8($xfer)
flddx $idx($ap),${fai} ; ap[2]
flddx $idx($np),${fni} ; np[2]
ldo 8($idx),$idx ; j++++
ldd -16($xfer),$ab0 ; 33-bit value
ldd -8($xfer),$nm0
ldw 0($xfer),$hi0 ; high part
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
extrd,u $ab0,31,32,$ti0 ; carry bit
extrd,u $ab0,63,32,$ab0
fstds ${fab1},0($xfer)
addl $ti0,$hi0,$hi0 ; account carry bit
fstds ${fnm1},8($xfer)
addl $ab0,$nm0,$nm0 ; low part is discarded
ldw 0($tp),$ti1 ; tp[1]
extrd,u $nm0,31,32,$hi1
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
L\$inner
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ti1,$ti1
addl $ti1,$ab1,$ab1
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
flddx $idx($ap),${fai} ; ap[j,j+1]
flddx $idx($np),${fni} ; np[j,j+1]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
ldw 4($tp),$ti0 ; tp[j]
stw $nm1,-4($tp) ; tp[j-1]
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
addl $hi0,$ti0,$ti0
addl $ti0,$ab0,$ab0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
extrd,u $ab0,31,32,$hi0
extrd,u $nm1,31,32,$hi1
ldw 8($tp),$ti1 ; tp[j]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
addl $ab0,$nm0,$nm0
stw,ma $nm0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$inner ; j++++
extrd,u $nm0,31,32,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ti1,$ti1
addl $ti1,$ab1,$ab1
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
ldd -16($xfer),$ab0
ldd -8($xfer),$nm0
extrd,u $nm1,31,32,$hi1
addl $hi0,$ab0,$ab0
addl $ti0,$ab0,$ab0
stw $nm1,-4($tp) ; tp[j-1]
extrd,u $ab0,31,32,$hi0
ldw 8($tp),$ti1 ; tp[j]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
ldd 0($xfer),$ab1
addl $ab0,$nm0,$nm0
ldd,mb 8($xfer),$nm1
extrd,u $nm0,31,32,$hi1
stw,ma $nm0,8($tp) ; tp[j-1]
addib,= -1,$num,L\$outerdone ; i--
subi 0,$arrsz,$idx ; j=0
___
$code.=<<___ if ($BN_SZ==4);
fldws,ma 4($bp),${fbi} ; bp[i]
___
$code.=<<___ if ($BN_SZ==8);
ldi 12,$ti0 ; bp[i] in flipped word order
addl,ev %r0,$num,$num
ldi -4,$ti0
addl $ti0,$bp,$bp
fldws 0($bp),${fbi}
___
$code.=<<___;
flddx $idx($ap),${fai} ; ap[0]
addl $hi0,$ab1,$ab1
flddx $idx($np),${fni} ; np[0]
fldws 8($xfer),${fti}R ; tp[0]
addl $ti1,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
fstws,mb ${fab0}L,-8($xfer) ; save high part
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
stw $nm1,-4($tp) ; tp[j-1]
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
addl $hi1,$hi0,$hi0
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
addl $ti0,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
stw $hi0,0($tp)
stw $hi1,4($tp)
xmpyu ${fn0},${fab0}R,${fm0}
b L\$outer
ldo `$LOCALS+32+4`($fp),$tp
L\$outerdone
addl $hi0,$ab1,$ab1
addl $ti1,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
stw $nm1,-4($tp) ; tp[j-1]
addl $hi1,$hi0,$hi0
addl $ti0,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
stw $hi0,0($tp)
stw $hi1,4($tp)
ldo `$LOCALS+32`($fp),$tp
sub %r0,%r0,%r0 ; clear borrow
___
$code.=<<___ if ($BN_SZ==4);
ldws,ma 4($tp),$ti0
extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
b L\$sub_pa11
addl $tp,$arrsz,$tp
L\$sub
ldwx $idx($np),$hi0
subb $ti0,$hi0,$hi1
ldwx $idx($tp),$ti0
addib,<> 4,$idx,L\$sub
stws,ma $hi1,4($rp)
subb $ti0,%r0,$hi1
ldo -4($tp),$tp
___
$code.=<<___ if ($BN_SZ==8);
ldd,ma 8($tp),$ti0
L\$sub
ldd $idx($np),$hi0
shrpd $ti0,$ti0,32,$ti0 ; flip word order
std $ti0,-8($tp) ; save flipped value
sub,db $ti0,$hi0,$hi1
ldd,ma 8($tp),$ti0
addib,<> 8,$idx,L\$sub
std,ma $hi1,8($rp)
extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
sub,db $ti0,%r0,$hi1
ldo -8($tp),$tp
___
$code.=<<___;
and $tp,$hi1,$ap
andcm $rp,$hi1,$bp
or $ap,$bp,$np
sub $rp,$arrsz,$rp ; rewind rp
subi 0,$arrsz,$idx
ldo `$LOCALS+32`($fp),$tp
L\$copy
ldd $idx($np),$hi0
std,ma %r0,8($tp)
addib,<> 8,$idx,.-8 ; L\$copy
std,ma $hi0,8($rp)
___
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
$ablo=$ab0;
$abhi=$ab1;
$nmlo0=$nm0;
$nmhi0=$nm1;
$nmlo1="%r9";
$nmhi1="%r8";
$code.=<<___;
b L\$done
nop
.ALIGN 8
L\$parisc11
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -12($xfer),$ablo
ldw -16($xfer),$hi0
ldw -4($xfer),$nmlo0
ldw -8($xfer),$nmhi0
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
ldo 8($idx),$idx ; j++++
add $ablo,$nmlo0,$nmlo0 ; discarded
addc %r0,$nmhi0,$hi1
ldw 4($xfer),$ablo
ldw 0($xfer),$abhi
nop
L\$1st_pa11
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
flddx $idx($ap),${fai} ; ap[j,j+1]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
flddx $idx($np),${fni} ; np[j,j+1]
add $hi0,$ablo,$ablo
ldw 12($xfer),$nmlo1
addc %r0,$abhi,$hi0
ldw 8($xfer),$nmhi1
add $ablo,$nmlo1,$nmlo1
fstds ${fab1},0($xfer)
addc %r0,$nmhi1,$nmhi1
fstds ${fnm1},8($xfer)
add $hi1,$nmlo1,$nmlo1
ldw -12($xfer),$ablo
addc %r0,$nmhi1,$hi1
ldw -16($xfer),$abhi
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
ldw -4($xfer),$nmlo0
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -8($xfer),$nmhi0
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$hi0
fstds ${fab0},-16($xfer)
add $ablo,$nmlo0,$nmlo0
fstds ${fnm0},-8($xfer)
addc %r0,$nmhi0,$nmhi0
ldw 0($xfer),$abhi
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$1st_pa11 ; j++++
addc %r0,$nmhi0,$hi1
ldw 8($xfer),$nmhi1
ldw 12($xfer),$nmlo1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
add $hi0,$ablo,$ablo
fstds ${fab1},0($xfer)
addc %r0,$abhi,$hi0
fstds ${fnm1},8($xfer)
add $ablo,$nmlo1,$nmlo1
ldw -16($xfer),$abhi
addc %r0,$nmhi1,$nmhi1
ldw -12($xfer),$ablo
add $hi1,$nmlo1,$nmlo1
ldw -8($xfer),$nmhi0
addc %r0,$nmhi1,$hi1
ldw -4($xfer),$nmlo0
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$hi0
ldw 0($xfer),$abhi
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldws,mb 8($xfer),$nmhi1
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$nmlo1
addc %r0,$nmhi0,$hi1
stws,ma $nmlo0,8($tp) ; tp[j-1]
ldo -1($num),$num ; i--
subi 0,$arrsz,$idx ; j=0
fldws,ma 4($bp),${fbi} ; bp[1]
flddx $idx($ap),${fai} ; ap[0,1]
flddx $idx($np),${fni} ; np[0,1]
fldws 8($xfer),${fti}R ; tp[0]
add $hi0,$ablo,$ablo
addc %r0,$abhi,$hi0
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
stw $nmlo1,-4($tp) ; tp[j-1]
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
stw $hi0,0($tp)
stw $hi1,4($tp)
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
xmpyu ${fn0},${fab0}R,${fm0}
ldo `$LOCALS+32+4`($fp),$tp
L\$outer_pa11
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer) ; 33-bit value
fstds ${fnm0},-8($xfer)
flddx $idx($ap),${fai} ; ap[2,3]
flddx $idx($np),${fni} ; np[2,3]
ldw -16($xfer),$abhi ; carry bit actually
ldo 8($idx),$idx ; j++++
ldw -12($xfer),$ablo
ldw -8($xfer),$nmhi0
ldw -4($xfer),$nmlo0
ldw 0($xfer),$hi0 ; high part
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
fstds ${fab1},0($xfer)
addl $abhi,$hi0,$hi0 ; account carry bit
fstds ${fnm1},8($xfer)
add $ablo,$nmlo0,$nmlo0 ; discarded
ldw 0($tp),$ti1 ; tp[1]
addc %r0,$nmhi0,$hi1
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
ldw 4($xfer),$ablo
ldw 0($xfer),$abhi
L\$inner_pa11
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
flddx $idx($ap),${fai} ; ap[j,j+1]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
flddx $idx($np),${fni} ; np[j,j+1]
add $hi0,$ablo,$ablo
ldw 4($tp),$ti0 ; tp[j]
addc %r0,$abhi,$abhi
ldw 12($xfer),$nmlo1
add $ti1,$ablo,$ablo
ldw 8($xfer),$nmhi1
addc %r0,$abhi,$hi0
fstds ${fab1},0($xfer)
add $ablo,$nmlo1,$nmlo1
fstds ${fnm1},8($xfer)
addc %r0,$nmhi1,$nmhi1
ldw -12($xfer),$ablo
add $hi1,$nmlo1,$nmlo1
ldw -16($xfer),$abhi
addc %r0,$nmhi1,$hi1
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
ldw 8($tp),$ti1 ; tp[j]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -4($xfer),$nmlo0
add $hi0,$ablo,$ablo
ldw -8($xfer),$nmhi0
addc %r0,$abhi,$abhi
stw $nmlo1,-4($tp) ; tp[j-1]
add $ti0,$ablo,$ablo
fstds ${fab0},-16($xfer)
addc %r0,$abhi,$hi0
fstds ${fnm0},-8($xfer)
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldw 0($xfer),$abhi
add $hi1,$nmlo0,$nmlo0
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$inner_pa11 ; j++++
addc %r0,$nmhi0,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
ldw 12($xfer),$nmlo1
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldw 8($xfer),$nmhi1
add $hi0,$ablo,$ablo
ldw 4($tp),$ti0 ; tp[j]
addc %r0,$abhi,$abhi
fstds ${fab1},0($xfer)
add $ti1,$ablo,$ablo
fstds ${fnm1},8($xfer)
addc %r0,$abhi,$hi0
ldw -16($xfer),$abhi
add $ablo,$nmlo1,$nmlo1
ldw -12($xfer),$ablo
addc %r0,$nmhi1,$nmhi1
ldw -8($xfer),$nmhi0
add $hi1,$nmlo1,$nmlo1
ldw -4($xfer),$nmlo0
addc %r0,$nmhi1,$hi1
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$abhi
add $ti0,$ablo,$ablo
ldw 8($tp),$ti1 ; tp[j]
addc %r0,$abhi,$hi0
ldw 0($xfer),$abhi
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldws,mb 8($xfer),$nmhi1
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$nmlo1
addc %r0,$nmhi0,$hi1
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,= -1,$num,L\$outerdone_pa11; i--
subi 0,$arrsz,$idx ; j=0
fldws,ma 4($bp),${fbi} ; bp[i]
flddx $idx($ap),${fai} ; ap[0]
add $hi0,$ablo,$ablo
addc %r0,$abhi,$abhi
flddx $idx($np),${fni} ; np[0]
fldws 8($xfer),${fti}R ; tp[0]
add $ti1,$ablo,$ablo
addc %r0,$abhi,$hi0
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
ldw 4($tp),$ti0 ; tp[j]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
stw $nmlo1,-4($tp) ; tp[j-1]
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
add $ti0,$hi0,$hi0
addc %r0,$hi1,$hi1
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
stw $hi0,0($tp)
stw $hi1,4($tp)
xmpyu ${fn0},${fab0}R,${fm0}
b L\$outer_pa11
ldo `$LOCALS+32+4`($fp),$tp
L\$outerdone_pa11
add $hi0,$ablo,$ablo
addc %r0,$abhi,$abhi
add $ti1,$ablo,$ablo
addc %r0,$abhi,$hi0
ldw 4($tp),$ti0 ; tp[j]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
stw $nmlo1,-4($tp) ; tp[j-1]
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
add $ti0,$hi0,$hi0
addc %r0,$hi1,$hi1
stw $hi0,0($tp)
stw $hi1,4($tp)
ldo `$LOCALS+32+4`($fp),$tp
sub %r0,%r0,%r0 ; clear borrow
ldw -4($tp),$ti0
addl $tp,$arrsz,$tp
L\$sub_pa11
ldwx $idx($np),$hi0
subb $ti0,$hi0,$hi1
ldwx $idx($tp),$ti0
addib,<> 4,$idx,L\$sub_pa11
stws,ma $hi1,4($rp)
subb $ti0,%r0,$hi1
ldo -4($tp),$tp
and $tp,$hi1,$ap
andcm $rp,$hi1,$bp
or $ap,$bp,$np
sub $rp,$arrsz,$rp ; rewind rp
subi 0,$arrsz,$idx
ldo `$LOCALS+32`($fp),$tp
L\$copy_pa11
ldwx $idx($np),$hi0
stws,ma %r0,4($tp)
addib,<> 4,$idx,L\$copy_pa11
stws,ma $hi0,4($rp)
nop ; alignment
L\$done
___
}
$code.=<<___;
ldi 1,%r28 ; signal "handled"
ldo $FRAME($fp),%sp ; destroy tp[num+1]
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
L\$abort
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
# that it can be compiled with .LEVEL 1.0. It should be noted that I
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
# directive...
my $ldd = sub {
my ($mod,$args) = @_;
my $orig = "ldd$mod\t$args";
if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
{ my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
{ my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
$opcode|=(1<<5) if ($mod =~ /^,m/);
$opcode|=(1<<13) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $std = sub {
my ($mod,$args) = @_;
my $orig = "std$mod\t$args";
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
{ my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
$opcode|=(1<<5) if ($mod =~ /^,m/);
$opcode|=(1<<13) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $extrd = sub {
my ($mod,$args) = @_;
my $orig = "extrd$mod\t$args";
# I only have ",u" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
my $len=32-$3;
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
my $len=32-$2;
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
$opcode |= (1<<13) if ($mod =~ /,\**=/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $shrpd = sub {
my ($mod,$args) = @_;
my $orig = "shrpd$mod\t$args";
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
my $cpos=63-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $sub = sub {
my ($mod,$args) = @_;
my $orig = "sub$mod\t$args";
if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
$opcode|=(1<<10); # e1
$opcode|=(1<<8); # e2
$opcode|=(1<<5); # d
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
}
else { "\t".$orig; }
};
sub assemble {
my ($mnemonic,$mod,$args)=@_;
my $opcode = eval("\$$mnemonic");
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
# flip word order in 64-bit mode...
s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
# assemble 2.0 instructions in 32-bit mode...
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
s/\bbv\b/bve/gm if ($SIZE_T==8);
print $_,"\n";
}
close STDOUT;

View File

@@ -31,7 +31,6 @@ if ($flavour =~ /32/) {
$BNSZ= $BITS/8;
$SIZE_T=4;
$RZONE= 224;
$FRAME= $SIZE_T*16;
$LD= "lwz"; # load
$LDU= "lwzu"; # load and update
@@ -51,7 +50,6 @@ if ($flavour =~ /32/) {
$BNSZ= $BITS/8;
$SIZE_T=8;
$RZONE= 288;
$FRAME= $SIZE_T*16;
# same as above, but 64-bit mnemonics...
$LD= "ld"; # load
@@ -69,6 +67,9 @@ if ($flavour =~ /32/) {
$POP= $LD;
} else { die "nonsense $flavour"; }
$FRAME=8*$SIZE_T+$RZONE;
$LOCALS=8*$SIZE_T;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -89,18 +90,18 @@ $aj="r10";
$nj="r11";
$tj="r12";
# non-volatile registers
$i="r14";
$j="r15";
$tp="r16";
$m0="r17";
$m1="r18";
$lo0="r19";
$hi0="r20";
$lo1="r21";
$hi1="r22";
$alo="r23";
$ahi="r24";
$nlo="r25";
$i="r20";
$j="r21";
$tp="r22";
$m0="r23";
$m1="r24";
$lo0="r25";
$hi0="r26";
$lo1="r27";
$hi1="r28";
$alo="r29";
$ahi="r30";
$nlo="r31";
#
$nhi="r0";
@@ -108,42 +109,48 @@ $code=<<___;
.machine "any"
.text
.globl .bn_mul_mont
.globl .bn_mul_mont_int
.align 4
.bn_mul_mont:
.bn_mul_mont_int:
cmpwi $num,4
mr $rp,r3 ; $rp is reassigned
li r3,0
bltlr
___
$code.=<<___ if ($BNSZ==4);
cmpwi $num,32 ; longer key performance is not better
bgelr
___
$code.=<<___;
slwi $num,$num,`log($BNSZ)/log(2)`
li $tj,-4096
addi $ovf,$num,`$FRAME+$RZONE`
addi $ovf,$num,$FRAME
subf $ovf,$ovf,$sp ; $sp-$ovf
and $ovf,$ovf,$tj ; minimize TLB usage
subf $ovf,$sp,$ovf ; $ovf-$sp
mr $tj,$sp
srwi $num,$num,`log($BNSZ)/log(2)`
$STUX $sp,$sp,$ovf
$PUSH r14,`4*$SIZE_T`($sp)
$PUSH r15,`5*$SIZE_T`($sp)
$PUSH r16,`6*$SIZE_T`($sp)
$PUSH r17,`7*$SIZE_T`($sp)
$PUSH r18,`8*$SIZE_T`($sp)
$PUSH r19,`9*$SIZE_T`($sp)
$PUSH r20,`10*$SIZE_T`($sp)
$PUSH r21,`11*$SIZE_T`($sp)
$PUSH r22,`12*$SIZE_T`($sp)
$PUSH r23,`13*$SIZE_T`($sp)
$PUSH r24,`14*$SIZE_T`($sp)
$PUSH r25,`15*$SIZE_T`($sp)
$PUSH r20,`-12*$SIZE_T`($tj)
$PUSH r21,`-11*$SIZE_T`($tj)
$PUSH r22,`-10*$SIZE_T`($tj)
$PUSH r23,`-9*$SIZE_T`($tj)
$PUSH r24,`-8*$SIZE_T`($tj)
$PUSH r25,`-7*$SIZE_T`($tj)
$PUSH r26,`-6*$SIZE_T`($tj)
$PUSH r27,`-5*$SIZE_T`($tj)
$PUSH r28,`-4*$SIZE_T`($tj)
$PUSH r29,`-3*$SIZE_T`($tj)
$PUSH r30,`-2*$SIZE_T`($tj)
$PUSH r31,`-1*$SIZE_T`($tj)
$LD $n0,0($n0) ; pull n0[0] value
addi $num,$num,-2 ; adjust $num for counter register
$LD $m0,0($bp) ; m0=bp[0]
$LD $aj,0($ap) ; ap[0]
addi $tp,$sp,$FRAME
addi $tp,$sp,$LOCALS
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
$UMULH $hi0,$aj,$m0
@@ -205,8 +212,8 @@ L1st:
Louter:
$LDX $m0,$bp,$i ; m0=bp[i]
$LD $aj,0($ap) ; ap[0]
addi $tp,$sp,$FRAME
$LD $tj,$FRAME($sp) ; tp[0]
addi $tp,$sp,$LOCALS
$LD $tj,$LOCALS($sp); tp[0]
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
$UMULH $hi0,$aj,$m0
$LD $aj,$BNSZ($ap) ; ap[1]
@@ -273,7 +280,7 @@ Linner:
addi $num,$num,2 ; restore $num
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
addi $tp,$sp,$FRAME
addi $tp,$sp,$LOCALS
mtctr $num
.align 4
@@ -299,23 +306,28 @@ Lcopy: ; copy or in-place refresh
addi $j,$j,$BNSZ
bdnz- Lcopy
$POP r14,`4*$SIZE_T`($sp)
$POP r15,`5*$SIZE_T`($sp)
$POP r16,`6*$SIZE_T`($sp)
$POP r17,`7*$SIZE_T`($sp)
$POP r18,`8*$SIZE_T`($sp)
$POP r19,`9*$SIZE_T`($sp)
$POP r20,`10*$SIZE_T`($sp)
$POP r21,`11*$SIZE_T`($sp)
$POP r22,`12*$SIZE_T`($sp)
$POP r23,`13*$SIZE_T`($sp)
$POP r24,`14*$SIZE_T`($sp)
$POP r25,`15*$SIZE_T`($sp)
$POP $sp,0($sp)
$POP $tj,0($sp)
li r3,1
$POP r20,`-12*$SIZE_T`($tj)
$POP r21,`-11*$SIZE_T`($tj)
$POP r22,`-10*$SIZE_T`($tj)
$POP r23,`-9*$SIZE_T`($tj)
$POP r24,`-8*$SIZE_T`($tj)
$POP r25,`-7*$SIZE_T`($tj)
$POP r26,`-6*$SIZE_T`($tj)
$POP r27,`-5*$SIZE_T`($tj)
$POP r28,`-4*$SIZE_T`($tj)
$POP r29,`-3*$SIZE_T`($tj)
$POP r30,`-2*$SIZE_T`($tj)
$POP r31,`-1*$SIZE_T`($tj)
mr $sp,$tj
blr
.long 0
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
.byte 0,12,4,0,0x80,12,6,0
.long 0
.size .bn_mul_mont_int,.-.bn_mul_mont_int
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;

View File

@@ -389,7 +389,10 @@ $data=<<EOF;
$ST r9,`6*$BNSZ`(r3) #r[6]=c1
$ST r10,`7*$BNSZ`(r3) #r[7]=c2
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.size .bn_sqr_comba4,.-.bn_sqr_comba4
#
# NOTE: The following label name should be changed to
@@ -814,8 +817,10 @@ $data=<<EOF;
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.size .bn_sqr_comba8,.-.bn_sqr_comba8
#
# NOTE: The following label name should be changed to
@@ -949,7 +954,7 @@ $data=<<EOF;
addze r11,r0
#mul_add_c(a[3],b[2],c3,c1,c2);
$LD r6,`3*$BNSZ`(r4)
$LD r7,`2*$BNSZ`(r4)
$LD r7,`2*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r12,r8,r12
@@ -966,7 +971,10 @@ $data=<<EOF;
$ST r10,`6*$BNSZ`(r3) #r[6]=c1
$ST r11,`7*$BNSZ`(r3) #r[7]=c2
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size .bn_mul_comba4,.-.bn_mul_comba4
#
# NOTE: The following label name should be changed to
@@ -1502,7 +1510,10 @@ $data=<<EOF;
$ST r12,`14*$BNSZ`(r3) #r[14]=c3;
$ST r10,`15*$BNSZ`(r3) #r[15]=c1;
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size .bn_mul_comba8,.-.bn_mul_comba8
#
# NOTE: The following label name should be changed to
@@ -1550,8 +1561,10 @@ Lppcasm_sub_adios:
subfze r3,r0 # if carry bit is set then r3 = 0 else -1
andi. r3,r3,1 # keep only last bit.
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
.size .bn_sub_words,.-.bn_sub_words
#
# NOTE: The following label name should be changed to
@@ -1594,7 +1607,10 @@ Lppcasm_add_mainloop:
Lppcasm_add_adios:
addze r3,r0 #return carry bit.
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
.size .bn_add_words,.-.bn_add_words
#
# NOTE: The following label name should be changed to
@@ -1707,7 +1723,10 @@ Lppcasm_div8:
Lppcasm_div9:
or r3,r8,r0
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size .bn_div_words,.-.bn_div_words
#
# NOTE: The following label name should be changed to
@@ -1746,8 +1765,10 @@ Lppcasm_sqr_mainloop:
bdnz- Lppcasm_sqr_mainloop
Lppcasm_sqr_adios:
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size .bn_sqr_words,.-.bn_sqr_words
#
# NOTE: The following label name should be changed to
@@ -1850,7 +1871,10 @@ Lppcasm_mw_REM:
Lppcasm_mw_OVER:
addi r3,r12,0
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
.size bn_mul_words,.-bn_mul_words
#
# NOTE: The following label name should be changed to
@@ -1973,7 +1997,10 @@ Lppcasm_maw_leftover:
Lppcasm_maw_adios:
addi r3,r12,0
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
.size .bn_mul_add_words,.-.bn_mul_add_words
.align 4
EOF
$data =~ s/\`([^\`]*)\`/eval $1/gem;

File diff suppressed because it is too large Load Diff

1898
crypto/bn/asm/rsaz-avx2.pl Executable file

File diff suppressed because it is too large Load Diff

2144
crypto/bn/asm/rsaz-x86_64.pl Executable file

File diff suppressed because it is too large Load Diff

221
crypto/bn/asm/s390x-gf2m.pl Normal file
View File

@@ -0,0 +1,221 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... gcc 4.3 appeared to generate poor code, therefore
# the effort. And indeed, the module delivers 55%-90%(*) improvement
# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
# This is for 64-bit build. In 32-bit "highgprs" case improvement is
# even higher, for example on z990 it was measured 80%-150%. ECDSA
# sign is modest 9%-12% faster. Keep in mind that these coefficients
# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
# burnt in it...
#
# (*) gcc 4.1 was observed to deliver better results than gcc 4.3,
# so that improvement coefficients can vary from one specific
# setup to another.
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$stdframe=16*$SIZE_T+4*8;
$rp="%r2";
$a1="%r3";
$a0="%r4";
$b1="%r5";
$b0="%r6";
$ra="%r14";
$sp="%r15";
@T=("%r0","%r1");
@i=("%r12","%r13");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
$code.=<<___;
.text
.type _mul_1x1,\@function
.align 16
_mul_1x1:
lgr $a1,$a
sllg $a2,$a,1
sllg $a4,$a,2
sllg $a8,$a,3
srag $lo,$a1,63 # broadcast 63rd bit
nihh $a1,0x1fff
srag @i[0],$a2,63 # broadcast 62nd bit
nihh $a2,0x3fff
srag @i[1],$a4,63 # broadcast 61st bit
nihh $a4,0x7fff
ngr $lo,$b
ngr @i[0],$b
ngr @i[1],$b
lghi @T[0],0
lgr $a12,$a1
stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0
xgr $a12,$a2
stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1
lgr $a48,$a4
stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2
xgr $a48,$a8
stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2
xgr $a1,$a4
stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4
xgr $a2,$a4
stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4
xgr $a12,$a4
stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4
xgr $a1,$a48
stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4
xgr $a2,$a48
stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8
xgr $a12,$a48
stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8
xgr $a1,$a4
stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8
xgr $a2,$a4
stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8
xgr $a12,$a4
stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8
srlg $hi,$lo,1
stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8
sllg $lo,$lo,63
stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8
srlg @T[0],@i[0],2
stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8
lghi $mask,`0xf<<3`
sllg $a1,@i[0],62
sllg @i[0],$b,3
srlg @T[1],@i[1],3
ngr @i[0],$mask
sllg $a2,@i[1],61
srlg @i[1],$b,4-3
xgr $hi,@T[0]
ngr @i[1],$mask
xgr $lo,$a1
xgr $hi,@T[1]
xgr $lo,$a2
xg $lo,$stdframe(@i[0],$sp)
srlg @i[0],$b,8-3
ngr @i[0],$mask
___
for($n=1;$n<14;$n++) {
$code.=<<___;
lg @T[1],$stdframe(@i[1],$sp)
srlg @i[1],$b,`($n+2)*4`-3
sllg @T[0],@T[1],`$n*4`
ngr @i[1],$mask
srlg @T[1],@T[1],`64-$n*4`
xgr $lo,@T[0]
xgr $hi,@T[1]
___
push(@i,shift(@i)); push(@T,shift(@T));
}
$code.=<<___;
lg @T[1],$stdframe(@i[1],$sp)
sllg @T[0],@T[1],`$n*4`
srlg @T[1],@T[1],`64-$n*4`
xgr $lo,@T[0]
xgr $hi,@T[1]
lg @T[0],$stdframe(@i[0],$sp)
sllg @T[1],@T[0],`($n+1)*4`
srlg @T[0],@T[0],`64-($n+1)*4`
xgr $lo,@T[1]
xgr $hi,@T[0]
br $ra
.size _mul_1x1,.-_mul_1x1
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,\@function
.align 16
bn_GF2m_mul_2x2:
stm${g} %r3,%r15,3*$SIZE_T($sp)
lghi %r1,-$stdframe-128
la %r0,0($sp)
la $sp,0(%r1,$sp) # alloca
st${g} %r0,0($sp) # back chain
___
if ($SIZE_T==8) {
my @r=map("%r$_",(6..9));
$code.=<<___;
bras $ra,_mul_1x1 # a1·b1
stmg $lo,$hi,16($rp)
lg $a,`$stdframe+128+4*$SIZE_T`($sp)
lg $b,`$stdframe+128+6*$SIZE_T`($sp)
bras $ra,_mul_1x1 # a0·b0
stmg $lo,$hi,0($rp)
lg $a,`$stdframe+128+3*$SIZE_T`($sp)
lg $b,`$stdframe+128+5*$SIZE_T`($sp)
xg $a,`$stdframe+128+4*$SIZE_T`($sp)
xg $b,`$stdframe+128+6*$SIZE_T`($sp)
bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
lmg @r[0],@r[3],0($rp)
xgr $lo,$hi
xgr $hi,@r[1]
xgr $lo,@r[0]
xgr $hi,@r[2]
xgr $lo,@r[3]
xgr $hi,@r[3]
xgr $lo,$hi
stg $hi,16($rp)
stg $lo,8($rp)
___
} else {
$code.=<<___;
sllg %r3,%r3,32
sllg %r5,%r5,32
or %r3,%r4
or %r5,%r6
bras $ra,_mul_1x1
rllg $lo,$lo,32
rllg $hi,$hi,32
stmg $lo,$hi,0($rp)
___
}
$code.=<<___;
lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
br $ra
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View File

@@ -32,6 +32,33 @@
# Reschedule to minimize/avoid Address Generation Interlock hazard,
# make inner loops counter-based.
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
# is achieved by swapping words after 64-bit loads, follow _dswap-s.
# On z990 it was measured to perform 2.6-2.2 times better than
# compiler-generated code, less for longer keys...
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$stdframe=16*$SIZE_T+4*8;
$mn0="%r0";
$num="%r1";
@@ -60,34 +87,44 @@ $code.=<<___;
.globl bn_mul_mont
.type bn_mul_mont,\@function
bn_mul_mont:
lgf $num,164($sp) # pull $num
sla $num,3 # $num to enumerate bytes
lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
la $bp,0($num,$bp)
stg %r2,16($sp)
st${g} %r2,2*$SIZE_T($sp)
cghi $num,16 #
lghi %r2,0 #
blr %r14 # if($num<16) return 0;
cghi $num,128 #
bhr %r14 # if($num>128) return 0;
___
$code.=<<___ if ($flavour =~ /3[12]/);
tmll $num,4
bnzr %r14 # if ($num&1) return 0;
___
$code.=<<___ if ($flavour !~ /3[12]/);
cghi $num,96 #
bhr %r14 # if($num>96) return 0;
___
$code.=<<___;
stm${g} %r3,%r15,3*$SIZE_T($sp)
stmg %r3,%r15,24($sp)
lghi $rp,-160-8 # leave room for carry bit
lghi $rp,-$stdframe-8 # leave room for carry bit
lcgr $j,$num # -$num
lgr %r0,$sp
la $rp,0($rp,$sp)
la $sp,0($j,$rp) # alloca
stg %r0,0($sp) # back chain
st${g} %r0,0($sp) # back chain
sra $num,3 # restore $num
la $bp,0($j,$bp) # restore $bp
ahi $num,-1 # adjust $num for inner loop
lg $n0,0($n0) # pull n0
_dswap $n0
lg $bi,0($bp)
_dswap $bi
lg $alo,0($ap)
_dswap $alo
mlgr $ahi,$bi # ap[0]*bp[0]
lgr $AHI,$ahi
@@ -95,6 +132,7 @@ bn_mul_mont:
msgr $mn0,$n0
lg $nlo,0($np) #
_dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
@@ -106,12 +144,14 @@ bn_mul_mont:
.align 16
.L1st:
lg $alo,0($j,$ap)
_dswap $alo
mlgr $ahi,$bi # ap[j]*bp[0]
algr $alo,$AHI
lghi $AHI,0
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
_dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
@@ -119,22 +159,24 @@ bn_mul_mont:
algr $nlo,$alo
alcgr $NHI,$nhi
stg $nlo,160-8($j,$sp) # tp[j-1]=
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.L1st
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI # upmost overflow bit
stg $NHI,160-8($j,$sp)
stg $AHI,160($j,$sp)
stg $NHI,$stdframe-8($j,$sp)
stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
.Louter:
lg $bi,0($bp) # bp[i]
_dswap $bi
lg $alo,0($ap)
_dswap $alo
mlgr $ahi,$bi # ap[0]*bp[i]
alg $alo,160($sp) # +=tp[0]
alg $alo,$stdframe($sp) # +=tp[0]
lghi $AHI,0
alcgr $AHI,$ahi
@@ -142,6 +184,7 @@ bn_mul_mont:
msgr $mn0,$n0 # tp[0]*n0
lg $nlo,0($np) # np[0]
_dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
@@ -153,14 +196,16 @@ bn_mul_mont:
.align 16
.Linner:
lg $alo,0($j,$ap)
_dswap $alo
mlgr $ahi,$bi # ap[j]*bp[i]
algr $alo,$AHI
lghi $AHI,0
alcgr $ahi,$AHI
alg $alo,160($j,$sp)# +=tp[j]
alg $alo,$stdframe($j,$sp)# +=tp[j]
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
_dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
@@ -168,31 +213,33 @@ bn_mul_mont:
algr $nlo,$alo # +="tp[j]"
alcgr $NHI,$nhi
stg $nlo,160-8($j,$sp) # tp[j-1]=
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.Linner
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI
alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit
alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
lghi $ahi,0
alcgr $AHI,$ahi # new upmost overflow bit
stg $NHI,160-8($j,$sp)
stg $AHI,160($j,$sp)
stg $NHI,$stdframe-8($j,$sp)
stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
clg $bp,160+8+32($j,$sp) # compare to &bp[num]
cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
jne .Louter
lg $rp,160+8+16($j,$sp) # reincarnate rp
la $ap,160($sp)
l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
la $ap,$stdframe($sp)
ahi $num,1 # restore $num, incidentally clears "borrow"
la $j,0(%r0)
lr $count,$num
.Lsub: lg $alo,0($j,$ap)
slbg $alo,0($j,$np)
lg $nlo,0($j,$np)
_dswap $nlo
slbgr $alo,$nlo
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lsub
@@ -207,19 +254,24 @@ bn_mul_mont:
la $j,0(%r0)
lgr $count,$num
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
stg $j,160($j,$sp) # zap tp
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
_dswap $alo
stg $j,$stdframe($j,$sp) # zap tp
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lcopy
la %r1,160+8+48($j,$sp)
lmg %r6,%r15,0(%r1)
la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
lm${g} %r6,%r15,0(%r1)
lghi %r2,1 # signal "processed"
br %r14
.size bn_mul_mont,.-bn_mul_mont
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
print $code;
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
print $_,"\n";
}
close STDOUT;

1222
crypto/bn/asm/sparct4-mont.pl Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,190 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# October 2012
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has two code paths: one suitable
# for all SPARCv9 processors and one for VIS3-capable ones. Former
# delivers ~25-45% more, more for longer keys, heaviest DH and DSA
# verify operations on venerable UltraSPARC II. On T4 VIS3 code is
# ~100-230% faster than gcc-generated code and ~35-90% faster than
# the pure SPARCv9 code path.
$locals=16*8;
$tab="%l0";
@T=("%g2","%g3");
@i=("%g4","%g5");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5));
($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;
$code.=<<___;
#include <sparc_arch.h>
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif
.globl bn_GF2m_mul_2x2
.align 16
bn_GF2m_mul_2x2:
SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
ld [%g1+0],%g1 ! OPENSSL_sparcv9cap_P[0]
andcc %g1, SPARCV9_VIS3, %g0
bz,pn %icc,.Lsoftware
nop
sllx %o1, 32, %o1
sllx %o3, 32, %o3
or %o2, %o1, %o1
or %o4, %o3, %o3
.word 0x95b262ab ! xmulx %o1, %o3, %o2
.word 0x99b262cb ! xmulxhi %o1, %o3, %o4
srlx %o2, 32, %o1 ! 13 cycles later
st %o2, [%o0+0]
st %o1, [%o0+4]
srlx %o4, 32, %o3
st %o4, [%o0+8]
retl
st %o3, [%o0+12]
.align 16
.Lsoftware:
save %sp,-STACK_FRAME-$locals,%sp
sllx %i1,32,$a
mov -1,$a12
sllx %i3,32,$b
or %i2,$a,$a
srlx $a12,1,$a48 ! 0x7fff...
or %i4,$b,$b
srlx $a12,2,$a12 ! 0x3fff...
add %sp,STACK_BIAS+STACK_FRAME,$tab
sllx $a,2,$a4
mov $a,$a1
sllx $a,1,$a2
srax $a4,63,@i[1] ! broadcast 61st bit
and $a48,$a4,$a4 ! (a<<2)&0x7fff...
srlx $a48,2,$a48
srax $a2,63,@i[0] ! broadcast 62nd bit
and $a12,$a2,$a2 ! (a<<1)&0x3fff...
srax $a1,63,$lo ! broadcast 63rd bit
and $a48,$a1,$a1 ! (a<<0)&0x1fff...
sllx $a1,3,$a8
and $b,$lo,$lo
and $b,@i[0],@i[0]
and $b,@i[1],@i[1]
stx %g0,[$tab+0*8] ! tab[0]=0
xor $a1,$a2,$a12
stx $a1,[$tab+1*8] ! tab[1]=a1
stx $a2,[$tab+2*8] ! tab[2]=a2
xor $a4,$a8,$a48
stx $a12,[$tab+3*8] ! tab[3]=a1^a2
xor $a4,$a1,$a1
stx $a4,[$tab+4*8] ! tab[4]=a4
xor $a4,$a2,$a2
stx $a1,[$tab+5*8] ! tab[5]=a1^a4
xor $a4,$a12,$a12
stx $a2,[$tab+6*8] ! tab[6]=a2^a4
xor $a48,$a1,$a1
stx $a12,[$tab+7*8] ! tab[7]=a1^a2^a4
xor $a48,$a2,$a2
stx $a8,[$tab+8*8] ! tab[8]=a8
xor $a48,$a12,$a12
stx $a1,[$tab+9*8] ! tab[9]=a1^a8
xor $a4,$a1,$a1
stx $a2,[$tab+10*8] ! tab[10]=a2^a8
xor $a4,$a2,$a2
stx $a12,[$tab+11*8] ! tab[11]=a1^a2^a8
xor $a4,$a12,$a12
stx $a48,[$tab+12*8] ! tab[12]=a4^a8
srlx $lo,1,$hi
stx $a1,[$tab+13*8] ! tab[13]=a1^a4^a8
sllx $lo,63,$lo
stx $a2,[$tab+14*8] ! tab[14]=a2^a4^a8
srlx @i[0],2,@T[0]
stx $a12,[$tab+15*8] ! tab[15]=a1^a2^a4^a8
sllx @i[0],62,$a1
sllx $b,3,@i[0]
srlx @i[1],3,@T[1]
and @i[0],`0xf<<3`,@i[0]
sllx @i[1],61,$a2
ldx [$tab+@i[0]],@i[0]
srlx $b,4-3,@i[1]
xor @T[0],$hi,$hi
and @i[1],`0xf<<3`,@i[1]
xor $a1,$lo,$lo
ldx [$tab+@i[1]],@i[1]
xor @T[1],$hi,$hi
xor @i[0],$lo,$lo
srlx $b,8-3,@i[0]
xor $a2,$lo,$lo
and @i[0],`0xf<<3`,@i[0]
___
for($n=1;$n<14;$n++) {
$code.=<<___;
sllx @i[1],`$n*4`,@T[0]
ldx [$tab+@i[0]],@i[0]
srlx @i[1],`64-$n*4`,@T[1]
xor @T[0],$lo,$lo
srlx $b,`($n+2)*4`-3,@i[1]
xor @T[1],$hi,$hi
and @i[1],`0xf<<3`,@i[1]
___
push(@i,shift(@i)); push(@T,shift(@T));
}
$code.=<<___;
sllx @i[1],`$n*4`,@T[0]
ldx [$tab+@i[0]],@i[0]
srlx @i[1],`64-$n*4`,@T[1]
xor @T[0],$lo,$lo
sllx @i[0],`($n+1)*4`,@T[0]
xor @T[1],$hi,$hi
srlx @i[0],`64-($n+1)*4`,@T[1]
xor @T[0],$lo,$lo
xor @T[1],$hi,$hi
srlx $lo,32,%i1
st $lo,[%i0+0]
st %i1,[%i0+4]
srlx $hi,32,%i2
st $hi,[%i0+8]
st %i2,[%i0+12]
ret
restore
.type bn_GF2m_mul_2x2,#function
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.asciz "GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

373
crypto/bn/asm/vis3-mont.pl Normal file
View File

@@ -0,0 +1,373 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2012.
#
# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
# onward. There are three new instructions used here: umulxhi,
# addxc[cc] and initializing store. On T3 RSA private key operations
# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
# lengths. This is without dedicated squaring procedure. On T4
# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
# for reference purposes, because T4 has dedicated Montgomery
# multiplication and squaring *instructions* that deliver even more.
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }
$code.=<<___ if ($bits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
.section ".text",#alloc,#execinstr
___
($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
# int bn_mul_mont(
$rp="%o0"; # BN_ULONG *rp,
$ap="%o1"; # const BN_ULONG *ap,
$bp="%o2"; # const BN_ULONG *bp,
$np="%o3"; # const BN_ULONG *np,
$n0p="%o4"; # const BN_ULONG *n0,
$num="%o5"; # int num); # caller ensures that num is even
# and >=6
$code.=<<___;
.globl bn_mul_mont_vis3
.align 32
bn_mul_mont_vis3:
add %sp, $bias, %g4 ! real top of stack
sll $num, 2, $num ! size in bytes
add $num, 63, %g5
andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes
add %g5, %g5, %g1
add %g5, %g1, %g1 ! 3*buffer size
sub %g4, %g1, %g1
andn %g1, 63, %g1 ! align at 64 byte
sub %g1, $frame, %g1 ! new top of stack
sub %g1, %g4, %g1
save %sp, %g1, %sp
___
# +-------------------------------+<----- %sp
# . .
# +-------------------------------+<----- aligned at 64 bytes
# | __int64 tmp[0] |
# +-------------------------------+
# . .
# . .
# +-------------------------------+<----- aligned at 64 bytes
# | __int64 ap[1..0] | converted ap[]
# +-------------------------------+
# | __int64 np[1..0] | converted np[]
# +-------------------------------+
# | __int64 ap[3..2] |
# . .
# . .
# +-------------------------------+
($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
($ovf,$i)=($t0,$t1);
$code.=<<___;
ld [$n0p+0], $t0 ! pull n0[0..1] value
add %sp, $bias+$frame, $tp
ld [$n0p+4], $t1
add $tp, %g5, $anp
ld [$bp+0], $t2 ! m0=bp[0]
sllx $t1, 32, $n0
ld [$bp+4], $t3
or $t0, $n0, $n0
add $bp, 8, $bp
ld [$ap+0], $t0 ! ap[0]
sllx $t3, 32, $m0
ld [$ap+4], $t1
or $t2, $m0, $m0
ld [$ap+8], $t2 ! ap[1]
sllx $t1, 32, $aj
ld [$ap+12], $t3
or $t0, $aj, $aj
add $ap, 16, $ap
stx $aj, [$anp] ! converted ap[0]
mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
umulxhi $aj, $m0, $hi0
ld [$np+0], $t0 ! np[0]
sllx $t3, 32, $aj
ld [$np+4], $t1
or $t2, $aj, $aj
ld [$np+8], $t2 ! np[1]
sllx $t1, 32, $nj
ld [$np+12], $t3
or $t0, $nj, $nj
add $np, 16, $np
stx $nj, [$anp+8] ! converted np[0]
mulx $lo0, $n0, $m1 ! "tp[0]"*n0
stx $aj, [$anp+16] ! converted ap[1]
mulx $aj, $m0, $alo ! ap[1]*bp[0]
umulxhi $aj, $m0, $aj ! ahi=aj
mulx $nj, $m1, $lo1 ! np[0]*m1
umulxhi $nj, $m1, $hi1
sllx $t3, 32, $nj
or $t2, $nj, $nj
stx $nj, [$anp+24] ! converted np[1]
add $anp, 32, $anp
addcc $lo0, $lo1, $lo1
addxc %g0, $hi1, $hi1
mulx $nj, $m1, $nlo ! np[1]*m1
umulxhi $nj, $m1, $nj ! nhi=nj
ba .L1st
sub $num, 24, $cnt ! cnt=num-3
.align 16
.L1st:
ld [$ap+0], $t0 ! ap[j]
addcc $alo, $hi0, $lo0
ld [$ap+4], $t1
addxc $aj, %g0, $hi0
sllx $t1, 32, $aj
add $ap, 8, $ap
or $t0, $aj, $aj
stx $aj, [$anp] ! converted ap[j]
ld [$np+0], $t2 ! np[j]
addcc $nlo, $hi1, $lo1
ld [$np+4], $t3
addxc $nj, %g0, $hi1 ! nhi=nj
sllx $t3, 32, $nj
add $np, 8, $np
mulx $aj, $m0, $alo ! ap[j]*bp[0]
or $t2, $nj, $nj
umulxhi $aj, $m0, $aj ! ahi=aj
stx $nj, [$anp+8] ! converted np[j]
add $anp, 16, $anp ! anp++
mulx $nj, $m1, $nlo ! np[j]*m1
addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
umulxhi $nj, $m1, $nj ! nhi=nj
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp ! tp++
brnz,pt $cnt, .L1st
sub $cnt, 8, $cnt ! j--
!.L1st
addcc $alo, $hi0, $lo0
addxc $aj, %g0, $hi0 ! ahi=aj
addcc $nlo, $hi1, $lo1
addxc $nj, %g0, $hi1
addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp
addcc $hi0, $hi1, $hi1
addxc %g0, %g0, $ovf ! upmost overflow bit
stx $hi1, [$tp]
add $tp, 8, $tp
ba .Louter
sub $num, 16, $i ! i=num-2
.align 16
.Louter:
ld [$bp+0], $t2 ! m0=bp[i]
ld [$bp+4], $t3
sub $anp, $num, $anp ! rewind
sub $tp, $num, $tp
sub $anp, $num, $anp
add $bp, 8, $bp
sllx $t3, 32, $m0
ldx [$anp+0], $aj ! ap[0]
or $t2, $m0, $m0
ldx [$anp+8], $nj ! np[0]
mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
ldx [$tp], $tj ! tp[0]
umulxhi $aj, $m0, $hi0
ldx [$anp+16], $aj ! ap[1]
addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
mulx $aj, $m0, $alo ! ap[1]*bp[i]
addxc %g0, $hi0, $hi0
mulx $lo0, $n0, $m1 ! tp[0]*n0
umulxhi $aj, $m0, $aj ! ahi=aj
mulx $nj, $m1, $lo1 ! np[0]*m1
umulxhi $nj, $m1, $hi1
ldx [$anp+24], $nj ! np[1]
add $anp, 32, $anp
addcc $lo1, $lo0, $lo1
mulx $nj, $m1, $nlo ! np[1]*m1
addxc %g0, $hi1, $hi1
umulxhi $nj, $m1, $nj ! nhi=nj
ba .Linner
sub $num, 24, $cnt ! cnt=num-3
.align 16
.Linner:
addcc $alo, $hi0, $lo0
ldx [$tp+8], $tj ! tp[j]
addxc $aj, %g0, $hi0 ! ahi=aj
ldx [$anp+0], $aj ! ap[j]
addcc $nlo, $hi1, $lo1
mulx $aj, $m0, $alo ! ap[j]*bp[i]
addxc $nj, %g0, $hi1 ! nhi=nj
ldx [$anp+8], $nj ! np[j]
add $anp, 16, $anp
umulxhi $aj, $m0, $aj ! ahi=aj
addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
mulx $nj, $m1, $nlo ! np[j]*m1
addxc %g0, $hi0, $hi0
umulxhi $nj, $m1, $nj ! nhi=nj
addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp
brnz,pt $cnt, .Linner
sub $cnt, 8, $cnt
!.Linner
ldx [$tp+8], $tj ! tp[j]
addcc $alo, $hi0, $lo0
addxc $aj, %g0, $hi0 ! ahi=aj
addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
addxc %g0, $hi0, $hi0
addcc $nlo, $hi1, $lo1
addxc $nj, %g0, $hi1 ! nhi=nj
addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
addxccc $hi1, $hi0, $hi1
addxc %g0, %g0, $ovf
stx $hi1, [$tp+8]
add $tp, 16, $tp
brnz,pt $i, .Louter
sub $i, 8, $i
sub $anp, $num, $anp ! rewind
sub $tp, $num, $tp
sub $anp, $num, $anp
ba .Lsub
subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
.align 16
.Lsub:
ldx [$tp], $tj
add $tp, 8, $tp
ldx [$anp+8], $nj
add $anp, 16, $anp
subccc $tj, $nj, $t2 ! tp[j]-np[j]
srlx $tj, 32, $tj
srlx $nj, 32, $nj
subccc $tj, $nj, $t3
add $rp, 8, $rp
st $t2, [$rp-4] ! reverse order
st $t3, [$rp-8]
brnz,pt $cnt, .Lsub
sub $cnt, 8, $cnt
sub $anp, $num, $anp ! rewind
sub $tp, $num, $tp
sub $anp, $num, $anp
sub $rp, $num, $rp
subc $ovf, %g0, $ovf ! handle upmost overflow bit
and $tp, $ovf, $ap
andn $rp, $ovf, $np
or $np, $ap, $ap ! ap=borrow?tp:rp
ba .Lcopy
sub $num, 8, $cnt
.align 16
.Lcopy: ! copy or in-place refresh
ld [$ap+0], $t2
ld [$ap+4], $t3
add $ap, 8, $ap
stx %g0, [$tp] ! zap
add $tp, 8, $tp
stx %g0, [$anp] ! zap
stx %g0, [$anp+8]
add $anp, 16, $anp
st $t3, [$rp+0] ! flip order
st $t2, [$rp+4]
add $rp, 8, $rp
brnz $cnt, .Lcopy
sub $cnt, 8, $cnt
mov 1, %o0
ret
restore
.type bn_mul_mont_vis3, #function
.size bn_mul_mont_vis3, .-bn_mul_mont_vis3
.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis3 {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my ($ref,$opf);
my %visopf = ( "addxc" => 0x011,
"addxccc" => 0x013,
"umulxhi" => 0x016 );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%([goli])([0-9])/);
$_=$bias{$1}+$2;
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
&unvis3($1,$2,$3,$4)
/ge;
print $_,"\n";
}
close STDOUT;

313
crypto/bn/asm/x86-gf2m.pl Normal file
View File

@@ -0,0 +1,313 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has three code paths: pure integer
# code suitable for any x86 CPU, MMX code suitable for PIII and later
# and PCLMULQDQ suitable for Westmere and later. Improvement varies
# from one benchmark and µ-arch to another. Below are interval values
# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
# code:
#
# PIII 16%-30%
# P4 12%-12%
# Opteron 18%-40%
# Core2 19%-44%
# Atom 38%-64%
# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX)
# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX)
#
# Note that above improvement coefficients are not coefficients for
# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
# is more and more dominated by other subroutines, most notably by
# BN_GF2m_mod[_mul]_arr...
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
$a="eax";
$b="ebx";
($a1,$a2,$a4)=("ecx","edx","ebp");
$R="mm0";
@T=("mm1","mm2");
($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
@i=("esi","edi");
if (!$x86only) {
&function_begin_B("_mul_1x1_mmx");
&sub ("esp",32+4);
&mov ($a1,$a);
&lea ($a2,&DWP(0,$a,$a));
&and ($a1,0x3fffffff);
&lea ($a4,&DWP(0,$a2,$a2));
&mov (&DWP(0*4,"esp"),0);
&and ($a2,0x7fffffff);
&movd ($A,$a);
&movd ($B,$b);
&mov (&DWP(1*4,"esp"),$a1); # a1
&xor ($a1,$a2); # a1^a2
&pxor ($B31,$B31);
&pxor ($B30,$B30);
&mov (&DWP(2*4,"esp"),$a2); # a2
&xor ($a2,$a4); # a2^a4
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
&pcmpgtd($B31,$A); # broadcast 31st bit
&paddd ($A,$A); # $A<<=1
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
&mov (&DWP(4*4,"esp"),$a4); # a4
&xor ($a4,$a2); # a2=a4^a2^a4
&pand ($B31,$B);
&pcmpgtd($B30,$A); # broadcast 30th bit
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
&xor ($a4,$a1); # a1^a2^a4
&psllq ($B31,31);
&pand ($B30,$B);
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
&mov (@i[0],0x7);
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
&mov ($a4,@i[0]);
&and (@i[0],$b);
&shr ($b,3);
&mov (@i[1],$a4);
&psllq ($B30,30);
&and (@i[1],$b);
&shr ($b,3);
&movd ($R,&DWP(0,"esp",@i[0],4));
&mov (@i[0],$a4);
&and (@i[0],$b);
&shr ($b,3);
for($n=1;$n<9;$n++) {
&movd (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@i[1],$a4);
&psllq (@T[1],3*$n);
&and (@i[1],$b);
&shr ($b,3);
&pxor ($R,@T[1]);
push(@i,shift(@i)); push(@T,shift(@T));
}
&movd (@T[1],&DWP(0,"esp",@i[1],4));
&pxor ($R,$B30);
&psllq (@T[1],3*$n++);
&pxor ($R,@T[1]);
&movd (@T[0],&DWP(0,"esp",@i[0],4));
&pxor ($R,$B31);
&psllq (@T[0],3*$n);
&add ("esp",32+4);
&pxor ($R,@T[0]);
&ret ();
&function_end_B("_mul_1x1_mmx");
}
($lo,$hi)=("eax","edx");
@T=("ecx","ebp");
&function_begin_B("_mul_1x1_ialu");
&sub ("esp",32+4);
&mov ($a1,$a);
&lea ($a2,&DWP(0,$a,$a));
&lea ($a4,&DWP(0,"",$a,4));
&and ($a1,0x3fffffff);
&lea (@i[1],&DWP(0,$lo,$lo));
&sar ($lo,31); # broadcast 31st bit
&mov (&DWP(0*4,"esp"),0);
&and ($a2,0x7fffffff);
&mov (&DWP(1*4,"esp"),$a1); # a1
&xor ($a1,$a2); # a1^a2
&mov (&DWP(2*4,"esp"),$a2); # a2
&xor ($a2,$a4); # a2^a4
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
&mov (&DWP(4*4,"esp"),$a4); # a4
&xor ($a4,$a2); # a2=a4^a2^a4
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
&xor ($a4,$a1); # a1^a2^a4
&sar (@i[1],31); # broardcast 30th bit
&and ($lo,$b);
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
&and (@i[1],$b);
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
&mov ($hi,$lo);
&shl ($lo,31);
&mov (@T[0],@i[1]);
&shr ($hi,1);
&mov (@i[0],0x7);
&shl (@i[1],30);
&and (@i[0],$b);
&shr (@T[0],2);
&xor ($lo,@i[1]);
&shr ($b,3);
&mov (@i[1],0x7); # 5-byte instruction!?
&and (@i[1],$b);
&shr ($b,3);
&xor ($hi,@T[0]);
&xor ($lo,&DWP(0,"esp",@i[0],4));
&mov (@i[0],0x7);
&and (@i[0],$b);
&shr ($b,3);
for($n=1;$n<9;$n++) {
&mov (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@i[1],0x7);
&mov (@T[0],@T[1]);
&shl (@T[1],3*$n);
&and (@i[1],$b);
&shr (@T[0],32-3*$n);
&xor ($lo,@T[1]);
&shr ($b,3);
&xor ($hi,@T[0]);
push(@i,shift(@i)); push(@T,shift(@T));
}
&mov (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@T[0],@T[1]);
&shl (@T[1],3*$n);
&mov (@i[1],&DWP(0,"esp",@i[0],4));
&shr (@T[0],32-3*$n); $n++;
&mov (@i[0],@i[1]);
&xor ($lo,@T[1]);
&shl (@i[1],3*$n);
&xor ($hi,@T[0]);
&shr (@i[0],32-3*$n);
&xor ($lo,@i[1]);
&xor ($hi,@i[0]);
&add ("esp",32+4);
&ret ();
&function_end_B("_mul_1x1_ialu");
# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
&function_begin_B("bn_GF2m_mul_2x2");
if (!$x86only) {
&picmeup("edx","OPENSSL_ia32cap_P");
&mov ("eax",&DWP(0,"edx"));
&mov ("edx",&DWP(4,"edx"));
&test ("eax",1<<23); # check MMX bit
&jz (&label("ialu"));
if ($sse2) {
&test ("eax",1<<24); # check FXSR bit
&jz (&label("mmx"));
&test ("edx",1<<1); # check PCLMULQDQ bit
&jz (&label("mmx"));
&movups ("xmm0",&QWP(8,"esp"));
&shufps ("xmm0","xmm0",0b10110001);
&pclmulqdq ("xmm0","xmm0",1);
&mov ("eax",&DWP(4,"esp"));
&movups (&QWP(0,"eax"),"xmm0");
&ret ();
&set_label("mmx",16);
}
&push ("ebp");
&push ("ebx");
&push ("esi");
&push ("edi");
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&call ("_mul_1x1_mmx"); # a1·b1
&movq ("mm7",$R);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
&call ("_mul_1x1_mmx"); # a0·b0
&movq ("mm6",$R);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
&call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
&pxor ($R,"mm7");
&mov ($a,&wparam(0));
&pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
&movq ($A,$R);
&psllq ($R,32);
&pop ("edi");
&psrlq ($A,32);
&pop ("esi");
&pxor ($R,"mm6");
&pop ("ebx");
&pxor ($A,"mm7");
&movq (&QWP(0,$a),$R);
&pop ("ebp");
&movq (&QWP(8,$a),$A);
&emms ();
&ret ();
&set_label("ialu",16);
}
&push ("ebp");
&push ("ebx");
&push ("esi");
&push ("edi");
&stack_push(4+1);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&call ("_mul_1x1_ialu"); # a1·b1
&mov (&DWP(8,"esp"),$lo);
&mov (&DWP(12,"esp"),$hi);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
&call ("_mul_1x1_ialu"); # a0·b0
&mov (&DWP(0,"esp"),$lo);
&mov (&DWP(4,"esp"),$hi);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
&call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
&mov ("ebp",&wparam(0));
@r=("ebx","ecx","edi","esi");
&mov (@r[0],&DWP(0,"esp"));
&mov (@r[1],&DWP(4,"esp"));
&mov (@r[2],&DWP(8,"esp"));
&mov (@r[3],&DWP(12,"esp"));
&xor ($lo,$hi);
&xor ($hi,@r[1]);
&xor ($lo,@r[0]);
&mov (&DWP(0,"ebp"),@r[0]);
&xor ($hi,@r[2]);
&mov (&DWP(12,"ebp"),@r[3]);
&xor ($lo,@r[3]);
&stack_pop(4+1);
&xor ($hi,@r[3]);
&pop ("edi");
&xor ($lo,$hi);
&pop ("esi");
&mov (&DWP(8,"ebp"),$hi);
&pop ("ebx");
&mov (&DWP(4,"ebp"),$lo);
&pop ("ebp");
&ret ();
&function_end_B("bn_GF2m_mul_2x2");
&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View File

@@ -527,8 +527,10 @@ $sbit=$num;
&jle (&label("sqradd"));
&mov ($carry,"edx");
&lea ("edx",&DWP(0,$sbit,"edx",2));
&add ("edx","edx");
&shr ($carry,31);
&add ("edx",$sbit);
&adc ($carry,0);
&set_label("sqrlast");
&mov ($word,$_n0);
&mov ($inp,$_np);

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,390 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has two code paths: code suitable
# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
# later. Improvement varies from one benchmark and µ-arch to another.
# Vanilla code path is at most 20% faster than compiler-generated code
# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
# all CPU time is burnt in it...
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
($lo,$hi)=("%rax","%rdx"); $a=$lo;
($i0,$i1)=("%rsi","%rdi");
($t0,$t1)=("%rbx","%rcx");
($b,$mask)=("%rbp","%r8");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
($R,$Tx)=("%xmm0","%xmm1");
$code.=<<___;
.text
.type _mul_1x1,\@abi-omnipotent
.align 16
_mul_1x1:
sub \$128+8,%rsp
mov \$-1,$a1
lea ($a,$a),$i0
shr \$3,$a1
lea (,$a,4),$i1
and $a,$a1 # a1=a&0x1fffffffffffffff
lea (,$a,8),$a8
sar \$63,$a # broadcast 63rd bit
lea ($a1,$a1),$a2
sar \$63,$i0 # broadcast 62nd bit
lea (,$a1,4),$a4
and $b,$a
sar \$63,$i1 # boardcast 61st bit
mov $a,$hi # $a is $lo
shl \$63,$lo
and $b,$i0
shr \$1,$hi
mov $i0,$t1
shl \$62,$i0
and $b,$i1
shr \$2,$t1
xor $i0,$lo
mov $i1,$t0
shl \$61,$i1
xor $t1,$hi
shr \$3,$t0
xor $i1,$lo
xor $t0,$hi
mov $a1,$a12
movq \$0,0(%rsp) # tab[0]=0
xor $a2,$a12 # a1^a2
mov $a1,8(%rsp) # tab[1]=a1
mov $a4,$a48
mov $a2,16(%rsp) # tab[2]=a2
xor $a8,$a48 # a4^a8
mov $a12,24(%rsp) # tab[3]=a1^a2
xor $a4,$a1
mov $a4,32(%rsp) # tab[4]=a4
xor $a4,$a2
mov $a1,40(%rsp) # tab[5]=a1^a4
xor $a4,$a12
mov $a2,48(%rsp) # tab[6]=a2^a4
xor $a48,$a1 # a1^a4^a4^a8=a1^a8
mov $a12,56(%rsp) # tab[7]=a1^a2^a4
xor $a48,$a2 # a2^a4^a4^a8=a1^a8
mov $a8,64(%rsp) # tab[8]=a8
xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
mov $a1,72(%rsp) # tab[9]=a1^a8
xor $a4,$a1 # a1^a8^a4
mov $a2,80(%rsp) # tab[10]=a2^a8
xor $a4,$a2 # a2^a8^a4
mov $a12,88(%rsp) # tab[11]=a1^a2^a8
xor $a4,$a12 # a1^a2^a8^a4
mov $a48,96(%rsp) # tab[12]=a4^a8
mov $mask,$i0
mov $a1,104(%rsp) # tab[13]=a1^a4^a8
and $b,$i0
mov $a2,112(%rsp) # tab[14]=a2^a4^a8
shr \$4,$b
mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
mov $mask,$i1
and $b,$i1
shr \$4,$b
movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
mov $mask,$i0
and $b,$i0
shr \$4,$b
___
for ($n=1;$n<8;$n++) {
$code.=<<___;
mov (%rsp,$i1,8),$t1
mov $mask,$i1
mov $t1,$t0
shl \$`8*$n-4`,$t1
and $b,$i1
movq (%rsp,$i0,8),$Tx
shr \$`64-(8*$n-4)`,$t0
xor $t1,$lo
pslldq \$$n,$Tx
mov $mask,$i0
shr \$4,$b
xor $t0,$hi
and $b,$i0
shr \$4,$b
pxor $Tx,$R
___
}
$code.=<<___;
mov (%rsp,$i1,8),$t1
mov $t1,$t0
shl \$`8*$n-4`,$t1
movq $R,$i0
shr \$`64-(8*$n-4)`,$t0
xor $t1,$lo
psrldq \$8,$R
xor $t0,$hi
movq $R,$i1
xor $i0,$lo
xor $i1,$hi
add \$128+8,%rsp
ret
.Lend_mul_1x1:
.size _mul_1x1,.-_mul_1x1
___
($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
$code.=<<___;
.extern OPENSSL_ia32cap_P
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,\@abi-omnipotent
.align 16
bn_GF2m_mul_2x2:
mov OPENSSL_ia32cap_P(%rip),%rax
bt \$33,%rax
jnc .Lvanilla_mul_2x2
movq $a1,%xmm0
movq $b1,%xmm1
movq $a0,%xmm2
___
$code.=<<___ if ($win64);
movq 40(%rsp),%xmm3
___
$code.=<<___ if (!$win64);
movq $b0,%xmm3
___
$code.=<<___;
movdqa %xmm0,%xmm4
movdqa %xmm1,%xmm5
pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
pxor %xmm2,%xmm4
pxor %xmm3,%xmm5
pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
xorps %xmm0,%xmm4
xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
movdqa %xmm4,%xmm5
pslldq \$8,%xmm4
psrldq \$8,%xmm5
pxor %xmm4,%xmm2
pxor %xmm5,%xmm0
movdqu %xmm2,0($rp)
movdqu %xmm0,16($rp)
ret
.align 16
.Lvanilla_mul_2x2:
lea -8*17(%rsp),%rsp
___
$code.=<<___ if ($win64);
mov `8*17+40`(%rsp),$b0
mov %rdi,8*15(%rsp)
mov %rsi,8*16(%rsp)
___
$code.=<<___;
mov %r14,8*10(%rsp)
mov %r13,8*11(%rsp)
mov %r12,8*12(%rsp)
mov %rbp,8*13(%rsp)
mov %rbx,8*14(%rsp)
.Lbody_mul_2x2:
mov $rp,32(%rsp) # save the arguments
mov $a1,40(%rsp)
mov $a0,48(%rsp)
mov $b1,56(%rsp)
mov $b0,64(%rsp)
mov \$0xf,$mask
mov $a1,$a
mov $b1,$b
call _mul_1x1 # a1·b1
mov $lo,16(%rsp)
mov $hi,24(%rsp)
mov 48(%rsp),$a
mov 64(%rsp),$b
call _mul_1x1 # a0·b0
mov $lo,0(%rsp)
mov $hi,8(%rsp)
mov 40(%rsp),$a
mov 56(%rsp),$b
xor 48(%rsp),$a
xor 64(%rsp),$b
call _mul_1x1 # (a0+a1)·(b0+b1)
___
@r=("%rbx","%rcx","%rdi","%rsi");
$code.=<<___;
mov 0(%rsp),@r[0]
mov 8(%rsp),@r[1]
mov 16(%rsp),@r[2]
mov 24(%rsp),@r[3]
mov 32(%rsp),%rbp
xor $hi,$lo
xor @r[1],$hi
xor @r[0],$lo
mov @r[0],0(%rbp)
xor @r[2],$hi
mov @r[3],24(%rbp)
xor @r[3],$lo
xor @r[3],$hi
xor $hi,$lo
mov $hi,16(%rbp)
mov $lo,8(%rbp)
mov 8*10(%rsp),%r14
mov 8*11(%rsp),%r13
mov 8*12(%rsp),%r12
mov 8*13(%rsp),%rbp
mov 8*14(%rsp),%rbx
___
$code.=<<___ if ($win64);
mov 8*15(%rsp),%rdi
mov 8*16(%rsp),%rsi
___
$code.=<<___;
lea 8*17(%rsp),%rsp
ret
.Lend_mul_2x2:
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 16
___
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 152($context),%rax # pull context->Rsp
mov 248($context),%rbx # pull context->Rip
lea .Lbody_mul_2x2(%rip),%r10
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lin_prologue
mov 8*10(%rax),%r14 # mimic epilogue
mov 8*11(%rax),%r13
mov 8*12(%rax),%r12
mov 8*13(%rax),%rbp
mov 8*14(%rax),%rbx
mov 8*15(%rax),%rdi
mov 8*16(%rax),%rsi
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
.Lin_prologue:
lea 8*17(%rax),%rax
mov %rax,152($context) # restore context->Rsp
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size se_handler,.-se_handler
.section .pdata
.align 4
.rva _mul_1x1
.rva .Lend_mul_1x1
.rva .LSEH_info_1x1
.rva .Lvanilla_mul_2x2
.rva .Lend_mul_2x2
.rva .LSEH_info_2x2
.section .xdata
.align 8
.LSEH_info_1x1:
.byte 0x01,0x07,0x02,0x00
.byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
.LSEH_info_2x2:
.byte 9,0,0,0
.rva se_handler
___
}
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

3519
crypto/bn/asm/x86_64-mont5.pl Executable file

File diff suppressed because it is too large Load Diff