Merge remote-tracking branch 'origin/master'

# Conflicts:
#	README.md
This commit is contained in:
Zhi Guan
2017-02-14 16:12:29 +08:00
parent d2254170b8
commit 43fed1108d
3503 changed files with 320546 additions and 408546 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
##############################################################################
# #
@@ -149,7 +156,7 @@ $code.=<<___;
___
{
# This function recieves a pointer to an array of four affine points
# This function receives a pointer to an array of four affine points
# (X, Y, <1>) and rearanges the data for AVX2 execution, while
# converting it to 2^29 radix redundant form
@@ -301,7 +308,7 @@ ___
}
{
################################################################################
# This function recieves a pointer to an array of four AVX2 formatted points
# This function receives a pointer to an array of four AVX2 formatted points
# (X, Y, Z) convert the data to normal representation, and rearanges the data
my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8));
@@ -1909,7 +1916,7 @@ ___
}
{
################################################################################
# void ecp_nistz256_avx2_multi_select_w7(void* RESULT, void *in,
# void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in,
# int index0, int index1, int index2, int index3);
################################################################################
@@ -1919,10 +1926,10 @@ my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
$code.=<<___;
.globl ecp_nistz256_avx2_multi_select_w7
.type ecp_nistz256_avx2_multi_select_w7,\@function,6
.globl ecp_nistz256_avx2_multi_gather_w7
.type ecp_nistz256_avx2_multi_gather_w7,\@function,6
.align 32
ecp_nistz256_avx2_multi_select_w7:
ecp_nistz256_avx2_multi_gather_w7:
vzeroupper
___
$code.=<<___ if ($win64);
@@ -2036,7 +2043,7 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
ret
.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7
.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
.extern OPENSSL_ia32cap_P
.globl ecp_nistz_avx2_eligible
@@ -2061,8 +2068,8 @@ $code.=<<___;
.globl ecp_nistz256_avx2_to_mont
.globl ecp_nistz256_avx2_from_mont
.globl ecp_nistz256_avx2_set1
.globl ecp_nistz256_avx2_multi_select_w7
.type ecp_nistz256_avx2_multi_select_w7,\@abi-omnipotent
.globl ecp_nistz256_avx2_multi_gather_w7
.type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent
ecp_nistz256_avx2_transpose_convert:
ecp_nistz256_avx2_convert_transpose_back:
ecp_nistz256_avx2_point_add_affine_x4:
@@ -2070,10 +2077,10 @@ ecp_nistz256_avx2_point_add_affines_x4:
ecp_nistz256_avx2_to_mont:
ecp_nistz256_avx2_from_mont:
ecp_nistz256_avx2_set1:
ecp_nistz256_avx2_multi_select_w7:
ecp_nistz256_avx2_multi_gather_w7:
.byte 0x0f,0x0b # ud2
ret
.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7
.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
.globl ecp_nistz_avx2_eligible
.type ecp_nistz_avx2_eligible,\@abi-omnipotent

File diff suppressed because it is too large Load Diff

1866
crypto/ec/asm/ecp_nistz256-x86.pl Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
##############################################################################
# #
@@ -60,7 +67,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
@@ -81,7 +88,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$addx = ($1>=12);
}
if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
$avx = ($ver>=3.0) + ($ver>=3.01);
$addx = ($ver>=3.03);
@@ -128,6 +135,7 @@ ecp_nistz256_mul_by_2:
push %r13
mov 8*0($a_ptr), $a0
xor $t4,$t4
mov 8*1($a_ptr), $a1
add $a0, $a0 # a0:a3+a0:a3
mov 8*2($a_ptr), $a2
@@ -138,7 +146,7 @@ ecp_nistz256_mul_by_2:
adc $a2, $a2
adc $a3, $a3
mov $a1, $t1
sbb $t4, $t4
adc \$0, $t4
sub 8*0($a_ptr), $a0
mov $a2, $t2
@@ -146,14 +154,14 @@ ecp_nistz256_mul_by_2:
sbb 8*2($a_ptr), $a2
mov $a3, $t3
sbb 8*3($a_ptr), $a3
test $t4, $t4
sbb \$0, $t4
cmovz $t0, $a0
cmovz $t1, $a1
cmovc $t0, $a0
cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
cmovz $t2, $a2
cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
cmovz $t3, $a3
cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -250,12 +258,12 @@ ecp_nistz256_mul_by_3:
sbb \$0, $a2
mov $a3, $t3
sbb .Lpoly+8*3(%rip), $a3
test $t4, $t4
sbb \$0, $t4
cmovz $t0, $a0
cmovz $t1, $a1
cmovz $t2, $a2
cmovz $t3, $a3
cmovc $t0, $a0
cmovc $t1, $a1
cmovc $t2, $a2
cmovc $t3, $a3
xor $t4, $t4
add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3]
@@ -272,14 +280,14 @@ ecp_nistz256_mul_by_3:
sbb \$0, $a2
mov $a3, $t3
sbb .Lpoly+8*3(%rip), $a3
test $t4, $t4
sbb \$0, $t4
cmovz $t0, $a0
cmovz $t1, $a1
cmovc $t0, $a0
cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
cmovz $t2, $a2
cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
cmovz $t3, $a3
cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -318,14 +326,14 @@ ecp_nistz256_add:
sbb 8*2($a_ptr), $a2
mov $a3, $t3
sbb 8*3($a_ptr), $a3
test $t4, $t4
sbb \$0, $t4
cmovz $t0, $a0
cmovz $t1, $a1
cmovc $t0, $a0
cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
cmovz $t2, $a2
cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
cmovz $t3, $a3
cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -1370,20 +1378,44 @@ my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
$code.=<<___;
################################################################################
# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
.globl ecp_nistz256_select_w5
.type ecp_nistz256_select_w5,\@abi-omnipotent
# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index);
.globl ecp_nistz256_scatter_w5
.type ecp_nistz256_scatter_w5,\@abi-omnipotent
.align 32
ecp_nistz256_select_w5:
ecp_nistz256_scatter_w5:
lea -3($index,$index,2), $index
movdqa 0x00($in_t), %xmm0
shl \$5, $index
movdqa 0x10($in_t), %xmm1
movdqa 0x20($in_t), %xmm2
movdqa 0x30($in_t), %xmm3
movdqa 0x40($in_t), %xmm4
movdqa 0x50($in_t), %xmm5
movdqa %xmm0, 0x00($val,$index)
movdqa %xmm1, 0x10($val,$index)
movdqa %xmm2, 0x20($val,$index)
movdqa %xmm3, 0x30($val,$index)
movdqa %xmm4, 0x40($val,$index)
movdqa %xmm5, 0x50($val,$index)
ret
.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
################################################################################
# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index);
.globl ecp_nistz256_gather_w5
.type ecp_nistz256_gather_w5,\@abi-omnipotent
.align 32
ecp_nistz256_gather_w5:
___
$code.=<<___ if ($avx>1);
mov OPENSSL_ia32cap_P+8(%rip), %eax
test \$`1<<5`, %eax
jnz .Lavx2_select_w5
jnz .Lavx2_gather_w5
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
.LSEH_begin_ecp_nistz256_select_w5:
.LSEH_begin_ecp_nistz256_gather_w5:
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
.byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
.byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
@@ -1460,27 +1492,46 @@ $code.=<<___ if ($win64);
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
.LSEH_end_ecp_nistz256_select_w5:
.LSEH_end_ecp_nistz256_gather_w5:
___
$code.=<<___;
ret
.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
################################################################################
# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
.globl ecp_nistz256_select_w7
.type ecp_nistz256_select_w7,\@abi-omnipotent
# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index);
.globl ecp_nistz256_scatter_w7
.type ecp_nistz256_scatter_w7,\@abi-omnipotent
.align 32
ecp_nistz256_select_w7:
ecp_nistz256_scatter_w7:
movdqu 0x00($in_t), %xmm0
shl \$6, $index
movdqu 0x10($in_t), %xmm1
movdqu 0x20($in_t), %xmm2
movdqu 0x30($in_t), %xmm3
movdqa %xmm0, 0x00($val,$index)
movdqa %xmm1, 0x10($val,$index)
movdqa %xmm2, 0x20($val,$index)
movdqa %xmm3, 0x30($val,$index)
ret
.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
################################################################################
# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index);
.globl ecp_nistz256_gather_w7
.type ecp_nistz256_gather_w7,\@abi-omnipotent
.align 32
ecp_nistz256_gather_w7:
___
$code.=<<___ if ($avx>1);
mov OPENSSL_ia32cap_P+8(%rip), %eax
test \$`1<<5`, %eax
jnz .Lavx2_select_w7
jnz .Lavx2_gather_w7
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
.LSEH_begin_ecp_nistz256_select_w7:
.LSEH_begin_ecp_nistz256_gather_w7:
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
.byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
.byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
@@ -1546,11 +1597,11 @@ $code.=<<___ if ($win64);
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
.LSEH_end_ecp_nistz256_select_w7:
.LSEH_end_ecp_nistz256_gather_w7:
___
$code.=<<___;
ret
.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
___
}
if ($avx>1) {
@@ -1561,16 +1612,16 @@ my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
$code.=<<___;
################################################################################
# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent
# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index);
.type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent
.align 32
ecp_nistz256_avx2_select_w5:
.Lavx2_select_w5:
ecp_nistz256_avx2_gather_w5:
.Lavx2_gather_w5:
vzeroupper
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
.LSEH_begin_ecp_nistz256_avx2_select_w5:
.LSEH_begin_ecp_nistz256_avx2_gather_w5:
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
@@ -1648,11 +1699,11 @@ $code.=<<___ if ($win64);
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
.LSEH_end_ecp_nistz256_avx2_select_w5:
.LSEH_end_ecp_nistz256_avx2_gather_w5:
___
$code.=<<___;
ret
.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
___
}
if ($avx>1) {
@@ -1665,17 +1716,17 @@ my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
$code.=<<___;
################################################################################
# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
.globl ecp_nistz256_avx2_select_w7
.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent
# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index);
.globl ecp_nistz256_avx2_gather_w7
.type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent
.align 32
ecp_nistz256_avx2_select_w7:
.Lavx2_select_w7:
ecp_nistz256_avx2_gather_w7:
.Lavx2_gather_w7:
vzeroupper
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
.LSEH_begin_ecp_nistz256_avx2_select_w7:
.LSEH_begin_ecp_nistz256_avx2_gather_w7:
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
@@ -1768,21 +1819,21 @@ $code.=<<___ if ($win64);
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
.LSEH_end_ecp_nistz256_avx2_select_w7:
.LSEH_end_ecp_nistz256_avx2_gather_w7:
___
$code.=<<___;
ret
.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
___
} else {
$code.=<<___;
.globl ecp_nistz256_avx2_select_w7
.type ecp_nistz256_avx2_select_w7,\@function,3
.globl ecp_nistz256_avx2_gather_w7
.type ecp_nistz256_avx2_gather_w7,\@function,3
.align 32
ecp_nistz256_avx2_select_w7:
ecp_nistz256_avx2_gather_w7:
.byte 0x0f,0x0b # ud2
ret
.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
___
}
{{{
@@ -1840,13 +1891,14 @@ $code.=<<___;
.type __ecp_nistz256_add_toq,\@abi-omnipotent
.align 32
__ecp_nistz256_add_toq:
xor $t4,$t4
add 8*0($b_ptr), $a0
adc 8*1($b_ptr), $a1
mov $a0, $t0
adc 8*2($b_ptr), $a2
adc 8*3($b_ptr), $a3
mov $a1, $t1
sbb $t4, $t4
adc \$0, $t4
sub \$-1, $a0
mov $a2, $t2
@@ -1854,14 +1906,14 @@ __ecp_nistz256_add_toq:
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
test $t4, $t4
sbb \$0, $t4
cmovz $t0, $a0
cmovz $t1, $a1
cmovc $t0, $a0
cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
cmovz $t2, $a2
cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
cmovz $t3, $a3
cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -1929,13 +1981,14 @@ __ecp_nistz256_subq:
.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
.align 32
__ecp_nistz256_mul_by_2q:
xor $t4, $t4
add $a0, $a0 # a0:a3+a0:a3
adc $a1, $a1
mov $a0, $t0
adc $a2, $a2
adc $a3, $a3
mov $a1, $t1
sbb $t4, $t4
adc \$0, $t4
sub \$-1, $a0
mov $a2, $t2
@@ -1943,14 +1996,14 @@ __ecp_nistz256_mul_by_2q:
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
test $t4, $t4
sbb \$0, $t4
cmovz $t0, $a0
cmovz $t1, $a1
cmovc $t0, $a0
cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
cmovz $t2, $a2
cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
cmovz $t3, $a3
cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -2001,6 +2054,7 @@ $code.=<<___;
push %r15
sub \$32*5+8, %rsp
.Lpoint_double_shortcut$x:
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
mov $a_ptr, $b_ptr # backup copy
movdqu 0x10($a_ptr), %xmm1
@@ -2240,16 +2294,14 @@ $code.=<<___;
mov $b_org, $a_ptr # reassign
movdqa %xmm0, $in1_x(%rsp)
movdqa %xmm1, $in1_x+0x10(%rsp)
por %xmm0, %xmm1
movdqa %xmm2, $in1_y(%rsp)
movdqa %xmm3, $in1_y+0x10(%rsp)
por %xmm2, %xmm3
movdqa %xmm4, $in1_z(%rsp)
movdqa %xmm5, $in1_z+0x10(%rsp)
por %xmm1, %xmm3
por %xmm4, %xmm5
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
pshufd \$0xb1, %xmm3, %xmm5
pshufd \$0xb1, %xmm5, %xmm3
movdqu 0x10($a_ptr), %xmm1
movdqu 0x20($a_ptr), %xmm2
por %xmm3, %xmm5
@@ -2261,14 +2313,14 @@ $code.=<<___;
movdqa %xmm0, $in2_x(%rsp)
pshufd \$0x1e, %xmm5, %xmm4
movdqa %xmm1, $in2_x+0x10(%rsp)
por %xmm0, %xmm1
movq $r_ptr, %xmm0 # save $r_ptr
movdqu 0x40($a_ptr),%xmm0 # in2_z again
movdqu 0x50($a_ptr),%xmm1
movdqa %xmm2, $in2_y(%rsp)
movdqa %xmm3, $in2_y+0x10(%rsp)
por %xmm2, %xmm3
por %xmm4, %xmm5
pxor %xmm4, %xmm4
por %xmm1, %xmm3
por %xmm0, %xmm1
movq $r_ptr, %xmm0 # save $r_ptr
lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
@@ -2279,8 +2331,8 @@ $code.=<<___;
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
pcmpeqd %xmm4, %xmm5
pshufd \$0xb1, %xmm3, %xmm4
por %xmm3, %xmm4
pshufd \$0xb1, %xmm1, %xmm4
por %xmm1, %xmm4
pshufd \$0, %xmm5, %xmm5 # in1infty
pshufd \$0x1e, %xmm4, %xmm3
por %xmm3, %xmm4
@@ -2291,6 +2343,7 @@ $code.=<<___;
mov 0x40+8*1($b_ptr), $acc6
mov 0x40+8*2($b_ptr), $acc7
mov 0x40+8*3($b_ptr), $acc0
movq $b_ptr, %xmm1
lea 0x40-$bias($b_ptr), $a_ptr
lea $Z1sqr(%rsp), $r_ptr # Z1^2
@@ -2346,7 +2399,7 @@ $code.=<<___;
test $acc0, $acc0
jnz .Ladd_proceed$x # (in1infty || in2infty)?
test $acc1, $acc1
jz .Ladd_proceed$x # is_equal(S1,S2)?
jz .Ladd_double$x # is_equal(S1,S2)?
movq %xmm0, $r_ptr # restore $r_ptr
pxor %xmm0, %xmm0
@@ -2358,6 +2411,13 @@ $code.=<<___;
movdqu %xmm0, 0x50($r_ptr)
jmp .Ladd_done$x
.align 32
.Ladd_double$x:
movq %xmm1, $a_ptr # restore $a_ptr
movq %xmm0, $r_ptr # restore $r_ptr
add \$`32*(18-5)`, %rsp # difference in frame sizes
jmp .Lpoint_double_shortcut$x
.align 32
.Ladd_proceed$x:
`&load_for_sqr("$R(%rsp)", "$src0")`
@@ -2396,6 +2456,7 @@ $code.=<<___;
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
xor $t4, $t4
add $acc0, $acc0 # a0:a3+a0:a3
lea $Rsqr(%rsp), $a_ptr
adc $acc1, $acc1
@@ -2403,7 +2464,7 @@ $code.=<<___;
adc $acc2, $acc2
adc $acc3, $acc3
mov $acc1, $t1
sbb $t4, $t4
adc \$0, $t4
sub \$-1, $acc0
mov $acc2, $t2
@@ -2411,15 +2472,15 @@ $code.=<<___;
sbb \$0, $acc2
mov $acc3, $t3
sbb $poly3, $acc3
test $t4, $t4
sbb \$0, $t4
cmovz $t0, $acc0
cmovc $t0, $acc0
mov 8*0($a_ptr), $t0
cmovz $t1, $acc1
cmovc $t1, $acc1
mov 8*1($a_ptr), $t1
cmovz $t2, $acc2
cmovc $t2, $acc2
mov 8*2($a_ptr), $t2
cmovz $t3, $acc3
cmovc $t3, $acc3
mov 8*3($a_ptr), $t3
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
@@ -2603,16 +2664,14 @@ $code.=<<___;
mov 0x40+8*3($a_ptr), $acc0
movdqa %xmm0, $in1_x(%rsp)
movdqa %xmm1, $in1_x+0x10(%rsp)
por %xmm0, %xmm1
movdqa %xmm2, $in1_y(%rsp)
movdqa %xmm3, $in1_y+0x10(%rsp)
por %xmm2, %xmm3
movdqa %xmm4, $in1_z(%rsp)
movdqa %xmm5, $in1_z+0x10(%rsp)
por %xmm1, %xmm3
por %xmm4, %xmm5
movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
pshufd \$0xb1, %xmm3, %xmm5
pshufd \$0xb1, %xmm5, %xmm3
movdqu 0x10($b_ptr), %xmm1
movdqu 0x20($b_ptr), %xmm2
por %xmm3, %xmm5
@@ -2701,6 +2760,7 @@ $code.=<<___;
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
xor $t4, $t4
add $acc0, $acc0 # a0:a3+a0:a3
lea $Rsqr(%rsp), $a_ptr
adc $acc1, $acc1
@@ -2708,7 +2768,7 @@ $code.=<<___;
adc $acc2, $acc2
adc $acc3, $acc3
mov $acc1, $t1
sbb $t4, $t4
adc \$0, $t4
sub \$-1, $acc0
mov $acc2, $t2
@@ -2716,15 +2776,15 @@ $code.=<<___;
sbb \$0, $acc2
mov $acc3, $t3
sbb $poly3, $acc3
test $t4, $t4
sbb \$0, $t4
cmovz $t0, $acc0
cmovc $t0, $acc0
mov 8*0($a_ptr), $t0
cmovz $t1, $acc1
cmovc $t1, $acc1
mov 8*1($a_ptr), $t1
cmovz $t2, $acc2
cmovc $t2, $acc2
mov 8*2($a_ptr), $t2
cmovz $t3, $acc3
cmovc $t3, $acc3
mov 8*3($a_ptr), $t3
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
@@ -2876,14 +2936,14 @@ __ecp_nistz256_add_tox:
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
sbb \$0, $t4
bt \$0, $t4
cmovnc $t0, $a0
cmovnc $t1, $a1
cmovc $t0, $a0
cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
cmovnc $t2, $a2
cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
cmovnc $t3, $a3
cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -2971,14 +3031,14 @@ __ecp_nistz256_mul_by_2x:
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
sbb \$0, $t4
bt \$0, $t4
cmovnc $t0, $a0
cmovnc $t1, $a1
cmovc $t0, $a0
cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
cmovnc $t2, $a2
cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
cmovnc $t3, $a3
cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -2992,6 +3052,36 @@ ___
}
}}}
########################################################################
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
#
open TABLE,"<ecp_nistz256_table.c" or
open TABLE,"<${dir}../ecp_nistz256_table.c" or
die "failed to open ecp_nistz256_table.c:",$!;
use integer;
foreach(<TABLE>) {
s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
}
close TABLE;
die "insane number of elements" if ($#arr != 64*16*37-1);
print <<___;
.text
.globl ecp_nistz256_precomputed
.type ecp_nistz256_precomputed,\@object
.align 4096
ecp_nistz256_precomputed:
___
while (@line=splice(@arr,0,16)) {
print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n";
}
print <<___;
.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;