mirror of
https://github.com/guanzhi/GmSSL.git
synced 2026-06-24 06:03:40 +08:00
Merge remote-tracking branch 'origin/master'
# Conflicts: # README.md
This commit is contained in:
1865
crypto/ec/asm/ecp_nistz256-armv4.pl
Executable file
1865
crypto/ec/asm/ecp_nistz256-armv4.pl
Executable file
File diff suppressed because it is too large
Load Diff
1558
crypto/ec/asm/ecp_nistz256-armv8.pl
Normal file
1558
crypto/ec/asm/ecp_nistz256-armv8.pl
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,4 +1,11 @@
|
||||
#!/usr/bin/env perl
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
##############################################################################
|
||||
# #
|
||||
@@ -149,7 +156,7 @@ $code.=<<___;
|
||||
___
|
||||
|
||||
{
|
||||
# This function recieves a pointer to an array of four affine points
|
||||
# This function receives a pointer to an array of four affine points
|
||||
# (X, Y, <1>) and rearanges the data for AVX2 execution, while
|
||||
# converting it to 2^29 radix redundant form
|
||||
|
||||
@@ -301,7 +308,7 @@ ___
|
||||
}
|
||||
{
|
||||
################################################################################
|
||||
# This function recieves a pointer to an array of four AVX2 formatted points
|
||||
# This function receives a pointer to an array of four AVX2 formatted points
|
||||
# (X, Y, Z) convert the data to normal representation, and rearanges the data
|
||||
|
||||
my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8));
|
||||
@@ -1909,7 +1916,7 @@ ___
|
||||
}
|
||||
{
|
||||
################################################################################
|
||||
# void ecp_nistz256_avx2_multi_select_w7(void* RESULT, void *in,
|
||||
# void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in,
|
||||
# int index0, int index1, int index2, int index3);
|
||||
################################################################################
|
||||
|
||||
@@ -1919,10 +1926,10 @@ my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
|
||||
my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
|
||||
|
||||
$code.=<<___;
|
||||
.globl ecp_nistz256_avx2_multi_select_w7
|
||||
.type ecp_nistz256_avx2_multi_select_w7,\@function,6
|
||||
.globl ecp_nistz256_avx2_multi_gather_w7
|
||||
.type ecp_nistz256_avx2_multi_gather_w7,\@function,6
|
||||
.align 32
|
||||
ecp_nistz256_avx2_multi_select_w7:
|
||||
ecp_nistz256_avx2_multi_gather_w7:
|
||||
vzeroupper
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
@@ -2036,7 +2043,7 @@ $code.=<<___ if ($win64);
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7
|
||||
.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
|
||||
|
||||
.extern OPENSSL_ia32cap_P
|
||||
.globl ecp_nistz_avx2_eligible
|
||||
@@ -2061,8 +2068,8 @@ $code.=<<___;
|
||||
.globl ecp_nistz256_avx2_to_mont
|
||||
.globl ecp_nistz256_avx2_from_mont
|
||||
.globl ecp_nistz256_avx2_set1
|
||||
.globl ecp_nistz256_avx2_multi_select_w7
|
||||
.type ecp_nistz256_avx2_multi_select_w7,\@abi-omnipotent
|
||||
.globl ecp_nistz256_avx2_multi_gather_w7
|
||||
.type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent
|
||||
ecp_nistz256_avx2_transpose_convert:
|
||||
ecp_nistz256_avx2_convert_transpose_back:
|
||||
ecp_nistz256_avx2_point_add_affine_x4:
|
||||
@@ -2070,10 +2077,10 @@ ecp_nistz256_avx2_point_add_affines_x4:
|
||||
ecp_nistz256_avx2_to_mont:
|
||||
ecp_nistz256_avx2_from_mont:
|
||||
ecp_nistz256_avx2_set1:
|
||||
ecp_nistz256_avx2_multi_select_w7:
|
||||
ecp_nistz256_avx2_multi_gather_w7:
|
||||
.byte 0x0f,0x0b # ud2
|
||||
ret
|
||||
.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7
|
||||
.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
|
||||
|
||||
.globl ecp_nistz_avx2_eligible
|
||||
.type ecp_nistz_avx2_eligible,\@abi-omnipotent
|
||||
|
||||
3061
crypto/ec/asm/ecp_nistz256-sparcv9.pl
Executable file
3061
crypto/ec/asm/ecp_nistz256-sparcv9.pl
Executable file
File diff suppressed because it is too large
Load Diff
1866
crypto/ec/asm/ecp_nistz256-x86.pl
Executable file
1866
crypto/ec/asm/ecp_nistz256-x86.pl
Executable file
File diff suppressed because it is too large
Load Diff
@@ -1,4 +1,11 @@
|
||||
#!/usr/bin/env perl
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
##############################################################################
|
||||
# #
|
||||
@@ -60,7 +67,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
||||
*STDOUT=*OUT;
|
||||
|
||||
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
||||
@@ -81,7 +88,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
||||
$addx = ($1>=12);
|
||||
}
|
||||
|
||||
if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
|
||||
if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
|
||||
my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
|
||||
$avx = ($ver>=3.0) + ($ver>=3.01);
|
||||
$addx = ($ver>=3.03);
|
||||
@@ -128,6 +135,7 @@ ecp_nistz256_mul_by_2:
|
||||
push %r13
|
||||
|
||||
mov 8*0($a_ptr), $a0
|
||||
xor $t4,$t4
|
||||
mov 8*1($a_ptr), $a1
|
||||
add $a0, $a0 # a0:a3+a0:a3
|
||||
mov 8*2($a_ptr), $a2
|
||||
@@ -138,7 +146,7 @@ ecp_nistz256_mul_by_2:
|
||||
adc $a2, $a2
|
||||
adc $a3, $a3
|
||||
mov $a1, $t1
|
||||
sbb $t4, $t4
|
||||
adc \$0, $t4
|
||||
|
||||
sub 8*0($a_ptr), $a0
|
||||
mov $a2, $t2
|
||||
@@ -146,14 +154,14 @@ ecp_nistz256_mul_by_2:
|
||||
sbb 8*2($a_ptr), $a2
|
||||
mov $a3, $t3
|
||||
sbb 8*3($a_ptr), $a3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $a0
|
||||
cmovz $t1, $a1
|
||||
cmovc $t0, $a0
|
||||
cmovc $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovz $t2, $a2
|
||||
cmovc $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovz $t3, $a3
|
||||
cmovc $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
@@ -250,12 +258,12 @@ ecp_nistz256_mul_by_3:
|
||||
sbb \$0, $a2
|
||||
mov $a3, $t3
|
||||
sbb .Lpoly+8*3(%rip), $a3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $a0
|
||||
cmovz $t1, $a1
|
||||
cmovz $t2, $a2
|
||||
cmovz $t3, $a3
|
||||
cmovc $t0, $a0
|
||||
cmovc $t1, $a1
|
||||
cmovc $t2, $a2
|
||||
cmovc $t3, $a3
|
||||
|
||||
xor $t4, $t4
|
||||
add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3]
|
||||
@@ -272,14 +280,14 @@ ecp_nistz256_mul_by_3:
|
||||
sbb \$0, $a2
|
||||
mov $a3, $t3
|
||||
sbb .Lpoly+8*3(%rip), $a3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $a0
|
||||
cmovz $t1, $a1
|
||||
cmovc $t0, $a0
|
||||
cmovc $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovz $t2, $a2
|
||||
cmovc $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovz $t3, $a3
|
||||
cmovc $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
@@ -318,14 +326,14 @@ ecp_nistz256_add:
|
||||
sbb 8*2($a_ptr), $a2
|
||||
mov $a3, $t3
|
||||
sbb 8*3($a_ptr), $a3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $a0
|
||||
cmovz $t1, $a1
|
||||
cmovc $t0, $a0
|
||||
cmovc $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovz $t2, $a2
|
||||
cmovc $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovz $t3, $a3
|
||||
cmovc $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
@@ -1370,20 +1378,44 @@ my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
|
||||
|
||||
$code.=<<___;
|
||||
################################################################################
|
||||
# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
|
||||
.globl ecp_nistz256_select_w5
|
||||
.type ecp_nistz256_select_w5,\@abi-omnipotent
|
||||
# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index);
|
||||
.globl ecp_nistz256_scatter_w5
|
||||
.type ecp_nistz256_scatter_w5,\@abi-omnipotent
|
||||
.align 32
|
||||
ecp_nistz256_select_w5:
|
||||
ecp_nistz256_scatter_w5:
|
||||
lea -3($index,$index,2), $index
|
||||
movdqa 0x00($in_t), %xmm0
|
||||
shl \$5, $index
|
||||
movdqa 0x10($in_t), %xmm1
|
||||
movdqa 0x20($in_t), %xmm2
|
||||
movdqa 0x30($in_t), %xmm3
|
||||
movdqa 0x40($in_t), %xmm4
|
||||
movdqa 0x50($in_t), %xmm5
|
||||
movdqa %xmm0, 0x00($val,$index)
|
||||
movdqa %xmm1, 0x10($val,$index)
|
||||
movdqa %xmm2, 0x20($val,$index)
|
||||
movdqa %xmm3, 0x30($val,$index)
|
||||
movdqa %xmm4, 0x40($val,$index)
|
||||
movdqa %xmm5, 0x50($val,$index)
|
||||
|
||||
ret
|
||||
.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
|
||||
|
||||
################################################################################
|
||||
# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index);
|
||||
.globl ecp_nistz256_gather_w5
|
||||
.type ecp_nistz256_gather_w5,\@abi-omnipotent
|
||||
.align 32
|
||||
ecp_nistz256_gather_w5:
|
||||
___
|
||||
$code.=<<___ if ($avx>1);
|
||||
mov OPENSSL_ia32cap_P+8(%rip), %eax
|
||||
test \$`1<<5`, %eax
|
||||
jnz .Lavx2_select_w5
|
||||
jnz .Lavx2_gather_w5
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea -0x88(%rsp), %rax
|
||||
.LSEH_begin_ecp_nistz256_select_w5:
|
||||
.LSEH_begin_ecp_nistz256_gather_w5:
|
||||
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
|
||||
.byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
|
||||
.byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
|
||||
@@ -1460,27 +1492,46 @@ $code.=<<___ if ($win64);
|
||||
movaps 0x80(%rsp), %xmm14
|
||||
movaps 0x90(%rsp), %xmm15
|
||||
lea 0xa8(%rsp), %rsp
|
||||
.LSEH_end_ecp_nistz256_select_w5:
|
||||
.LSEH_end_ecp_nistz256_gather_w5:
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
|
||||
.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
|
||||
|
||||
################################################################################
|
||||
# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
|
||||
.globl ecp_nistz256_select_w7
|
||||
.type ecp_nistz256_select_w7,\@abi-omnipotent
|
||||
# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index);
|
||||
.globl ecp_nistz256_scatter_w7
|
||||
.type ecp_nistz256_scatter_w7,\@abi-omnipotent
|
||||
.align 32
|
||||
ecp_nistz256_select_w7:
|
||||
ecp_nistz256_scatter_w7:
|
||||
movdqu 0x00($in_t), %xmm0
|
||||
shl \$6, $index
|
||||
movdqu 0x10($in_t), %xmm1
|
||||
movdqu 0x20($in_t), %xmm2
|
||||
movdqu 0x30($in_t), %xmm3
|
||||
movdqa %xmm0, 0x00($val,$index)
|
||||
movdqa %xmm1, 0x10($val,$index)
|
||||
movdqa %xmm2, 0x20($val,$index)
|
||||
movdqa %xmm3, 0x30($val,$index)
|
||||
|
||||
ret
|
||||
.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
|
||||
|
||||
################################################################################
|
||||
# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index);
|
||||
.globl ecp_nistz256_gather_w7
|
||||
.type ecp_nistz256_gather_w7,\@abi-omnipotent
|
||||
.align 32
|
||||
ecp_nistz256_gather_w7:
|
||||
___
|
||||
$code.=<<___ if ($avx>1);
|
||||
mov OPENSSL_ia32cap_P+8(%rip), %eax
|
||||
test \$`1<<5`, %eax
|
||||
jnz .Lavx2_select_w7
|
||||
jnz .Lavx2_gather_w7
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea -0x88(%rsp), %rax
|
||||
.LSEH_begin_ecp_nistz256_select_w7:
|
||||
.LSEH_begin_ecp_nistz256_gather_w7:
|
||||
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
|
||||
.byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
|
||||
.byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
|
||||
@@ -1546,11 +1597,11 @@ $code.=<<___ if ($win64);
|
||||
movaps 0x80(%rsp), %xmm14
|
||||
movaps 0x90(%rsp), %xmm15
|
||||
lea 0xa8(%rsp), %rsp
|
||||
.LSEH_end_ecp_nistz256_select_w7:
|
||||
.LSEH_end_ecp_nistz256_gather_w7:
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
|
||||
.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
|
||||
___
|
||||
}
|
||||
if ($avx>1) {
|
||||
@@ -1561,16 +1612,16 @@ my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
|
||||
|
||||
$code.=<<___;
|
||||
################################################################################
|
||||
# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
|
||||
.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent
|
||||
# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index);
|
||||
.type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent
|
||||
.align 32
|
||||
ecp_nistz256_avx2_select_w5:
|
||||
.Lavx2_select_w5:
|
||||
ecp_nistz256_avx2_gather_w5:
|
||||
.Lavx2_gather_w5:
|
||||
vzeroupper
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea -0x88(%rsp), %rax
|
||||
.LSEH_begin_ecp_nistz256_avx2_select_w5:
|
||||
.LSEH_begin_ecp_nistz256_avx2_gather_w5:
|
||||
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
|
||||
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
|
||||
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
|
||||
@@ -1648,11 +1699,11 @@ $code.=<<___ if ($win64);
|
||||
movaps 0x80(%rsp), %xmm14
|
||||
movaps 0x90(%rsp), %xmm15
|
||||
lea 0xa8(%rsp), %rsp
|
||||
.LSEH_end_ecp_nistz256_avx2_select_w5:
|
||||
.LSEH_end_ecp_nistz256_avx2_gather_w5:
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
|
||||
.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
|
||||
___
|
||||
}
|
||||
if ($avx>1) {
|
||||
@@ -1665,17 +1716,17 @@ my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
|
||||
$code.=<<___;
|
||||
|
||||
################################################################################
|
||||
# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
|
||||
.globl ecp_nistz256_avx2_select_w7
|
||||
.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent
|
||||
# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index);
|
||||
.globl ecp_nistz256_avx2_gather_w7
|
||||
.type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent
|
||||
.align 32
|
||||
ecp_nistz256_avx2_select_w7:
|
||||
.Lavx2_select_w7:
|
||||
ecp_nistz256_avx2_gather_w7:
|
||||
.Lavx2_gather_w7:
|
||||
vzeroupper
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea -0x88(%rsp), %rax
|
||||
.LSEH_begin_ecp_nistz256_avx2_select_w7:
|
||||
.LSEH_begin_ecp_nistz256_avx2_gather_w7:
|
||||
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
|
||||
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
|
||||
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
|
||||
@@ -1768,21 +1819,21 @@ $code.=<<___ if ($win64);
|
||||
movaps 0x80(%rsp), %xmm14
|
||||
movaps 0x90(%rsp), %xmm15
|
||||
lea 0xa8(%rsp), %rsp
|
||||
.LSEH_end_ecp_nistz256_avx2_select_w7:
|
||||
.LSEH_end_ecp_nistz256_avx2_gather_w7:
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
|
||||
.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
|
||||
___
|
||||
} else {
|
||||
$code.=<<___;
|
||||
.globl ecp_nistz256_avx2_select_w7
|
||||
.type ecp_nistz256_avx2_select_w7,\@function,3
|
||||
.globl ecp_nistz256_avx2_gather_w7
|
||||
.type ecp_nistz256_avx2_gather_w7,\@function,3
|
||||
.align 32
|
||||
ecp_nistz256_avx2_select_w7:
|
||||
ecp_nistz256_avx2_gather_w7:
|
||||
.byte 0x0f,0x0b # ud2
|
||||
ret
|
||||
.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
|
||||
.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
|
||||
___
|
||||
}
|
||||
{{{
|
||||
@@ -1840,13 +1891,14 @@ $code.=<<___;
|
||||
.type __ecp_nistz256_add_toq,\@abi-omnipotent
|
||||
.align 32
|
||||
__ecp_nistz256_add_toq:
|
||||
xor $t4,$t4
|
||||
add 8*0($b_ptr), $a0
|
||||
adc 8*1($b_ptr), $a1
|
||||
mov $a0, $t0
|
||||
adc 8*2($b_ptr), $a2
|
||||
adc 8*3($b_ptr), $a3
|
||||
mov $a1, $t1
|
||||
sbb $t4, $t4
|
||||
adc \$0, $t4
|
||||
|
||||
sub \$-1, $a0
|
||||
mov $a2, $t2
|
||||
@@ -1854,14 +1906,14 @@ __ecp_nistz256_add_toq:
|
||||
sbb \$0, $a2
|
||||
mov $a3, $t3
|
||||
sbb $poly3, $a3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $a0
|
||||
cmovz $t1, $a1
|
||||
cmovc $t0, $a0
|
||||
cmovc $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovz $t2, $a2
|
||||
cmovc $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovz $t3, $a3
|
||||
cmovc $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
@@ -1929,13 +1981,14 @@ __ecp_nistz256_subq:
|
||||
.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
|
||||
.align 32
|
||||
__ecp_nistz256_mul_by_2q:
|
||||
xor $t4, $t4
|
||||
add $a0, $a0 # a0:a3+a0:a3
|
||||
adc $a1, $a1
|
||||
mov $a0, $t0
|
||||
adc $a2, $a2
|
||||
adc $a3, $a3
|
||||
mov $a1, $t1
|
||||
sbb $t4, $t4
|
||||
adc \$0, $t4
|
||||
|
||||
sub \$-1, $a0
|
||||
mov $a2, $t2
|
||||
@@ -1943,14 +1996,14 @@ __ecp_nistz256_mul_by_2q:
|
||||
sbb \$0, $a2
|
||||
mov $a3, $t3
|
||||
sbb $poly3, $a3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $a0
|
||||
cmovz $t1, $a1
|
||||
cmovc $t0, $a0
|
||||
cmovc $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovz $t2, $a2
|
||||
cmovc $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovz $t3, $a3
|
||||
cmovc $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
@@ -2001,6 +2054,7 @@ $code.=<<___;
|
||||
push %r15
|
||||
sub \$32*5+8, %rsp
|
||||
|
||||
.Lpoint_double_shortcut$x:
|
||||
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
|
||||
mov $a_ptr, $b_ptr # backup copy
|
||||
movdqu 0x10($a_ptr), %xmm1
|
||||
@@ -2240,16 +2294,14 @@ $code.=<<___;
|
||||
mov $b_org, $a_ptr # reassign
|
||||
movdqa %xmm0, $in1_x(%rsp)
|
||||
movdqa %xmm1, $in1_x+0x10(%rsp)
|
||||
por %xmm0, %xmm1
|
||||
movdqa %xmm2, $in1_y(%rsp)
|
||||
movdqa %xmm3, $in1_y+0x10(%rsp)
|
||||
por %xmm2, %xmm3
|
||||
movdqa %xmm4, $in1_z(%rsp)
|
||||
movdqa %xmm5, $in1_z+0x10(%rsp)
|
||||
por %xmm1, %xmm3
|
||||
por %xmm4, %xmm5
|
||||
|
||||
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
|
||||
pshufd \$0xb1, %xmm3, %xmm5
|
||||
pshufd \$0xb1, %xmm5, %xmm3
|
||||
movdqu 0x10($a_ptr), %xmm1
|
||||
movdqu 0x20($a_ptr), %xmm2
|
||||
por %xmm3, %xmm5
|
||||
@@ -2261,14 +2313,14 @@ $code.=<<___;
|
||||
movdqa %xmm0, $in2_x(%rsp)
|
||||
pshufd \$0x1e, %xmm5, %xmm4
|
||||
movdqa %xmm1, $in2_x+0x10(%rsp)
|
||||
por %xmm0, %xmm1
|
||||
movq $r_ptr, %xmm0 # save $r_ptr
|
||||
movdqu 0x40($a_ptr),%xmm0 # in2_z again
|
||||
movdqu 0x50($a_ptr),%xmm1
|
||||
movdqa %xmm2, $in2_y(%rsp)
|
||||
movdqa %xmm3, $in2_y+0x10(%rsp)
|
||||
por %xmm2, %xmm3
|
||||
por %xmm4, %xmm5
|
||||
pxor %xmm4, %xmm4
|
||||
por %xmm1, %xmm3
|
||||
por %xmm0, %xmm1
|
||||
movq $r_ptr, %xmm0 # save $r_ptr
|
||||
|
||||
lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
|
||||
mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
|
||||
@@ -2279,8 +2331,8 @@ $code.=<<___;
|
||||
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
|
||||
|
||||
pcmpeqd %xmm4, %xmm5
|
||||
pshufd \$0xb1, %xmm3, %xmm4
|
||||
por %xmm3, %xmm4
|
||||
pshufd \$0xb1, %xmm1, %xmm4
|
||||
por %xmm1, %xmm4
|
||||
pshufd \$0, %xmm5, %xmm5 # in1infty
|
||||
pshufd \$0x1e, %xmm4, %xmm3
|
||||
por %xmm3, %xmm4
|
||||
@@ -2291,6 +2343,7 @@ $code.=<<___;
|
||||
mov 0x40+8*1($b_ptr), $acc6
|
||||
mov 0x40+8*2($b_ptr), $acc7
|
||||
mov 0x40+8*3($b_ptr), $acc0
|
||||
movq $b_ptr, %xmm1
|
||||
|
||||
lea 0x40-$bias($b_ptr), $a_ptr
|
||||
lea $Z1sqr(%rsp), $r_ptr # Z1^2
|
||||
@@ -2346,7 +2399,7 @@ $code.=<<___;
|
||||
test $acc0, $acc0
|
||||
jnz .Ladd_proceed$x # (in1infty || in2infty)?
|
||||
test $acc1, $acc1
|
||||
jz .Ladd_proceed$x # is_equal(S1,S2)?
|
||||
jz .Ladd_double$x # is_equal(S1,S2)?
|
||||
|
||||
movq %xmm0, $r_ptr # restore $r_ptr
|
||||
pxor %xmm0, %xmm0
|
||||
@@ -2358,6 +2411,13 @@ $code.=<<___;
|
||||
movdqu %xmm0, 0x50($r_ptr)
|
||||
jmp .Ladd_done$x
|
||||
|
||||
.align 32
|
||||
.Ladd_double$x:
|
||||
movq %xmm1, $a_ptr # restore $a_ptr
|
||||
movq %xmm0, $r_ptr # restore $r_ptr
|
||||
add \$`32*(18-5)`, %rsp # difference in frame sizes
|
||||
jmp .Lpoint_double_shortcut$x
|
||||
|
||||
.align 32
|
||||
.Ladd_proceed$x:
|
||||
`&load_for_sqr("$R(%rsp)", "$src0")`
|
||||
@@ -2396,6 +2456,7 @@ $code.=<<___;
|
||||
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
|
||||
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
|
||||
|
||||
xor $t4, $t4
|
||||
add $acc0, $acc0 # a0:a3+a0:a3
|
||||
lea $Rsqr(%rsp), $a_ptr
|
||||
adc $acc1, $acc1
|
||||
@@ -2403,7 +2464,7 @@ $code.=<<___;
|
||||
adc $acc2, $acc2
|
||||
adc $acc3, $acc3
|
||||
mov $acc1, $t1
|
||||
sbb $t4, $t4
|
||||
adc \$0, $t4
|
||||
|
||||
sub \$-1, $acc0
|
||||
mov $acc2, $t2
|
||||
@@ -2411,15 +2472,15 @@ $code.=<<___;
|
||||
sbb \$0, $acc2
|
||||
mov $acc3, $t3
|
||||
sbb $poly3, $acc3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $acc0
|
||||
cmovc $t0, $acc0
|
||||
mov 8*0($a_ptr), $t0
|
||||
cmovz $t1, $acc1
|
||||
cmovc $t1, $acc1
|
||||
mov 8*1($a_ptr), $t1
|
||||
cmovz $t2, $acc2
|
||||
cmovc $t2, $acc2
|
||||
mov 8*2($a_ptr), $t2
|
||||
cmovz $t3, $acc3
|
||||
cmovc $t3, $acc3
|
||||
mov 8*3($a_ptr), $t3
|
||||
|
||||
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
|
||||
@@ -2603,16 +2664,14 @@ $code.=<<___;
|
||||
mov 0x40+8*3($a_ptr), $acc0
|
||||
movdqa %xmm0, $in1_x(%rsp)
|
||||
movdqa %xmm1, $in1_x+0x10(%rsp)
|
||||
por %xmm0, %xmm1
|
||||
movdqa %xmm2, $in1_y(%rsp)
|
||||
movdqa %xmm3, $in1_y+0x10(%rsp)
|
||||
por %xmm2, %xmm3
|
||||
movdqa %xmm4, $in1_z(%rsp)
|
||||
movdqa %xmm5, $in1_z+0x10(%rsp)
|
||||
por %xmm1, %xmm3
|
||||
por %xmm4, %xmm5
|
||||
|
||||
movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
|
||||
pshufd \$0xb1, %xmm3, %xmm5
|
||||
pshufd \$0xb1, %xmm5, %xmm3
|
||||
movdqu 0x10($b_ptr), %xmm1
|
||||
movdqu 0x20($b_ptr), %xmm2
|
||||
por %xmm3, %xmm5
|
||||
@@ -2701,6 +2760,7 @@ $code.=<<___;
|
||||
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
|
||||
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
|
||||
|
||||
xor $t4, $t4
|
||||
add $acc0, $acc0 # a0:a3+a0:a3
|
||||
lea $Rsqr(%rsp), $a_ptr
|
||||
adc $acc1, $acc1
|
||||
@@ -2708,7 +2768,7 @@ $code.=<<___;
|
||||
adc $acc2, $acc2
|
||||
adc $acc3, $acc3
|
||||
mov $acc1, $t1
|
||||
sbb $t4, $t4
|
||||
adc \$0, $t4
|
||||
|
||||
sub \$-1, $acc0
|
||||
mov $acc2, $t2
|
||||
@@ -2716,15 +2776,15 @@ $code.=<<___;
|
||||
sbb \$0, $acc2
|
||||
mov $acc3, $t3
|
||||
sbb $poly3, $acc3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $acc0
|
||||
cmovc $t0, $acc0
|
||||
mov 8*0($a_ptr), $t0
|
||||
cmovz $t1, $acc1
|
||||
cmovc $t1, $acc1
|
||||
mov 8*1($a_ptr), $t1
|
||||
cmovz $t2, $acc2
|
||||
cmovc $t2, $acc2
|
||||
mov 8*2($a_ptr), $t2
|
||||
cmovz $t3, $acc3
|
||||
cmovc $t3, $acc3
|
||||
mov 8*3($a_ptr), $t3
|
||||
|
||||
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
|
||||
@@ -2876,14 +2936,14 @@ __ecp_nistz256_add_tox:
|
||||
sbb \$0, $a2
|
||||
mov $a3, $t3
|
||||
sbb $poly3, $a3
|
||||
sbb \$0, $t4
|
||||
|
||||
bt \$0, $t4
|
||||
cmovnc $t0, $a0
|
||||
cmovnc $t1, $a1
|
||||
cmovc $t0, $a0
|
||||
cmovc $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovnc $t2, $a2
|
||||
cmovc $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovnc $t3, $a3
|
||||
cmovc $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
@@ -2971,14 +3031,14 @@ __ecp_nistz256_mul_by_2x:
|
||||
sbb \$0, $a2
|
||||
mov $a3, $t3
|
||||
sbb $poly3, $a3
|
||||
sbb \$0, $t4
|
||||
|
||||
bt \$0, $t4
|
||||
cmovnc $t0, $a0
|
||||
cmovnc $t1, $a1
|
||||
cmovc $t0, $a0
|
||||
cmovc $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovnc $t2, $a2
|
||||
cmovc $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovnc $t3, $a3
|
||||
cmovc $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
@@ -2992,6 +3052,36 @@ ___
|
||||
}
|
||||
}}}
|
||||
|
||||
########################################################################
|
||||
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
|
||||
#
|
||||
open TABLE,"<ecp_nistz256_table.c" or
|
||||
open TABLE,"<${dir}../ecp_nistz256_table.c" or
|
||||
die "failed to open ecp_nistz256_table.c:",$!;
|
||||
|
||||
use integer;
|
||||
|
||||
foreach(<TABLE>) {
|
||||
s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
|
||||
}
|
||||
close TABLE;
|
||||
|
||||
die "insane number of elements" if ($#arr != 64*16*37-1);
|
||||
|
||||
print <<___;
|
||||
.text
|
||||
.globl ecp_nistz256_precomputed
|
||||
.type ecp_nistz256_precomputed,\@object
|
||||
.align 4096
|
||||
ecp_nistz256_precomputed:
|
||||
___
|
||||
while (@line=splice(@arr,0,16)) {
|
||||
print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n";
|
||||
}
|
||||
print <<___;
|
||||
.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
||||
|
||||
Reference in New Issue
Block a user