Files
GmSSL/src/sm2_z256_arm64.S
2024-06-18 09:24:38 +08:00

2213 lines
39 KiB
ArmAsm
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Copyright 2014-2024 The GmSSL Project. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*/
#include <gmssl/asm.h>
.text
.align 5
#define neg_p1 0xffffffff
#define neg_p3 0x100000000
Lneg_p:
.quad 1, neg_p1, 0, neg_p3
// 2^512 mod p
Lz256_2e512modp:
.quad 0x0000000200000003, 0x00000002ffffffff, 0x0000000100000001, 0x0000000400000002
Lone:
.quad 1,0,0,0
Lmodn:
.quad 0x53bbf40939d54123, 0x7203df6b21c6052b, 0xffffffffffffffff, 0xfffffffeffffffff
.align 4
__sm2_z256_modp_add:
// carry, a = a + b
adds x14,x14,x8
adcs x15,x15,x9
adcs x16,x16,x10
adcs x17,x17,x11
adc x1,xzr,xzr
// carry, b = a + (2^256 - p) = (a + b - p) + 2^256
adds x8,x14,#1
adcs x9,x15,x12
adcs x10,x16,xzr
adcs x11,x17,x13
adc x1,x1,xzr
cmp x1,xzr
// if carry == 0, i.e. (a + b - p) < 0, return a == (a + b)
// else return b == (a + b - p)
csel x14,x14,x8,eq
csel x15,x15,x9,eq
csel x16,x16,x10,eq
csel x17,x17,x11,eq
stp x14,x15,[x0]
stp x16,x17,[x0,#16]
ret
.globl func(sm2_z256_modp_add)
.align 4
func(sm2_z256_modp_add):
stp x29,x30,[sp,#-16]!
add x29,sp,#0
// load a
ldp x14,x15,[x1]
ldp x16,x17,[x1,#16]
// load b
ldp x8,x9,[x2]
ldp x10,x11,[x2,#16]
// load modp
mov x12,#neg_p1
mov x13,#neg_p3
bl __sm2_z256_modp_add
ldp x29,x30,[sp],#16
ret
.globl func(sm2_z256_modp_dbl)
.align 4
func(sm2_z256_modp_dbl):
stp x29,x30,[sp,#-16]!
add x29,sp,#0
// load a
ldp x14,x15,[x1]
ldp x16,x17,[x1,#16]
// b = a
mov x8,x14
mov x9,x15
mov x10,x16
mov x11,x17
// set (2^256 - p)
mov x12,#neg_p1
mov x13,#neg_p3
bl __sm2_z256_modp_add
ldp x29,x30,[sp],#16
ret
.globl func(sm2_z256_modp_tri)
.align 4
func(sm2_z256_modp_tri):
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ldp x14,x15,[x1]
ldp x16,x17,[x1,#16]
// load (2^256 - p)
mov x12,#neg_p1
mov x13,#neg_p3
// b = a
mov x8,x14
mov x9,x15
mov x10,x16
mov x11,x17
// c = a
mov x4,x14
mov x5,x15
mov x6,x16
mov x7,x17
// a = a + b = 2a
bl __sm2_z256_modp_add
// b = c = a
mov x8,x4
mov x9,x5
mov x10,x6
mov x11,x7
// a = a + b = 2a + a = 3a
bl __sm2_z256_modp_add
ldp x29,x30,[sp],#16
ret
// a - b (mod p)
.align 4
__sm2_z256_modp_sub:
ldp x8,x9,[x2]
ldp x10,x11,[x2,#16]
// a = a - b
subs x14,x14,x8
sbcs x15,x15,x9
sbcs x16,x16,x10
sbcs x17,x17,x11
sbc x1,xzr,xzr
// b = a - (2^256 - p) = a - b + p - 2^256
subs x8,x14,#1
sbcs x9,x15,x12
sbcs x10,x16,xzr
sbcs x11,x17,x13
cmp x1,xzr
csel x14,x14,x8,eq
csel x15,x15,x9,eq
csel x16,x16,x10,eq
stp x14,x15,[x0]
csel x17,x17,x11,eq
stp x16,x17,[x0,#16]
ret
// b - a (mod p)
.align 4
__sm2_z256_modp_neg_sub:
ldp x8,x9,[x2]
ldp x10,x11,[x2,#16]
// a = b - a
subs x14,x8,x14
sbcs x15,x9,x15
sbcs x16,x10,x16
sbcs x17,x11,x17
sbc x1,xzr,xzr
// b = a - (2^256 - p) = b - a + p - 2^256
subs x8,x14,#1
sbcs x9,x15,x12
sbcs x10,x16,xzr
sbcs x11,x17,x13
cmp x1,xzr
csel x14,x14,x8,eq
csel x15,x15,x9,eq
csel x16,x16,x10,eq
stp x14,x15,[x0]
csel x17,x17,x11,eq
stp x16,x17,[x0,#16]
ret
.globl func(sm2_z256_modp_sub)
.align 4
func(sm2_z256_modp_sub):
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ldp x14,x15,[x1]
ldp x16,x17,[x1,#16]
mov x12,#neg_p1
mov x13,#neg_p3
bl __sm2_z256_modp_sub
ldp x29,x30,[sp],#16
ret
.globl func(sm2_z256_modp_neg)
.align 4
func(sm2_z256_modp_neg):
stp x29,x30,[sp,#-16]!
add x29,sp,#0
mov x2,x1
mov x14,xzr
mov x15,xzr
mov x16,xzr
mov x17,xzr
mov x12,#neg_p1
mov x13,#neg_p3
bl __sm2_z256_modp_sub
ldp x29,x30,[sp],#16
ret
.align 4
__sm2_z256_modp_mont_mul:
// a * b0
mul x14,x4,x3 // a[0]*b[0]
umulh x8,x4,x3
mul x15,x5,x3 // a[1]*b[0]
umulh x9,x5,x3
mul x16,x6,x3 // a[2]*b[0]
umulh x10,x6,x3
mul x17,x7,x3 // a[3]*b[0]
umulh x11,x7,x3
ldr x3,[x2,#8] // b[1]
adds x15,x15,x8
adcs x16,x16,x9
adcs x17,x17,x10
adc x19,xzr,x11
mov x20,xzr
lsl x10,x14,#32
lsr x11,x14,#32
// p = 2^256 - 2^224 - 2^96 + 2^64 - 1
// R = 2^64
// p * a0 = (a0 * R^4 + a0 * R^1) - (a0 * 2^32 * R^192 + a0 * 2^32 * R + a0)
// [ a4 ][ a3 ][ a2 ][ a1 ][ a0 ]
// [ a0 ] 0 0 [ a0 ] 0
// - [ a0>>32 ][ a0<<32 ][ a0 >> 32 ][ a0<<32 ][ a0 ]
// x10 = a0 << 32
// x11 = a0 >> 32
//subs x10,x14,x8
//sbc x11,x14,x9
subs x8,x14,x10
sbcs x9,xzr,x11
sbcs x10,xzr,x10
sbc x11,x14,x11
adds x14,x15,x8
mul x8,x4,x3 // lo(a[0]*b[i])
adcs x15,x16,x9
mul x9,x5,x3 // lo(a[1]*b[i])
adcs x16,x17,x10
mul x10,x6,x3 // lo(a[2]*b[i])
adcs x17,x19,x11
mul x11,x7,x3 // lo(a[3]*b[i])
adc x19,x20,xzr
adds x14,x14,x8
umulh x8,x4,x3 // hi(a[0]*b[i])
adcs x15,x15,x9
umulh x9,x5,x3 // hi(a[1]*b[i])
adcs x16,x16,x10
umulh x10,x6,x3 // hi(a[2]*b[i])
adcs x17,x17,x11
umulh x11,x7,x3 // hi(a[3]*b[i])
adc x19,x19,xzr
ldr x3,[x2,#8*(1+1)] // b[1+1]
adds x15,x15,x8 // accumulate high parts of multiplication
adcs x16,x16,x9
adcs x17,x17,x10
adcs x19,x19,x11
adc x20,xzr,xzr
lsl x10,x14,#32
lsr x11,x14,#32
subs x8,x14,x10
sbcs x9,xzr,x11
sbcs x10,xzr,x10
sbc x11,x14,x11
adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
mul x8,x4,x3 // lo(a[0]*b[i])
adcs x15,x16,x9
mul x9,x5,x3 // lo(a[1]*b[i])
adcs x16,x17,x10 // +=acc[0]*0xffff0001
mul x10,x6,x3 // lo(a[2]*b[i])
adcs x17,x19,x11
mul x11,x7,x3 // lo(a[3]*b[i])
adc x19,x20,xzr
adds x14,x14,x8 // accumulate low parts of multiplication
umulh x8,x4,x3 // hi(a[0]*b[i])
adcs x15,x15,x9
umulh x9,x5,x3 // hi(a[1]*b[i])
adcs x16,x16,x10
umulh x10,x6,x3 // hi(a[2]*b[i])
adcs x17,x17,x11
umulh x11,x7,x3 // hi(a[3]*b[i])
adc x19,x19,xzr
ldr x3,[x2,#8*(2+1)] // b[2+1]
adds x15,x15,x8 // accumulate high parts of multiplication
adcs x16,x16,x9
adcs x17,x17,x10
adcs x19,x19,x11
adc x20,xzr,xzr
lsl x10,x14,#32 // t0
lsr x11,x14,#32 // t1
subs x8,x14,x10
sbcs x9,xzr,x11
sbcs x10,xzr,x10
sbc x11,x14,x11
adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
mul x8,x4,x3 // lo(a[0]*b[i])
adcs x15,x16,x9
mul x9,x5,x3 // lo(a[1]*b[i])
adcs x16,x17,x10 // +=acc[0]*0xffff0001
mul x10,x6,x3 // lo(a[2]*b[i])
adcs x17,x19,x11
mul x11,x7,x3 // lo(a[3]*b[i])
adc x19,x20,xzr
adds x14,x14,x8 // accumulate low parts of multiplication
umulh x8,x4,x3 // hi(a[0]*b[i])
adcs x15,x15,x9
umulh x9,x5,x3 // hi(a[1]*b[i])
adcs x16,x16,x10
umulh x10,x6,x3 // hi(a[2]*b[i])
adcs x17,x17,x11
umulh x11,x7,x3 // hi(a[3]*b[i])
adc x19,x19,xzr
adds x15,x15,x8 // accumulate high parts of multiplication
adcs x16,x16,x9
adcs x17,x17,x10
adcs x19,x19,x11
adc x20,xzr,xzr
lsl x10,x14,#32 // t0
lsr x11,x14,#32 // t1
// last reduction
subs x8,x14,x10
sbcs x9,xzr,x11
sbcs x10,xzr,x10
sbc x11,x14,x11
adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
adcs x15,x16,x9
adcs x16,x17,x10 // +=acc[0]*0xffff0001
adcs x17,x19,x11
adc x19,x20,xzr
// if a > p : return a - p
// else: return a
// carry, b = a + (2^256 - p)
adds x8,x14,#1
adcs x9,x15,x12
adcs x10,x16,xzr
adcs x11,x17,x13
adc x19,x19,xzr
cmp x19,xzr
// a + 2^256 - p a < p, a - p a
// b
csel x14,x14,x8,eq
csel x15,x15,x9,eq
csel x16,x16,x10,eq
csel x17,x17,x11,eq
stp x14,x15,[x0]
stp x16,x17,[x0,#16]
ret
.globl func(sm2_z256_modp_mont_mul)
.align 4
func(sm2_z256_modp_mont_mul):
stp x29,x30,[sp,#-32]!
add x29,sp,#0
stp x19,x20,[sp,#16]
// load a
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
// load b0
ldr x3,[x2]
// load modp
mov x12,#neg_p1
mov x13,#neg_p3
bl __sm2_z256_modp_mont_mul
ldp x19,x20,[sp,#16]
ldp x29,x30,[sp],#32
ret
.align 4
__sm2_z256_modp_mont_sqr:
// | | | | | |a1*a0| |
// | | | | |a2*a0| | |
// | |a3*a2|a3*a0| | | |
// | | | |a2*a1| | | |
// | | |a3*a1| | | | |
// *| | | | | | | | 2|
// +|a3*a3|a2*a2|a1*a1|a0*a0|
// |--+--+--+--+--+--+--+--|
// |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
//
// "can't overflow" below mark carrying into high part of
// multiplication result, which can't overflow, because it
// can never be all ones.
mul x15,x5,x4 // a[1]*a[0]
umulh x9,x5,x4
mul x16,x6,x4 // a[2]*a[0]
umulh x10,x6,x4
mul x17,x7,x4 // a[3]*a[0]
umulh x19,x7,x4
adds x16,x16,x9 // accumulate high parts of multiplication
mul x8,x6,x5 // a[2]*a[1]
umulh x9,x6,x5
adcs x17,x17,x10
mul x10,x7,x5 // a[3]*a[1]
umulh x11,x7,x5
adc x19,x19,xzr // can't overflow
mul x20,x7,x6 // a[3]*a[2]
umulh x1,x7,x6
adds x9,x9,x10 // accumulate high parts of multiplication
mul x14,x4,x4 // a[0]*a[0]
adc x10,x11,xzr // can't overflow
adds x17,x17,x8 // accumulate low parts of multiplication
umulh x4,x4,x4
adcs x19,x19,x9
mul x9,x5,x5 // a[1]*a[1]
adcs x20,x20,x10
umulh x5,x5,x5
adc x1,x1,xzr // can't overflow
adds x15,x15,x15 // acc[1-6]*=2
mul x10,x6,x6 // a[2]*a[2]
adcs x16,x16,x16
umulh x6,x6,x6
adcs x17,x17,x17
mul x11,x7,x7 // a[3]*a[3]
adcs x19,x19,x19
umulh x7,x7,x7
adcs x20,x20,x20
adcs x1,x1,x1
adc x2,xzr,xzr
adds x15,x15,x4 // +a[i]*a[i]
adcs x16,x16,x9
adcs x17,x17,x5
adcs x19,x19,x10
adcs x20,x20,x6
lsl x10,x14,#32
adcs x1,x1,x11
lsr x11,x14,#32
adc x2,x2,x7
// Now: x2, x1, x20, x19, x17, x16, x15, x14 a^2
subs x8,x14,x10
sbcs x9,xzr,x11
sbcs x10,xzr,x10
sbc x11,x14,x11
adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
adcs x15,x16,x9
adcs x16,x17,x10 // +=acc[0]*0xffff0001
adc x17,x11,xzr // can't overflow
lsl x10,x14,#32
lsr x11,x14,#32
subs x8,x14,x10
sbcs x9,xzr,x11
sbcs x10,xzr,x10
sbc x11,x14,x11
adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
adcs x15,x16,x9
adcs x16,x17,x10 // +=acc[0]*0xffff0001
adc x17,x11,xzr // can't overflow
lsl x10,x14,#32
lsr x11,x14,#32
subs x8,x14,x10
sbcs x9,xzr,x11
sbcs x10,xzr,x10
sbc x11,x14,x11
adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
adcs x15,x16,x9
adcs x16,x17,x10 // +=acc[0]*0xffff0001
adc x17,x11,xzr // can't overflow
lsl x10,x14,#32
lsr x11,x14,#32
subs x8,x14,x10
sbcs x9,xzr,x11
sbcs x10,xzr,x10
sbc x11,x14,x11
adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
adcs x15,x16,x9
adcs x16,x17,x10 // +=acc[0]*0xffff0001
adc x17,x11,xzr // can't overflow
adds x14,x14,x19 // accumulate upper half
adcs x15,x15,x20
adcs x16,x16,x1
adcs x17,x17,x2
adc x19,xzr,xzr
// carry, b = a + (2^256 - p)
adds x8,x14,#1
adcs x9,x15,x12
adcs x10,x16,xzr
adcs x11,x17,x13
adc x19,x19,xzr
cmp x19,xzr
// a + 2^256 - p a < p, a - p a
// b
csel x14,x14,x8,eq
csel x15,x15,x9,eq
csel x16,x16,x10,eq
csel x17,x17,x11,eq
stp x14,x15,[x0]
stp x16,x17,[x0,#16]
// x4,x5,x6,x7[x0]
ret
.globl func(sm2_z256_modp_mont_sqr)
.align 4
func(sm2_z256_modp_mont_sqr):
stp x29,x30,[sp,#-32]!
add x29,sp,#0
stp x19,x20,[sp,#16]
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
// load modp
mov x12,#neg_p1
mov x13,#neg_p3
bl __sm2_z256_modp_mont_sqr
ldp x19,x20,[sp,#16]
ldp x29,x30,[sp],#32
ret
// r = r^(2^n) n
// __sm2_z256_modp_mont_sqrx4,x5,x6,x7, x14,x15,x16,x17[x0]
// mont_sqr
//
.globl func(sm2_z256_modp_mont_esq)
.align 4
func(sm2_z256_modp_mont_esq):
stp x29,x30,[sp,#-32]!
add x29,sp,#0
stp x19,x20,[sp,#16]
ldp x4,x5,[x0]
ldp x6,x7,[x0,#16]
// load modp
mov x12,#neg_p1
mov x13,#neg_p3
// x1 sqrx18
mov x3, x1
22:
// x4,x5,x6,x7
bl __sm2_z256_modp_mont_sqr
// x4,x5,x6,x7
mov x4,x14
mov x5,x15
mov x6,x16
mov x7,x17
subs x3, x3, #1
b.ne 22b
ldp x19,x20,[sp,#16]
ldp x29,x30,[sp],#32
ret
// mont(a) = a * 2^256 (mod p) = mont_mul(a, 2^512 mod p)
.globl func(sm2_z256_modp_to_mont)
.align 6
func(sm2_z256_modp_to_mont):
stp x29,x30,[sp,#-32]!
add x29,sp,#0
stp x19,x20,[sp,#16]
mov x3,x1
mov x1,x0
mov x0,x3
adr x2,Lz256_2e512modp
ldr x3,Lz256_2e512modp
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
mov x12,#neg_p1
mov x13,#neg_p3
bl __sm2_z256_modp_mont_mul
ldp x19,x20,[sp,#16]
ldp x29,x30,[sp],#32
ret
// b == 1使
//
// mont(mont(a), 1) = aR * 1 * R^-1 (mod p) = a (mod p)
.globl func(sm2_z256_modp_from_mont)
.align 4
func(sm2_z256_modp_from_mont):
stp x29,x30,[sp,#-32]!
add x29,sp,#0
stp x19,x20,[sp,#16]
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
// load modp
mov x12,#neg_p1
mov x13,#neg_p3
// load b = {1,0,0,0}
adr x2,Lone
// load b1 = 1
mov x3,#1
bl __sm2_z256_modp_mont_mul
ldp x19,x20,[sp,#16]
ldp x29,x30,[sp],#32
ret
.align 4
__sm2_z256_modp_haf:
// a - (2^256 - p) == a + p - 2^256
subs x8,x14,#1
sbcs x9,x15,x12
sbcs x10,x16,xzr
sbcs x11,x17,x13
// (a + p - 2^256) + 2^256
adcs x1,xzr,xzr
// r = (a is even) ? a : (a - (2^256 - p) + 2^256)
tst x14,#1
csel x14,x14,x8,eq
csel x15,x15,x9,eq
csel x16,x16,x10,eq
csel x17,x17,x11,eq
csel x1,xzr,x1,eq
// r = r >> 1
lsr x14,x14,#1
orr x14,x14,x15,lsl#63
lsr x15,x15,#1
orr x15,x15,x16,lsl#63
lsr x16,x16,#1
orr x16,x16,x17,lsl#63
lsr x17,x17,#1
stp x14,x15,[x0]
orr x17,x17,x1,lsl#63
stp x16,x17,[x0,#16]
ret
.globl func(sm2_z256_modp_haf)
.align 4
func(sm2_z256_modp_haf):
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ldp x14,x15,[x1]
ldp x16,x17,[x1,#16]
mov x12,#neg_p1
mov x13,#neg_p3
bl __sm2_z256_modp_haf
ldp x29,x30,[sp],#16
ret
.globl func(sm2_z256_point_dbl)
.align 5
func(sm2_z256_point_dbl):
stp x29,x30,[sp,#-96]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
sub sp,sp,#32*4 //4
Ldouble_shortcut:
// Jacobian3
// 0-16,16-32
// 32-48,48-64
// 64-80,80-96
// x14-x17 = Y
ldp x14,x15,[x1,#32]
mov x21,x0
ldp x16,x17,[x1,#48]
mov x22,x1
// x21, x22 x0,x1 x21 = out, x22 = in
// x0,x1使
// __foo x0
// load modp
mov x12,#neg_p1
mov x13,#neg_p3
// x8-x11 = x14-x17 = Y
mov x8,x14
mov x9,x15
// x4-x7 = Z sqr x4-x7
// x22 == x1
ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont
mov x10,x16
mov x11,x17
ldp x6,x7,[x22,#64+16]
// S = T[0]
add x0,sp,#0
//
//
// 1. S = 2Y
bl __sm2_z256_modp_add // p256_mul_by_2(S, in_y);
// Zsqr = T[2]
add x0,sp,#64
// 2. Zsqr = Z1^2
bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Zsqr, in_z);
// x8-x11 = X
ldp x8,x9,[x22]
ldp x10,x11,[x22,#16]
// x4-x7 = x14-x17
mov x4,x14 // put Zsqr aside for p256_sub
mov x5,x15
mov x6,x16
mov x7,x17
// t1 = M
// M = T[1]
add x0,sp,#32
// 6. M = X1 + Zsqr = X1 + Z1^2
bl __sm2_z256_modp_add // p256_add(M, Zsqr, in_x);
add x2,x22,#0
mov x14,x4 // restore Zsqr
mov x15,x5
ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont
mov x16,x6
mov x17,x7
ldp x6,x7,[sp,#0+16]
add x0,sp,#64
// 7. Zsqr = X - Z^2
bl __sm2_z256_modp_neg_sub // p256_sub(Zsqr, in_x, Zsqr);
add x0,sp,#0
// 3. S = S^2 = 4*Y1^2
bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(S, S);
ldr x3,[x22,#32]
ldp x4,x5,[x22,#64]
ldp x6,x7,[x22,#64+16]
add x2,x22,#32
add x0,sp,#96
// tmp0 = Z*Y
// 4. Z3 = Z1 * Y1
bl __sm2_z256_modp_mont_mul // p256_mul_mont(tmp0, in_z, in_y);
//
//
mov x8,x14
mov x9,x15
ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont
mov x10,x16
mov x11,x17
ldp x6,x7,[sp,#0+16]
add x0,x21,#64
// mov x0,x21 // z256
// add sp,x29,#0
// ldp x19,x20,[x29,#16]
// ldp x21,x22,[x29,#32]
// ldp x29,x30,[sp],#96
// ret
// Z3 = 2YZ
bl __sm2_z256_modp_add // p256_mul_by_2(res_z, tmp0);
add x0,sp,#96
bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(tmp0, S);
ldr x3,[sp,#64] // forward load for p256_mul_mont
ldp x4,x5,[sp,#32]
ldp x6,x7,[sp,#32+16]
add x0,x21,#32
bl __sm2_z256_modp_haf // p256_div_by_2(res_y, tmp0);
add x2,sp,#64
add x0,sp,#32
bl __sm2_z256_modp_mont_mul // p256_mul_mont(M, M, Zsqr);
mov x8,x14 // duplicate M
mov x9,x15
mov x10,x16
mov x11,x17
mov x4,x14 // put M aside
mov x5,x15
mov x6,x16
mov x7,x17
add x0,sp,#32
bl __sm2_z256_modp_add
mov x8,x4 // restore M
mov x9,x5
ldr x3,[x22] // forward load for p256_mul_mont
mov x10,x6
ldp x4,x5,[sp,#0]
mov x11,x7
ldp x6,x7,[sp,#0+16]
bl __sm2_z256_modp_add // p256_mul_by_3(M, M);
add x2,x22,#0
add x0,sp,#0
bl __sm2_z256_modp_mont_mul // p256_mul_mont(S, S, in_x);
mov x8,x14
mov x9,x15
ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont
mov x10,x16
mov x11,x17
ldp x6,x7,[sp,#32+16]
add x0,sp,#96
bl __sm2_z256_modp_add // p256_mul_by_2(tmp0, S);
add x0,x21,#0 // X
bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(res_x, M);
add x2,sp,#96
bl __sm2_z256_modp_sub // p256_sub(res_x, res_x, tmp0);
add x2,sp,#0
add x0,sp,#0
bl __sm2_z256_modp_neg_sub // p256_sub(S, S, res_x);
ldr x3,[sp,#32]
mov x4,x14 // copy S
mov x5,x15
mov x6,x16
mov x7,x17
add x2,sp,#32
bl __sm2_z256_modp_mont_mul // p256_mul_mont(S, S, M);
add x2,x21,#32
add x0,x21,#32 // Y
bl __sm2_z256_modp_sub // p256_sub(res_y, S, res_y);
add sp,x29,#0 // destroy frame
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldp x29,x30,[sp],#96
ret
.globl func(sm2_z256_point_add)
.align 5
func(sm2_z256_point_add):
stp x29,x30,[sp,#-96]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#32*12
ldp x4,x5,[x2,#64] // in2_z
ldp x6,x7,[x2,#64+16]
mov x21,x0
mov x22,x1
mov x23,x2
// load modp
mov x12,#neg_p1
mov x13,#neg_p3
//ldr x12,Lpoly+8
//ldr x13,Lpoly+24
orr x8,x4,x5
orr x10,x6,x7
orr x25,x8,x10
cmp x25,#0
csetm x25,ne // ~in2infty
add x0,sp,#192
bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Z2sqr, in2_z);
ldp x4,x5,[x22,#64] // in1_z
ldp x6,x7,[x22,#64+16]
orr x8,x4,x5
orr x10,x6,x7
orr x24,x8,x10
cmp x24,#0
csetm x24,ne // ~in1infty
add x0,sp,#128
bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Z1sqr, in1_z);
ldr x3,[x23,#64]
ldp x4,x5,[sp,#192]
ldp x6,x7,[sp,#192+16]
add x2,x23,#64
add x0,sp,#320
bl __sm2_z256_modp_mont_mul // p256_mul_mont(S1, Z2sqr, in2_z);
ldr x3,[x22,#64]
ldp x4,x5,[sp,#128]
ldp x6,x7,[sp,#128+16]
add x2,x22,#64
add x0,sp,#352
bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, Z1sqr, in1_z);
ldr x3,[x22,#32]
ldp x4,x5,[sp,#320]
ldp x6,x7,[sp,#320+16]
add x2,x22,#32
add x0,sp,#320
bl __sm2_z256_modp_mont_mul // p256_mul_mont(S1, S1, in1_y);
ldr x3,[x23,#32]
ldp x4,x5,[sp,#352]
ldp x6,x7,[sp,#352+16]
add x2,x23,#32
add x0,sp,#352
bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, S2, in2_y);
add x2,sp,#320
ldr x3,[sp,#192] // forward load for p256_mul_mont
ldp x4,x5,[x22]
ldp x6,x7,[x22,#16]
add x0,sp,#160
bl __sm2_z256_modp_sub // p256_sub(R, S2, S1);
orr x14,x14,x15 // see if result is zero
orr x16,x16,x17
orr x26,x14,x16 // ~is_equal(S1,S2)
add x2,sp,#192
add x0,sp,#256
bl __sm2_z256_modp_mont_mul // p256_mul_mont(U1, in1_x, Z2sqr);
ldr x3,[sp,#128]
ldp x4,x5,[x23]
ldp x6,x7,[x23,#16]
add x2,sp,#128
add x0,sp,#288
bl __sm2_z256_modp_mont_mul // p256_mul_mont(U2, in2_x, Z1sqr);
add x2,sp,#256
ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont
ldp x6,x7,[sp,#160+16]
add x0,sp,#96
bl __sm2_z256_modp_sub // p256_sub(H, U2, U1);
orr x14,x14,x15 // see if result is zero
orr x16,x16,x17
orr x14,x14,x16 // ~is_equal(U1,U2)
mvn x27,x24 // -1/0 -> 0/-1
mvn x28,x25 // -1/0 -> 0/-1
orr x14,x14,x27
orr x14,x14,x28
orr x14,x14,x26
cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
Ladd_double:
mov x1,x22
mov x0,x21
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
add sp,sp,#32*(12-4) // difference in stack frames
b Ldouble_shortcut
.align 4
Ladd_proceed:
add x0,sp,#192
bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Rsqr, R);
ldr x3,[x22,#64]
ldp x4,x5,[sp,#96]
ldp x6,x7,[sp,#96+16]
add x2,x22,#64
add x0,sp,#64
bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_z, H, in1_z);
ldp x4,x5,[sp,#96]
ldp x6,x7,[sp,#96+16]
add x0,sp,#128
bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Hsqr, H);
ldr x3,[x23,#64]
ldp x4,x5,[sp,#64]
ldp x6,x7,[sp,#64+16]
add x2,x23,#64
add x0,sp,#64
bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_z, res_z, in2_z);
ldr x3,[sp,#96]
ldp x4,x5,[sp,#128]
ldp x6,x7,[sp,#128+16]
add x2,sp,#96
add x0,sp,#224
bl __sm2_z256_modp_mont_mul // p256_mul_mont(Hcub, Hsqr, H);
ldr x3,[sp,#128]
ldp x4,x5,[sp,#256]
ldp x6,x7,[sp,#256+16]
add x2,sp,#128
add x0,sp,#288
bl __sm2_z256_modp_mont_mul // p256_mul_mont(U2, U1, Hsqr);
mov x8,x14
mov x9,x15
mov x10,x16
mov x11,x17
add x0,sp,#128
bl __sm2_z256_modp_add // p256_mul_by_2(Hsqr, U2);
add x2,sp,#192
add x0,sp,#0
bl __sm2_z256_modp_neg_sub // p256_sub(res_x, Rsqr, Hsqr);
add x2,sp,#224
bl __sm2_z256_modp_sub // p256_sub(res_x, res_x, Hcub);
add x2,sp,#288
ldr x3,[sp,#224] // forward load for p256_mul_mont
ldp x4,x5,[sp,#320]
ldp x6,x7,[sp,#320+16]
add x0,sp,#32
bl __sm2_z256_modp_neg_sub // p256_sub(res_y, U2, res_x);
add x2,sp,#224
add x0,sp,#352
bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, S1, Hcub);
ldr x3,[sp,#160]
ldp x4,x5,[sp,#32]
ldp x6,x7,[sp,#32+16]
add x2,sp,#160
add x0,sp,#32
bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_y, res_y, R);
add x2,sp,#352
bl __sm2_z256_modp_sub // p256_sub(res_y, res_y, S2);
ldp x4,x5,[sp,#0] // res
ldp x6,x7,[sp,#0+16]
ldp x8,x9,[x23] // in2
ldp x10,x11,[x23,#16]
ldp x14,x15,[x22,#0] // in1
cmp x24,#0 // ~, remember?
ldp x16,x17,[x22,#0+16]
csel x8,x4,x8,ne
csel x9,x5,x9,ne
ldp x4,x5,[sp,#0+0+32] // res
csel x10,x6,x10,ne
csel x11,x7,x11,ne
cmp x25,#0 // ~, remember?
ldp x6,x7,[sp,#0+0+48]
csel x14,x8,x14,ne
csel x15,x9,x15,ne
ldp x8,x9,[x23,#0+32] // in2
csel x16,x10,x16,ne
csel x17,x11,x17,ne
ldp x10,x11,[x23,#0+48]
stp x14,x15,[x21,#0]
stp x16,x17,[x21,#0+16]
ldp x14,x15,[x22,#32] // in1
cmp x24,#0 // ~, remember?
ldp x16,x17,[x22,#32+16]
csel x8,x4,x8,ne
csel x9,x5,x9,ne
ldp x4,x5,[sp,#0+32+32] // res
csel x10,x6,x10,ne
csel x11,x7,x11,ne
cmp x25,#0 // ~, remember?
ldp x6,x7,[sp,#0+32+48]
csel x14,x8,x14,ne
csel x15,x9,x15,ne
ldp x8,x9,[x23,#32+32] // in2
csel x16,x10,x16,ne
csel x17,x11,x17,ne
ldp x10,x11,[x23,#32+48]
stp x14,x15,[x21,#32]
stp x16,x17,[x21,#32+16]
ldp x14,x15,[x22,#64] // in1
cmp x24,#0 // ~, remember?
ldp x16,x17,[x22,#64+16]
csel x8,x4,x8,ne
csel x9,x5,x9,ne
csel x10,x6,x10,ne
csel x11,x7,x11,ne
cmp x25,#0 // ~, remember?
csel x14,x8,x14,ne
csel x15,x9,x15,ne
csel x16,x10,x16,ne
csel x17,x11,x17,ne
stp x14,x15,[x21,#64]
stp x16,x17,[x21,#64+16]
Ladd_done:
add sp,x29,#0 // destroy frame
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
.globl func(sm2_z256_point_add_affine)
.align 5
func(sm2_z256_point_add_affine):
stp x29,x30,[sp,#-80]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
sub sp,sp,#32*10
mov x21,x0
mov x22,x1
mov x23,x2
// load modp
mov x12,#neg_p1
mov x13,#neg_p3
ldp x4,x5,[x1,#64] // in1_z
ldp x6,x7,[x1,#64+16]
orr x8,x4,x5
orr x10,x6,x7
orr x24,x8,x10
cmp x24,#0
csetm x24,ne // ~in1infty
ldp x14,x15,[x2] // in2_x
ldp x16,x17,[x2,#16]
ldp x8,x9,[x2,#32] // in2_y
ldp x10,x11,[x2,#48]
orr x14,x14,x15
orr x16,x16,x17
orr x8,x8,x9
orr x10,x10,x11
orr x14,x14,x16
orr x8,x8,x10
orr x25,x14,x8
cmp x25,#0
csetm x25,ne // ~in2infty
add x0,sp,#128
bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Z1sqr, in1_z);
mov x4,x14
mov x5,x15
mov x6,x16
mov x7,x17
ldr x3,[x23]
add x2,x23,#0
add x0,sp,#96
bl __sm2_z256_modp_mont_mul // p256_mul_mont(U2, Z1sqr, in2_x);
add x2,x22,#0
ldr x3,[x22,#64] // forward load for p256_mul_mont
ldp x4,x5,[sp,#128]
ldp x6,x7,[sp,#128+16]
add x0,sp,#160
bl __sm2_z256_modp_sub // p256_sub(H, U2, in1_x);
add x2,x22,#64
add x0,sp,#128
bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, Z1sqr, in1_z);
ldr x3,[x22,#64]
ldp x4,x5,[sp,#160]
ldp x6,x7,[sp,#160+16]
add x2,x22,#64
add x0,sp,#64
bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_z, H, in1_z);
ldr x3,[x23,#32]
ldp x4,x5,[sp,#128]
ldp x6,x7,[sp,#128+16]
add x2,x23,#32
add x0,sp,#128
bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, S2, in2_y);
add x2,x22,#32
ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont
ldp x6,x7,[sp,#160+16]
add x0,sp,#192
bl __sm2_z256_modp_sub // p256_sub(R, S2, in1_y);
add x0,sp,#224
bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Hsqr, H);
ldp x4,x5,[sp,#192]
ldp x6,x7,[sp,#192+16]
add x0,sp,#288
bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Rsqr, R);
ldr x3,[sp,#160]
ldp x4,x5,[sp,#224]
ldp x6,x7,[sp,#224+16]
add x2,sp,#160
add x0,sp,#256
bl __sm2_z256_modp_mont_mul // p256_mul_mont(Hcub, Hsqr, H);
ldr x3,[x22]
ldp x4,x5,[sp,#224]
ldp x6,x7,[sp,#224+16]
add x2,x22,#0
add x0,sp,#96
bl __sm2_z256_modp_mont_mul // p256_mul_mont(U2, in1_x, Hsqr);
mov x8,x14
mov x9,x15
mov x10,x16
mov x11,x17
add x0,sp,#224
bl __sm2_z256_modp_add // p256_mul_by_2(Hsqr, U2);
add x2,sp,#288
add x0,sp,#0
bl __sm2_z256_modp_neg_sub // p256_sub(res_x, Rsqr, Hsqr);
add x2,sp,#256
bl __sm2_z256_modp_sub // p256_sub(res_x, res_x, Hcub);
add x2,sp,#96
ldr x3,[x22,#32] // forward load for p256_mul_mont
ldp x4,x5,[sp,#256]
ldp x6,x7,[sp,#256+16]
add x0,sp,#32
bl __sm2_z256_modp_neg_sub // p256_sub(res_y, U2, res_x);
add x2,x22,#32
add x0,sp,#128
bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, in1_y, Hcub);
ldr x3,[sp,#192]
ldp x4,x5,[sp,#32]
ldp x6,x7,[sp,#32+16]
add x2,sp,#192
add x0,sp,#32
bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_y, res_y, R);
add x2,sp,#128
bl __sm2_z256_modp_sub // p256_sub(res_y, res_y, S2);
ldp x4,x5,[sp,#0] // res
ldp x6,x7,[sp,#0+16]
ldp x8,x9,[x23] // in2
ldp x10,x11,[x23,#16]
ldp x14,x15,[x22,#0] // in1
cmp x24,#0 // ~, remember?
ldp x16,x17,[x22,#0+16]
csel x8,x4,x8,ne
csel x9,x5,x9,ne
ldp x4,x5,[sp,#0+0+32] // res
csel x10,x6,x10,ne
csel x11,x7,x11,ne
cmp x25,#0 // ~, remember?
ldp x6,x7,[sp,#0+0+48]
csel x14,x8,x14,ne
csel x15,x9,x15,ne
ldp x8,x9,[x23,#0+32] // in2
csel x16,x10,x16,ne
csel x17,x11,x17,ne
ldp x10,x11,[x23,#0+48]
stp x14,x15,[x21,#0]
stp x16,x17,[x21,#0+16]
adr x23,Lneg_p-64
ldp x14,x15,[x22,#32] // in1
cmp x24,#0 // ~, remember?
ldp x16,x17,[x22,#32+16]
csel x8,x4,x8,ne
csel x9,x5,x9,ne
ldp x4,x5,[sp,#0+32+32] // res
csel x10,x6,x10,ne
csel x11,x7,x11,ne
cmp x25,#0 // ~, remember?
ldp x6,x7,[sp,#0+32+48]
csel x14,x8,x14,ne
csel x15,x9,x15,ne
ldp x8,x9,[x23,#32+32] // in2
csel x16,x10,x16,ne
csel x17,x11,x17,ne
ldp x10,x11,[x23,#32+48]
stp x14,x15,[x21,#32]
stp x16,x17,[x21,#32+16]
ldp x14,x15,[x22,#64] // in1
cmp x24,#0 // ~, remember?
ldp x16,x17,[x22,#64+16]
csel x8,x4,x8,ne
csel x9,x5,x9,ne
csel x10,x6,x10,ne
csel x11,x7,x11,ne
cmp x25,#0 // ~, remember?
csel x14,x8,x14,ne
csel x15,x9,x15,ne
csel x16,x10,x16,ne
csel x17,x11,x17,ne
stp x14,x15,[x21,#64]
stp x16,x17,[x21,#64+16]
add sp,x29,#0 // destroy frame
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x29,x30,[sp],#80
ret
.align 4
__sm2_z256_modn_add:
// (carry, a) = a + b
adds x14,x14,x4
adcs x15,x15,x5
adcs x16,x16,x6
adcs x17,x17,x7
adc x1,xzr,xzr
// (borrow, b) = (carry, a) - p = a + b - p
subs x4,x14,x10
sbcs x5,x15,x11
sbcs x6,x16,x12
sbcs x7,x17,x13
sbcs xzr,x1,xzr
// if borrow (lo), b is not the answer
csel x14,x14,x4,lo
csel x15,x15,x5,lo
csel x16,x16,x6,lo
stp x14,x15,[x0]
csel x17,x17,x7,lo
stp x16,x17,[x0,#16]
ret
.globl func(sm2_z256_modn_add)
.align 4
func(sm2_z256_modn_add):
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ldp x14,x15,[x1]
ldp x16,x17,[x1,#16]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldr x10,Lmodn
ldr x11,Lmodn+8
ldr x12,Lmodn+16
ldr x13,Lmodn+24
bl __sm2_z256_modn_add
ldp x29,x30,[sp],#16
ret
.align 4
__sm2_z256_modn_sub:
// load b
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
// borrow, r = a - b
subs x14,x14,x4
sbcs x15,x15,x5
sbcs x16,x16,x6
sbcs x17,x17,x7
sbc x1,xzr,xzr
// b = r + p = a - b + p
adds x4,x14,x10
adcs x5,x15,x11
adcs x6,x16,x12
adcs x7,x17,x13
// return (borrow == 0) ? r : (a - b + p)
cmp x1,xzr
csel x14,x14,x4,eq
csel x15,x15,x5,eq
csel x16,x16,x6,eq
stp x14,x15,[x0]
csel x17,x17,x7,eq
stp x16,x17,[x0,#16]
ret
.globl func(sm2_z256_modn_sub)
.align 4
func(sm2_z256_modn_sub):
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ldp x14,x15,[x1]
ldp x16,x17,[x1,#16]
ldr x10,Lmodn
ldr x11,Lmodn+8
ldr x12,Lmodn+16
ldr x13,Lmodn+24
bl __sm2_z256_modn_sub
ldp x29,x30,[sp],#16
ret
.globl func(sm2_z256_modn_neg)
.align 4
func(sm2_z256_modn_neg):
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ldr x10,Lmodn
ldr x11,Lmodn+8
ldr x12,Lmodn+16
ldr x13,Lmodn+24
mov x2,x1
mov x14,xzr
mov x15,xzr
mov x16,xzr
mov x17,xzr
bl __sm2_z256_modn_sub
ldp x29,x30,[sp],#16
ret
.align 4
__sm2_z256_modn_mont_mul:
// x14,x15,x16,x17 as a0,a1,a2,a3
// x4,x5,x6,x7 as b0,b1,b2,b3
// x3 as b0,b1,b2,b3
// c = b0 * a, len(c) = 5
mul x14,x4,x3
umulh x21,x4,x3
mul x15,x5,x3
umulh x22,x5,x3
mul x16,x6,x3
umulh x23,x6,x3
mul x17,x7,x3
umulh x24,x7,x3
adds x15,x15,x21
adcs x16,x16,x22
adcs x17,x17,x23
adc x19,xzr,x24
// q = mu * c0 mod 2^64
mul x3,x9,x14
// c = (c + q * p) // 2^64
mul x21,x10,x3
mul x22,x11,x3
mul x23,x12,x3
mul x24,x13,x3
adds x14,x14,x21
adcs x15,x15,x22
adcs x16,x16,x23
adcs x17,x17,x24
adcs x19,x19,xzr
adc x20,xzr,xzr
umulh x21,x10,x3
umulh x22,x11,x3
umulh x23,x12,x3
umulh x24,x13,x3
adds x14,x15,x21
adcs x15,x16,x22
adcs x16,x17,x23
adcs x17,x19,x24
adc x19,x20,xzr
// load b1
ldr x3,[x2,#8]
// c += a * b1
// len(c) = 6
mul x21,x4,x3
mul x22,x5,x3
mul x23,x6,x3
mul x24,x7,x3
adds x14,x14,x21
adcs x15,x15,x22
adcs x16,x16,x23
adcs x17,x17,x24
adcs x19,x19,xzr
adc x20,xzr,xzr
umulh x21,x4,x3
umulh x22,x5,x3
umulh x23,x6,x3
umulh x24,x7,x3
adds x15,x15,x21
adcs x16,x16,x22
adcs x17,x17,x23
adcs x19,x19,x24
adc x20,x20,xzr
// mu * c0 mod 2^64
mul x3,x9,x14
// c = (c + q * p) // 2^64
mul x21,x10,x3
mul x22,x11,x3
mul x23,x12,x3
mul x24,x13,x3
adds x14,x14,x21
adcs x15,x15,x22
adcs x16,x16,x23
adcs x17,x17,x24
adcs x19,x19,xzr
adc x20,x20,xzr
umulh x21,x10,x3
umulh x22,x11,x3
umulh x23,x12,x3
umulh x24,x13,x3
adds x14,x15,x21
adcs x15,x16,x22
adcs x16,x17,x23
adcs x17,x19,x24
adc x19,x20,xzr
// load b2
ldr x3,[x2,#16]
// c += a * b1
// len(c) = 6
mul x21,x4,x3
mul x22,x5,x3
mul x23,x6,x3
mul x24,x7,x3
adds x14,x14,x21
adcs x15,x15,x22
adcs x16,x16,x23
adcs x17,x17,x24
adcs x19,x19,xzr
adc x20,xzr,xzr
umulh x21,x4,x3
umulh x22,x5,x3
umulh x23,x6,x3
umulh x24,x7,x3
adds x15,x15,x21
adcs x16,x16,x22
adcs x17,x17,x23
adcs x19,x19,x24
adc x20,x20,xzr
// mu * c0 mod 2^64
mul x3,x9,x14
// c = (c + q * p) // 2^64
mul x21,x10,x3
mul x22,x11,x3
mul x23,x12,x3
mul x24,x13,x3
adds x14,x14,x21
adcs x15,x15,x22
adcs x16,x16,x23
adcs x17,x17,x24
adcs x19,x19,xzr
adc x20,x20,xzr
umulh x21,x10,x3
umulh x22,x11,x3
umulh x23,x12,x3
umulh x24,x13,x3
adds x14,x15,x21
adcs x15,x16,x22
adcs x16,x17,x23
adcs x17,x19,x24
adc x19,x20,xzr
// load b3
ldr x3,[x2,#24]
// c += a * b1
mul x21,x4,x3
mul x22,x5,x3
mul x23,x6,x3
mul x24,x7,x3
adds x14,x14,x21
adcs x15,x15,x22
adcs x16,x16,x23
adcs x17,x17,x24
adcs x19,x19,xzr
adc x20,xzr,xzr
umulh x21,x4,x3
umulh x22,x5,x3
umulh x23,x6,x3
umulh x24,x7,x3
adds x15,x15,x21
adcs x16,x16,x22
adcs x17,x17,x23
adcs x19,x19,x24
adc x20,x20,xzr
// q = mu * c0 mod 2^64
mul x3,x9,x14
// c = (c + q * p) // 2^64
mul x21,x10,x3
mul x22,x11,x3
mul x23,x12,x3
mul x24,x13,x3
adds x14,x14,x21
adcs x15,x15,x22
adcs x16,x16,x23
adcs x17,x17,x24
adcs x19,x19,xzr
adc x20,x20,xzr
umulh x21,x10,x3
umulh x22,x11,x3
umulh x23,x12,x3
umulh x24,x13,x3
adds x14,x15,x21
adcs x15,x16,x22
adcs x16,x17,x23
adcs x17,x19,x24
adc x19,x20,xzr
// (borrow, t) = c - p
// return borrow ? c : (c - p)
subs x21,x14,x10
sbcs x22,x15,x11
sbcs x23,x16,x12
sbcs x24,x17,x13
sbcs xzr,x19,xzr
// if borrow
csel x14,x14,x21,lo
csel x15,x15,x22,lo
csel x16,x16,x23,lo
csel x17,x17,x24,lo
// output
stp x14,x15,[x0]
stp x16,x17,[x0,#16]
ret
// mu = -n^-1 mod 2^64
// sage: n = 0xfffffffeffffffffffffffffffffffff7203df6b21c6052b53bbf40939d54123
// sage: mu = -(IntegerModRing(2^64)(n))^-1
Lmodn_mu:
.quad 0x327f9e8872350975
.globl func(sm2_z256_modn_mont_mul)
.align 4
func(sm2_z256_modn_mont_mul):
stp x29,x30,[sp,#-64]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
// mu = -n^-1 mod 2^64
ldr x9,Lmodn_mu
// load modp
ldr x10,Lmodn
ldr x11,Lmodn+8
ldr x12,Lmodn+16
ldr x13,Lmodn+24
// load a
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
// load b0
ldr x3,[x2]
bl __sm2_z256_modn_mont_mul
add sp,x29,#0
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x29,x30,[sp],#64
ret
// mont(mont(a), 1) = aR * 1 * R^-1 (mod p) = a (mod p)
.globl func(sm2_z256_modn_from_mont)
.align 4
func(sm2_z256_modn_from_mont):
stp x29,x30,[sp,#-64]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
// mu = -p^-1 mod 2^64
ldr x9,Lmodn_mu
// load p
ldr x10,Lmodn
ldr x11,Lmodn+8
ldr x12,Lmodn+16
ldr x13,Lmodn+24
// load a
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
// b = {1,0,0,0}
adr x2,Lone
// b0 = 1
mov x3,#1
bl __sm2_z256_modn_mont_mul
add sp,x29,#0
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x29,x30,[sp],#64
ret
// 2^512 mod n = 0x1eb5e412a22b3d3b620fc84c3affe0d43464504ade6fa2fa901192af7c114f20
Lsm2_z256_modn_2e512:
.quad 0x901192af7c114f20, 0x3464504ade6fa2fa, 0x620fc84c3affe0d4, 0x1eb5e412a22b3d3b
// mont(a) = a * 2^256 (mod p) = mont_mul(a, 2^512 mod p)
.globl func(sm2_z256_modn_to_mont)
.align 6
func(sm2_z256_modn_to_mont):
stp x29,x30,[sp,#-64]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
// mu = -p^-1 mod 2^64
ldr x9,Lmodn_mu
// load modp
ldr x10,Lmodn
ldr x11,Lmodn+8
ldr x12,Lmodn+16
ldr x13,Lmodn+24
// swap args x0,x1 = x1,x0
mov x3,x1
mov x1,x0
mov x0,x3
// load a
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
// load b = 2^512 mod p
adr x2,Lsm2_z256_modn_2e512
// load b0
ldr x3,Lsm2_z256_modn_2e512
bl __sm2_z256_modn_mont_mul
add sp,x29,#0
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x29,x30,[sp],#64
ret
.align 4
__sm2_z256_modn_mont_sqr:
// L(a0*a0) H(a0*a0) L(a1*a1) H(a1*a1) L(a2*a2) H(a2*a2) L(a3*a3) H(a3*a3)
// 2* L(a0*a1) L(a0*a2) L(a0*a3)
// 2* H(a0*a1) H(a0*a2) H(a0*a3)
// 2* L(a1*a2) L(a1*a3)
// 2* H(a1*a2) H(a1*a3)
mul x15,x5,x4
umulh x22,x5,x4
mul x16,x6,x4
umulh x23,x6,x4
mul x17,x7,x4
umulh x19,x7,x4
adds x16,x16,x22
mul x21,x6,x5
umulh x22,x6,x5
adcs x17,x17,x23
mul x23,x7,x5
umulh x24,x7,x5
adc x19,x19,xzr
mul x20,x7,x6 // a[3]*a[2]
umulh x1,x7,x6
adds x22,x22,x23 // accumulate high parts of multiplication
mul x14,x4,x4 // a[0]*a[0]
adc x23,x24,xzr // can't overflow
adds x17,x17,x21 // accumulate low parts of multiplication
umulh x4,x4,x4
adcs x19,x19,x22
mul x22,x5,x5 // a[1]*a[1]
adcs x20,x20,x23
umulh x5,x5,x5
adc x1,x1,xzr // can't overflow
adds x15,x15,x15 // acc[1-6]*=2
mul x23,x6,x6 // a[2]*a[2]
adcs x16,x16,x16
umulh x6,x6,x6
adcs x17,x17,x17
mul x24,x7,x7 // a[3]*a[3]
adcs x19,x19,x19
umulh x7,x7,x7
adcs x20,x20,x20
adcs x1,x1,x1
adc x2,xzr,xzr
adds x15,x15,x4 // +a[i]*a[i]
adcs x16,x16,x22
adcs x17,x17,x5
adcs x19,x19,x23
adcs x20,x20,x6
adcs x1,x1,x24
adc x2,x2,x7
// round 0
// q = mu * c0 mod 2^64
mul x3,x9,x14
// C = (C + q*p) // 2^64
mul x21,x10,x3
mul x22,x11,x3
mul x23,x12,x3
mul x24,x13,x3
adds x14,x14,x21
adcs x14,x15,x22
adcs x15,x16,x23
adcs x16,x17,x24
adc x17,xzr,xzr
umulh x21,x10,x3
umulh x22,x11,x3
umulh x23,x12,x3
umulh x24,x13,x3
adds x14,x14,x21
adcs x15,x15,x22
adcs x16,x16,x23
adc x17,x17,x24
// round 1
// q = mu * c0 mod 2^64
mul x3,x9,x14
// C = (C + q*p) // 2^64
mul x21,x10,x3
mul x22,x11,x3
mul x23,x12,x3
mul x24,x13,x3
adds x14,x14,x21
adcs x14,x15,x22
adcs x15,x16,x23
adcs x16,x17,x24
adc x17,xzr,xzr
umulh x21,x10,x3
umulh x22,x11,x3
umulh x23,x12,x3
umulh x24,x13,x3
adds x14,x14,x21
adcs x15,x15,x22
adcs x16,x16,x23
adc x17,x17,x24
// round 2
// q = mu * c0 mod 2^64
mul x3,x9,x14
// C = (C + q*p) // 2^64
mul x21,x10,x3
mul x22,x11,x3
mul x23,x12,x3
mul x24,x13,x3
adds x14,x14,x21
adcs x14,x15,x22
adcs x15,x16,x23
adcs x16,x17,x24
adc x17,xzr,xzr
umulh x21,x10,x3
umulh x22,x11,x3
umulh x23,x12,x3
umulh x24,x13,x3
adds x14,x14,x21
adcs x15,x15,x22
adcs x16,x16,x23
adc x17,x17,x24
// round 3
// q = mu * c0 mod 2^64
mul x3,x9,x14
// C = (C + q*p) // 2^64
mul x21,x10,x3
mul x22,x11,x3
mul x23,x12,x3
mul x24,x13,x3
adds x14,x14,x21
adcs x14,x15,x22
adcs x15,x16,x23
adcs x16,x17,x24
adc x17,xzr,xzr
umulh x21,x10,x3
umulh x22,x11,x3
umulh x23,x12,x3
umulh x24,x13,x3
adds x14,x14,x21
adcs x15,x15,x22
adcs x16,x16,x23
adc x17,x17,x24
// add upper half
adds x14,x14,x19
adcs x15,x15,x20
adcs x16,x16,x1
adcs x17,x17,x2
adc x19,xzr,xzr
// if c >= p, c = c - p
subs x21,x14,x10
sbcs x22,x15,x11
sbcs x23,x16,x12
sbcs x24,x17,x13
sbcs xzr,x19,xzr
csel x14,x14,x21,lo
csel x15,x15,x22,lo
csel x16,x16,x23,lo
csel x17,x17,x24,lo
stp x14,x15,[x0]
stp x16,x17,[x0,#16]
ret
.globl func(sm2_z256_modn_mont_sqr)
.align 4
func(sm2_z256_modn_mont_sqr):
stp x29,x30,[sp,#-64]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
// mu = -p^-1 mod 2^64
ldr x9,Lmodn_mu
// load modp
ldr x10,Lmodn
ldr x11,Lmodn+8
ldr x12,Lmodn+16
ldr x13,Lmodn+24
// load a
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
bl __sm2_z256_modn_mont_sqr
add sp,x29,#0
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x29,x30,[sp],#64
ret