From f6456f9e281e97b162bd1147ae5b4d8e60964c1d Mon Sep 17 00:00:00 2001 From: Zhi Guan Date: Tue, 18 Jun 2024 09:24:38 +0800 Subject: [PATCH] Update sm2_z256_arm64.S --- src/sm2_z256_arm64.S | 2211 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2211 insertions(+) diff --git a/src/sm2_z256_arm64.S b/src/sm2_z256_arm64.S index d89c31af..b6138563 100644 --- a/src/sm2_z256_arm64.S +++ b/src/sm2_z256_arm64.S @@ -1 +1,2212 @@ +/* + * Copyright 2014-2024 The GmSSL Project. All Rights Reserved. + * * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +#include + + +.text + +.align 5 + +#define neg_p1 0xffffffff +#define neg_p3 0x100000000 + +Lneg_p: +.quad 1, neg_p1, 0, neg_p3 + + +// 2^512 mod p +Lz256_2e512modp: +.quad 0x0000000200000003, 0x00000002ffffffff, 0x0000000100000001, 0x0000000400000002 + +Lone: +.quad 1,0,0,0 + + +Lmodn: +.quad 0x53bbf40939d54123, 0x7203df6b21c6052b, 0xffffffffffffffff, 0xfffffffeffffffff + + + +.align 4 +__sm2_z256_modp_add: + + // carry, a = a + b + adds x14,x14,x8 + adcs x15,x15,x9 + adcs x16,x16,x10 + adcs x17,x17,x11 + adc x1,xzr,xzr + + // carry, b = a + (2^256 - p) = (a + b - p) + 2^256 + adds x8,x14,#1 + adcs x9,x15,x12 + adcs x10,x16,xzr + adcs x11,x17,x13 + adc x1,x1,xzr + + cmp x1,xzr + + // if carry == 0, i.e. (a + b - p) < 0, return a == (a + b) + // else return b == (a + b - p) + csel x14,x14,x8,eq + csel x15,x15,x9,eq + csel x16,x16,x10,eq + csel x17,x17,x11,eq + + stp x14,x15,[x0] + stp x16,x17,[x0,#16] + + ret + + +.globl func(sm2_z256_modp_add) +.align 4 + +func(sm2_z256_modp_add): + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + // load a + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + + // load b + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + + // load modp + mov x12,#neg_p1 + mov x13,#neg_p3 + + bl __sm2_z256_modp_add + + ldp x29,x30,[sp],#16 + ret + + +.globl func(sm2_z256_modp_dbl) +.align 4 + +func(sm2_z256_modp_dbl): + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + // load a + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + + // b = a + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + + // set (2^256 - p) + mov x12,#neg_p1 + mov x13,#neg_p3 + + bl __sm2_z256_modp_add + + ldp x29,x30,[sp],#16 + ret + + +.globl func(sm2_z256_modp_tri) +.align 4 +func(sm2_z256_modp_tri): + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + + // load (2^256 - p) + mov x12,#neg_p1 + mov x13,#neg_p3 + + // b = a + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + + // c = a + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + + // a = a + b = 2a + bl __sm2_z256_modp_add + + // b = c = a + mov x8,x4 + mov x9,x5 + mov x10,x6 + mov x11,x7 + + // a = a + b = 2a + a = 3a + bl __sm2_z256_modp_add + + ldp x29,x30,[sp],#16 + ret + + +// a - b (mod p) +.align 4 +__sm2_z256_modp_sub: + + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + + // a = a - b + subs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbcs x17,x17,x11 + sbc x1,xzr,xzr + + // b = a - (2^256 - p) = a - b + p - 2^256 + subs x8,x14,#1 + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + + cmp x1,xzr + csel x14,x14,x8,eq + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + ret + + +// b - a (mod p) +.align 4 +__sm2_z256_modp_neg_sub: + + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + + // a = b - a + subs x14,x8,x14 + sbcs x15,x9,x15 + sbcs x16,x10,x16 + sbcs x17,x11,x17 + sbc x1,xzr,xzr + + // b = a - (2^256 - p) = b - a + p - 2^256 + subs x8,x14,#1 + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + + cmp x1,xzr + csel x14,x14,x8,eq + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + ret + + +.globl func(sm2_z256_modp_sub) +.align 4 +func(sm2_z256_modp_sub): + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + + mov x12,#neg_p1 + mov x13,#neg_p3 + + bl __sm2_z256_modp_sub + + ldp x29,x30,[sp],#16 + ret + + +.globl func(sm2_z256_modp_neg) + +.align 4 +func(sm2_z256_modp_neg): + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x2,x1 + + mov x14,xzr + mov x15,xzr + mov x16,xzr + mov x17,xzr + + mov x12,#neg_p1 + mov x13,#neg_p3 + + bl __sm2_z256_modp_sub + + ldp x29,x30,[sp],#16 + ret + + +.align 4 +__sm2_z256_modp_mont_mul: + + // a * b0 + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + mul x17,x7,x3 // a[3]*b[0] + umulh x11,x7,x3 + + ldr x3,[x2,#8] // b[1] + + adds x15,x15,x8 + adcs x16,x16,x9 + adcs x17,x17,x10 + adc x19,xzr,x11 + mov x20,xzr + + lsl x10,x14,#32 + lsr x11,x14,#32 + + + // p = 2^256 - 2^224 - 2^96 + 2^64 - 1 + + // R = 2^64 + + // p * a0 = (a0 * R^4 + a0 * R^1) - (a0 * 2^32 * R^192 + a0 * 2^32 * R + a0) + + // [ a4 ][ a3 ][ a2 ][ a1 ][ a0 ] + // [ a0 ] 0 0 [ a0 ] 0 + // - [ a0>>32 ][ a0<<32 ][ a0 >> 32 ][ a0<<32 ][ a0 ] + + + // 这里 x10 = a0 << 32 + // x11 = a0 >> 32 + + //subs x10,x14,x8 + //sbc x11,x14,x9 + subs x8,x14,x10 + sbcs x9,xzr,x11 + sbcs x10,xzr,x10 + sbc x11,x14,x11 + + + adds x14,x15,x8 + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + + + ldr x3,[x2,#8*(1+1)] // b[1+1] + + adds x15,x15,x8 // accumulate high parts of multiplication + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + + lsl x10,x14,#32 + lsr x11,x14,#32 + + + subs x8,x14,x10 + sbcs x9,xzr,x11 + sbcs x10,xzr,x10 + sbc x11,x14,x11 + + + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + + + + ldr x3,[x2,#8*(2+1)] // b[2+1] + adds x15,x15,x8 // accumulate high parts of multiplication + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + + lsl x10,x14,#32 // t0 + lsr x11,x14,#32 // t1 + + subs x8,x14,x10 + sbcs x9,xzr,x11 + sbcs x10,xzr,x10 + sbc x11,x14,x11 + + + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + adds x15,x15,x8 // accumulate high parts of multiplication + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + + lsl x10,x14,#32 // t0 + lsr x11,x14,#32 // t1 + // last reduction + + subs x8,x14,x10 + sbcs x9,xzr,x11 + sbcs x10,xzr,x10 + sbc x11,x14,x11 + + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adcs x17,x19,x11 + adc x19,x20,xzr + + // if a > p : return a - p + // else: return a + + // carry, b = a + (2^256 - p) + adds x8,x14,#1 + adcs x9,x15,x12 + adcs x10,x16,xzr + adcs x11,x17,x13 + adc x19,x19,xzr + + cmp x19,xzr + + // 如果 a + 2^256 - p 没有进位,说明 a < p, a - p 是个负数,说明我们直接返回a + // 如果进位了,那么返回b + csel x14,x14,x8,eq + csel x15,x15,x9,eq + csel x16,x16,x10,eq + csel x17,x17,x11,eq + + stp x14,x15,[x0] + stp x16,x17,[x0,#16] + ret + + +.globl func(sm2_z256_modp_mont_mul) + +.align 4 +func(sm2_z256_modp_mont_mul): + + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + // load a + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + // load b0 + ldr x3,[x2] + + // load modp + mov x12,#neg_p1 + mov x13,#neg_p3 + + bl __sm2_z256_modp_mont_mul + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + ret + + +.align 4 +__sm2_z256_modp_mont_sqr: + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x2,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + + + lsl x10,x14,#32 + adcs x1,x1,x11 + lsr x11,x14,#32 + adc x2,x2,x7 + + + // Now: x2, x1, x20, x19, x17, x16, x15, x14 就是 a^2 的结果 + + + subs x8,x14,x10 + sbcs x9,xzr,x11 + sbcs x10,xzr,x10 + sbc x11,x14,x11 + + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adc x17,x11,xzr // can't overflow + + lsl x10,x14,#32 + lsr x11,x14,#32 + + subs x8,x14,x10 + sbcs x9,xzr,x11 + sbcs x10,xzr,x10 + sbc x11,x14,x11 + + + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adc x17,x11,xzr // can't overflow + + lsl x10,x14,#32 + lsr x11,x14,#32 + + subs x8,x14,x10 + sbcs x9,xzr,x11 + sbcs x10,xzr,x10 + sbc x11,x14,x11 + + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adc x17,x11,xzr // can't overflow + + lsl x10,x14,#32 + lsr x11,x14,#32 + + subs x8,x14,x10 + sbcs x9,xzr,x11 + sbcs x10,xzr,x10 + sbc x11,x14,x11 + + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adc x17,x11,xzr // can't overflow + + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x2 + adc x19,xzr,xzr + + // carry, b = a + (2^256 - p) + adds x8,x14,#1 + adcs x9,x15,x12 + adcs x10,x16,xzr + adcs x11,x17,x13 + adc x19,x19,xzr + + cmp x19,xzr + + // 如果 a + 2^256 - p 没有进位,说明 a < p, a - p 是个负数,说明我们直接返回a + // 如果进位了,那么返回b + csel x14,x14,x8,eq + csel x15,x15,x9,eq + csel x16,x16,x10,eq + csel x17,x17,x11,eq + + stp x14,x15,[x0] + stp x16,x17,[x0,#16] + + + // 如果要用于连续平方,最好最后的输出是x4,x5,x6,x7,并且不需要输出到[x0]内存上 + + ret + + +.globl func(sm2_z256_modp_mont_sqr) +.align 4 + +func(sm2_z256_modp_mont_sqr): + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + // load modp + mov x12,#neg_p1 + mov x13,#neg_p3 + + bl __sm2_z256_modp_mont_sqr + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + ret + + + +// 计算r = r^(2^n) 也就是连续做n次平方 +// 这个函数调用__sm2_z256_modp_mont_sqr,输入是x4,x5,x6,x7, 输出是x14,x15,x16,x17,并且写入到[x0] +// 但是对于连续的平方,实际上我们不需要写到内存里,而且需要保证输入输出是一样的,需要对mont_sqr函数做一定的调整 +// 当然不调整的话开销也不算大 +.globl func(sm2_z256_modp_mont_esq) +.align 4 + +func(sm2_z256_modp_mont_esq): + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp x4,x5,[x0] + ldp x6,x7,[x0,#16] + + // load modp + mov x12,#neg_p1 + mov x13,#neg_p3 + + // x1 在sqr中已经被用了,因此就不能再用了,x18没有用过,这里实际上没有节省什么计算 + mov x3, x1 +22: + + // 这个函数的输入是x4,x5,x6,x7 + bl __sm2_z256_modp_mont_sqr + // 结束之后还应该继续把值放到x4,x5,x6,x7中 + + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + + subs x3, x3, #1 + b.ne 22b + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + ret + + + +// mont(a) = a * 2^256 (mod p) = mont_mul(a, 2^512 mod p) +.globl func(sm2_z256_modp_to_mont) + +.align 6 +func(sm2_z256_modp_to_mont): + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + mov x3,x1 + mov x1,x0 + mov x0,x3 + + adr x2,Lz256_2e512modp + ldr x3,Lz256_2e512modp + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + mov x12,#neg_p1 + mov x13,#neg_p3 + + bl __sm2_z256_modp_mont_mul + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + ret + + +// 这个函数中参与运算的b == 1,因此应该有更快的实现,但是似乎这个计算使用量不大 +// 因此没必要专门优化 +// mont(mont(a), 1) = aR * 1 * R^-1 (mod p) = a (mod p) +.globl func(sm2_z256_modp_from_mont) + +.align 4 +func(sm2_z256_modp_from_mont): + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + // load modp + mov x12,#neg_p1 + mov x13,#neg_p3 + + // load b = {1,0,0,0} + adr x2,Lone + // load b1 = 1 + mov x3,#1 + + bl __sm2_z256_modp_mont_mul + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + + ret + + +.align 4 +__sm2_z256_modp_haf: + + // a - (2^256 - p) == a + p - 2^256 + subs x8,x14,#1 + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + // (a + p - 2^256) + 2^256 + adcs x1,xzr,xzr + + // r = (a is even) ? a : (a - (2^256 - p) + 2^256) + tst x14,#1 + csel x14,x14,x8,eq + csel x15,x15,x9,eq + csel x16,x16,x10,eq + csel x17,x17,x11,eq + csel x1,xzr,x1,eq + + // r = r >> 1 + lsr x14,x14,#1 + orr x14,x14,x15,lsl#63 + lsr x15,x15,#1 + orr x15,x15,x16,lsl#63 + lsr x16,x16,#1 + orr x16,x16,x17,lsl#63 + lsr x17,x17,#1 + stp x14,x15,[x0] + orr x17,x17,x1,lsl#63 + stp x16,x17,[x0,#16] + ret + + +.globl func(sm2_z256_modp_haf) + +.align 4 +func(sm2_z256_modp_haf): + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + + mov x12,#neg_p1 + mov x13,#neg_p3 + + bl __sm2_z256_modp_haf + + ldp x29,x30,[sp],#16 + ret + + + +.globl func(sm2_z256_point_dbl) + +.align 5 +func(sm2_z256_point_dbl): + + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + sub sp,sp,#32*4 //还是准备了4个临时变量 + +Ldouble_shortcut: + // Jacobian点一共3个元素 + // 0-16,16-32 + // 32-48,48-64 + // 64-80,80-96 + + // x14-x17 = Y + + ldp x14,x15,[x1,#32] + mov x21,x0 + ldp x16,x17,[x1,#48] + mov x22,x1 + + // x21, x22 分别保存了x0,x1,也就是说 x21 = out, x22 = in + // 为什么保存了x0,x1,难道这两个值被重复使用了吗? + // 每个 __foo 都需要将输出写到 x0 的地址上 + + // load modp + mov x12,#neg_p1 + mov x13,#neg_p3 + + // x8-x11 = x14-x17 = Y + mov x8,x14 + + + mov x9,x15 + // x4-x7 = Z sqr 确实是将 x4-x7 作为输入参数的 + // x22 == x1 + ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[x22,#64+16] + + + // S = T[0] + add x0,sp,#0 + + // 此时没有把输出写入到输出地址 + // 我们可以 + + + // 1. S = 2Y + bl __sm2_z256_modp_add // p256_mul_by_2(S, in_y); + + + + // Zsqr = T[2] + add x0,sp,#64 + + // 2. Zsqr = Z1^2 + bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Zsqr, in_z); + + + // x8-x11 = X + ldp x8,x9,[x22] + ldp x10,x11,[x22,#16] + + // x4-x7 = x14-x17 这是什么值 + mov x4,x14 // put Zsqr aside for p256_sub + mov x5,x15 + mov x6,x16 + mov x7,x17 + + // t1 = M + + // M = T[1] + add x0,sp,#32 + + // 6. M = X1 + Zsqr = X1 + Z1^2 + bl __sm2_z256_modp_add // p256_add(M, Zsqr, in_x); + + + add x2,x22,#0 + mov x14,x4 // restore Zsqr + mov x15,x5 + + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x16,x6 + mov x17,x7 + ldp x6,x7,[sp,#0+16] + add x0,sp,#64 + + + // 7. Zsqr = X - Z^2 + bl __sm2_z256_modp_neg_sub // p256_sub(Zsqr, in_x, Zsqr); + + add x0,sp,#0 + + // 3. S = S^2 = 4*Y1^2 + bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(S, S); + + ldr x3,[x22,#32] + ldp x4,x5,[x22,#64] + ldp x6,x7,[x22,#64+16] + add x2,x22,#32 + add x0,sp,#96 + + + // tmp0 = Z*Y + + // 4. Z3 = Z1 * Y1 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(tmp0, in_z, in_y); + // 算完之后已经把结果写到内存了 + // 因此还必须再把数据读到寄存器才能继续算 + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#0+16] + add x0,x21,#64 + + + +// mov x0,x21 // 现在第一个位置就是一个z256了 +// add sp,x29,#0 +// ldp x19,x20,[x29,#16] +// ldp x21,x22,[x29,#32] +// ldp x29,x30,[sp],#96 +// ret + + // Z3 = 2YZ + bl __sm2_z256_modp_add // p256_mul_by_2(res_z, tmp0); + + add x0,sp,#96 + bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(tmp0, S); + + ldr x3,[sp,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x0,x21,#32 + bl __sm2_z256_modp_haf // p256_div_by_2(res_y, tmp0); + + add x2,sp,#64 + add x0,sp,#32 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(M, M, Zsqr); + + mov x8,x14 // duplicate M + mov x9,x15 + mov x10,x16 + mov x11,x17 + mov x4,x14 // put M aside + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __sm2_z256_modp_add + mov x8,x4 // restore M + mov x9,x5 + ldr x3,[x22] // forward load for p256_mul_mont + mov x10,x6 + ldp x4,x5,[sp,#0] + mov x11,x7 + ldp x6,x7,[sp,#0+16] + bl __sm2_z256_modp_add // p256_mul_by_3(M, M); + + add x2,x22,#0 + add x0,sp,#0 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(S, S, in_x); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#32+16] + add x0,sp,#96 + bl __sm2_z256_modp_add // p256_mul_by_2(tmp0, S); + + add x0,x21,#0 // 输出X + bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(res_x, M); + + add x2,sp,#96 + + bl __sm2_z256_modp_sub // p256_sub(res_x, res_x, tmp0); + + add x2,sp,#0 + add x0,sp,#0 + bl __sm2_z256_modp_neg_sub // p256_sub(S, S, res_x); + + ldr x3,[sp,#32] + mov x4,x14 // copy S + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x2,sp,#32 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(S, S, M); + + add x2,x21,#32 + add x0,x21,#32 // 这里输出的是Y + + + bl __sm2_z256_modp_sub // p256_sub(res_y, S, res_y); + + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#96 + + ret + + + + + +.globl func(sm2_z256_point_add) + +.align 5 +func(sm2_z256_point_add): + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#32*12 + + ldp x4,x5,[x2,#64] // in2_z + ldp x6,x7,[x2,#64+16] + mov x21,x0 + mov x22,x1 + mov x23,x2 + + // load modp + mov x12,#neg_p1 + mov x13,#neg_p3 + //ldr x12,Lpoly+8 + //ldr x13,Lpoly+24 + + orr x8,x4,x5 + orr x10,x6,x7 + orr x25,x8,x10 + cmp x25,#0 + csetm x25,ne // ~in2infty + add x0,sp,#192 + bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Z2sqr, in2_z); + + ldp x4,x5,[x22,#64] // in1_z + ldp x6,x7,[x22,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + add x0,sp,#128 + bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Z1sqr, in1_z); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x2,x23,#64 + add x0,sp,#320 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(S1, Z2sqr, in2_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x22,#64 + add x0,sp,#352 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#32] + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x2,x22,#32 + add x0,sp,#320 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(S1, S1, in1_y); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#352] + ldp x6,x7,[sp,#352+16] + add x2,x23,#32 + add x0,sp,#352 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, S2, in2_y); + + add x2,sp,#320 + ldr x3,[sp,#192] // forward load for p256_mul_mont + ldp x4,x5,[x22] + ldp x6,x7,[x22,#16] + add x0,sp,#160 + bl __sm2_z256_modp_sub // p256_sub(R, S2, S1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x26,x14,x16 // ~is_equal(S1,S2) + + add x2,sp,#192 + add x0,sp,#256 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(U1, in1_x, Z2sqr); + + ldr x3,[sp,#128] + ldp x4,x5,[x23] + ldp x6,x7,[x23,#16] + add x2,sp,#128 + add x0,sp,#288 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(U2, in2_x, Z1sqr); + + add x2,sp,#256 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#96 + bl __sm2_z256_modp_sub // p256_sub(H, U2, U1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x14,x14,x16 // ~is_equal(U1,U2) + + mvn x27,x24 // -1/0 -> 0/-1 + mvn x28,x25 // -1/0 -> 0/-1 + orr x14,x14,x27 + orr x14,x14,x28 + orr x14,x14,x26 + cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + +Ladd_double: + mov x1,x22 + mov x0,x21 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + add sp,sp,#32*(12-4) // difference in stack frames + b Ldouble_shortcut + +.align 4 +Ladd_proceed: + add x0,sp,#192 + bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Rsqr, R); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x2,x22,#64 + add x0,sp,#64 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_z, H, in1_z); + + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x0,sp,#128 + bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Hsqr, H); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#64] + ldp x6,x7,[sp,#64+16] + add x2,x23,#64 + add x0,sp,#64 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_z, res_z, in2_z); + + ldr x3,[sp,#96] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,sp,#96 + add x0,sp,#224 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[sp,#128] + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x2,sp,#128 + add x0,sp,#288 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(U2, U1, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#128 + bl __sm2_z256_modp_add // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#192 + add x0,sp,#0 + bl __sm2_z256_modp_neg_sub // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#224 + bl __sm2_z256_modp_sub // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#288 + ldr x3,[sp,#224] // forward load for p256_mul_mont + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x0,sp,#32 + bl __sm2_z256_modp_neg_sub // p256_sub(res_y, U2, res_x); + + add x2,sp,#224 + add x0,sp,#352 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, S1, Hcub); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#160 + add x0,sp,#32 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#352 + bl __sm2_z256_modp_sub // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + +Ladd_done: + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + + + + +.globl func(sm2_z256_point_add_affine) + +.align 5 +func(sm2_z256_point_add_affine): + + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*10 + + mov x21,x0 + mov x22,x1 + mov x23,x2 + + // load modp + mov x12,#neg_p1 + mov x13,#neg_p3 + + ldp x4,x5,[x1,#64] // in1_z + ldp x6,x7,[x1,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + + ldp x14,x15,[x2] // in2_x + ldp x16,x17,[x2,#16] + ldp x8,x9,[x2,#32] // in2_y + ldp x10,x11,[x2,#48] + orr x14,x14,x15 + orr x16,x16,x17 + orr x8,x8,x9 + orr x10,x10,x11 + orr x14,x14,x16 + orr x8,x8,x10 + orr x25,x14,x8 + cmp x25,#0 + csetm x25,ne // ~in2infty + + add x0,sp,#128 + bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Z1sqr, in1_z); + + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + ldr x3,[x23] + add x2,x23,#0 + add x0,sp,#96 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(U2, Z1sqr, in2_x); + + add x2,x22,#0 + ldr x3,[x22,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x0,sp,#160 + bl __sm2_z256_modp_sub // p256_sub(H, U2, in1_x); + + add x2,x22,#64 + add x0,sp,#128 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#160] + ldp x6,x7,[sp,#160+16] + add x2,x22,#64 + add x0,sp,#64 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_z, H, in1_z); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x23,#32 + add x0,sp,#128 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, S2, in2_y); + + add x2,x22,#32 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#192 + bl __sm2_z256_modp_sub // p256_sub(R, S2, in1_y); + + add x0,sp,#224 + bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Hsqr, H); + + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x0,sp,#288 + bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Rsqr, R); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,sp,#160 + add x0,sp,#256 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[x22] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,x22,#0 + add x0,sp,#96 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(U2, in1_x, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#224 + bl __sm2_z256_modp_add // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#288 + add x0,sp,#0 + bl __sm2_z256_modp_neg_sub // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#256 + bl __sm2_z256_modp_sub // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#96 + ldr x3,[x22,#32] // forward load for p256_mul_mont + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x0,sp,#32 + bl __sm2_z256_modp_neg_sub // p256_sub(res_y, U2, res_x); + + add x2,x22,#32 + add x0,sp,#128 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, in1_y, Hcub); + + ldr x3,[sp,#192] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#192 + add x0,sp,#32 + bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#128 + bl __sm2_z256_modp_sub // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + + + adr x23,Lneg_p-64 + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + ret + + + + + +.align 4 +__sm2_z256_modn_add: + + // (carry, a) = a + b + adds x14,x14,x4 + adcs x15,x15,x5 + adcs x16,x16,x6 + adcs x17,x17,x7 + adc x1,xzr,xzr + + // (borrow, b) = (carry, a) - p = a + b - p + subs x4,x14,x10 + sbcs x5,x15,x11 + sbcs x6,x16,x12 + sbcs x7,x17,x13 + sbcs xzr,x1,xzr + + // if borrow (lo), b is not the answer + csel x14,x14,x4,lo + csel x15,x15,x5,lo + csel x16,x16,x6,lo + stp x14,x15,[x0] + csel x17,x17,x7,lo + stp x16,x17,[x0,#16] + ret + + +.globl func(sm2_z256_modn_add) +.align 4 +func(sm2_z256_modn_add): + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + + ldr x10,Lmodn + ldr x11,Lmodn+8 + ldr x12,Lmodn+16 + ldr x13,Lmodn+24 + + bl __sm2_z256_modn_add + + ldp x29,x30,[sp],#16 + ret + + +.align 4 +__sm2_z256_modn_sub: + + // load b + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + + // borrow, r = a - b + subs x14,x14,x4 + sbcs x15,x15,x5 + sbcs x16,x16,x6 + sbcs x17,x17,x7 + sbc x1,xzr,xzr + + // b = r + p = a - b + p + adds x4,x14,x10 + adcs x5,x15,x11 + adcs x6,x16,x12 + adcs x7,x17,x13 + + // return (borrow == 0) ? r : (a - b + p) + cmp x1,xzr + + csel x14,x14,x4,eq + csel x15,x15,x5,eq + csel x16,x16,x6,eq + stp x14,x15,[x0] + csel x17,x17,x7,eq + stp x16,x17,[x0,#16] + ret + + +.globl func(sm2_z256_modn_sub) +.align 4 +func(sm2_z256_modn_sub): + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + + ldr x10,Lmodn + ldr x11,Lmodn+8 + ldr x12,Lmodn+16 + ldr x13,Lmodn+24 + + bl __sm2_z256_modn_sub + + ldp x29,x30,[sp],#16 + ret + + +.globl func(sm2_z256_modn_neg) +.align 4 +func(sm2_z256_modn_neg): + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldr x10,Lmodn + ldr x11,Lmodn+8 + ldr x12,Lmodn+16 + ldr x13,Lmodn+24 + + mov x2,x1 + + mov x14,xzr + mov x15,xzr + mov x16,xzr + mov x17,xzr + + bl __sm2_z256_modn_sub + + ldp x29,x30,[sp],#16 + ret + + +.align 4 +__sm2_z256_modn_mont_mul: + // x14,x15,x16,x17 as a0,a1,a2,a3 + // x4,x5,x6,x7 as b0,b1,b2,b3 + // x3 as b0,b1,b2,b3 + + // c = b0 * a, len(c) = 5 + mul x14,x4,x3 + umulh x21,x4,x3 + mul x15,x5,x3 + umulh x22,x5,x3 + mul x16,x6,x3 + umulh x23,x6,x3 + mul x17,x7,x3 + umulh x24,x7,x3 + adds x15,x15,x21 + adcs x16,x16,x22 + adcs x17,x17,x23 + adc x19,xzr,x24 + + // q = mu * c0 mod 2^64 + mul x3,x9,x14 + + // c = (c + q * p) // 2^64 + mul x21,x10,x3 + mul x22,x11,x3 + mul x23,x12,x3 + mul x24,x13,x3 + + adds x14,x14,x21 + adcs x15,x15,x22 + adcs x16,x16,x23 + adcs x17,x17,x24 + adcs x19,x19,xzr + adc x20,xzr,xzr + + umulh x21,x10,x3 + umulh x22,x11,x3 + umulh x23,x12,x3 + umulh x24,x13,x3 + + adds x14,x15,x21 + adcs x15,x16,x22 + adcs x16,x17,x23 + adcs x17,x19,x24 + adc x19,x20,xzr + + // load b1 + ldr x3,[x2,#8] + + // c += a * b1 + // len(c) = 6 + mul x21,x4,x3 + mul x22,x5,x3 + mul x23,x6,x3 + mul x24,x7,x3 + + adds x14,x14,x21 + adcs x15,x15,x22 + adcs x16,x16,x23 + adcs x17,x17,x24 + adcs x19,x19,xzr + adc x20,xzr,xzr + + umulh x21,x4,x3 + umulh x22,x5,x3 + umulh x23,x6,x3 + umulh x24,x7,x3 + + adds x15,x15,x21 + adcs x16,x16,x22 + adcs x17,x17,x23 + adcs x19,x19,x24 + adc x20,x20,xzr + + // mu * c0 mod 2^64 + mul x3,x9,x14 + + // c = (c + q * p) // 2^64 + mul x21,x10,x3 + mul x22,x11,x3 + mul x23,x12,x3 + mul x24,x13,x3 + + adds x14,x14,x21 + adcs x15,x15,x22 + adcs x16,x16,x23 + adcs x17,x17,x24 + adcs x19,x19,xzr + adc x20,x20,xzr + + umulh x21,x10,x3 + umulh x22,x11,x3 + umulh x23,x12,x3 + umulh x24,x13,x3 + + adds x14,x15,x21 + adcs x15,x16,x22 + adcs x16,x17,x23 + adcs x17,x19,x24 + adc x19,x20,xzr + + // load b2 + ldr x3,[x2,#16] + + // c += a * b1 + // len(c) = 6 + mul x21,x4,x3 + mul x22,x5,x3 + mul x23,x6,x3 + mul x24,x7,x3 + + adds x14,x14,x21 + adcs x15,x15,x22 + adcs x16,x16,x23 + adcs x17,x17,x24 + adcs x19,x19,xzr + adc x20,xzr,xzr + + umulh x21,x4,x3 + umulh x22,x5,x3 + umulh x23,x6,x3 + umulh x24,x7,x3 + + adds x15,x15,x21 + adcs x16,x16,x22 + adcs x17,x17,x23 + adcs x19,x19,x24 + adc x20,x20,xzr + + // mu * c0 mod 2^64 + mul x3,x9,x14 + + // c = (c + q * p) // 2^64 + mul x21,x10,x3 + mul x22,x11,x3 + mul x23,x12,x3 + mul x24,x13,x3 + + adds x14,x14,x21 + adcs x15,x15,x22 + adcs x16,x16,x23 + adcs x17,x17,x24 + adcs x19,x19,xzr + adc x20,x20,xzr + + umulh x21,x10,x3 + umulh x22,x11,x3 + umulh x23,x12,x3 + umulh x24,x13,x3 + + adds x14,x15,x21 + adcs x15,x16,x22 + adcs x16,x17,x23 + adcs x17,x19,x24 + adc x19,x20,xzr + + // load b3 + ldr x3,[x2,#24] + + // c += a * b1 + mul x21,x4,x3 + mul x22,x5,x3 + mul x23,x6,x3 + mul x24,x7,x3 + + adds x14,x14,x21 + adcs x15,x15,x22 + adcs x16,x16,x23 + adcs x17,x17,x24 + adcs x19,x19,xzr + adc x20,xzr,xzr + + umulh x21,x4,x3 + umulh x22,x5,x3 + umulh x23,x6,x3 + umulh x24,x7,x3 + + adds x15,x15,x21 + adcs x16,x16,x22 + adcs x17,x17,x23 + adcs x19,x19,x24 + adc x20,x20,xzr + + // q = mu * c0 mod 2^64 + mul x3,x9,x14 + + // c = (c + q * p) // 2^64 + mul x21,x10,x3 + mul x22,x11,x3 + mul x23,x12,x3 + mul x24,x13,x3 + + adds x14,x14,x21 + adcs x15,x15,x22 + adcs x16,x16,x23 + adcs x17,x17,x24 + adcs x19,x19,xzr + adc x20,x20,xzr + + umulh x21,x10,x3 + umulh x22,x11,x3 + umulh x23,x12,x3 + umulh x24,x13,x3 + + adds x14,x15,x21 + adcs x15,x16,x22 + adcs x16,x17,x23 + adcs x17,x19,x24 + adc x19,x20,xzr + + // (borrow, t) = c - p + // return borrow ? c : (c - p) + + subs x21,x14,x10 + sbcs x22,x15,x11 + sbcs x23,x16,x12 + sbcs x24,x17,x13 + sbcs xzr,x19,xzr + + // if borrow + csel x14,x14,x21,lo + csel x15,x15,x22,lo + csel x16,x16,x23,lo + csel x17,x17,x24,lo + + // output + stp x14,x15,[x0] + stp x16,x17,[x0,#16] + + ret + + + +// mu = -n^-1 mod 2^64 +// sage: n = 0xfffffffeffffffffffffffffffffffff7203df6b21c6052b53bbf40939d54123 +// sage: mu = -(IntegerModRing(2^64)(n))^-1 +Lmodn_mu: +.quad 0x327f9e8872350975 + + +.globl func(sm2_z256_modn_mont_mul) +.align 4 + +func(sm2_z256_modn_mont_mul): + + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + // mu = -n^-1 mod 2^64 + ldr x9,Lmodn_mu + + // load modp + ldr x10,Lmodn + ldr x11,Lmodn+8 + ldr x12,Lmodn+16 + ldr x13,Lmodn+24 + + // load a + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + // load b0 + ldr x3,[x2] + + bl __sm2_z256_modn_mont_mul + + add sp,x29,#0 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x29,x30,[sp],#64 + ret + + + +// mont(mont(a), 1) = aR * 1 * R^-1 (mod p) = a (mod p) +.globl func(sm2_z256_modn_from_mont) + +.align 4 +func(sm2_z256_modn_from_mont): + + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + // mu = -p^-1 mod 2^64 + ldr x9,Lmodn_mu + + // load p + ldr x10,Lmodn + ldr x11,Lmodn+8 + ldr x12,Lmodn+16 + ldr x13,Lmodn+24 + + // load a + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + // b = {1,0,0,0} + adr x2,Lone + // b0 = 1 + mov x3,#1 + + bl __sm2_z256_modn_mont_mul + + add sp,x29,#0 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x29,x30,[sp],#64 + ret + + + +// 2^512 mod n = 0x1eb5e412a22b3d3b620fc84c3affe0d43464504ade6fa2fa901192af7c114f20 +Lsm2_z256_modn_2e512: +.quad 0x901192af7c114f20, 0x3464504ade6fa2fa, 0x620fc84c3affe0d4, 0x1eb5e412a22b3d3b + +// mont(a) = a * 2^256 (mod p) = mont_mul(a, 2^512 mod p) +.globl func(sm2_z256_modn_to_mont) +.align 6 + +func(sm2_z256_modn_to_mont): + + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + // mu = -p^-1 mod 2^64 + ldr x9,Lmodn_mu + + // load modp + ldr x10,Lmodn + ldr x11,Lmodn+8 + ldr x12,Lmodn+16 + ldr x13,Lmodn+24 + + // swap args x0,x1 = x1,x0 + mov x3,x1 + mov x1,x0 + mov x0,x3 + + // load a + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + // load b = 2^512 mod p + adr x2,Lsm2_z256_modn_2e512 + // load b0 + ldr x3,Lsm2_z256_modn_2e512 + + bl __sm2_z256_modn_mont_mul + + add sp,x29,#0 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x29,x30,[sp],#64 + ret + + +.align 4 +__sm2_z256_modn_mont_sqr: + + // L(a0*a0) H(a0*a0) L(a1*a1) H(a1*a1) L(a2*a2) H(a2*a2) L(a3*a3) H(a3*a3) + // 2* L(a0*a1) L(a0*a2) L(a0*a3) + // 2* H(a0*a1) H(a0*a2) H(a0*a3) + // 2* L(a1*a2) L(a1*a3) + // 2* H(a1*a2) H(a1*a3) + + mul x15,x5,x4 + umulh x22,x5,x4 + mul x16,x6,x4 + umulh x23,x6,x4 + mul x17,x7,x4 + umulh x19,x7,x4 + + adds x16,x16,x22 + mul x21,x6,x5 + umulh x22,x6,x5 + adcs x17,x17,x23 + mul x23,x7,x5 + umulh x24,x7,x5 + adc x19,x19,xzr + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x22,x22,x23 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x23,x24,xzr // can't overflow + + adds x17,x17,x21 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x22 + mul x22,x5,x5 // a[1]*a[1] + adcs x20,x20,x23 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x23,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x24,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x2,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + adcs x16,x16,x22 + adcs x17,x17,x5 + adcs x19,x19,x23 + adcs x20,x20,x6 + adcs x1,x1,x24 + adc x2,x2,x7 + + // round 0 + + // q = mu * c0 mod 2^64 + mul x3,x9,x14 + + // C = (C + q*p) // 2^64 + mul x21,x10,x3 + mul x22,x11,x3 + mul x23,x12,x3 + mul x24,x13,x3 + adds x14,x14,x21 + adcs x14,x15,x22 + adcs x15,x16,x23 + adcs x16,x17,x24 + adc x17,xzr,xzr + umulh x21,x10,x3 + umulh x22,x11,x3 + umulh x23,x12,x3 + umulh x24,x13,x3 + adds x14,x14,x21 + adcs x15,x15,x22 + adcs x16,x16,x23 + adc x17,x17,x24 + + // round 1 + + // q = mu * c0 mod 2^64 + mul x3,x9,x14 + + // C = (C + q*p) // 2^64 + mul x21,x10,x3 + mul x22,x11,x3 + mul x23,x12,x3 + mul x24,x13,x3 + adds x14,x14,x21 + adcs x14,x15,x22 + adcs x15,x16,x23 + adcs x16,x17,x24 + adc x17,xzr,xzr + umulh x21,x10,x3 + umulh x22,x11,x3 + umulh x23,x12,x3 + umulh x24,x13,x3 + adds x14,x14,x21 + adcs x15,x15,x22 + adcs x16,x16,x23 + adc x17,x17,x24 + + + // round 2 + + // q = mu * c0 mod 2^64 + mul x3,x9,x14 + + // C = (C + q*p) // 2^64 + mul x21,x10,x3 + mul x22,x11,x3 + mul x23,x12,x3 + mul x24,x13,x3 + adds x14,x14,x21 + adcs x14,x15,x22 + adcs x15,x16,x23 + adcs x16,x17,x24 + adc x17,xzr,xzr + umulh x21,x10,x3 + umulh x22,x11,x3 + umulh x23,x12,x3 + umulh x24,x13,x3 + adds x14,x14,x21 + adcs x15,x15,x22 + adcs x16,x16,x23 + adc x17,x17,x24 + + // round 3 + + + // q = mu * c0 mod 2^64 + mul x3,x9,x14 + + // C = (C + q*p) // 2^64 + mul x21,x10,x3 + mul x22,x11,x3 + mul x23,x12,x3 + mul x24,x13,x3 + adds x14,x14,x21 + adcs x14,x15,x22 + adcs x15,x16,x23 + adcs x16,x17,x24 + adc x17,xzr,xzr + umulh x21,x10,x3 + umulh x22,x11,x3 + umulh x23,x12,x3 + umulh x24,x13,x3 + adds x14,x14,x21 + adcs x15,x15,x22 + adcs x16,x16,x23 + adc x17,x17,x24 + + // add upper half + adds x14,x14,x19 + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x2 + adc x19,xzr,xzr + + // if c >= p, c = c - p + subs x21,x14,x10 + sbcs x22,x15,x11 + sbcs x23,x16,x12 + sbcs x24,x17,x13 + sbcs xzr,x19,xzr + + csel x14,x14,x21,lo + csel x15,x15,x22,lo + csel x16,x16,x23,lo + csel x17,x17,x24,lo + + stp x14,x15,[x0] + stp x16,x17,[x0,#16] + + ret + + +.globl func(sm2_z256_modn_mont_sqr) +.align 4 + +func(sm2_z256_modn_mont_sqr): + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + // mu = -p^-1 mod 2^64 + ldr x9,Lmodn_mu + + // load modp + ldr x10,Lmodn + ldr x11,Lmodn+8 + ldr x12,Lmodn+16 + ldr x13,Lmodn+24 + + // load a + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + bl __sm2_z256_modn_mont_sqr + + add sp,x29,#0 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x29,x30,[sp],#64 + ret +