/* * Copyright 2014-2024 The GmSSL Project. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the License); you may * not use this file except in compliance with the License. * * http://www.apache.org/licenses/LICENSE-2.0 */ #include .text .align 5 #define neg_p1 0xffffffff #define neg_p3 0x100000000 Lneg_p: .quad 1, neg_p1, 0, neg_p3 // 2^512 mod p Lz256_2e512modp: .quad 0x0000000200000003, 0x00000002ffffffff, 0x0000000100000001, 0x0000000400000002 Lone: .quad 1,0,0,0 Lmodn: .quad 0x53bbf40939d54123, 0x7203df6b21c6052b, 0xffffffffffffffff, 0xfffffffeffffffff .align 4 __sm2_z256_modp_add: // carry, a = a + b adds x14,x14,x8 adcs x15,x15,x9 adcs x16,x16,x10 adcs x17,x17,x11 adc x1,xzr,xzr // carry, b = a + (2^256 - p) = (a + b - p) + 2^256 adds x8,x14,#1 adcs x9,x15,x12 adcs x10,x16,xzr adcs x11,x17,x13 adc x1,x1,xzr cmp x1,xzr // if carry == 0, i.e. (a + b - p) < 0, return a == (a + b) // else return b == (a + b - p) csel x14,x14,x8,eq csel x15,x15,x9,eq csel x16,x16,x10,eq csel x17,x17,x11,eq stp x14,x15,[x0] stp x16,x17,[x0,#16] ret .globl func(sm2_z256_modp_add) .align 4 func(sm2_z256_modp_add): stp x29,x30,[sp,#-16]! add x29,sp,#0 // load a ldp x14,x15,[x1] ldp x16,x17,[x1,#16] // load b ldp x8,x9,[x2] ldp x10,x11,[x2,#16] // load modp mov x12,#neg_p1 mov x13,#neg_p3 bl __sm2_z256_modp_add ldp x29,x30,[sp],#16 ret .globl func(sm2_z256_modp_dbl) .align 4 func(sm2_z256_modp_dbl): stp x29,x30,[sp,#-16]! add x29,sp,#0 // load a ldp x14,x15,[x1] ldp x16,x17,[x1,#16] // b = a mov x8,x14 mov x9,x15 mov x10,x16 mov x11,x17 // set (2^256 - p) mov x12,#neg_p1 mov x13,#neg_p3 bl __sm2_z256_modp_add ldp x29,x30,[sp],#16 ret .globl func(sm2_z256_modp_tri) .align 4 func(sm2_z256_modp_tri): stp x29,x30,[sp,#-16]! add x29,sp,#0 ldp x14,x15,[x1] ldp x16,x17,[x1,#16] // load (2^256 - p) mov x12,#neg_p1 mov x13,#neg_p3 // b = a mov x8,x14 mov x9,x15 mov x10,x16 mov x11,x17 // c = a mov x4,x14 mov x5,x15 mov x6,x16 mov x7,x17 // a = a + b = 2a bl __sm2_z256_modp_add // b = c = a mov x8,x4 mov x9,x5 mov x10,x6 mov x11,x7 // a = a + b = 2a + a = 3a bl __sm2_z256_modp_add ldp x29,x30,[sp],#16 ret // a - b (mod p) .align 4 __sm2_z256_modp_sub: ldp x8,x9,[x2] ldp x10,x11,[x2,#16] // a = a - b subs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbcs x17,x17,x11 sbc x1,xzr,xzr // b = a - (2^256 - p) = a - b + p - 2^256 subs x8,x14,#1 sbcs x9,x15,x12 sbcs x10,x16,xzr sbcs x11,x17,x13 cmp x1,xzr csel x14,x14,x8,eq csel x15,x15,x9,eq csel x16,x16,x10,eq stp x14,x15,[x0] csel x17,x17,x11,eq stp x16,x17,[x0,#16] ret // b - a (mod p) .align 4 __sm2_z256_modp_neg_sub: ldp x8,x9,[x2] ldp x10,x11,[x2,#16] // a = b - a subs x14,x8,x14 sbcs x15,x9,x15 sbcs x16,x10,x16 sbcs x17,x11,x17 sbc x1,xzr,xzr // b = a - (2^256 - p) = b - a + p - 2^256 subs x8,x14,#1 sbcs x9,x15,x12 sbcs x10,x16,xzr sbcs x11,x17,x13 cmp x1,xzr csel x14,x14,x8,eq csel x15,x15,x9,eq csel x16,x16,x10,eq stp x14,x15,[x0] csel x17,x17,x11,eq stp x16,x17,[x0,#16] ret .globl func(sm2_z256_modp_sub) .align 4 func(sm2_z256_modp_sub): stp x29,x30,[sp,#-16]! add x29,sp,#0 ldp x14,x15,[x1] ldp x16,x17,[x1,#16] mov x12,#neg_p1 mov x13,#neg_p3 bl __sm2_z256_modp_sub ldp x29,x30,[sp],#16 ret .globl func(sm2_z256_modp_neg) .align 4 func(sm2_z256_modp_neg): stp x29,x30,[sp,#-16]! add x29,sp,#0 mov x2,x1 mov x14,xzr mov x15,xzr mov x16,xzr mov x17,xzr mov x12,#neg_p1 mov x13,#neg_p3 bl __sm2_z256_modp_sub ldp x29,x30,[sp],#16 ret .align 4 __sm2_z256_modp_mont_mul: // a * b0 mul x14,x4,x3 // a[0]*b[0] umulh x8,x4,x3 mul x15,x5,x3 // a[1]*b[0] umulh x9,x5,x3 mul x16,x6,x3 // a[2]*b[0] umulh x10,x6,x3 mul x17,x7,x3 // a[3]*b[0] umulh x11,x7,x3 ldr x3,[x2,#8] // b[1] adds x15,x15,x8 adcs x16,x16,x9 adcs x17,x17,x10 adc x19,xzr,x11 mov x20,xzr lsl x10,x14,#32 lsr x11,x14,#32 // p = 2^256 - 2^224 - 2^96 + 2^64 - 1 // R = 2^64 // p * a0 = (a0 * R^4 + a0 * R^1) - (a0 * 2^32 * R^192 + a0 * 2^32 * R + a0) // [ a4 ][ a3 ][ a2 ][ a1 ][ a0 ] // [ a0 ] 0 0 [ a0 ] 0 // - [ a0>>32 ][ a0<<32 ][ a0 >> 32 ][ a0<<32 ][ a0 ] // 这里 x10 = a0 << 32 // x11 = a0 >> 32 //subs x10,x14,x8 //sbc x11,x14,x9 subs x8,x14,x10 sbcs x9,xzr,x11 sbcs x10,xzr,x10 sbc x11,x14,x11 adds x14,x15,x8 mul x8,x4,x3 // lo(a[0]*b[i]) adcs x15,x16,x9 mul x9,x5,x3 // lo(a[1]*b[i]) adcs x16,x17,x10 mul x10,x6,x3 // lo(a[2]*b[i]) adcs x17,x19,x11 mul x11,x7,x3 // lo(a[3]*b[i]) adc x19,x20,xzr adds x14,x14,x8 umulh x8,x4,x3 // hi(a[0]*b[i]) adcs x15,x15,x9 umulh x9,x5,x3 // hi(a[1]*b[i]) adcs x16,x16,x10 umulh x10,x6,x3 // hi(a[2]*b[i]) adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr ldr x3,[x2,#8*(1+1)] // b[1+1] adds x15,x15,x8 // accumulate high parts of multiplication adcs x16,x16,x9 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr lsl x10,x14,#32 lsr x11,x14,#32 subs x8,x14,x10 sbcs x9,xzr,x11 sbcs x10,xzr,x10 sbc x11,x14,x11 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] mul x8,x4,x3 // lo(a[0]*b[i]) adcs x15,x16,x9 mul x9,x5,x3 // lo(a[1]*b[i]) adcs x16,x17,x10 // +=acc[0]*0xffff0001 mul x10,x6,x3 // lo(a[2]*b[i]) adcs x17,x19,x11 mul x11,x7,x3 // lo(a[3]*b[i]) adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts of multiplication umulh x8,x4,x3 // hi(a[0]*b[i]) adcs x15,x15,x9 umulh x9,x5,x3 // hi(a[1]*b[i]) adcs x16,x16,x10 umulh x10,x6,x3 // hi(a[2]*b[i]) adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr ldr x3,[x2,#8*(2+1)] // b[2+1] adds x15,x15,x8 // accumulate high parts of multiplication adcs x16,x16,x9 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr lsl x10,x14,#32 // t0 lsr x11,x14,#32 // t1 subs x8,x14,x10 sbcs x9,xzr,x11 sbcs x10,xzr,x10 sbc x11,x14,x11 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] mul x8,x4,x3 // lo(a[0]*b[i]) adcs x15,x16,x9 mul x9,x5,x3 // lo(a[1]*b[i]) adcs x16,x17,x10 // +=acc[0]*0xffff0001 mul x10,x6,x3 // lo(a[2]*b[i]) adcs x17,x19,x11 mul x11,x7,x3 // lo(a[3]*b[i]) adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts of multiplication umulh x8,x4,x3 // hi(a[0]*b[i]) adcs x15,x15,x9 umulh x9,x5,x3 // hi(a[1]*b[i]) adcs x16,x16,x10 umulh x10,x6,x3 // hi(a[2]*b[i]) adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr adds x15,x15,x8 // accumulate high parts of multiplication adcs x16,x16,x9 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr lsl x10,x14,#32 // t0 lsr x11,x14,#32 // t1 // last reduction subs x8,x14,x10 sbcs x9,xzr,x11 sbcs x10,xzr,x10 sbc x11,x14,x11 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 adcs x16,x17,x10 // +=acc[0]*0xffff0001 adcs x17,x19,x11 adc x19,x20,xzr // if a > p : return a - p // else: return a // carry, b = a + (2^256 - p) adds x8,x14,#1 adcs x9,x15,x12 adcs x10,x16,xzr adcs x11,x17,x13 adc x19,x19,xzr cmp x19,xzr // 如果 a + 2^256 - p 没有进位,说明 a < p, a - p 是个负数,说明我们直接返回a // 如果进位了,那么返回b csel x14,x14,x8,eq csel x15,x15,x9,eq csel x16,x16,x10,eq csel x17,x17,x11,eq stp x14,x15,[x0] stp x16,x17,[x0,#16] ret .globl func(sm2_z256_modp_mont_mul) .align 4 func(sm2_z256_modp_mont_mul): stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] // load a ldp x4,x5,[x1] ldp x6,x7,[x1,#16] // load b0 ldr x3,[x2] // load modp mov x12,#neg_p1 mov x13,#neg_p3 bl __sm2_z256_modp_mont_mul ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 ret .align 4 __sm2_z256_modp_mont_sqr: // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul x15,x5,x4 // a[1]*a[0] umulh x9,x5,x4 mul x16,x6,x4 // a[2]*a[0] umulh x10,x6,x4 mul x17,x7,x4 // a[3]*a[0] umulh x19,x7,x4 adds x16,x16,x9 // accumulate high parts of multiplication mul x8,x6,x5 // a[2]*a[1] umulh x9,x6,x5 adcs x17,x17,x10 mul x10,x7,x5 // a[3]*a[1] umulh x11,x7,x5 adc x19,x19,xzr // can't overflow mul x20,x7,x6 // a[3]*a[2] umulh x1,x7,x6 adds x9,x9,x10 // accumulate high parts of multiplication mul x14,x4,x4 // a[0]*a[0] adc x10,x11,xzr // can't overflow adds x17,x17,x8 // accumulate low parts of multiplication umulh x4,x4,x4 adcs x19,x19,x9 mul x9,x5,x5 // a[1]*a[1] adcs x20,x20,x10 umulh x5,x5,x5 adc x1,x1,xzr // can't overflow adds x15,x15,x15 // acc[1-6]*=2 mul x10,x6,x6 // a[2]*a[2] adcs x16,x16,x16 umulh x6,x6,x6 adcs x17,x17,x17 mul x11,x7,x7 // a[3]*a[3] adcs x19,x19,x19 umulh x7,x7,x7 adcs x20,x20,x20 adcs x1,x1,x1 adc x2,xzr,xzr adds x15,x15,x4 // +a[i]*a[i] adcs x16,x16,x9 adcs x17,x17,x5 adcs x19,x19,x10 adcs x20,x20,x6 lsl x10,x14,#32 adcs x1,x1,x11 lsr x11,x14,#32 adc x2,x2,x7 // Now: x2, x1, x20, x19, x17, x16, x15, x14 就是 a^2 的结果 subs x8,x14,x10 sbcs x9,xzr,x11 sbcs x10,xzr,x10 sbc x11,x14,x11 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 adcs x16,x17,x10 // +=acc[0]*0xffff0001 adc x17,x11,xzr // can't overflow lsl x10,x14,#32 lsr x11,x14,#32 subs x8,x14,x10 sbcs x9,xzr,x11 sbcs x10,xzr,x10 sbc x11,x14,x11 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 adcs x16,x17,x10 // +=acc[0]*0xffff0001 adc x17,x11,xzr // can't overflow lsl x10,x14,#32 lsr x11,x14,#32 subs x8,x14,x10 sbcs x9,xzr,x11 sbcs x10,xzr,x10 sbc x11,x14,x11 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 adcs x16,x17,x10 // +=acc[0]*0xffff0001 adc x17,x11,xzr // can't overflow lsl x10,x14,#32 lsr x11,x14,#32 subs x8,x14,x10 sbcs x9,xzr,x11 sbcs x10,xzr,x10 sbc x11,x14,x11 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 adcs x16,x17,x10 // +=acc[0]*0xffff0001 adc x17,x11,xzr // can't overflow adds x14,x14,x19 // accumulate upper half adcs x15,x15,x20 adcs x16,x16,x1 adcs x17,x17,x2 adc x19,xzr,xzr // carry, b = a + (2^256 - p) adds x8,x14,#1 adcs x9,x15,x12 adcs x10,x16,xzr adcs x11,x17,x13 adc x19,x19,xzr cmp x19,xzr // 如果 a + 2^256 - p 没有进位,说明 a < p, a - p 是个负数,说明我们直接返回a // 如果进位了,那么返回b csel x14,x14,x8,eq csel x15,x15,x9,eq csel x16,x16,x10,eq csel x17,x17,x11,eq stp x14,x15,[x0] stp x16,x17,[x0,#16] // 如果要用于连续平方,最好最后的输出是x4,x5,x6,x7,并且不需要输出到[x0]内存上 ret .globl func(sm2_z256_modp_mont_sqr) .align 4 func(sm2_z256_modp_mont_sqr): stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] // load modp mov x12,#neg_p1 mov x13,#neg_p3 bl __sm2_z256_modp_mont_sqr ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 ret // 计算r = r^(2^n) 也就是连续做n次平方 // 这个函数调用__sm2_z256_modp_mont_sqr,输入是x4,x5,x6,x7, 输出是x14,x15,x16,x17,并且写入到[x0] // 但是对于连续的平方,实际上我们不需要写到内存里,而且需要保证输入输出是一样的,需要对mont_sqr函数做一定的调整 // 当然不调整的话开销也不算大 .globl func(sm2_z256_modp_mont_esq) .align 4 func(sm2_z256_modp_mont_esq): stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldp x4,x5,[x0] ldp x6,x7,[x0,#16] // load modp mov x12,#neg_p1 mov x13,#neg_p3 // x1 在sqr中已经被用了,因此就不能再用了,x18没有用过,这里实际上没有节省什么计算 mov x3, x1 22: // 这个函数的输入是x4,x5,x6,x7 bl __sm2_z256_modp_mont_sqr // 结束之后还应该继续把值放到x4,x5,x6,x7中 mov x4,x14 mov x5,x15 mov x6,x16 mov x7,x17 subs x3, x3, #1 b.ne 22b ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 ret // mont(a) = a * 2^256 (mod p) = mont_mul(a, 2^512 mod p) .globl func(sm2_z256_modp_to_mont) .align 6 func(sm2_z256_modp_to_mont): stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] mov x3,x1 mov x1,x0 mov x0,x3 adr x2,Lz256_2e512modp ldr x3,Lz256_2e512modp ldp x4,x5,[x1] ldp x6,x7,[x1,#16] mov x12,#neg_p1 mov x13,#neg_p3 bl __sm2_z256_modp_mont_mul ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 ret // 这个函数中参与运算的b == 1,因此应该有更快的实现,但是似乎这个计算使用量不大 // 因此没必要专门优化 // mont(mont(a), 1) = aR * 1 * R^-1 (mod p) = a (mod p) .globl func(sm2_z256_modp_from_mont) .align 4 func(sm2_z256_modp_from_mont): stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] // load modp mov x12,#neg_p1 mov x13,#neg_p3 // load b = {1,0,0,0} adr x2,Lone // load b1 = 1 mov x3,#1 bl __sm2_z256_modp_mont_mul ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 ret .align 4 __sm2_z256_modp_haf: // a - (2^256 - p) == a + p - 2^256 subs x8,x14,#1 sbcs x9,x15,x12 sbcs x10,x16,xzr sbcs x11,x17,x13 // (a + p - 2^256) + 2^256 adcs x1,xzr,xzr // r = (a is even) ? a : (a - (2^256 - p) + 2^256) tst x14,#1 csel x14,x14,x8,eq csel x15,x15,x9,eq csel x16,x16,x10,eq csel x17,x17,x11,eq csel x1,xzr,x1,eq // r = r >> 1 lsr x14,x14,#1 orr x14,x14,x15,lsl#63 lsr x15,x15,#1 orr x15,x15,x16,lsl#63 lsr x16,x16,#1 orr x16,x16,x17,lsl#63 lsr x17,x17,#1 stp x14,x15,[x0] orr x17,x17,x1,lsl#63 stp x16,x17,[x0,#16] ret .globl func(sm2_z256_modp_haf) .align 4 func(sm2_z256_modp_haf): stp x29,x30,[sp,#-16]! add x29,sp,#0 ldp x14,x15,[x1] ldp x16,x17,[x1,#16] mov x12,#neg_p1 mov x13,#neg_p3 bl __sm2_z256_modp_haf ldp x29,x30,[sp],#16 ret .globl func(sm2_z256_point_dbl) .align 5 func(sm2_z256_point_dbl): stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] sub sp,sp,#32*4 //还是准备了4个临时变量 Ldouble_shortcut: // Jacobian点一共3个元素 // 0-16,16-32 // 32-48,48-64 // 64-80,80-96 // x14-x17 = Y ldp x14,x15,[x1,#32] mov x21,x0 ldp x16,x17,[x1,#48] mov x22,x1 // x21, x22 分别保存了x0,x1,也就是说 x21 = out, x22 = in // 为什么保存了x0,x1,难道这两个值被重复使用了吗? // 每个 __foo 都需要将输出写到 x0 的地址上 // load modp mov x12,#neg_p1 mov x13,#neg_p3 // x8-x11 = x14-x17 = Y mov x8,x14 mov x9,x15 // x4-x7 = Z sqr 确实是将 x4-x7 作为输入参数的 // x22 == x1 ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 ldp x6,x7,[x22,#64+16] // S = T[0] add x0,sp,#0 // 此时没有把输出写入到输出地址 // 我们可以 // 1. S = 2Y bl __sm2_z256_modp_add // p256_mul_by_2(S, in_y); // Zsqr = T[2] add x0,sp,#64 // 2. Zsqr = Z1^2 bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Zsqr, in_z); // x8-x11 = X ldp x8,x9,[x22] ldp x10,x11,[x22,#16] // x4-x7 = x14-x17 这是什么值 mov x4,x14 // put Zsqr aside for p256_sub mov x5,x15 mov x6,x16 mov x7,x17 // t1 = M // M = T[1] add x0,sp,#32 // 6. M = X1 + Zsqr = X1 + Z1^2 bl __sm2_z256_modp_add // p256_add(M, Zsqr, in_x); add x2,x22,#0 mov x14,x4 // restore Zsqr mov x15,x5 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont mov x16,x6 mov x17,x7 ldp x6,x7,[sp,#0+16] add x0,sp,#64 // 7. Zsqr = X - Z^2 bl __sm2_z256_modp_neg_sub // p256_sub(Zsqr, in_x, Zsqr); add x0,sp,#0 // 3. S = S^2 = 4*Y1^2 bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(S, S); ldr x3,[x22,#32] ldp x4,x5,[x22,#64] ldp x6,x7,[x22,#64+16] add x2,x22,#32 add x0,sp,#96 // tmp0 = Z*Y // 4. Z3 = Z1 * Y1 bl __sm2_z256_modp_mont_mul // p256_mul_mont(tmp0, in_z, in_y); // 算完之后已经把结果写到内存了 // 因此还必须再把数据读到寄存器才能继续算 mov x8,x14 mov x9,x15 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 ldp x6,x7,[sp,#0+16] add x0,x21,#64 // mov x0,x21 // 现在第一个位置就是一个z256了 // add sp,x29,#0 // ldp x19,x20,[x29,#16] // ldp x21,x22,[x29,#32] // ldp x29,x30,[sp],#96 // ret // Z3 = 2YZ bl __sm2_z256_modp_add // p256_mul_by_2(res_z, tmp0); add x0,sp,#96 bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(tmp0, S); ldr x3,[sp,#64] // forward load for p256_mul_mont ldp x4,x5,[sp,#32] ldp x6,x7,[sp,#32+16] add x0,x21,#32 bl __sm2_z256_modp_haf // p256_div_by_2(res_y, tmp0); add x2,sp,#64 add x0,sp,#32 bl __sm2_z256_modp_mont_mul // p256_mul_mont(M, M, Zsqr); mov x8,x14 // duplicate M mov x9,x15 mov x10,x16 mov x11,x17 mov x4,x14 // put M aside mov x5,x15 mov x6,x16 mov x7,x17 add x0,sp,#32 bl __sm2_z256_modp_add mov x8,x4 // restore M mov x9,x5 ldr x3,[x22] // forward load for p256_mul_mont mov x10,x6 ldp x4,x5,[sp,#0] mov x11,x7 ldp x6,x7,[sp,#0+16] bl __sm2_z256_modp_add // p256_mul_by_3(M, M); add x2,x22,#0 add x0,sp,#0 bl __sm2_z256_modp_mont_mul // p256_mul_mont(S, S, in_x); mov x8,x14 mov x9,x15 ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 ldp x6,x7,[sp,#32+16] add x0,sp,#96 bl __sm2_z256_modp_add // p256_mul_by_2(tmp0, S); add x0,x21,#0 // 输出X bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(res_x, M); add x2,sp,#96 bl __sm2_z256_modp_sub // p256_sub(res_x, res_x, tmp0); add x2,sp,#0 add x0,sp,#0 bl __sm2_z256_modp_neg_sub // p256_sub(S, S, res_x); ldr x3,[sp,#32] mov x4,x14 // copy S mov x5,x15 mov x6,x16 mov x7,x17 add x2,sp,#32 bl __sm2_z256_modp_mont_mul // p256_mul_mont(S, S, M); add x2,x21,#32 add x0,x21,#32 // 这里输出的是Y bl __sm2_z256_modp_sub // p256_sub(res_y, S, res_y); add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x29,x30,[sp],#96 ret .globl func(sm2_z256_point_add) .align 5 func(sm2_z256_point_add): stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#32*12 ldp x4,x5,[x2,#64] // in2_z ldp x6,x7,[x2,#64+16] mov x21,x0 mov x22,x1 mov x23,x2 // load modp mov x12,#neg_p1 mov x13,#neg_p3 //ldr x12,Lpoly+8 //ldr x13,Lpoly+24 orr x8,x4,x5 orr x10,x6,x7 orr x25,x8,x10 cmp x25,#0 csetm x25,ne // ~in2infty add x0,sp,#192 bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Z2sqr, in2_z); ldp x4,x5,[x22,#64] // in1_z ldp x6,x7,[x22,#64+16] orr x8,x4,x5 orr x10,x6,x7 orr x24,x8,x10 cmp x24,#0 csetm x24,ne // ~in1infty add x0,sp,#128 bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Z1sqr, in1_z); ldr x3,[x23,#64] ldp x4,x5,[sp,#192] ldp x6,x7,[sp,#192+16] add x2,x23,#64 add x0,sp,#320 bl __sm2_z256_modp_mont_mul // p256_mul_mont(S1, Z2sqr, in2_z); ldr x3,[x22,#64] ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x2,x22,#64 add x0,sp,#352 bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, Z1sqr, in1_z); ldr x3,[x22,#32] ldp x4,x5,[sp,#320] ldp x6,x7,[sp,#320+16] add x2,x22,#32 add x0,sp,#320 bl __sm2_z256_modp_mont_mul // p256_mul_mont(S1, S1, in1_y); ldr x3,[x23,#32] ldp x4,x5,[sp,#352] ldp x6,x7,[sp,#352+16] add x2,x23,#32 add x0,sp,#352 bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, S2, in2_y); add x2,sp,#320 ldr x3,[sp,#192] // forward load for p256_mul_mont ldp x4,x5,[x22] ldp x6,x7,[x22,#16] add x0,sp,#160 bl __sm2_z256_modp_sub // p256_sub(R, S2, S1); orr x14,x14,x15 // see if result is zero orr x16,x16,x17 orr x26,x14,x16 // ~is_equal(S1,S2) add x2,sp,#192 add x0,sp,#256 bl __sm2_z256_modp_mont_mul // p256_mul_mont(U1, in1_x, Z2sqr); ldr x3,[sp,#128] ldp x4,x5,[x23] ldp x6,x7,[x23,#16] add x2,sp,#128 add x0,sp,#288 bl __sm2_z256_modp_mont_mul // p256_mul_mont(U2, in2_x, Z1sqr); add x2,sp,#256 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont ldp x6,x7,[sp,#160+16] add x0,sp,#96 bl __sm2_z256_modp_sub // p256_sub(H, U2, U1); orr x14,x14,x15 // see if result is zero orr x16,x16,x17 orr x14,x14,x16 // ~is_equal(U1,U2) mvn x27,x24 // -1/0 -> 0/-1 mvn x28,x25 // -1/0 -> 0/-1 orr x14,x14,x27 orr x14,x14,x28 orr x14,x14,x26 cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) Ladd_double: mov x1,x22 mov x0,x21 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] add sp,sp,#32*(12-4) // difference in stack frames b Ldouble_shortcut .align 4 Ladd_proceed: add x0,sp,#192 bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Rsqr, R); ldr x3,[x22,#64] ldp x4,x5,[sp,#96] ldp x6,x7,[sp,#96+16] add x2,x22,#64 add x0,sp,#64 bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_z, H, in1_z); ldp x4,x5,[sp,#96] ldp x6,x7,[sp,#96+16] add x0,sp,#128 bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Hsqr, H); ldr x3,[x23,#64] ldp x4,x5,[sp,#64] ldp x6,x7,[sp,#64+16] add x2,x23,#64 add x0,sp,#64 bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_z, res_z, in2_z); ldr x3,[sp,#96] ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x2,sp,#96 add x0,sp,#224 bl __sm2_z256_modp_mont_mul // p256_mul_mont(Hcub, Hsqr, H); ldr x3,[sp,#128] ldp x4,x5,[sp,#256] ldp x6,x7,[sp,#256+16] add x2,sp,#128 add x0,sp,#288 bl __sm2_z256_modp_mont_mul // p256_mul_mont(U2, U1, Hsqr); mov x8,x14 mov x9,x15 mov x10,x16 mov x11,x17 add x0,sp,#128 bl __sm2_z256_modp_add // p256_mul_by_2(Hsqr, U2); add x2,sp,#192 add x0,sp,#0 bl __sm2_z256_modp_neg_sub // p256_sub(res_x, Rsqr, Hsqr); add x2,sp,#224 bl __sm2_z256_modp_sub // p256_sub(res_x, res_x, Hcub); add x2,sp,#288 ldr x3,[sp,#224] // forward load for p256_mul_mont ldp x4,x5,[sp,#320] ldp x6,x7,[sp,#320+16] add x0,sp,#32 bl __sm2_z256_modp_neg_sub // p256_sub(res_y, U2, res_x); add x2,sp,#224 add x0,sp,#352 bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, S1, Hcub); ldr x3,[sp,#160] ldp x4,x5,[sp,#32] ldp x6,x7,[sp,#32+16] add x2,sp,#160 add x0,sp,#32 bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_y, res_y, R); add x2,sp,#352 bl __sm2_z256_modp_sub // p256_sub(res_y, res_y, S2); ldp x4,x5,[sp,#0] // res ldp x6,x7,[sp,#0+16] ldp x8,x9,[x23] // in2 ldp x10,x11,[x23,#16] ldp x14,x15,[x22,#0] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#0+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+0+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+0+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#0+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#0+48] stp x14,x15,[x21,#0] stp x16,x17,[x21,#0+16] ldp x14,x15,[x22,#32] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#32+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+32+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+32+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#32+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#32+48] stp x14,x15,[x21,#32] stp x16,x17,[x21,#32+16] ldp x14,x15,[x22,#64] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#64+16] csel x8,x4,x8,ne csel x9,x5,x9,ne csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? csel x14,x8,x14,ne csel x15,x9,x15,ne csel x16,x10,x16,ne csel x17,x11,x17,ne stp x14,x15,[x21,#64] stp x16,x17,[x21,#64+16] Ladd_done: add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 ret .globl func(sm2_z256_point_add_affine) .align 5 func(sm2_z256_point_add_affine): stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] sub sp,sp,#32*10 mov x21,x0 mov x22,x1 mov x23,x2 // load modp mov x12,#neg_p1 mov x13,#neg_p3 ldp x4,x5,[x1,#64] // in1_z ldp x6,x7,[x1,#64+16] orr x8,x4,x5 orr x10,x6,x7 orr x24,x8,x10 cmp x24,#0 csetm x24,ne // ~in1infty ldp x14,x15,[x2] // in2_x ldp x16,x17,[x2,#16] ldp x8,x9,[x2,#32] // in2_y ldp x10,x11,[x2,#48] orr x14,x14,x15 orr x16,x16,x17 orr x8,x8,x9 orr x10,x10,x11 orr x14,x14,x16 orr x8,x8,x10 orr x25,x14,x8 cmp x25,#0 csetm x25,ne // ~in2infty add x0,sp,#128 bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Z1sqr, in1_z); mov x4,x14 mov x5,x15 mov x6,x16 mov x7,x17 ldr x3,[x23] add x2,x23,#0 add x0,sp,#96 bl __sm2_z256_modp_mont_mul // p256_mul_mont(U2, Z1sqr, in2_x); add x2,x22,#0 ldr x3,[x22,#64] // forward load for p256_mul_mont ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x0,sp,#160 bl __sm2_z256_modp_sub // p256_sub(H, U2, in1_x); add x2,x22,#64 add x0,sp,#128 bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, Z1sqr, in1_z); ldr x3,[x22,#64] ldp x4,x5,[sp,#160] ldp x6,x7,[sp,#160+16] add x2,x22,#64 add x0,sp,#64 bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_z, H, in1_z); ldr x3,[x23,#32] ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x2,x23,#32 add x0,sp,#128 bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, S2, in2_y); add x2,x22,#32 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont ldp x6,x7,[sp,#160+16] add x0,sp,#192 bl __sm2_z256_modp_sub // p256_sub(R, S2, in1_y); add x0,sp,#224 bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Hsqr, H); ldp x4,x5,[sp,#192] ldp x6,x7,[sp,#192+16] add x0,sp,#288 bl __sm2_z256_modp_mont_sqr // p256_sqr_mont(Rsqr, R); ldr x3,[sp,#160] ldp x4,x5,[sp,#224] ldp x6,x7,[sp,#224+16] add x2,sp,#160 add x0,sp,#256 bl __sm2_z256_modp_mont_mul // p256_mul_mont(Hcub, Hsqr, H); ldr x3,[x22] ldp x4,x5,[sp,#224] ldp x6,x7,[sp,#224+16] add x2,x22,#0 add x0,sp,#96 bl __sm2_z256_modp_mont_mul // p256_mul_mont(U2, in1_x, Hsqr); mov x8,x14 mov x9,x15 mov x10,x16 mov x11,x17 add x0,sp,#224 bl __sm2_z256_modp_add // p256_mul_by_2(Hsqr, U2); add x2,sp,#288 add x0,sp,#0 bl __sm2_z256_modp_neg_sub // p256_sub(res_x, Rsqr, Hsqr); add x2,sp,#256 bl __sm2_z256_modp_sub // p256_sub(res_x, res_x, Hcub); add x2,sp,#96 ldr x3,[x22,#32] // forward load for p256_mul_mont ldp x4,x5,[sp,#256] ldp x6,x7,[sp,#256+16] add x0,sp,#32 bl __sm2_z256_modp_neg_sub // p256_sub(res_y, U2, res_x); add x2,x22,#32 add x0,sp,#128 bl __sm2_z256_modp_mont_mul // p256_mul_mont(S2, in1_y, Hcub); ldr x3,[sp,#192] ldp x4,x5,[sp,#32] ldp x6,x7,[sp,#32+16] add x2,sp,#192 add x0,sp,#32 bl __sm2_z256_modp_mont_mul // p256_mul_mont(res_y, res_y, R); add x2,sp,#128 bl __sm2_z256_modp_sub // p256_sub(res_y, res_y, S2); ldp x4,x5,[sp,#0] // res ldp x6,x7,[sp,#0+16] ldp x8,x9,[x23] // in2 ldp x10,x11,[x23,#16] ldp x14,x15,[x22,#0] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#0+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+0+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+0+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#0+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#0+48] stp x14,x15,[x21,#0] stp x16,x17,[x21,#0+16] adr x23,Lneg_p-64 ldp x14,x15,[x22,#32] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#32+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+32+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+32+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#32+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#32+48] stp x14,x15,[x21,#32] stp x16,x17,[x21,#32+16] ldp x14,x15,[x22,#64] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#64+16] csel x8,x4,x8,ne csel x9,x5,x9,ne csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? csel x14,x8,x14,ne csel x15,x9,x15,ne csel x16,x10,x16,ne csel x17,x11,x17,ne stp x14,x15,[x21,#64] stp x16,x17,[x21,#64+16] add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x29,x30,[sp],#80 ret .align 4 __sm2_z256_modn_add: // (carry, a) = a + b adds x14,x14,x4 adcs x15,x15,x5 adcs x16,x16,x6 adcs x17,x17,x7 adc x1,xzr,xzr // (borrow, b) = (carry, a) - p = a + b - p subs x4,x14,x10 sbcs x5,x15,x11 sbcs x6,x16,x12 sbcs x7,x17,x13 sbcs xzr,x1,xzr // if borrow (lo), b is not the answer csel x14,x14,x4,lo csel x15,x15,x5,lo csel x16,x16,x6,lo stp x14,x15,[x0] csel x17,x17,x7,lo stp x16,x17,[x0,#16] ret .globl func(sm2_z256_modn_add) .align 4 func(sm2_z256_modn_add): stp x29,x30,[sp,#-16]! add x29,sp,#0 ldp x14,x15,[x1] ldp x16,x17,[x1,#16] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldr x10,Lmodn ldr x11,Lmodn+8 ldr x12,Lmodn+16 ldr x13,Lmodn+24 bl __sm2_z256_modn_add ldp x29,x30,[sp],#16 ret .align 4 __sm2_z256_modn_sub: // load b ldp x4,x5,[x2] ldp x6,x7,[x2,#16] // borrow, r = a - b subs x14,x14,x4 sbcs x15,x15,x5 sbcs x16,x16,x6 sbcs x17,x17,x7 sbc x1,xzr,xzr // b = r + p = a - b + p adds x4,x14,x10 adcs x5,x15,x11 adcs x6,x16,x12 adcs x7,x17,x13 // return (borrow == 0) ? r : (a - b + p) cmp x1,xzr csel x14,x14,x4,eq csel x15,x15,x5,eq csel x16,x16,x6,eq stp x14,x15,[x0] csel x17,x17,x7,eq stp x16,x17,[x0,#16] ret .globl func(sm2_z256_modn_sub) .align 4 func(sm2_z256_modn_sub): stp x29,x30,[sp,#-16]! add x29,sp,#0 ldp x14,x15,[x1] ldp x16,x17,[x1,#16] ldr x10,Lmodn ldr x11,Lmodn+8 ldr x12,Lmodn+16 ldr x13,Lmodn+24 bl __sm2_z256_modn_sub ldp x29,x30,[sp],#16 ret .globl func(sm2_z256_modn_neg) .align 4 func(sm2_z256_modn_neg): stp x29,x30,[sp,#-16]! add x29,sp,#0 ldr x10,Lmodn ldr x11,Lmodn+8 ldr x12,Lmodn+16 ldr x13,Lmodn+24 mov x2,x1 mov x14,xzr mov x15,xzr mov x16,xzr mov x17,xzr bl __sm2_z256_modn_sub ldp x29,x30,[sp],#16 ret .align 4 __sm2_z256_modn_mont_mul: // x14,x15,x16,x17 as a0,a1,a2,a3 // x4,x5,x6,x7 as b0,b1,b2,b3 // x3 as b0,b1,b2,b3 // c = b0 * a, len(c) = 5 mul x14,x4,x3 umulh x21,x4,x3 mul x15,x5,x3 umulh x22,x5,x3 mul x16,x6,x3 umulh x23,x6,x3 mul x17,x7,x3 umulh x24,x7,x3 adds x15,x15,x21 adcs x16,x16,x22 adcs x17,x17,x23 adc x19,xzr,x24 // q = mu * c0 mod 2^64 mul x3,x9,x14 // c = (c + q * p) // 2^64 mul x21,x10,x3 mul x22,x11,x3 mul x23,x12,x3 mul x24,x13,x3 adds x14,x14,x21 adcs x15,x15,x22 adcs x16,x16,x23 adcs x17,x17,x24 adcs x19,x19,xzr adc x20,xzr,xzr umulh x21,x10,x3 umulh x22,x11,x3 umulh x23,x12,x3 umulh x24,x13,x3 adds x14,x15,x21 adcs x15,x16,x22 adcs x16,x17,x23 adcs x17,x19,x24 adc x19,x20,xzr // load b1 ldr x3,[x2,#8] // c += a * b1 // len(c) = 6 mul x21,x4,x3 mul x22,x5,x3 mul x23,x6,x3 mul x24,x7,x3 adds x14,x14,x21 adcs x15,x15,x22 adcs x16,x16,x23 adcs x17,x17,x24 adcs x19,x19,xzr adc x20,xzr,xzr umulh x21,x4,x3 umulh x22,x5,x3 umulh x23,x6,x3 umulh x24,x7,x3 adds x15,x15,x21 adcs x16,x16,x22 adcs x17,x17,x23 adcs x19,x19,x24 adc x20,x20,xzr // mu * c0 mod 2^64 mul x3,x9,x14 // c = (c + q * p) // 2^64 mul x21,x10,x3 mul x22,x11,x3 mul x23,x12,x3 mul x24,x13,x3 adds x14,x14,x21 adcs x15,x15,x22 adcs x16,x16,x23 adcs x17,x17,x24 adcs x19,x19,xzr adc x20,x20,xzr umulh x21,x10,x3 umulh x22,x11,x3 umulh x23,x12,x3 umulh x24,x13,x3 adds x14,x15,x21 adcs x15,x16,x22 adcs x16,x17,x23 adcs x17,x19,x24 adc x19,x20,xzr // load b2 ldr x3,[x2,#16] // c += a * b1 // len(c) = 6 mul x21,x4,x3 mul x22,x5,x3 mul x23,x6,x3 mul x24,x7,x3 adds x14,x14,x21 adcs x15,x15,x22 adcs x16,x16,x23 adcs x17,x17,x24 adcs x19,x19,xzr adc x20,xzr,xzr umulh x21,x4,x3 umulh x22,x5,x3 umulh x23,x6,x3 umulh x24,x7,x3 adds x15,x15,x21 adcs x16,x16,x22 adcs x17,x17,x23 adcs x19,x19,x24 adc x20,x20,xzr // mu * c0 mod 2^64 mul x3,x9,x14 // c = (c + q * p) // 2^64 mul x21,x10,x3 mul x22,x11,x3 mul x23,x12,x3 mul x24,x13,x3 adds x14,x14,x21 adcs x15,x15,x22 adcs x16,x16,x23 adcs x17,x17,x24 adcs x19,x19,xzr adc x20,x20,xzr umulh x21,x10,x3 umulh x22,x11,x3 umulh x23,x12,x3 umulh x24,x13,x3 adds x14,x15,x21 adcs x15,x16,x22 adcs x16,x17,x23 adcs x17,x19,x24 adc x19,x20,xzr // load b3 ldr x3,[x2,#24] // c += a * b1 mul x21,x4,x3 mul x22,x5,x3 mul x23,x6,x3 mul x24,x7,x3 adds x14,x14,x21 adcs x15,x15,x22 adcs x16,x16,x23 adcs x17,x17,x24 adcs x19,x19,xzr adc x20,xzr,xzr umulh x21,x4,x3 umulh x22,x5,x3 umulh x23,x6,x3 umulh x24,x7,x3 adds x15,x15,x21 adcs x16,x16,x22 adcs x17,x17,x23 adcs x19,x19,x24 adc x20,x20,xzr // q = mu * c0 mod 2^64 mul x3,x9,x14 // c = (c + q * p) // 2^64 mul x21,x10,x3 mul x22,x11,x3 mul x23,x12,x3 mul x24,x13,x3 adds x14,x14,x21 adcs x15,x15,x22 adcs x16,x16,x23 adcs x17,x17,x24 adcs x19,x19,xzr adc x20,x20,xzr umulh x21,x10,x3 umulh x22,x11,x3 umulh x23,x12,x3 umulh x24,x13,x3 adds x14,x15,x21 adcs x15,x16,x22 adcs x16,x17,x23 adcs x17,x19,x24 adc x19,x20,xzr // (borrow, t) = c - p // return borrow ? c : (c - p) subs x21,x14,x10 sbcs x22,x15,x11 sbcs x23,x16,x12 sbcs x24,x17,x13 sbcs xzr,x19,xzr // if borrow csel x14,x14,x21,lo csel x15,x15,x22,lo csel x16,x16,x23,lo csel x17,x17,x24,lo // output stp x14,x15,[x0] stp x16,x17,[x0,#16] ret // mu = -n^-1 mod 2^64 // sage: n = 0xfffffffeffffffffffffffffffffffff7203df6b21c6052b53bbf40939d54123 // sage: mu = -(IntegerModRing(2^64)(n))^-1 Lmodn_mu: .quad 0x327f9e8872350975 .globl func(sm2_z256_modn_mont_mul) .align 4 func(sm2_z256_modn_mont_mul): stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] // mu = -n^-1 mod 2^64 ldr x9,Lmodn_mu // load modp ldr x10,Lmodn ldr x11,Lmodn+8 ldr x12,Lmodn+16 ldr x13,Lmodn+24 // load a ldp x4,x5,[x1] ldp x6,x7,[x1,#16] // load b0 ldr x3,[x2] bl __sm2_z256_modn_mont_mul add sp,x29,#0 ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x29,x30,[sp],#64 ret // mont(mont(a), 1) = aR * 1 * R^-1 (mod p) = a (mod p) .globl func(sm2_z256_modn_from_mont) .align 4 func(sm2_z256_modn_from_mont): stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] // mu = -p^-1 mod 2^64 ldr x9,Lmodn_mu // load p ldr x10,Lmodn ldr x11,Lmodn+8 ldr x12,Lmodn+16 ldr x13,Lmodn+24 // load a ldp x4,x5,[x1] ldp x6,x7,[x1,#16] // b = {1,0,0,0} adr x2,Lone // b0 = 1 mov x3,#1 bl __sm2_z256_modn_mont_mul add sp,x29,#0 ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x29,x30,[sp],#64 ret // 2^512 mod n = 0x1eb5e412a22b3d3b620fc84c3affe0d43464504ade6fa2fa901192af7c114f20 Lsm2_z256_modn_2e512: .quad 0x901192af7c114f20, 0x3464504ade6fa2fa, 0x620fc84c3affe0d4, 0x1eb5e412a22b3d3b // mont(a) = a * 2^256 (mod p) = mont_mul(a, 2^512 mod p) .globl func(sm2_z256_modn_to_mont) .align 6 func(sm2_z256_modn_to_mont): stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] // mu = -p^-1 mod 2^64 ldr x9,Lmodn_mu // load modp ldr x10,Lmodn ldr x11,Lmodn+8 ldr x12,Lmodn+16 ldr x13,Lmodn+24 // swap args x0,x1 = x1,x0 mov x3,x1 mov x1,x0 mov x0,x3 // load a ldp x4,x5,[x1] ldp x6,x7,[x1,#16] // load b = 2^512 mod p adr x2,Lsm2_z256_modn_2e512 // load b0 ldr x3,Lsm2_z256_modn_2e512 bl __sm2_z256_modn_mont_mul add sp,x29,#0 ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x29,x30,[sp],#64 ret .align 4 __sm2_z256_modn_mont_sqr: // L(a0*a0) H(a0*a0) L(a1*a1) H(a1*a1) L(a2*a2) H(a2*a2) L(a3*a3) H(a3*a3) // 2* L(a0*a1) L(a0*a2) L(a0*a3) // 2* H(a0*a1) H(a0*a2) H(a0*a3) // 2* L(a1*a2) L(a1*a3) // 2* H(a1*a2) H(a1*a3) mul x15,x5,x4 umulh x22,x5,x4 mul x16,x6,x4 umulh x23,x6,x4 mul x17,x7,x4 umulh x19,x7,x4 adds x16,x16,x22 mul x21,x6,x5 umulh x22,x6,x5 adcs x17,x17,x23 mul x23,x7,x5 umulh x24,x7,x5 adc x19,x19,xzr mul x20,x7,x6 // a[3]*a[2] umulh x1,x7,x6 adds x22,x22,x23 // accumulate high parts of multiplication mul x14,x4,x4 // a[0]*a[0] adc x23,x24,xzr // can't overflow adds x17,x17,x21 // accumulate low parts of multiplication umulh x4,x4,x4 adcs x19,x19,x22 mul x22,x5,x5 // a[1]*a[1] adcs x20,x20,x23 umulh x5,x5,x5 adc x1,x1,xzr // can't overflow adds x15,x15,x15 // acc[1-6]*=2 mul x23,x6,x6 // a[2]*a[2] adcs x16,x16,x16 umulh x6,x6,x6 adcs x17,x17,x17 mul x24,x7,x7 // a[3]*a[3] adcs x19,x19,x19 umulh x7,x7,x7 adcs x20,x20,x20 adcs x1,x1,x1 adc x2,xzr,xzr adds x15,x15,x4 // +a[i]*a[i] adcs x16,x16,x22 adcs x17,x17,x5 adcs x19,x19,x23 adcs x20,x20,x6 adcs x1,x1,x24 adc x2,x2,x7 // round 0 // q = mu * c0 mod 2^64 mul x3,x9,x14 // C = (C + q*p) // 2^64 mul x21,x10,x3 mul x22,x11,x3 mul x23,x12,x3 mul x24,x13,x3 adds x14,x14,x21 adcs x14,x15,x22 adcs x15,x16,x23 adcs x16,x17,x24 adc x17,xzr,xzr umulh x21,x10,x3 umulh x22,x11,x3 umulh x23,x12,x3 umulh x24,x13,x3 adds x14,x14,x21 adcs x15,x15,x22 adcs x16,x16,x23 adc x17,x17,x24 // round 1 // q = mu * c0 mod 2^64 mul x3,x9,x14 // C = (C + q*p) // 2^64 mul x21,x10,x3 mul x22,x11,x3 mul x23,x12,x3 mul x24,x13,x3 adds x14,x14,x21 adcs x14,x15,x22 adcs x15,x16,x23 adcs x16,x17,x24 adc x17,xzr,xzr umulh x21,x10,x3 umulh x22,x11,x3 umulh x23,x12,x3 umulh x24,x13,x3 adds x14,x14,x21 adcs x15,x15,x22 adcs x16,x16,x23 adc x17,x17,x24 // round 2 // q = mu * c0 mod 2^64 mul x3,x9,x14 // C = (C + q*p) // 2^64 mul x21,x10,x3 mul x22,x11,x3 mul x23,x12,x3 mul x24,x13,x3 adds x14,x14,x21 adcs x14,x15,x22 adcs x15,x16,x23 adcs x16,x17,x24 adc x17,xzr,xzr umulh x21,x10,x3 umulh x22,x11,x3 umulh x23,x12,x3 umulh x24,x13,x3 adds x14,x14,x21 adcs x15,x15,x22 adcs x16,x16,x23 adc x17,x17,x24 // round 3 // q = mu * c0 mod 2^64 mul x3,x9,x14 // C = (C + q*p) // 2^64 mul x21,x10,x3 mul x22,x11,x3 mul x23,x12,x3 mul x24,x13,x3 adds x14,x14,x21 adcs x14,x15,x22 adcs x15,x16,x23 adcs x16,x17,x24 adc x17,xzr,xzr umulh x21,x10,x3 umulh x22,x11,x3 umulh x23,x12,x3 umulh x24,x13,x3 adds x14,x14,x21 adcs x15,x15,x22 adcs x16,x16,x23 adc x17,x17,x24 // add upper half adds x14,x14,x19 adcs x15,x15,x20 adcs x16,x16,x1 adcs x17,x17,x2 adc x19,xzr,xzr // if c >= p, c = c - p subs x21,x14,x10 sbcs x22,x15,x11 sbcs x23,x16,x12 sbcs x24,x17,x13 sbcs xzr,x19,xzr csel x14,x14,x21,lo csel x15,x15,x22,lo csel x16,x16,x23,lo csel x17,x17,x24,lo stp x14,x15,[x0] stp x16,x17,[x0,#16] ret .globl func(sm2_z256_modn_mont_sqr) .align 4 func(sm2_z256_modn_mont_sqr): stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] // mu = -p^-1 mod 2^64 ldr x9,Lmodn_mu // load modp ldr x10,Lmodn ldr x11,Lmodn+8 ldr x12,Lmodn+16 ldr x13,Lmodn+24 // load a ldp x4,x5,[x1] ldp x6,x7,[x1,#16] bl __sm2_z256_modn_mont_sqr add sp,x29,#0 ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x29,x30,[sp],#64 ret