diff --git a/CMakeLists.txt b/CMakeLists.txt index a85ba0f6..cfc8c532 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,9 @@ option(ENABLE_SM9_ARM64 "Enable SM9_Z256 ARMv8 assembly" OFF) option(ENABLE_GMUL_ARM64 "Enable GF(2^128) Multiplication AArch64 assembly" OFF) +option(ENABLE_SM2_AMD64 "Enable SM2_Z256 X86_64 assembly" OFF) + + option(ENABLE_SM3_SSE "Enable SM3 SSE assembly implementation" OFF) option(ENABLE_SM4_CTR_AESNI_AVX "Enable SM4 CTR AESNI+AVX assembly implementation" OFF) @@ -259,6 +262,13 @@ if (ENABLE_SM2_ARM64) list(APPEND src src/sm2_z256_arm64.S) endif() +if (ENABLE_SM2_AMD64) + message(STATUS "ENABLE_SM2_AMD64 is ON") + add_definitions(-DENABLE_SM2_AMD64) + enable_language(ASM) + list(APPEND src src/sm2_z256_amd64.S) +endif() + if (ENABLE_SM2_NEON) message(STATUS "ENABLE_SM2_NEON is ON") add_definitions(-DENABLE_SM2_NEON) diff --git a/src/sm2_z256.c b/src/sm2_z256.c index b927f2a4..756a36f8 100644 --- a/src/sm2_z256.c +++ b/src/sm2_z256.c @@ -400,7 +400,7 @@ const uint64_t SM2_Z256_NEG_P[4] = { 1, ((uint64_t)1 << 32) - 1, 0, ((uint64_t)1 << 32), }; -#ifndef ENABLE_SM2_ARM64 +#if !defined(ENABLE_SM2_ARM64) && !defined(ENABLE_SM2_AMD64) void sm2_z256_modp_add(sm2_z256_t r, const sm2_z256_t a, const sm2_z256_t b) { uint64_t c; @@ -481,7 +481,7 @@ const uint64_t SM2_Z256_P_PRIME[4] = { // mont(1) (mod p) = 2^256 mod p = 2^256 - p const uint64_t *SM2_Z256_MODP_MONT_ONE = SM2_Z256_NEG_P; -#if defined(ENABLE_SM2_ARM64) +#if defined(ENABLE_SM2_ARM64) || defined(ENABLE_SM2_AMD64) // src/sm2_z256_armv8.S #elif defined(ENABLE_SM2_Z256_NEON) #include @@ -812,7 +812,7 @@ const uint64_t SM2_Z256_NEG_N[4] = { 0xac440bf6c62abedd, 0x8dfc2094de39fad4, 0x0000000000000000, 0x0000000100000000, }; -#ifndef ENABLE_SM2_ARM64 +#if !defined(ENABLE_SM2_ARM64) && !defined(ENABLE_SM2_AMD64) void sm2_z256_modn_add(sm2_z256_t r, const sm2_z256_t a, const sm2_z256_t b) { uint64_t c; @@ -868,7 +868,7 @@ const uint64_t *sm2_z256_order_minus_one(void) { const uint64_t *SM2_Z256_MODN_MONT_ONE = SM2_Z256_NEG_N; -#ifndef ENABLE_SM2_ARM64 +#if !defined(ENABLE_SM2_ARM64) && !defined(ENABLE_SM2_AMD64) void sm2_z256_modn_mont_mul(sm2_z256_t r, const sm2_z256_t a, const sm2_z256_t b) { sm2_z512_t z; @@ -917,7 +917,7 @@ void sm2_z256_modn_mul(sm2_z256_t r, const sm2_z256_t a, const sm2_z256_t b) sm2_z256_modn_from_mont(r, r); } -#ifndef ENABLE_SM2_ARM64 +#if !defined(ENABLE_SM2_ARM64) && !defined(ENABLE_SM2_AMD64) void sm2_z256_modn_mont_sqr(sm2_z256_t r, const sm2_z256_t a) { sm2_z256_modn_mont_mul(r, a, a); @@ -1020,7 +1020,7 @@ void sm2_z256_modn_inv(sm2_z256_t r, const sm2_z256_t a) } -#ifndef ENABLE_SM2_ARM64 +#if !defined(ENABLE_SM2_ARM64) && !defined(ENABLE_SM2_AMD64) // mont(mont(a), 1) = aR * 1 * R^-1 (mod n) = a (mod p) void sm2_z256_modn_from_mont(sm2_z256_t r, const sm2_z256_t a) @@ -1149,7 +1149,7 @@ int sm2_z256_point_get_xy(const SM2_Z256_POINT *P, uint64_t x[4], uint64_t y[4]) return 1; } -#ifndef ENABLE_SM2_ARM64 +#if !defined(ENABLE_SM2_ARM64) && !defined(ENABLE_SM2_AMD64) void sm2_z256_point_dbl(SM2_Z256_POINT *R, const SM2_Z256_POINT *A) { const uint64_t *X1 = A->X; @@ -1475,7 +1475,7 @@ void sm2_z256_point_copy_affine(SM2_Z256_POINT *R, const SM2_Z256_AFFINE_POINT * sm2_z256_copy(R->Z, SM2_Z256_MODP_MONT_ONE); } -#ifndef ENABLE_SM2_ARM64 +#if !defined(ENABLE_SM2_ARM64) && !defined(ENABLE_SM2_AMD64) void sm2_z256_point_add_affine(SM2_Z256_POINT *r, const SM2_Z256_POINT *a, const SM2_Z256_AFFINE_POINT *b) { sm2_z256_t U2, S2; diff --git a/src/sm2_z256_amd64.S b/src/sm2_z256_amd64.S index 5d88befc..707f83d6 100644 --- a/src/sm2_z256_amd64.S +++ b/src/sm2_z256_amd64.S @@ -31,10 +31,10 @@ L$Three: L$ONE_mont: .quad 0x0000000000000001, 0x00000000ffffffff, 0x0000000000000000, 0x0000000100000000 -.globl _ecp_sm2z256_mul_by_2 +.globl func(sm2_z256_modp_dbl) .p2align 6 -_ecp_sm2z256_mul_by_2: +func(sm2_z256_modp_dbl): pushq %r12 pushq %r13 @@ -76,10 +76,10 @@ _ecp_sm2z256_mul_by_2: -.globl _ecp_sm2z256_div_by_2 +.globl func(sm2_z256_modp_haf) .p2align 5 -_ecp_sm2z256_div_by_2: +func(sm2_z256_modp_haf): pushq %r12 pushq %r13 @@ -136,10 +136,10 @@ _ecp_sm2z256_div_by_2: -.globl _ecp_sm2z256_mul_by_3 +.globl func(sm2_z256_modp_tri) .p2align 5 -_ecp_sm2z256_mul_by_3: +func(sm2_z256_modp_tri): pushq %r12 pushq %r13 @@ -202,10 +202,10 @@ _ecp_sm2z256_mul_by_3: -.globl _ecp_sm2z256_add +.globl func(sm2_z256_modp_add) .p2align 5 -_ecp_sm2z256_add: +func(sm2_z256_modp_add): pushq %r12 pushq %r13 @@ -248,10 +248,10 @@ _ecp_sm2z256_add: -.globl _ecp_sm2z256_sub +.globl func(sm2_z256_modp_sub) .p2align 5 -_ecp_sm2z256_sub: +func(sm2_z256_modp_sub): pushq %r12 pushq %r13 @@ -294,10 +294,10 @@ _ecp_sm2z256_sub: -.globl _ecp_sm2z256_neg +.globl func(sm2_z256_modp_neg) .p2align 5 -_ecp_sm2z256_neg: +func(sm2_z256_modp_neg): pushq %r12 pushq %r13 @@ -341,10 +341,11 @@ _ecp_sm2z256_neg: -.globl _ecp_sm2z256_to_mont +.globl func(sm2_z256_modp_to_mont) .p2align 5 -_ecp_sm2z256_to_mont: +func(sm2_z256_modp_to_mont): + // FIXME: swap arg1 arg2 leaq L$RR(%rip),%rdx jmp L$mul_mont @@ -355,10 +356,10 @@ _ecp_sm2z256_to_mont: -.globl _ecp_sm2z256_mul_mont +.globl func(sm2_z256_modp_mont_mul) .p2align 5 -_ecp_sm2z256_mul_mont: +func(sm2_z256_modp_mont_mul): L$mul_mont: pushq %rbp pushq %rbx @@ -633,10 +634,10 @@ __ecp_sm2z256_mul_montq: -.globl _ecp_sm2z256_sqr_mont +.globl func(sm2_z256_modp_mont_sqr) .p2align 5 -_ecp_sm2z256_sqr_mont: +func(sm2_z256_modp_mont_mul): pushq %rbp pushq %rbx pushq %r12 @@ -842,10 +843,10 @@ __ecp_sm2z256_sqr_montq: -.globl _ecp_sm2z256_from_mont +.globl func(sm2_z256_modp_from_mont) .p2align 5 -_ecp_sm2z256_from_mont: +func(sm2_z256_modp_from_mont): pushq %r12 pushq %r13 @@ -1218,10 +1219,16 @@ __ecp_sm2z256_mul_by_2q: .byte 0xf3,0xc3 -.globl _ecp_sm2z256_point_double + + + + + + +.globl func(sm2_z256_point_dbl) .p2align 5 -_ecp_sm2z256_point_double: +func(sm2_z256_point_dbl): pushq %rbp pushq %rbx pushq %r12 @@ -1420,10 +1427,17 @@ L$point_double_shortcutq: popq %rbp .byte 0xf3,0xc3 -.globl _ecp_sm2z256_point_add + + + + + + + +.globl func(sm2_z256_point_add) .p2align 5 -_ecp_sm2z256_point_add: +func(sm2_z256_point_add): pushq %rbp pushq %rbx pushq %r12 @@ -1816,10 +1830,14 @@ L$add_doneq: popq %rbp .byte 0xf3,0xc3 -.globl _ecp_sm2z256_point_add_affine + + + + +.globl func(sm2_z256_point_add_affine) .p2align 5 -_ecp_sm2z256_point_add_affine: +func(sm2_z256_point_add_affine): pushq %rbp pushq %rbx pushq %r12