Change cpu suffix and other file names

This commit is contained in:
Zhi Guan
2024-05-12 23:17:35 +08:00
parent 58340393b1
commit dc80b0c137
19 changed files with 1479 additions and 3278 deletions

View File

@@ -95,7 +95,7 @@ void gf128_add(gf128_t r, const gf128_t a, const gf128_t b)
r[1] = a[1] ^ b[1];
}
#ifndef ENABLE_GMUL_AARCH64
#ifndef ENABLE_GMUL_ARM64
void gf128_mul(gf128_t r, const gf128_t a, const gf128_t b)
{
const uint64_t mask = (uint64_t)1 << 63;

View File

@@ -1,28 +0,0 @@
CC=gcc
CFLAGS=-fPIC -Wall
LDFLAGS=-shared
LIBS=-lgmssl -framework Security
TARGET=libsoft_sdf.so
OBJS=soft_sdf.o
all: $(TARGET)
$(OBJS): soft_sdf.c
$(CC) $(CFLAGS) -c soft_sdf.c -o $@
$(TARGET): $(OBJS)
$(CC) $(LDFLAGS) -o $@ $(OBJS) $(LIBS) -Wl,-exported_symbols_list,soft_sdf.exp
clean:
rm -f $(OBJS) $(TARGET)
install:
cp $(TARGET) /usr/local/lib
ldconfig
uninstall:
rm /usr/local/lib/$(TARGET)
ldconfig

File diff suppressed because it is too large Load Diff

View File

@@ -400,7 +400,7 @@ const uint64_t SM2_Z256_NEG_P[4] = {
1, ((uint64_t)1 << 32) - 1, 0, ((uint64_t)1 << 32),
};
#ifndef ENABLE_SM2_Z256_ARMV8
#ifndef ENABLE_SM2_ARM64
void sm2_z256_modp_add(sm2_z256_t r, const sm2_z256_t a, const sm2_z256_t b)
{
uint64_t c;
@@ -481,7 +481,7 @@ const uint64_t SM2_Z256_P_PRIME[4] = {
// mont(1) (mod p) = 2^256 mod p = 2^256 - p
const uint64_t *SM2_Z256_MODP_MONT_ONE = SM2_Z256_NEG_P;
#if defined(ENABLE_SM2_Z256_ARMV8)
#if defined(ENABLE_SM2_ARM64)
// src/sm2_z256_armv8.S
#elif defined(ENABLE_SM2_Z256_NEON)
#include <arm_neon.h>
@@ -812,7 +812,7 @@ const uint64_t SM2_Z256_NEG_N[4] = {
0xac440bf6c62abedd, 0x8dfc2094de39fad4, 0x0000000000000000, 0x0000000100000000,
};
#ifndef ENABLE_SM2_Z256_ARMV8
#ifndef ENABLE_SM2_ARM64
void sm2_z256_modn_add(sm2_z256_t r, const sm2_z256_t a, const sm2_z256_t b)
{
uint64_t c;
@@ -868,7 +868,7 @@ const uint64_t *sm2_z256_order_minus_one(void) {
const uint64_t *SM2_Z256_MODN_MONT_ONE = SM2_Z256_NEG_N;
#ifndef ENABLE_SM2_Z256_ARMV8
#ifndef ENABLE_SM2_ARM64
void sm2_z256_modn_mont_mul(sm2_z256_t r, const sm2_z256_t a, const sm2_z256_t b)
{
sm2_z512_t z;
@@ -917,7 +917,7 @@ void sm2_z256_modn_mul(sm2_z256_t r, const sm2_z256_t a, const sm2_z256_t b)
sm2_z256_modn_from_mont(r, r);
}
#ifndef ENABLE_SM2_Z256_ARMV8
#ifndef ENABLE_SM2_ARM64
void sm2_z256_modn_mont_sqr(sm2_z256_t r, const sm2_z256_t a)
{
sm2_z256_modn_mont_mul(r, a, a);
@@ -1020,7 +1020,7 @@ void sm2_z256_modn_inv(sm2_z256_t r, const sm2_z256_t a)
}
#ifndef ENABLE_SM2_Z256_ARMV8
#ifndef ENABLE_SM2_ARM64
// mont(mont(a), 1) = aR * 1 * R^-1 (mod n) = a (mod p)
void sm2_z256_modn_from_mont(sm2_z256_t r, const sm2_z256_t a)
@@ -1149,7 +1149,7 @@ int sm2_z256_point_get_xy(const SM2_Z256_POINT *P, uint64_t x[4], uint64_t y[4])
return 1;
}
#ifndef ENABLE_SM2_Z256_ARMV8
#ifndef ENABLE_SM2_ARM64
void sm2_z256_point_dbl(SM2_Z256_POINT *R, const SM2_Z256_POINT *A)
{
const uint64_t *X1 = A->X;
@@ -1480,7 +1480,7 @@ void sm2_z256_point_copy_affine(SM2_Z256_POINT *R, const SM2_Z256_AFFINE_POINT *
sm2_z256_copy(R->Z, SM2_Z256_MODP_MONT_ONE);
}
#ifndef ENABLE_SM2_Z256_ARMV8
#ifndef ENABLE_SM2_ARM64
void sm2_z256_point_add_affine(SM2_Z256_POINT *r, const SM2_Z256_POINT *a, const SM2_Z256_AFFINE_POINT *b)
{
sm2_z256_t U2, S2;

699
src/sm4.c
View File

@@ -7,8 +7,8 @@
* http://www.apache.org/licenses/LICENSE-2.0
*/
#include <gmssl/sm4.h>
#include <gmssl/endian.h>
static uint32_t FK[4] = {
@@ -61,20 +61,6 @@ const uint8_t S[256] = {
0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48,
};
#define GETU32(ptr) \
((uint32_t)(ptr)[0] << 24 | \
(uint32_t)(ptr)[1] << 16 | \
(uint32_t)(ptr)[2] << 8 | \
(uint32_t)(ptr)[3])
#define PUTU32(ptr,X) \
((ptr)[0] = (uint8_t)((X) >> 24), \
(ptr)[1] = (uint8_t)((X) >> 16), \
(ptr)[2] = (uint8_t)((X) >> 8), \
(ptr)[3] = (uint8_t)(X))
#define ROL32(X,n) (((X)<<(n)) | ((X)>>(32-(n))))
#define L32(X) \
((X) ^ \
ROL32((X), 2) ^ \
@@ -144,6 +130,7 @@ void sm4_set_decrypt_key(SM4_KEY *key, const uint8_t user_key[16])
}
}
#if ENABLE_SMALL_FOOTPRINT
void sm4_encrypt(const SM4_KEY *key, const uint8_t in[16], uint8_t out[16])
{
uint32_t X0, X1, X2, X3, X4;
@@ -219,21 +206,19 @@ static void ctr_incr(uint8_t a[16]) {
}
}
void sm4_ctr_encrypt(const SM4_KEY *key, uint8_t ctr[16], const uint8_t *in, size_t inlen, uint8_t *out)
void sm4_ctr_encrypt_blocks(const SM4_KEY *key, uint8_t ctr[16], const uint8_t *in, size_t nblocks, uint8_t *out)
{
uint8_t block[16];
size_t len, i;
int i;
while (inlen) {
len = inlen < 16 ? inlen : 16;
while (nblocks--) {
sm4_encrypt(key, ctr, block);
for (i = 0; i < len; i++) {
ctr_incr(ctr);
for (i = 0; i < 16; i++) {
out[i] = in[i] ^ block[i];
}
ctr_incr(ctr);
in += len;
out += len;
inlen -= len;
in += 16;
out += 16;
}
}
@@ -246,20 +231,668 @@ static void ctr32_incr(uint8_t a[16]) {
}
}
void sm4_ctr32_encrypt(const SM4_KEY *key, uint8_t ctr[16], const uint8_t *in, size_t inlen, uint8_t *out)
void sm4_ctr32_encrypt_blocks(const SM4_KEY *key, uint8_t ctr[16], const uint8_t *in, size_t nblocks, uint8_t *out)
{
uint8_t block[16];
size_t len, i;
int i;
while (inlen) {
len = inlen < 16 ? inlen : 16;
while (nblocks--) {
sm4_encrypt(key, ctr, block);
for (i = 0; i < len; i++) {
ctr32_incr(ctr);
for (i = 0; i < 16; i++) {
out[i] = in[i] ^ block[i];
}
ctr32_incr(ctr);
in += len;
out += len;
inlen -= len;
in += 16;
out += 16;
}
}
#else //!ENABLE_SMALL_FOOTPRINT
// T0[i] = L32(S[i] << 24)
const uint32_t T0[256] = {
0x8ed55b5b, 0xd0924242, 0x4deaa7a7, 0x06fdfbfb,
0xfccf3333, 0x65e28787, 0xc93df4f4, 0x6bb5dede,
0x4e165858, 0x6eb4dada, 0x44145050, 0xcac10b0b,
0x8828a0a0, 0x17f8efef, 0x9c2cb0b0, 0x11051414,
0x872bacac, 0xfb669d9d, 0xf2986a6a, 0xae77d9d9,
0x822aa8a8, 0x46bcfafa, 0x14041010, 0xcfc00f0f,
0x02a8aaaa, 0x54451111, 0x5f134c4c, 0xbe269898,
0x6d482525, 0x9e841a1a, 0x1e061818, 0xfd9b6666,
0xec9e7272, 0x4a430909, 0x10514141, 0x24f7d3d3,
0xd5934646, 0x53ecbfbf, 0xf89a6262, 0x927be9e9,
0xff33cccc, 0x04555151, 0x270b2c2c, 0x4f420d0d,
0x59eeb7b7, 0xf3cc3f3f, 0x1caeb2b2, 0xea638989,
0x74e79393, 0x7fb1cece, 0x6c1c7070, 0x0daba6a6,
0xedca2727, 0x28082020, 0x48eba3a3, 0xc1975656,
0x80820202, 0xa3dc7f7f, 0xc4965252, 0x12f9ebeb,
0xa174d5d5, 0xb38d3e3e, 0xc33ffcfc, 0x3ea49a9a,
0x5b461d1d, 0x1b071c1c, 0x3ba59e9e, 0x0cfff3f3,
0x3ff0cfcf, 0xbf72cdcd, 0x4b175c5c, 0x52b8eaea,
0x8f810e0e, 0x3d586565, 0xcc3cf0f0, 0x7d196464,
0x7ee59b9b, 0x91871616, 0x734e3d3d, 0x08aaa2a2,
0xc869a1a1, 0xc76aadad, 0x85830606, 0x7ab0caca,
0xb570c5c5, 0xf4659191, 0xb2d96b6b, 0xa7892e2e,
0x18fbe3e3, 0x47e8afaf, 0x330f3c3c, 0x674a2d2d,
0xb071c1c1, 0x0e575959, 0xe99f7676, 0xe135d4d4,
0x661e7878, 0xb4249090, 0x360e3838, 0x265f7979,
0xef628d8d, 0x38596161, 0x95d24747, 0x2aa08a8a,
0xb1259494, 0xaa228888, 0x8c7df1f1, 0xd73becec,
0x05010404, 0xa5218484, 0x9879e1e1, 0x9b851e1e,
0x84d75353, 0x00000000, 0x5e471919, 0x0b565d5d,
0xe39d7e7e, 0x9fd04f4f, 0xbb279c9c, 0x1a534949,
0x7c4d3131, 0xee36d8d8, 0x0a020808, 0x7be49f9f,
0x20a28282, 0xd4c71313, 0xe8cb2323, 0xe69c7a7a,
0x42e9abab, 0x43bdfefe, 0xa2882a2a, 0x9ad14b4b,
0x40410101, 0xdbc41f1f, 0xd838e0e0, 0x61b7d6d6,
0x2fa18e8e, 0x2bf4dfdf, 0x3af1cbcb, 0xf6cd3b3b,
0x1dfae7e7, 0xe5608585, 0x41155454, 0x25a38686,
0x60e38383, 0x16acbaba, 0x295c7575, 0x34a69292,
0xf7996e6e, 0xe434d0d0, 0x721a6868, 0x01545555,
0x19afb6b6, 0xdf914e4e, 0xfa32c8c8, 0xf030c0c0,
0x21f6d7d7, 0xbc8e3232, 0x75b3c6c6, 0x6fe08f8f,
0x691d7474, 0x2ef5dbdb, 0x6ae18b8b, 0x962eb8b8,
0x8a800a0a, 0xfe679999, 0xe2c92b2b, 0xe0618181,
0xc0c30303, 0x8d29a4a4, 0xaf238c8c, 0x07a9aeae,
0x390d3434, 0x1f524d4d, 0x764f3939, 0xd36ebdbd,
0x81d65757, 0xb7d86f6f, 0xeb37dcdc, 0x51441515,
0xa6dd7b7b, 0x09fef7f7, 0xb68c3a3a, 0x932fbcbc,
0x0f030c0c, 0x03fcffff, 0xc26ba9a9, 0xba73c9c9,
0xd96cb5b5, 0xdc6db1b1, 0x375a6d6d, 0x15504545,
0xb98f3636, 0x771b6c6c, 0x13adbebe, 0xda904a4a,
0x57b9eeee, 0xa9de7777, 0x4cbef2f2, 0x837efdfd,
0x55114444, 0xbdda6767, 0x2c5d7171, 0x45400505,
0x631f7c7c, 0x50104040, 0x325b6969, 0xb8db6363,
0x220a2828, 0xc5c20707, 0xf531c4c4, 0xa88a2222,
0x31a79696, 0xf9ce3737, 0x977aeded, 0x49bff6f6,
0x992db4b4, 0xa475d1d1, 0x90d34343, 0x5a124848,
0x58bae2e2, 0x71e69797, 0x64b6d2d2, 0x70b2c2c2,
0xad8b2626, 0xcd68a5a5, 0xcb955e5e, 0x624b2929,
0x3c0c3030, 0xce945a5a, 0xab76dddd, 0x867ff9f9,
0xf1649595, 0x5dbbe6e6, 0x35f2c7c7, 0x2d092424,
0xd1c61717, 0xd66fb9b9, 0xdec51b1b, 0x94861212,
0x78186060, 0x30f3c3c3, 0x897cf5f5, 0x5cefb3b3,
0xd23ae8e8, 0xacdf7373, 0x794c3535, 0xa0208080,
0x9d78e5e5, 0x56edbbbb, 0x235e7d7d, 0xc63ef8f8,
0x8bd45f5f, 0xe7c82f2f, 0xdd39e4e4, 0x68492121,
};
// T1[i] = L32(S[i] << 16)
const uint32_t T1[256] = {
0x5b8ed55b, 0x42d09242, 0xa74deaa7, 0xfb06fdfb,
0x33fccf33, 0x8765e287, 0xf4c93df4, 0xde6bb5de,
0x584e1658, 0xda6eb4da, 0x50441450, 0x0bcac10b,
0xa08828a0, 0xef17f8ef, 0xb09c2cb0, 0x14110514,
0xac872bac, 0x9dfb669d, 0x6af2986a, 0xd9ae77d9,
0xa8822aa8, 0xfa46bcfa, 0x10140410, 0x0fcfc00f,
0xaa02a8aa, 0x11544511, 0x4c5f134c, 0x98be2698,
0x256d4825, 0x1a9e841a, 0x181e0618, 0x66fd9b66,
0x72ec9e72, 0x094a4309, 0x41105141, 0xd324f7d3,
0x46d59346, 0xbf53ecbf, 0x62f89a62, 0xe9927be9,
0xccff33cc, 0x51045551, 0x2c270b2c, 0x0d4f420d,
0xb759eeb7, 0x3ff3cc3f, 0xb21caeb2, 0x89ea6389,
0x9374e793, 0xce7fb1ce, 0x706c1c70, 0xa60daba6,
0x27edca27, 0x20280820, 0xa348eba3, 0x56c19756,
0x02808202, 0x7fa3dc7f, 0x52c49652, 0xeb12f9eb,
0xd5a174d5, 0x3eb38d3e, 0xfcc33ffc, 0x9a3ea49a,
0x1d5b461d, 0x1c1b071c, 0x9e3ba59e, 0xf30cfff3,
0xcf3ff0cf, 0xcdbf72cd, 0x5c4b175c, 0xea52b8ea,
0x0e8f810e, 0x653d5865, 0xf0cc3cf0, 0x647d1964,
0x9b7ee59b, 0x16918716, 0x3d734e3d, 0xa208aaa2,
0xa1c869a1, 0xadc76aad, 0x06858306, 0xca7ab0ca,
0xc5b570c5, 0x91f46591, 0x6bb2d96b, 0x2ea7892e,
0xe318fbe3, 0xaf47e8af, 0x3c330f3c, 0x2d674a2d,
0xc1b071c1, 0x590e5759, 0x76e99f76, 0xd4e135d4,
0x78661e78, 0x90b42490, 0x38360e38, 0x79265f79,
0x8def628d, 0x61385961, 0x4795d247, 0x8a2aa08a,
0x94b12594, 0x88aa2288, 0xf18c7df1, 0xecd73bec,
0x04050104, 0x84a52184, 0xe19879e1, 0x1e9b851e,
0x5384d753, 0x00000000, 0x195e4719, 0x5d0b565d,
0x7ee39d7e, 0x4f9fd04f, 0x9cbb279c, 0x491a5349,
0x317c4d31, 0xd8ee36d8, 0x080a0208, 0x9f7be49f,
0x8220a282, 0x13d4c713, 0x23e8cb23, 0x7ae69c7a,
0xab42e9ab, 0xfe43bdfe, 0x2aa2882a, 0x4b9ad14b,
0x01404101, 0x1fdbc41f, 0xe0d838e0, 0xd661b7d6,
0x8e2fa18e, 0xdf2bf4df, 0xcb3af1cb, 0x3bf6cd3b,
0xe71dfae7, 0x85e56085, 0x54411554, 0x8625a386,
0x8360e383, 0xba16acba, 0x75295c75, 0x9234a692,
0x6ef7996e, 0xd0e434d0, 0x68721a68, 0x55015455,
0xb619afb6, 0x4edf914e, 0xc8fa32c8, 0xc0f030c0,
0xd721f6d7, 0x32bc8e32, 0xc675b3c6, 0x8f6fe08f,
0x74691d74, 0xdb2ef5db, 0x8b6ae18b, 0xb8962eb8,
0x0a8a800a, 0x99fe6799, 0x2be2c92b, 0x81e06181,
0x03c0c303, 0xa48d29a4, 0x8caf238c, 0xae07a9ae,
0x34390d34, 0x4d1f524d, 0x39764f39, 0xbdd36ebd,
0x5781d657, 0x6fb7d86f, 0xdceb37dc, 0x15514415,
0x7ba6dd7b, 0xf709fef7, 0x3ab68c3a, 0xbc932fbc,
0x0c0f030c, 0xff03fcff, 0xa9c26ba9, 0xc9ba73c9,
0xb5d96cb5, 0xb1dc6db1, 0x6d375a6d, 0x45155045,
0x36b98f36, 0x6c771b6c, 0xbe13adbe, 0x4ada904a,
0xee57b9ee, 0x77a9de77, 0xf24cbef2, 0xfd837efd,
0x44551144, 0x67bdda67, 0x712c5d71, 0x05454005,
0x7c631f7c, 0x40501040, 0x69325b69, 0x63b8db63,
0x28220a28, 0x07c5c207, 0xc4f531c4, 0x22a88a22,
0x9631a796, 0x37f9ce37, 0xed977aed, 0xf649bff6,
0xb4992db4, 0xd1a475d1, 0x4390d343, 0x485a1248,
0xe258bae2, 0x9771e697, 0xd264b6d2, 0xc270b2c2,
0x26ad8b26, 0xa5cd68a5, 0x5ecb955e, 0x29624b29,
0x303c0c30, 0x5ace945a, 0xddab76dd, 0xf9867ff9,
0x95f16495, 0xe65dbbe6, 0xc735f2c7, 0x242d0924,
0x17d1c617, 0xb9d66fb9, 0x1bdec51b, 0x12948612,
0x60781860, 0xc330f3c3, 0xf5897cf5, 0xb35cefb3,
0xe8d23ae8, 0x73acdf73, 0x35794c35, 0x80a02080,
0xe59d78e5, 0xbb56edbb, 0x7d235e7d, 0xf8c63ef8,
0x5f8bd45f, 0x2fe7c82f, 0xe4dd39e4, 0x21684921,
};
// T2[i] = L32(S[i] << 8)
const uint32_t T2[256] = {
0x5b5b8ed5, 0x4242d092, 0xa7a74dea, 0xfbfb06fd,
0x3333fccf, 0x878765e2, 0xf4f4c93d, 0xdede6bb5,
0x58584e16, 0xdada6eb4, 0x50504414, 0x0b0bcac1,
0xa0a08828, 0xefef17f8, 0xb0b09c2c, 0x14141105,
0xacac872b, 0x9d9dfb66, 0x6a6af298, 0xd9d9ae77,
0xa8a8822a, 0xfafa46bc, 0x10101404, 0x0f0fcfc0,
0xaaaa02a8, 0x11115445, 0x4c4c5f13, 0x9898be26,
0x25256d48, 0x1a1a9e84, 0x18181e06, 0x6666fd9b,
0x7272ec9e, 0x09094a43, 0x41411051, 0xd3d324f7,
0x4646d593, 0xbfbf53ec, 0x6262f89a, 0xe9e9927b,
0xccccff33, 0x51510455, 0x2c2c270b, 0x0d0d4f42,
0xb7b759ee, 0x3f3ff3cc, 0xb2b21cae, 0x8989ea63,
0x939374e7, 0xcece7fb1, 0x70706c1c, 0xa6a60dab,
0x2727edca, 0x20202808, 0xa3a348eb, 0x5656c197,
0x02028082, 0x7f7fa3dc, 0x5252c496, 0xebeb12f9,
0xd5d5a174, 0x3e3eb38d, 0xfcfcc33f, 0x9a9a3ea4,
0x1d1d5b46, 0x1c1c1b07, 0x9e9e3ba5, 0xf3f30cff,
0xcfcf3ff0, 0xcdcdbf72, 0x5c5c4b17, 0xeaea52b8,
0x0e0e8f81, 0x65653d58, 0xf0f0cc3c, 0x64647d19,
0x9b9b7ee5, 0x16169187, 0x3d3d734e, 0xa2a208aa,
0xa1a1c869, 0xadadc76a, 0x06068583, 0xcaca7ab0,
0xc5c5b570, 0x9191f465, 0x6b6bb2d9, 0x2e2ea789,
0xe3e318fb, 0xafaf47e8, 0x3c3c330f, 0x2d2d674a,
0xc1c1b071, 0x59590e57, 0x7676e99f, 0xd4d4e135,
0x7878661e, 0x9090b424, 0x3838360e, 0x7979265f,
0x8d8def62, 0x61613859, 0x474795d2, 0x8a8a2aa0,
0x9494b125, 0x8888aa22, 0xf1f18c7d, 0xececd73b,
0x04040501, 0x8484a521, 0xe1e19879, 0x1e1e9b85,
0x535384d7, 0x00000000, 0x19195e47, 0x5d5d0b56,
0x7e7ee39d, 0x4f4f9fd0, 0x9c9cbb27, 0x49491a53,
0x31317c4d, 0xd8d8ee36, 0x08080a02, 0x9f9f7be4,
0x828220a2, 0x1313d4c7, 0x2323e8cb, 0x7a7ae69c,
0xabab42e9, 0xfefe43bd, 0x2a2aa288, 0x4b4b9ad1,
0x01014041, 0x1f1fdbc4, 0xe0e0d838, 0xd6d661b7,
0x8e8e2fa1, 0xdfdf2bf4, 0xcbcb3af1, 0x3b3bf6cd,
0xe7e71dfa, 0x8585e560, 0x54544115, 0x868625a3,
0x838360e3, 0xbaba16ac, 0x7575295c, 0x929234a6,
0x6e6ef799, 0xd0d0e434, 0x6868721a, 0x55550154,
0xb6b619af, 0x4e4edf91, 0xc8c8fa32, 0xc0c0f030,
0xd7d721f6, 0x3232bc8e, 0xc6c675b3, 0x8f8f6fe0,
0x7474691d, 0xdbdb2ef5, 0x8b8b6ae1, 0xb8b8962e,
0x0a0a8a80, 0x9999fe67, 0x2b2be2c9, 0x8181e061,
0x0303c0c3, 0xa4a48d29, 0x8c8caf23, 0xaeae07a9,
0x3434390d, 0x4d4d1f52, 0x3939764f, 0xbdbdd36e,
0x575781d6, 0x6f6fb7d8, 0xdcdceb37, 0x15155144,
0x7b7ba6dd, 0xf7f709fe, 0x3a3ab68c, 0xbcbc932f,
0x0c0c0f03, 0xffff03fc, 0xa9a9c26b, 0xc9c9ba73,
0xb5b5d96c, 0xb1b1dc6d, 0x6d6d375a, 0x45451550,
0x3636b98f, 0x6c6c771b, 0xbebe13ad, 0x4a4ada90,
0xeeee57b9, 0x7777a9de, 0xf2f24cbe, 0xfdfd837e,
0x44445511, 0x6767bdda, 0x71712c5d, 0x05054540,
0x7c7c631f, 0x40405010, 0x6969325b, 0x6363b8db,
0x2828220a, 0x0707c5c2, 0xc4c4f531, 0x2222a88a,
0x969631a7, 0x3737f9ce, 0xeded977a, 0xf6f649bf,
0xb4b4992d, 0xd1d1a475, 0x434390d3, 0x48485a12,
0xe2e258ba, 0x979771e6, 0xd2d264b6, 0xc2c270b2,
0x2626ad8b, 0xa5a5cd68, 0x5e5ecb95, 0x2929624b,
0x30303c0c, 0x5a5ace94, 0xddddab76, 0xf9f9867f,
0x9595f164, 0xe6e65dbb, 0xc7c735f2, 0x24242d09,
0x1717d1c6, 0xb9b9d66f, 0x1b1bdec5, 0x12129486,
0x60607818, 0xc3c330f3, 0xf5f5897c, 0xb3b35cef,
0xe8e8d23a, 0x7373acdf, 0x3535794c, 0x8080a020,
0xe5e59d78, 0xbbbb56ed, 0x7d7d235e, 0xf8f8c63e,
0x5f5f8bd4, 0x2f2fe7c8, 0xe4e4dd39, 0x21216849,
};
// T3[i] = L32(S[i])
const uint32_t T3[256] = {
0xd55b5b8e, 0x924242d0, 0xeaa7a74d, 0xfdfbfb06,
0xcf3333fc, 0xe2878765, 0x3df4f4c9, 0xb5dede6b,
0x1658584e, 0xb4dada6e, 0x14505044, 0xc10b0bca,
0x28a0a088, 0xf8efef17, 0x2cb0b09c, 0x05141411,
0x2bacac87, 0x669d9dfb, 0x986a6af2, 0x77d9d9ae,
0x2aa8a882, 0xbcfafa46, 0x04101014, 0xc00f0fcf,
0xa8aaaa02, 0x45111154, 0x134c4c5f, 0x269898be,
0x4825256d, 0x841a1a9e, 0x0618181e, 0x9b6666fd,
0x9e7272ec, 0x4309094a, 0x51414110, 0xf7d3d324,
0x934646d5, 0xecbfbf53, 0x9a6262f8, 0x7be9e992,
0x33ccccff, 0x55515104, 0x0b2c2c27, 0x420d0d4f,
0xeeb7b759, 0xcc3f3ff3, 0xaeb2b21c, 0x638989ea,
0xe7939374, 0xb1cece7f, 0x1c70706c, 0xaba6a60d,
0xca2727ed, 0x08202028, 0xeba3a348, 0x975656c1,
0x82020280, 0xdc7f7fa3, 0x965252c4, 0xf9ebeb12,
0x74d5d5a1, 0x8d3e3eb3, 0x3ffcfcc3, 0xa49a9a3e,
0x461d1d5b, 0x071c1c1b, 0xa59e9e3b, 0xfff3f30c,
0xf0cfcf3f, 0x72cdcdbf, 0x175c5c4b, 0xb8eaea52,
0x810e0e8f, 0x5865653d, 0x3cf0f0cc, 0x1964647d,
0xe59b9b7e, 0x87161691, 0x4e3d3d73, 0xaaa2a208,
0x69a1a1c8, 0x6aadadc7, 0x83060685, 0xb0caca7a,
0x70c5c5b5, 0x659191f4, 0xd96b6bb2, 0x892e2ea7,
0xfbe3e318, 0xe8afaf47, 0x0f3c3c33, 0x4a2d2d67,
0x71c1c1b0, 0x5759590e, 0x9f7676e9, 0x35d4d4e1,
0x1e787866, 0x249090b4, 0x0e383836, 0x5f797926,
0x628d8def, 0x59616138, 0xd2474795, 0xa08a8a2a,
0x259494b1, 0x228888aa, 0x7df1f18c, 0x3bececd7,
0x01040405, 0x218484a5, 0x79e1e198, 0x851e1e9b,
0xd7535384, 0x00000000, 0x4719195e, 0x565d5d0b,
0x9d7e7ee3, 0xd04f4f9f, 0x279c9cbb, 0x5349491a,
0x4d31317c, 0x36d8d8ee, 0x0208080a, 0xe49f9f7b,
0xa2828220, 0xc71313d4, 0xcb2323e8, 0x9c7a7ae6,
0xe9abab42, 0xbdfefe43, 0x882a2aa2, 0xd14b4b9a,
0x41010140, 0xc41f1fdb, 0x38e0e0d8, 0xb7d6d661,
0xa18e8e2f, 0xf4dfdf2b, 0xf1cbcb3a, 0xcd3b3bf6,
0xfae7e71d, 0x608585e5, 0x15545441, 0xa3868625,
0xe3838360, 0xacbaba16, 0x5c757529, 0xa6929234,
0x996e6ef7, 0x34d0d0e4, 0x1a686872, 0x54555501,
0xafb6b619, 0x914e4edf, 0x32c8c8fa, 0x30c0c0f0,
0xf6d7d721, 0x8e3232bc, 0xb3c6c675, 0xe08f8f6f,
0x1d747469, 0xf5dbdb2e, 0xe18b8b6a, 0x2eb8b896,
0x800a0a8a, 0x679999fe, 0xc92b2be2, 0x618181e0,
0xc30303c0, 0x29a4a48d, 0x238c8caf, 0xa9aeae07,
0x0d343439, 0x524d4d1f, 0x4f393976, 0x6ebdbdd3,
0xd6575781, 0xd86f6fb7, 0x37dcdceb, 0x44151551,
0xdd7b7ba6, 0xfef7f709, 0x8c3a3ab6, 0x2fbcbc93,
0x030c0c0f, 0xfcffff03, 0x6ba9a9c2, 0x73c9c9ba,
0x6cb5b5d9, 0x6db1b1dc, 0x5a6d6d37, 0x50454515,
0x8f3636b9, 0x1b6c6c77, 0xadbebe13, 0x904a4ada,
0xb9eeee57, 0xde7777a9, 0xbef2f24c, 0x7efdfd83,
0x11444455, 0xda6767bd, 0x5d71712c, 0x40050545,
0x1f7c7c63, 0x10404050, 0x5b696932, 0xdb6363b8,
0x0a282822, 0xc20707c5, 0x31c4c4f5, 0x8a2222a8,
0xa7969631, 0xce3737f9, 0x7aeded97, 0xbff6f649,
0x2db4b499, 0x75d1d1a4, 0xd3434390, 0x1248485a,
0xbae2e258, 0xe6979771, 0xb6d2d264, 0xb2c2c270,
0x8b2626ad, 0x68a5a5cd, 0x955e5ecb, 0x4b292962,
0x0c30303c, 0x945a5ace, 0x76ddddab, 0x7ff9f986,
0x649595f1, 0xbbe6e65d, 0xf2c7c735, 0x0924242d,
0xc61717d1, 0x6fb9b9d6, 0xc51b1bde, 0x86121294,
0x18606078, 0xf3c3c330, 0x7cf5f589, 0xefb3b35c,
0x3ae8e8d2, 0xdf7373ac, 0x4c353579, 0x208080a0,
0x78e5e59d, 0xedbbbb56, 0x5e7d7d23, 0x3ef8f8c6,
0xd45f5f8b, 0xc82f2fe7, 0x39e4e4dd, 0x49212168,
};
#define ROUND(i, X0, X1, X2, X3, X4) \
X4 = X1 ^ X2 ^ X3 ^ rk[i]; \
X4 = T0[(uint8_t)(X4 >> 24)] ^ \
T1[(uint8_t)(X4 >> 16)] ^ \
T2[(uint8_t)(X4 >> 8)] ^ \
T3[(uint8_t)(X4 )] ^ \
X0
void sm4_encrypt(const SM4_KEY *key, const unsigned char in[16], unsigned char out[16])
{
const uint32_t *rk = key->rk;
uint32_t X0, X1, X2, X3, X4;
X0 = GETU32(in );
X1 = GETU32(in + 4);
X2 = GETU32(in + 8);
X3 = GETU32(in + 12);
ROUND( 0, X0, X1, X2, X3, X4);
ROUND( 1, X1, X2, X3, X4, X0);
ROUND( 2, X2, X3, X4, X0, X1);
ROUND( 3, X3, X4, X0, X1, X2);
ROUND( 4, X4, X0, X1, X2, X3);
ROUND( 5, X0, X1, X2, X3, X4);
ROUND( 6, X1, X2, X3, X4, X0);
ROUND( 7, X2, X3, X4, X0, X1);
ROUND( 8, X3, X4, X0, X1, X2);
ROUND( 9, X4, X0, X1, X2, X3);
ROUND(10, X0, X1, X2, X3, X4);
ROUND(11, X1, X2, X3, X4, X0);
ROUND(12, X2, X3, X4, X0, X1);
ROUND(13, X3, X4, X0, X1, X2);
ROUND(14, X4, X0, X1, X2, X3);
ROUND(15, X0, X1, X2, X3, X4);
ROUND(16, X1, X2, X3, X4, X0);
ROUND(17, X2, X3, X4, X0, X1);
ROUND(18, X3, X4, X0, X1, X2);
ROUND(19, X4, X0, X1, X2, X3);
ROUND(20, X0, X1, X2, X3, X4);
ROUND(21, X1, X2, X3, X4, X0);
ROUND(22, X2, X3, X4, X0, X1);
ROUND(23, X3, X4, X0, X1, X2);
ROUND(24, X4, X0, X1, X2, X3);
ROUND(25, X0, X1, X2, X3, X4);
ROUND(26, X1, X2, X3, X4, X0);
ROUND(27, X2, X3, X4, X0, X1);
ROUND(28, X3, X4, X0, X1, X2);
PUTU32(out + 12, X2);
ROUND(29, X4, X0, X1, X2, X3);
PUTU32(out + 8, X3);
ROUND(30, X0, X1, X2, X3, X4);
PUTU32(out + 4, X4);
ROUND(31, X1, X2, X3, X4, X0);
PUTU32(out, X0);
}
void sm4_encrypt_blocks(const SM4_KEY *key, const uint8_t *in, size_t nblocks, uint8_t *out)
{
const uint32_t *rk = key->rk;
uint32_t X0, X1, X2, X3, X4;
while (nblocks--) {
X0 = GETU32(in );
X1 = GETU32(in + 4);
X2 = GETU32(in + 8);
X3 = GETU32(in + 12);
ROUND( 0, X0, X1, X2, X3, X4);
ROUND( 1, X1, X2, X3, X4, X0);
ROUND( 2, X2, X3, X4, X0, X1);
ROUND( 3, X3, X4, X0, X1, X2);
ROUND( 4, X4, X0, X1, X2, X3);
ROUND( 5, X0, X1, X2, X3, X4);
ROUND( 6, X1, X2, X3, X4, X0);
ROUND( 7, X2, X3, X4, X0, X1);
ROUND( 8, X3, X4, X0, X1, X2);
ROUND( 9, X4, X0, X1, X2, X3);
ROUND(10, X0, X1, X2, X3, X4);
ROUND(11, X1, X2, X3, X4, X0);
ROUND(12, X2, X3, X4, X0, X1);
ROUND(13, X3, X4, X0, X1, X2);
ROUND(14, X4, X0, X1, X2, X3);
ROUND(15, X0, X1, X2, X3, X4);
ROUND(16, X1, X2, X3, X4, X0);
ROUND(17, X2, X3, X4, X0, X1);
ROUND(18, X3, X4, X0, X1, X2);
ROUND(19, X4, X0, X1, X2, X3);
ROUND(20, X0, X1, X2, X3, X4);
ROUND(21, X1, X2, X3, X4, X0);
ROUND(22, X2, X3, X4, X0, X1);
ROUND(23, X3, X4, X0, X1, X2);
ROUND(24, X4, X0, X1, X2, X3);
ROUND(25, X0, X1, X2, X3, X4);
ROUND(26, X1, X2, X3, X4, X0);
ROUND(27, X2, X3, X4, X0, X1);
ROUND(28, X3, X4, X0, X1, X2);
PUTU32(out + 12, X2);
ROUND(29, X4, X0, X1, X2, X3);
PUTU32(out + 8, X3);
ROUND(30, X0, X1, X2, X3, X4);
PUTU32(out + 4, X4);
ROUND(31, X1, X2, X3, X4, X0);
PUTU32(out, X0);
in += 16;
out += 16;
}
}
void sm4_cbc_encrypt_blocks(const SM4_KEY *key, const uint8_t iv[16], const uint8_t *in, size_t nblocks, uint8_t *out)
{
const uint32_t *rk = key->rk;
uint32_t X0, X1, X2, X3, X4;
uint32_t X5;
X0 = GETU32(iv ); // X0 = IV0
X4 = GETU32(iv + 4); // X4 = IV1
X3 = GETU32(iv + 8); // X3 = IV2
X5 = GETU32(iv + 12); // X5 = IV3
while (nblocks--) {
X0 = X0 ^ GETU32(in );
X1 = X4 ^ GETU32(in + 4);
X2 = X3 ^ GETU32(in + 8);
X3 = X5 ^ GETU32(in + 12);
ROUND( 0, X0, X1, X2, X3, X4);
ROUND( 1, X1, X2, X3, X4, X0);
ROUND( 2, X2, X3, X4, X0, X1);
ROUND( 3, X3, X4, X0, X1, X2);
ROUND( 4, X4, X0, X1, X2, X3);
ROUND( 5, X0, X1, X2, X3, X4);
ROUND( 6, X1, X2, X3, X4, X0);
ROUND( 7, X2, X3, X4, X0, X1);
ROUND( 8, X3, X4, X0, X1, X2);
ROUND( 9, X4, X0, X1, X2, X3);
ROUND(10, X0, X1, X2, X3, X4);
ROUND(11, X1, X2, X3, X4, X0);
ROUND(12, X2, X3, X4, X0, X1);
ROUND(13, X3, X4, X0, X1, X2);
ROUND(14, X4, X0, X1, X2, X3);
ROUND(15, X0, X1, X2, X3, X4);
ROUND(16, X1, X2, X3, X4, X0);
ROUND(17, X2, X3, X4, X0, X1);
ROUND(18, X3, X4, X0, X1, X2);
ROUND(19, X4, X0, X1, X2, X3);
ROUND(20, X0, X1, X2, X3, X4);
ROUND(21, X1, X2, X3, X4, X0);
ROUND(22, X2, X3, X4, X0, X1);
ROUND(23, X3, X4, X0, X1, X2);
ROUND(24, X4, X0, X1, X2, X3);
ROUND(25, X0, X1, X2, X3, X4);
ROUND(26, X1, X2, X3, X4, X0);
ROUND(27, X2, X3, X4, X0, X1);
ROUND(28, X3, X4, X0, X1, X2);
PUTU32(out + 12, X2);
ROUND(29, X4, X0, X1, X2, X3);
PUTU32(out + 8, X3);
ROUND(30, X0, X1, X2, X3, X4);
PUTU32(out + 4, X4);
ROUND(31, X1, X2, X3, X4, X0);
PUTU32(out, X0);
X5 = X2;
in += 16;
out += 16;
}
}
void sm4_cbc_decrypt_blocks(const SM4_KEY *key, const uint8_t iv[16], const uint8_t *in, size_t nblocks, uint8_t *out)
{
const uint32_t *rk = key->rk;
uint32_t IV0, IV1, IV2, IV3;
uint32_t X0, X1, X2, X3, X4;
uint32_t C0, C1, C2, C3;
IV0 = GETU32(iv ); // X0 = IV0
IV1 = GETU32(iv + 4); // X4 = IV1
IV2 = GETU32(iv + 8); // X3 = IV2
IV3 = GETU32(iv + 12); // X5 = IV3
while (nblocks--) {
X0 = C0 = GETU32(in );
X1 = C1 = GETU32(in + 4);
X2 = C2 = GETU32(in + 8);
X3 = C3 = GETU32(in + 12);
ROUND( 0, X0, X1, X2, X3, X4);
ROUND( 1, X1, X2, X3, X4, X0);
ROUND( 2, X2, X3, X4, X0, X1);
ROUND( 3, X3, X4, X0, X1, X2);
ROUND( 4, X4, X0, X1, X2, X3);
ROUND( 5, X0, X1, X2, X3, X4);
ROUND( 6, X1, X2, X3, X4, X0);
ROUND( 7, X2, X3, X4, X0, X1);
ROUND( 8, X3, X4, X0, X1, X2);
ROUND( 9, X4, X0, X1, X2, X3);
ROUND(10, X0, X1, X2, X3, X4);
ROUND(11, X1, X2, X3, X4, X0);
ROUND(12, X2, X3, X4, X0, X1);
ROUND(13, X3, X4, X0, X1, X2);
ROUND(14, X4, X0, X1, X2, X3);
ROUND(15, X0, X1, X2, X3, X4);
ROUND(16, X1, X2, X3, X4, X0);
ROUND(17, X2, X3, X4, X0, X1);
ROUND(18, X3, X4, X0, X1, X2);
ROUND(19, X4, X0, X1, X2, X3);
ROUND(20, X0, X1, X2, X3, X4);
ROUND(21, X1, X2, X3, X4, X0);
ROUND(22, X2, X3, X4, X0, X1);
ROUND(23, X3, X4, X0, X1, X2);
ROUND(24, X4, X0, X1, X2, X3);
ROUND(25, X0, X1, X2, X3, X4);
ROUND(26, X1, X2, X3, X4, X0);
ROUND(27, X2, X3, X4, X0, X1);
ROUND(28, X3, X4, X0, X1, X2);
PUTU32(out + 12, IV3 ^ X2);
ROUND(29, X4, X0, X1, X2, X3);
PUTU32(out + 8, IV2 ^ X3);
ROUND(30, X0, X1, X2, X3, X4);
PUTU32(out + 4, IV1 ^ X4);
ROUND(31, X1, X2, X3, X4, X0);
PUTU32(out, IV0 ^ X0);
IV0 = C0;
IV1 = C1;
IV2 = C2;
IV3 = C3;
in += 16;
out += 16;
}
}
void sm4_ctr_encrypt_blocks(const SM4_KEY *key, uint8_t ctr[16], const uint8_t *in, size_t nblocks, uint8_t *out)
{
const uint32_t *rk = key->rk;
uint32_t X0, X1, X2, X3, X4;
uint64_t C0, C1;
uint32_t D0, D1, D2, D3;
C0 = GETU64(ctr );
C1 = GETU64(ctr + 8);
while (nblocks--) {
X0 = (uint32_t)(C0 >> 32);
X1 = (uint32_t)(C0 );
X2 = (uint32_t)(C1 >> 32);
X3 = (uint32_t)(C1 );
D0 = GETU32(in );
D1 = GETU32(in + 4);
D2 = GETU32(in + 8);
D3 = GETU32(in + 12);
ROUND( 0, X0, X1, X2, X3, X4);
ROUND( 1, X1, X2, X3, X4, X0);
ROUND( 2, X2, X3, X4, X0, X1);
ROUND( 3, X3, X4, X0, X1, X2);
ROUND( 4, X4, X0, X1, X2, X3);
ROUND( 5, X0, X1, X2, X3, X4);
ROUND( 6, X1, X2, X3, X4, X0);
ROUND( 7, X2, X3, X4, X0, X1);
ROUND( 8, X3, X4, X0, X1, X2);
ROUND( 9, X4, X0, X1, X2, X3);
ROUND(10, X0, X1, X2, X3, X4);
ROUND(11, X1, X2, X3, X4, X0);
ROUND(12, X2, X3, X4, X0, X1);
ROUND(13, X3, X4, X0, X1, X2);
ROUND(14, X4, X0, X1, X2, X3);
ROUND(15, X0, X1, X2, X3, X4);
ROUND(16, X1, X2, X3, X4, X0);
ROUND(17, X2, X3, X4, X0, X1);
ROUND(18, X3, X4, X0, X1, X2);
ROUND(19, X4, X0, X1, X2, X3);
ROUND(20, X0, X1, X2, X3, X4);
ROUND(21, X1, X2, X3, X4, X0);
ROUND(22, X2, X3, X4, X0, X1);
ROUND(23, X3, X4, X0, X1, X2);
ROUND(24, X4, X0, X1, X2, X3);
ROUND(25, X0, X1, X2, X3, X4);
ROUND(26, X1, X2, X3, X4, X0);
ROUND(27, X2, X3, X4, X0, X1);
ROUND(28, X3, X4, X0, X1, X2);
PUTU32(out + 12, D3 ^ X2);
ROUND(29, X4, X0, X1, X2, X3);
PUTU32(out + 8, D2 ^ X3);
ROUND(30, X0, X1, X2, X3, X4);
PUTU32(out + 4, D1 ^ X4);
ROUND(31, X1, X2, X3, X4, X0);
PUTU32(out, D0 ^ X0);
C1++;
C0 = (C1 == 0) ? C0 + 1 : C0;
in += 16;
out += 16;
}
PUTU64(ctr , C0);
PUTU64(ctr + 8, C1);
}
void sm4_ctr32_encrypt_blocks(const SM4_KEY *key, uint8_t ctr[16], const uint8_t *in, size_t nblocks, uint8_t *out)
{
const uint32_t *rk = key->rk;
uint32_t X0, X1, X2, X3, X4;
uint32_t C0, C1, C2, C3;
uint32_t D0, D1, D2, D3;
C0 = GETU32(ctr );
C1 = GETU32(ctr + 4);
C2 = GETU32(ctr + 8);
C3 = GETU32(ctr + 12);
while (nblocks--) {
X0 = C0;
X1 = C1;
X2 = C2;
X3 = C3++;
D0 = GETU32(in );
D1 = GETU32(in + 4);
D2 = GETU32(in + 8);
D3 = GETU32(in + 12);
ROUND( 0, X0, X1, X2, X3, X4);
ROUND( 1, X1, X2, X3, X4, X0);
ROUND( 2, X2, X3, X4, X0, X1);
ROUND( 3, X3, X4, X0, X1, X2);
ROUND( 4, X4, X0, X1, X2, X3);
ROUND( 5, X0, X1, X2, X3, X4);
ROUND( 6, X1, X2, X3, X4, X0);
ROUND( 7, X2, X3, X4, X0, X1);
ROUND( 8, X3, X4, X0, X1, X2);
ROUND( 9, X4, X0, X1, X2, X3);
ROUND(10, X0, X1, X2, X3, X4);
ROUND(11, X1, X2, X3, X4, X0);
ROUND(12, X2, X3, X4, X0, X1);
ROUND(13, X3, X4, X0, X1, X2);
ROUND(14, X4, X0, X1, X2, X3);
ROUND(15, X0, X1, X2, X3, X4);
ROUND(16, X1, X2, X3, X4, X0);
ROUND(17, X2, X3, X4, X0, X1);
ROUND(18, X3, X4, X0, X1, X2);
ROUND(19, X4, X0, X1, X2, X3);
ROUND(20, X0, X1, X2, X3, X4);
ROUND(21, X1, X2, X3, X4, X0);
ROUND(22, X2, X3, X4, X0, X1);
ROUND(23, X3, X4, X0, X1, X2);
ROUND(24, X4, X0, X1, X2, X3);
ROUND(25, X0, X1, X2, X3, X4);
ROUND(26, X1, X2, X3, X4, X0);
ROUND(27, X2, X3, X4, X0, X1);
ROUND(28, X3, X4, X0, X1, X2);
PUTU32(out + 12, D3 ^ X2);
ROUND(29, X4, X0, X1, X2, X3);
PUTU32(out + 8, D2 ^ X3);
ROUND(30, X0, X1, X2, X3, X4);
PUTU32(out + 4, D1 ^ X4);
ROUND(31, X1, X2, X3, X4, X0);
PUTU32(out, D0 ^ X0);
in += 16;
out += 16;
}
PUTU32(ctr + 12, C3);
}
#endif //ENABLE_SMALL_FOOTPRINT

View File

@@ -1,302 +0,0 @@
/*
* Copyright 2014-2024 The GmSSL Project. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*/
#include <gmssl/asm.h>
.align 7
LFK:
.long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
LCK:
.long 0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269
.long 0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9
.long 0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249
.long 0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9
.long 0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229
.long 0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299
.long 0xa0a7aeb5, 0xbcc3cad1, 0xd8dfe6ed, 0xf4fb0209
.long 0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279
LSBOX:
.byte 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7
.byte 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05
.byte 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3
.byte 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99
.byte 0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a
.byte 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62
.byte 0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95
.byte 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6
.byte 0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba
.byte 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8
.byte 0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b
.byte 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35
.byte 0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2
.byte 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87
.byte 0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52
.byte 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e
.byte 0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5
.byte 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1
.byte 0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55
.byte 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3
.byte 0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60
.byte 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f
.byte 0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f
.byte 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51
.byte 0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f
.byte 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8
.byte 0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd
.byte 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0
.byte 0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e
.byte 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84
.byte 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20
.byte 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48
// X0, X1, X2, X3 => X1, X2, X3, X0
Llshift:
.byte 4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3
.globl func(sm4_set_encrypt_key)
.align 4
func(sm4_set_encrypt_key):
// load const v16..v31 = SBox
adr x3, LSBOX
ld1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x3], #64
ld1 {v20.16b,v21.16b,v22.16b,v23.16b}, [x3], #64
ld1 {v24.16b,v25.16b,v26.16b,v27.16b}, [x3], #64
ld1 {v28.16b,v29.16b,v30.16b,v31.16b}, [x3]
// load const v15 = [64, 64, ...]
movi v15.16b, #64
// load const v14 = lshift index
adr x3, Llshift
ld1 {v14.2d}, [x3]
// load const v13 = FK
adr x3, LFK
ld1 {v13.2d}, [x3]
// load const x5 = CK address
adr x15, LCK
// load user_key v1 = X0,X1,X2,X3
ld1 {v1.4s}, [x1]
rev32 v1.16b, v1.16b
// X = X ^ FK
eor v1.16b, v1.16b, v13.16b
// x4(w4) as X4, x5(w5) as tmp
// rounds = 32
mov x6, #32
1:
// w4 = X1 ^ X2 ^ X3 ^ CK[0]
mov w4, v1.s[1]
mov w5, v1.s[2]
eor w4, w4, w5
mov w5, v1.s[3]
eor w4, w4, w5
ldr w5, [x15], #4
eor w4, w4, w5
// sbox lookup, X4 = w4 = v3[0] = sbox(v2[0])
mov v2.s[0], w4
tbl v3.16b, {v16.16b,v17.16b,v18.16b,v19.16b}, v2.16b
sub v2.16b, v2.16b, v15.16b
tbx v3.16b, {v20.16b,v21.16b,v22.16b,v23.16b}, v2.16b
sub v2.16b, v2.16b, v15.16b
tbx v3.16b, {v24.16b,v25.16b,v26.16b,v27.16b}, v2.16b
sub v2.16b, v2.16b, v15.16b
tbx v3.16b, {v28.16b,v29.16b,v30.16b,v31.16b}, v2.16b
mov w4, v3.s[0]
// X4 = X0 ^ X4 ^ (X4 <<< 13) ^ (X4 <<< 23)
mov w5, v1.s[0]
eor w5, w4, w5
eor w5, w5, w4, ror #32-23
eor w4, w5, w4, ror #32-13
// output rk[i]
str w4, [x0], #4
// X1,X2,X3,X0 = X0,X1,X2,X3
mov v1.s[0], w4
tbl v1.16b, {v1.16b}, v14.16b
// if --rounds != 0, goto label(1)
subs x6, x6, #1
b.ne 1b
ret
.globl func(sm4_set_decrypt_key)
.align 4
func(sm4_set_decrypt_key):
// load const v16..v31 = SBox
adr x3,LSBOX
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x3],#64
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x3],#64
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x3],#64
ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x3]
// load const v15 = [64, 64, ...]
movi v15.16b, #64
// load const v14 = lshift index
adr x3,Llshift
ld1 {v14.2d},[x3]
// load const v13 = FK
adr x3,LFK
ld1 {v13.2d},[x3]
// load const x5 = CK address
adr x15,LCK
// load user_key v1 = X0,X1,X2,X3
ld1 {v1.4s}, [x1]
rev32 v1.16b, v1.16b
// X = X ^ FK
eor v1.16b, v1.16b, v13.16b
// x4(w4) as X4, x5(w5) as tmp
// rounds = 32
mov x6, #32
// set rk offset (31 * 4 = 124)
add x0, x0, 124
2:
// w4 = X1 ^ X2 ^ X3 ^ CK[0]
mov w4, v1.s[1]
mov w5, v1.s[2]
eor w4, w4, w5
mov w5, v1.s[3]
eor w4, w4, w5
ldr w5, [x15], #4
eor w4, w4, w5
// sbox lookup, X4 = w4 = v3[0] = sbox(v2[0])
mov v2.s[0], w4
tbl v3.16b, {v16.16b,v17.16b,v18.16b,v19.16b},v2.16b
sub v2.16b, v2.16b, v15.16b
tbx v3.16b, {v20.16b,v21.16b,v22.16b,v23.16b},v2.16b
sub v2.16b, v2.16b, v15.16b
tbx v3.16b, {v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
sub v2.16b, v2.16b, v15.16b
tbx v3.16b, {v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
mov w4, v3.s[0]
// X4 = X0 ^ X4 ^ (X4 <<< 13) ^ (X4 <<< 23)
mov w5, v1.s[0]
eor w5, w4, w5
eor w5, w5, w4, ror #32-23
eor w4, w5, w4, ror #32-13
// output rk[31 - i]
str w4, [x0], #-4
// X1,X2,X3,X0 = X0,X1,X2,X3
mov v1.s[0], w4
tbl v1.16b,{v1.16b},v14.16b
// if --rounds != 0, goto label(1)
subs x6, x6, #1
b.ne 2b
ret
.globl func(sm4_encrypt)
.align 5
func(sm4_encrypt):
// load sbox
adr x3, LSBOX
ld1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x3], #64
ld1 {v20.16b,v21.16b,v22.16b,v23.16b}, [x3], #64
ld1 {v24.16b,v25.16b,v26.16b,v27.16b}, [x3], #64
ld1 {v28.16b,v29.16b,v30.16b,v31.16b}, [x3]
// load const v15 = [64, 64, ...]
movi v15.16b, #64
// load input block
ld1 {v1.4s}, [x1]
rev32 v1.16b, v1.16b
// w10,w11,w12,w13 = X0,X1,X2,X3
mov w10, v1.s[0]
mov w11, v1.s[1]
mov w12, v1.s[2]
mov w13, v1.s[3]
// w8,w9 as tmp
// round = 32
mov w6, #32
3:
// load rk[i]
ldr w3,[x0],4
// X4 = (X2 ^ X3) ^ (RK[0] ^ X1)
eor w8, w12, w13
eor w9, w3, w11
eor w8, w8, w9
// sbox lookup, X4 = SBOX(X4)
mov v2.s[0], w8
tbl v3.16b, {v16.16b,v17.16b,v18.16b,v19.16b}, v2.16b
sub v2.16b, v2.16b, v15.16b
tbx v3.16b, {v20.16b,v21.16b,v22.16b,v23.16b}, v2.16b
sub v2.16b, v2.16b, v15.16b
tbx v3.16b, {v24.16b,v25.16b,v26.16b,v27.16b}, v2.16b
sub v2.16b, v2.16b, v15.16b
tbx v3.16b, {v28.16b,v29.16b,v30.16b,v31.16b}, v2.16b
mov w3, v3.s[0]
// X0 = X0 ^ X4 ^ (X4 <<< 2) ^ (X4 <<< 10) ^ (X4 <<< 18) ^ (X <<< 24)
eor w8, w3, w3, ror #32-2
eor w8, w8, w3, ror #32-10
eor w8, w8, w3, ror #32-18
eor w8, w8, w3, ror #32-24
eor w8, w8, w10
mov w10, w11
mov w11, w12
mov w12, w13
mov w13, w8
subs w6, w6, #1
b.ne 3b
// output X3,X2,X1,X0
mov v1.s[0], w13
mov v1.s[1], w12
mov v1.s[2], w11
mov v1.s[3], w10
rev32 v1.16b, v1.16b
st1 {v1.4s}, [x2]
ret

404
src/sm4_arm64.c Normal file
View File

@@ -0,0 +1,404 @@
/*
* Copyright 2014-2024 The GmSSL Project. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*/
#include <gmssl/sm4.h>
#include <gmssl/error.h>
#include <arm_neon.h>
static uint32_t FK[4] = {
0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc,
};
static uint32_t CK[32] = {
0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269,
0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9,
0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249,
0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9,
0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229,
0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299,
0xa0a7aeb5, 0xbcc3cad1, 0xd8dfe6ed, 0xf4fb0209,
0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279,
};
const uint8_t S[256] = {
0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7,
0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05,
0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3,
0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a,
0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62,
0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95,
0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6,
0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba,
0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8,
0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b,
0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35,
0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2,
0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87,
0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52,
0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e,
0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5,
0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1,
0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55,
0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3,
0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60,
0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f,
0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f,
0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51,
0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f,
0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8,
0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd,
0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0,
0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e,
0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84,
0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20,
0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48,
};
#define GETU32(ptr) \
((uint32_t)(ptr)[0] << 24 | \
(uint32_t)(ptr)[1] << 16 | \
(uint32_t)(ptr)[2] << 8 | \
(uint32_t)(ptr)[3])
#define PUTU32(ptr,X) \
((ptr)[0] = (uint8_t)((X) >> 24), \
(ptr)[1] = (uint8_t)((X) >> 16), \
(ptr)[2] = (uint8_t)((X) >> 8), \
(ptr)[3] = (uint8_t)(X))
#define ROL32(X,n) (((X)<<(n)) | ((X)>>(32-(n))))
#define L32(X) \
((X) ^ \
ROL32((X), 2) ^ \
ROL32((X), 10) ^ \
ROL32((X), 18) ^ \
ROL32((X), 24))
#define L32_(X) \
((X) ^ \
ROL32((X), 13) ^ \
ROL32((X), 23))
#define S32(A) \
((S[((A) >> 24) ] << 24) | \
(S[((A) >> 16) & 0xff] << 16) | \
(S[((A) >> 8) & 0xff] << 8) | \
(S[((A)) & 0xff]))
void sm4_set_encrypt_key(SM4_KEY *key, const uint8_t user_key[16])
{
uint32_t X0, X1, X2, X3, X4;
int i;
X0 = GETU32(user_key ) ^ FK[0];
X1 = GETU32(user_key + 4) ^ FK[1];
X2 = GETU32(user_key + 8) ^ FK[2];
X3 = GETU32(user_key + 12) ^ FK[3];
for (i = 0; i < 32; i++) {
X4 = X1 ^ X2 ^ X3 ^ CK[i];
X4 = S32(X4);
X4 = X0 ^ L32_(X4);
key->rk[i] = X4;
X0 = X1;
X1 = X2;
X2 = X3;
X3 = X4;
}
}
void sm4_set_decrypt_key(SM4_KEY *key, const uint8_t user_key[16])
{
uint32_t X0, X1, X2, X3, X4;
int i;
X0 = GETU32(user_key ) ^ FK[0];
X1 = GETU32(user_key + 4) ^ FK[1];
X2 = GETU32(user_key + 8) ^ FK[2];
X3 = GETU32(user_key + 12) ^ FK[3];
for (i = 0; i < 32; i++) {
X4 = X1 ^ X2 ^ X3 ^ CK[i];
X4 = S32(X4);
X4 = X0 ^ L32_(X4);
key->rk[31 - i] = X4;
X0 = X1;
X1 = X2;
X2 = X3;
X3 = X4;
}
}
// const time sbox with neon tbl/tbx
void sm4_encrypt(const SM4_KEY *key, const unsigned char in[16], unsigned char out[16])
{
uint8x16x4_t S0 = vld1q_u8_x4(S);
uint8x16x4_t S1 = vld1q_u8_x4(S + 64);
uint8x16x4_t S2 = vld1q_u8_x4(S + 128);
uint8x16x4_t S3 = vld1q_u8_x4(S + 192);
uint8x16_t vx;
uint8x16_t vt;
uint32_t X0, X1, X2, X3, X4;
int i;
X0 = GETU32(in );
X1 = GETU32(in + 4);
X2 = GETU32(in + 8);
X3 = GETU32(in + 12);
for (i = 0; i < 32; i++) {
X4 = X1 ^ X2 ^ X3 ^ key->rk[i];
// const time X4 = S32(X4)
vx = vdupq_n_u32(X4);
vt = vqtbl4q_u8(S0, vx);
vt = vqtbx4q_u8(vt, S1, veorq_u8(vx, vdupq_n_u8(0x40)));
vt = vqtbx4q_u8(vt, S2, veorq_u8(vx, vdupq_n_u8(0x80)));
vx = vqtbx4q_u8(vt, S3, veorq_u8(vx, vdupq_n_u8(0xc0)));
X4 = vgetq_lane_u32(vx, 0);
X4 = X0 ^ L32(X4);
X0 = X1;
X1 = X2;
X2 = X3;
X3 = X4;
}
PUTU32(out , X3);
PUTU32(out + 4, X2);
PUTU32(out + 8, X1);
PUTU32(out + 12, X0);
}
void sm4_encrypt_blocks(const SM4_KEY *key, const uint8_t *in, size_t nblocks, uint8_t *out)
{
while (nblocks--) {
sm4_encrypt(key, in, out);
in += 16;
out += 16;
}
}
void sm4_cbc_encrypt_blocks(const SM4_KEY *key, const uint8_t iv[16],
const uint8_t *in, size_t nblocks, uint8_t *out)
{
while (nblocks--) {
size_t i;
for (i = 0; i < 16; i++) {
out[i] = in[i] ^ iv[i];
}
sm4_encrypt(key, out, out);
iv = out;
in += 16;
out += 16;
}
}
void sm4_cbc_decrypt_blocks(const SM4_KEY *key, const uint8_t iv[16],
const uint8_t *in, size_t nblocks, uint8_t *out)
{
while (nblocks--) {
size_t i;
sm4_encrypt(key, in, out);
for (i = 0; i < 16; i++) {
out[i] ^= iv[i];
}
iv = in;
in += 16;
out += 16;
}
}
static void ctr_incr(uint8_t a[16]) {
int i;
for (i = 15; i >= 0; i--) {
a[i]++;
if (a[i]) break;
}
}
void sm4_ctr_encrypt_blocks(const SM4_KEY *key, uint8_t ctr[16], const uint8_t *in, size_t nblocks, uint8_t *out)
{
uint8_t block[16];
int i;
while (nblocks--) {
sm4_encrypt(key, ctr, block);
ctr_incr(ctr);
for (i = 0; i < 16; i++) {
out[i] = in[i] ^ block[i];
}
in += 16;
out += 16;
}
}
#define vrolq_n_u32(words, N) \
vorrq_u32(vshlq_n_u32((words), (N)), vshrq_n_u32((words), 32 - (N)))
void sm4_ctr32_encrypt_4blocks(const SM4_KEY *key, uint8_t iv[16], const uint8_t *in, size_t n4blks, uint8_t *out)
{
uint8x16x4_t S0 = vld1q_u8_x4(S);
uint8x16x4_t S1 = vld1q_u8_x4(S + 64);
uint8x16x4_t S2 = vld1q_u8_x4(S + 128);
uint8x16x4_t S3 = vld1q_u8_x4(S + 192);
const uint32_t incr[4] = { 0, 1, 2, 3 };
uint32_t __attribute__((aligned(16))) buf[16];
uint8_t *cipher = (uint8_t *)buf;
uint32_t n;
uint32x4_t ctr;
uint32x4_t ctr0, ctr1, ctr2, ctr3;
uint32x4_t vi;
uint32x4_t fours;
uint32x4_t x0, x1, x2, x3, x4;
uint32x4_t rk, xt;
uint32x4x2_t x02, x13, x01, x23;
int i;
error_print();
vi = vld1q_u32(incr);
fours = vdupq_n_u32(4);
// compute low ctr32
n = GETU32(iv + 12);
n += (uint32_t)(4 * n4blks);
memcpy(buf, iv, 16);
ctr = vld1q_u32(buf);
ctr = vrev32q_u8(ctr);
error_print();
ctr0 = vdupq_n_u32(vgetq_lane_u32(ctr, 0));
ctr1 = vdupq_n_u32(vgetq_lane_u32(ctr, 1));
ctr2 = vdupq_n_u32(vgetq_lane_u32(ctr, 2));
ctr3 = vdupq_n_u32(vgetq_lane_u32(ctr, 3));
error_print();
ctr3 = vaddq_u32(ctr3, vi);
while (n4blks--) {
x0 = ctr0;
x1 = ctr1;
x2 = ctr2;
x3 = ctr3;
error_print();
for (i = 0; i < 32; i++) {
// X4 = X1 ^ X2 ^ X3 ^ RK[i]
rk = vdupq_n_u32(key->rk[i]);
x4 = veorq_u32(veorq_u32(x1, x2), veorq_u32(x3, rk));
// X4 = SBOX(X4)
xt = vqtbl4q_u8(S0, x4);
xt = vqtbx4q_u8(xt, S1, veorq_u8(x4, vdupq_n_u8(0x40)));
xt = vqtbx4q_u8(xt, S2, veorq_u8(x4, vdupq_n_u8(0x80)));
x4 = vqtbx4q_u8(xt, S3, veorq_u8(x4, vdupq_n_u8(0xc0)));
// X4 = L(X4)
xt = veorq_u32(x4, vrolq_n_u32(x4, 2));
xt = veorq_u32(xt, vrolq_n_u32(x4, 10));
xt = veorq_u32(xt, vrolq_n_u32(x4, 18));
x4 = veorq_u32(xt, vrolq_n_u32(x4, 24));
// X0, X1, X2, X3 = X1, X2, X3, X0^X4
x4 = veorq_u32(x0, x4);
x0 = x1;
x1 = x2;
x2 = x3;
x3 = x4;
}
// output x3,x2,x1,x0
x02 = vzipq_u32(x3, x1);
x13 = vzipq_u32(x2, x0);
x01 = vzipq_u32(x02.val[0], x13.val[0]);
x23 = vzipq_u32(x02.val[1], x13.val[1]);
x0 = vrev32q_u8(x01.val[0]);
vst1q_u32(buf, x0);
error_print();
x1 = vrev32q_u8(x01.val[1]);
vst1q_u32(buf + 4, x1);
error_print();
x2 = vrev32q_u8(x23.val[0]);
vst1q_u32(buf + 8, x2);
error_print();
x3 = vrev32q_u8(x23.val[1]);
vst1q_u32(buf + 12, x3);
error_print();
// xor with plaintext
for (i = 0; i < 16*4; i++) {
out[i] = in[i] ^ cipher[i];
}
// update ctr
ctr3 = vaddq_u32(ctr3, fours);
in += 64;
out += 64;
}
// update iv
PUTU32(iv + 12, n);
}
static void ctr32_incr(uint8_t a[16]) {
int i;
for (i = 15; i >= 12; i--) {
a[i]++;
if (a[i]) break;
}
}
void sm4_ctr32_encrypt_blocks(const SM4_KEY *key, uint8_t ctr[16], const uint8_t *in, size_t nblocks, uint8_t *out)
{
uint8_t block[16];
int i;
if (nblocks >= 4) {
sm4_ctr32_encrypt_4blocks(key, ctr, in, nblocks/4, out);
in += 64 * (nblocks/4);
out += 64 * (nblocks/4);
nblocks %= 4;
}
while (nblocks--) {
sm4_encrypt(key, ctr, block);
ctr32_incr(ctr);
for (i = 0; i < 16; i++) {
out[i] = in[i] ^ block[i];
}
in += 16;
out += 16;
}
}

173
src/sm4_avx2.c Normal file
View File

@@ -0,0 +1,173 @@
#include <openssl/sm4.h>
#include "internal/rotate.h"
#include "modes_lcl.h"
#include "sms4_lcl.h"
# include <immintrin.h>
# define GET_BLKS(x0, x1, x2, x3, in) \
t0 = _mm256_i32gather_epi32((int *)(in+4*0), vindex_4i, 4); \
t1 = _mm256_i32gather_epi32((int *)(in+4*1), vindex_4i, 4); \
t2 = _mm256_i32gather_epi32((int *)(in+4*2), vindex_4i, 4); \
t3 = _mm256_i32gather_epi32((int *)(in+4*3), vindex_4i, 4); \
x0 = _mm256_shuffle_epi8(t0, vindex_swap); \
x1 = _mm256_shuffle_epi8(t1, vindex_swap); \
x2 = _mm256_shuffle_epi8(t2, vindex_swap); \
x3 = _mm256_shuffle_epi8(t3, vindex_swap)
# define PUT_BLKS(out, x0, x1, x2, x3) \
t0 = _mm256_shuffle_epi8(x0, vindex_swap); \
t1 = _mm256_shuffle_epi8(x1, vindex_swap); \
t2 = _mm256_shuffle_epi8(x2, vindex_swap); \
t3 = _mm256_shuffle_epi8(x3, vindex_swap); \
_mm256_storeu_si256((__m256i *)(out+32*0), t0); \
_mm256_storeu_si256((__m256i *)(out+32*1), t1); \
_mm256_storeu_si256((__m256i *)(out+32*2), t2); \
_mm256_storeu_si256((__m256i *)(out+32*3), t3); \
x0 = _mm256_i32gather_epi32((int *)(out+8*0), vindex_read, 4); \
x1 = _mm256_i32gather_epi32((int *)(out+8*1), vindex_read, 4); \
x2 = _mm256_i32gather_epi32((int *)(out+8*2), vindex_read, 4); \
x3 = _mm256_i32gather_epi32((int *)(out+8*3), vindex_read, 4); \
_mm256_storeu_si256((__m256i *)(out+32*0), x0); \
_mm256_storeu_si256((__m256i *)(out+32*1), x1); \
_mm256_storeu_si256((__m256i *)(out+32*2), x2); \
_mm256_storeu_si256((__m256i *)(out+32*3), x3)
# define _mm256_rotl_epi32(a, i) _mm256_xor_si256( \
_mm256_slli_epi32(a, i), _mm256_srli_epi32(a, 32 - i))
# define INDEX_MASK_TBOX 0xff
# define ROUND_TBOX(x0, x1, x2, x3, x4, i) \
t0 = _mm256_set1_epi32(*(rk + i)); \
t1 = _mm256_xor_si256(x1, x2); \
t2 = _mm256_xor_si256(x3, t0); \
x4 = _mm256_xor_si256(t1, t2); \
t0 = _mm256_and_si256(x4, vindex_mask); \
t0 = _mm256_i32gather_epi32((int *)SMS4_T, t0, 4); \
t0 = _mm256_rotl_epi32(t0, 8); \
x4 = _mm256_srli_epi32(x4, 8); \
x0 = _mm256_xor_si256(x0, t0); \
t0 = _mm256_and_si256(x4, vindex_mask); \
t0 = _mm256_i32gather_epi32((int *)SMS4_T, t0, 4); \
t0 = _mm256_rotl_epi32(t0, 16); \
x4 = _mm256_srli_epi32(x4, 8); \
x0 = _mm256_xor_si256(x0, t0); \
t0 = _mm256_and_si256(x4, vindex_mask); \
t0 = _mm256_i32gather_epi32((int *)SMS4_T, t0, 4); \
t0 = _mm256_rotl_epi32(t0, 24); \
x4 = _mm256_srli_epi32(x4, 8); \
x0 = _mm256_xor_si256(x0, t0); \
t1 = _mm256_i32gather_epi32((int *)SMS4_T, x4, 4); \
x4 = _mm256_xor_si256(x0, t1)
# define INDEX_MASK_DBOX 0xffff
# define ROUND_DBOX(x0, x1, x2, x3, x4, i) \
t0 = _mm256_set1_epi32(*(rk + i)); \
t1 = _mm256_xor_si256(x1, x2); \
t2 = _mm256_xor_si256(x3, t0); \
x4 = _mm256_xor_si256(t1, t2); \
t0 = _mm256_srli_epi32(x4, 16); \
t1 = _mm256_i32gather_epi32((int *)SMS4_D, t0, 4); \
t2 = _mm256_and_si256(x4, vindex_mask); \
t3 = _mm256_i32gather_epi32((int *)SMS4_D, t2, 4); \
t0 = _mm256_rotl_epi32(t3, 16); \
x4 = _mm256_xor_si256(x0, t1); \
x4 = _mm256_xor_si256(x4, t0)
# define ROUND ROUND_TBOX
# define INDEX_MASK INDEX_MASK_TBOX
// 这个函数是否要做成完全独立的呢?
void sm4_avx2_ecb_encrypt_blocks(const unsigned char *in, unsigned char *out,
size_t blocks, const sms4_key_t *key)
{
const int *rk = (int *)key->rk;
__m256i x0, x1, x2, x3, x4;
__m256i t0, t1, t2, t3;
__m256i vindex_4i = _mm256_setr_epi32(0,4,8,12,16,20,24,28);
__m256i vindex_mask = _mm256_set1_epi32(INDEX_MASK);
__m256i vindex_read = _mm256_setr_epi32(0,8,16,24,1,9,17,25);
__m256i vindex_swap = _mm256_setr_epi8(
3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,
3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12
);
while (blocks >= 8) {
// read 8 blocks
t0 = _mm256_i32gather_epi32((int *)(in+4*0), vindex_4i, 4);
t1 = _mm256_i32gather_epi32((int *)(in+4*1), vindex_4i, 4);
t2 = _mm256_i32gather_epi32((int *)(in+4*2), vindex_4i, 4);
t3 = _mm256_i32gather_epi32((int *)(in+4*3), vindex_4i, 4);
x0 = _mm256_shuffle_epi8(t0, vindex_swap);
x1 = _mm256_shuffle_epi8(t1, vindex_swap);
x2 = _mm256_shuffle_epi8(t2, vindex_swap);
x3 = _mm256_shuffle_epi8(t3, vindex_swap);
// 这里还是循环好了
ROUNDS(x0, x1, x2, x3, x4);
PUT_BLKS(out, x0, x4, x3, x2);
in += 128;
out += 128;
blocks -= 8;
}
}
// 这个应该还是比较有效果的
void sms4_avx2_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
size_t blocks, const sms4_key_t *key, const unsigned char iv[16])
{
const int *rk = (int *)key->rk;
__m256i x0, x1, x2, x3, x4;
__m256i t0, t1, t2, t3;
__m256i vindex_4i = _mm256_setr_epi32(0,4,8,12,16,20,24,28);
__m256i vindex_mask = _mm256_set1_epi32(INDEX_MASK);
__m256i vindex_read = _mm256_setr_epi32(0,8,16,24,1,9,17,25);
__m256i vindex_swap = _mm256_setr_epi8(
3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,
3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12
);
__m256i incr = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
int c0 = (int)GETU32(iv );
int c1 = (int)GETU32(iv + 4);
int c2 = (int)GETU32(iv + 8);
int c3 = (int)GETU32(iv + 12);
while (blocks >= 8) {
x0 = _mm256_set1_epi32(c0);
x1 = _mm256_set1_epi32(c1);
x2 = _mm256_set1_epi32(c2);
x3 = _mm256_set1_epi32(c3);
x3 = _mm256_add_epi32(x3, incr);
ROUNDS(x0, x1, x2, x3, x4);
GET_BLKS(t0, t1, t2, t3, in);
x0 = _mm256_xor_si256(x0, t0);
x4 = _mm256_xor_si256(x4, t1);
x3 = _mm256_xor_si256(x3, t2);
x2 = _mm256_xor_si256(x2, t3);
PUT_BLKS(out, x0, x4, x3, x2);
c3 += 8;
in += 128;
out += 128;
blocks -= 8;
}
if (blocks) {
unsigned char ctr[16];
memcpy(ctr, iv, 12);
PUTU32(ctr + 12, c3);
sms4_ctr32_encrypt_blocks(in, out, blocks, key, ctr);
}
}

88
src/sm4_ce.c Normal file
View File

@@ -0,0 +1,88 @@
/*
* Copyright 2014-2024 The GmSSL Project. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <arm_neon.h>
#include <gmssl/sm4.h>
static const uint32_t FK[4] = {
0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc,
};
static const uint32_t CK[32] = {
0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269,
0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9,
0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249,
0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9,
0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229,
0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299,
0xa0a7aeb5, 0xbcc3cad1, 0xd8dfe6ed, 0xf4fb0209,
0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279,
};
void sm4_set_encrypt_key(SM4_KEY *sm4_key, const uint8_t key[16])
{
uint32x4_t rk;
uint32x4_t fk;
rk = vrev32q_u8(vld1q_u8(key));
rk = veorq_u32(rk, vld1q_u32(FK));
rk = vsm4ekeyq_u32(rk, vld1q_u32(CK));
vst1q_u32(sm4_key->rk, rk);
rk = vsm4ekeyq_u32(rk, vld1q_u32(CK + 4));
vst1q_u32(sm4_key->rk + 4, rk);
rk = vsm4ekeyq_u32(rk, vld1q_u32(CK + 8));
vst1q_u32(sm4_key->rk + 8, rk);
rk = vsm4ekeyq_u32(rk, vld1q_u32(CK + 12));
vst1q_u32(sm4_key->rk + 12, rk);
rk = vsm4ekeyq_u32(rk, vld1q_u32(CK + 16));
vst1q_u32(sm4_key->rk + 16, rk);
rk = vsm4ekeyq_u32(rk, vld1q_u32(CK + 20));
vst1q_u32(sm4_key->rk + 20, rk);
rk = vsm4ekeyq_u32(rk, vld1q_u32(CK + 24));
vst1q_u32(sm4_key->rk + 24, rk);
rk = vsm4ekeyq_u32(rk, vld1q_u32(CK + 28));
vst1q_u32(sm4_key->rk + 28, rk);
}
void sm4_encrypt(const SM4_KEY *key, const unsigned char in[16], unsigned char out[16])
{
uint32x4_t x4, rk;
x4 = vld1q_u8(in);
x4 = vrev32q_u8(x4);
rk = vld1q_u32(key->rk);
x4 = vsm4eq_u32(x4, rk);
rk = vld1q_u32(key->rk + 4);
x4 = vsm4eq_u32(x4, rk);
rk = vld1q_u32(key->rk + 8);
x4 = vsm4eq_u32(x4, rk);
rk = vld1q_u32(key->rk + 12);
x4 = vsm4eq_u32(x4, rk);
rk = vld1q_u32(key->rk + 16);
x4 = vsm4eq_u32(x4, rk);
rk = vld1q_u32(key->rk + 20);
x4 = vsm4eq_u32(x4, rk);
rk = vld1q_u32(key->rk + 24);
x4 = vsm4eq_u32(x4, rk);
rk = vld1q_u32(key->rk + 28);
x4 = vsm4eq_u32(x4, rk);
x4 = vrev64q_u32(x4);
x4 = vextq_u32(x4, x4, 2);
x4 = vrev32q_u8(x4);
vst1q_u8(out, x4);
}

View File

@@ -414,7 +414,7 @@ int sm9_z256_print(FILE *fp, int ind, int fmt, const char *label, const sm9_z256
}
#ifndef ENABLE_SM9_Z256_ARMV8
#ifndef ENABLE_SM9_ARM64
void sm9_z256_modp_add(sm9_z256_t r, const sm9_z256_t a, const sm9_z256_t b)
{
uint64_t c;
@@ -487,7 +487,7 @@ const uint64_t SM9_Z256_P_PRIME[4] = {
};
#if defined(ENABLE_SM9_Z256_ARMV8)
#if defined(ENABLE_SM9_ARM64)
// src/sm9_z256_armv8.S
#elif defined(ENABLE_SM9_Z256_NEON)
#include <arm_neon.h>
@@ -681,10 +681,10 @@ void sm9_z256_modp_mont_mul(uint64_t r[4], const uint64_t a[4], const uint64_t b
(void)sm9_z256_sub(r, r, SM9_Z256_P);
}
}
#endif // ENABLE_SM9_Z256_ARMV8
#endif // ENABLE_SM9_ARM64
#ifndef ENABLE_SM9_Z256_ARMV8
#ifndef ENABLE_SM9_ARM64
void sm9_z256_modp_to_mont(sm9_z256_t r, const sm9_z256_t a)
{
sm9_z256_modp_mont_mul(r, a, SM9_Z256_MODP_2e512);