diff --git a/src/sm3.c b/src/sm3.c index a82f9015..80c73b7f 100644 --- a/src/sm3.c +++ b/src/sm3.c @@ -10,129 +10,33 @@ #include #include -#include #include -#ifdef SM3_SSE3 -# include -# include - -# define _mm_rotl_epi32(X,i) \ - _mm_xor_si128(_mm_slli_epi32((X),(i)), _mm_srli_epi32((X),32-(i))) -#endif +#define GETU32(ptr) \ + ((uint32_t)(ptr)[0] << 24 | \ + (uint32_t)(ptr)[1] << 16 | \ + (uint32_t)(ptr)[2] << 8 | \ + (uint32_t)(ptr)[3]) +#define PUTU32(ptr,a) \ + ((ptr)[0] = (uint8_t)((a) >> 24), \ + (ptr)[1] = (uint8_t)((a) >> 16), \ + (ptr)[2] = (uint8_t)((a) >> 8), \ + (ptr)[3] = (uint8_t)(a)) #define ROTL(x,n) (((x)<<(n)) | ((x)>>(32-(n)))) -#define P0(x) ((x) ^ ROL32((x), 9) ^ ROL32((x),17)) -#define P1(x) ((x) ^ ROL32((x),15) ^ ROL32((x),23)) + +#define P0(x) ((x) ^ ROTL((x), 9) ^ ROTL((x),17)) +#define P1(x) ((x) ^ ROTL((x),15) ^ ROTL((x),23)) #define FF00(x,y,z) ((x) ^ (y) ^ (z)) #define FF16(x,y,z) (((x)&(y)) | ((x)&(z)) | ((y)&(z))) #define GG00(x,y,z) ((x) ^ (y) ^ (z)) #define GG16(x,y,z) ((((y)^(z)) & (x)) ^ (z)) -#define R(A, B, C, D, E, F, G, H, xx) \ - SS1 = ROL32((ROL32(A, 12) + E + K[j]), 7); \ - SS2 = SS1 ^ ROL32(A, 12); \ - TT1 = FF##xx(A, B, C) + D + SS2 + (W[j] ^ W[j + 4]); \ - TT2 = GG##xx(E, F, G) + H + SS1 + W[j]; \ - B = ROL32(B, 9); \ - H = TT1; \ - F = ROL32(F, 19); \ - D = P0(TT2); \ - j++ - -#define R8(A, B, C, D, E, F, G, H, xx) \ - R(A, B, C, D, E, F, G, H, xx); \ - R(H, A, B, C, D, E, F, G, xx); \ - R(G, H, A, B, C, D, E, F, xx); \ - R(F, G, H, A, B, C, D, E, xx); \ - R(E, F, G, H, A, B, C, D, xx); \ - R(D, E, F, G, H, A, B, C, xx); \ - R(C, D, E, F, G, H, A, B, xx); \ - R(B, C, D, E, F, G, H, A, xx) - - - -#define T00 0x79cc4519U -#define T16 0x7a879d8aU - -#define K0 0x79cc4519U -#define K1 0xf3988a32U -#define K2 0xe7311465U -#define K3 0xce6228cbU -#define K4 0x9cc45197U -#define K5 0x3988a32fU -#define K6 0x7311465eU -#define K7 0xe6228cbcU -#define K8 0xcc451979U -#define K9 0x988a32f3U -#define K10 0x311465e7U -#define K11 0x6228cbceU -#define K12 0xc451979cU -#define K13 0x88a32f39U -#define K14 0x11465e73U -#define K15 0x228cbce6U -#define K16 0x9d8a7a87U -#define K17 0x3b14f50fU -#define K18 0x7629ea1eU -#define K19 0xec53d43cU -#define K20 0xd8a7a879U -#define K21 0xb14f50f3U -#define K22 0x629ea1e7U -#define K23 0xc53d43ceU -#define K24 0x8a7a879dU -#define K25 0x14f50f3bU -#define K26 0x29ea1e76U -#define K27 0x53d43cecU -#define K28 0xa7a879d8U -#define K29 0x4f50f3b1U -#define K30 0x9ea1e762U -#define K31 0x3d43cec5U -#define K32 0x7a879d8aU -#define K33 0xf50f3b14U -#define K34 0xea1e7629U -#define K35 0xd43cec53U -#define K36 0xa879d8a7U -#define K37 0x50f3b14fU -#define K38 0xa1e7629eU -#define K39 0x43cec53dU -#define K40 0x879d8a7aU -#define K41 0x0f3b14f5U -#define K42 0x1e7629eaU -#define K43 0x3cec53d4U -#define K44 0x79d8a7a8U -#define K45 0xf3b14f50U -#define K46 0xe7629ea1U -#define K47 0xcec53d43U -#define K48 0x9d8a7a87U -#define K49 0x3b14f50fU -#define K50 0x7629ea1eU -#define K51 0xec53d43cU -#define K52 0xd8a7a879U -#define K53 0xb14f50f3U -#define K54 0x629ea1e7U -#define K55 0xc53d43ceU -#define K56 0x8a7a879dU -#define K57 0x14f50f3bU -#define K58 0x29ea1e76U -#define K59 0x53d43cecU -#define K60 0xa7a879d8U -#define K61 0x4f50f3b1U -#define K62 0x9ea1e762U -#define K63 0x3d43cec5U static uint32_t K[64] = { - K0, K1, K2, K3, K4, K5, K6, K7, - K8, K9, K10, K11, K12, K13, K14, K15, - K16, K17, K18, K19, K20, K21, K22, K23, - K24, K25, K26, K27, K28, K29, K30, K31, - K32, K33, K34, K35, K36, K37, K38, K39, - K40, K41, K42, K43, K44, K45, K46, K47, - K48, K49, K50, K51, K52, K53, K54, K55, - K56, K57, K58, K59, K60, K61, K62, K63, - /* 0x79cc4519U, 0xf3988a32U, 0xe7311465U, 0xce6228cbU, 0x9cc45197U, 0x3988a32fU, 0x7311465eU, 0xe6228cbcU, 0xcc451979U, 0x988a32f3U, 0x311465e7U, 0x6228cbceU, @@ -149,10 +53,8 @@ static uint32_t K[64] = { 0xd8a7a879U, 0xb14f50f3U, 0x629ea1e7U, 0xc53d43ceU, 0x8a7a879dU, 0x14f50f3bU, 0x29ea1e76U, 0x53d43cecU, 0xa7a879d8U, 0x4f50f3b1U, 0x9ea1e762U, 0x3d43cec5U, - */ }; -#ifndef SM3_AVX_BMI2 void sm3_compress_blocks(uint32_t digest[8], const uint8_t *data, size_t blocks) { uint32_t A; @@ -167,12 +69,6 @@ void sm3_compress_blocks(uint32_t digest[8], const uint8_t *data, size_t blocks) uint32_t SS1, SS2, TT1, TT2; int j; -#ifdef SM3_SSE3 - __m128i X, T, R; - __m128i M = _mm_setr_epi32(0, 0, 0, 0xffffffff); - __m128i V = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12); -#endif - while (blocks--) { A = digest[0]; @@ -184,103 +80,44 @@ void sm3_compress_blocks(uint32_t digest[8], const uint8_t *data, size_t blocks) G = digest[6]; H = digest[7]; - -#ifdef SM3_SSE3 - - for (j = 0; j < 16; j += 4) { - X = _mm_loadu_si128((__m128i *)(data + j * 4)); - X = _mm_shuffle_epi8(X, V); - _mm_storeu_si128((__m128i *)(W + j), X); - } - - for (j = 16; j < 68; j += 4) { - /* X = (W[j - 3], W[j - 2], W[j - 1], 0) */ - X = _mm_loadu_si128((__m128i *)(W + j - 3)); - X = _mm_andnot_si128(M, X); - - X = _mm_rotl_epi32(X, 15); - T = _mm_loadu_si128((__m128i *)(W + j - 9)); - X = _mm_xor_si128(X, T); - T = _mm_loadu_si128((__m128i *)(W + j - 16)); - X = _mm_xor_si128(X, T); - - /* P1() */ - T = _mm_rotl_epi32(X, (23 - 15)); - T = _mm_xor_si128(T, X); - T = _mm_rotl_epi32(T, 15); - X = _mm_xor_si128(X, T); - - T = _mm_loadu_si128((__m128i *)(W + j - 13)); - T = _mm_rotl_epi32(T, 7); - X = _mm_xor_si128(X, T); - T = _mm_loadu_si128((__m128i *)(W + j - 6)); - X = _mm_xor_si128(X, T); - - /* W[j + 3] ^= P1(ROL32(W[j + 1], 15)) */ - R = _mm_shuffle_epi32(X, 0); - R = _mm_and_si128(R, M); - T = _mm_rotl_epi32(R, 15); - T = _mm_xor_si128(T, R); - T = _mm_rotl_epi32(T, 9); - R = _mm_xor_si128(R, T); - R = _mm_rotl_epi32(R, 6); - X = _mm_xor_si128(X, R); - - _mm_storeu_si128((__m128i *)(W + j), X); - } -#else - for (j = 0; j < 16; j++) + for (j = 0; j < 16; j++) { W[j] = GETU32(data + j*4); + } - for (; j < 68; j++) - W[j] = P1(W[j - 16] ^ W[j - 9] ^ ROL32(W[j - 3], 15)) - ^ ROL32(W[j - 13], 7) ^ W[j - 6]; -#endif + for (; j < 68; j++) { + W[j] = P1(W[j - 16] ^ W[j - 9] ^ ROTL(W[j - 3], 15)) + ^ ROTL(W[j - 13], 7) ^ W[j - 6]; + } - - j = 0; - -#define FULL_UNROLL -#ifdef FULL_UNROLL - R8(A, B, C, D, E, F, G, H, 00); - R8(A, B, C, D, E, F, G, H, 00); - R8(A, B, C, D, E, F, G, H, 16); - R8(A, B, C, D, E, F, G, H, 16); - R8(A, B, C, D, E, F, G, H, 16); - R8(A, B, C, D, E, F, G, H, 16); - R8(A, B, C, D, E, F, G, H, 16); - R8(A, B, C, D, E, F, G, H, 16); -#else - for (; j < 16; j++) { - SS1 = ROL32((ROL32(A, 12) + E + K(j)), 7); - SS2 = SS1 ^ ROL32(A, 12); + for (j = 0; j < 16; j++) { + SS1 = ROTL((ROTL(A, 12) + E + K[j]), 7); + SS2 = SS1 ^ ROTL(A, 12); TT1 = FF00(A, B, C) + D + SS2 + (W[j] ^ W[j + 4]); TT2 = GG00(E, F, G) + H + SS1 + W[j]; D = C; - C = ROL32(B, 9); + C = ROTL(B, 9); B = A; A = TT1; H = G; - G = ROL32(F, 19); + G = ROTL(F, 19); F = E; E = P0(TT2); } for (; j < 64; j++) { - SS1 = ROL32((ROL32(A, 12) + E + K(j)), 7); - SS2 = SS1 ^ ROL32(A, 12); + SS1 = ROTL((ROTL(A, 12) + E + K[j]), 7); + SS2 = SS1 ^ ROTL(A, 12); TT1 = FF16(A, B, C) + D + SS2 + (W[j] ^ W[j + 4]); TT2 = GG16(E, F, G) + H + SS1 + W[j]; D = C; - C = ROL32(B, 9); + C = ROTL(B, 9); B = A; A = TT1; H = G; - G = ROL32(F, 19); + G = ROTL(F, 19); F = E; E = P0(TT2); } -#endif digest[0] ^= A; digest[1] ^= B; @@ -294,7 +131,6 @@ void sm3_compress_blocks(uint32_t digest[8], const uint8_t *data, size_t blocks) data += 64; } } -#endif void sm3_init(SM3_CTX *ctx) { @@ -357,10 +193,11 @@ void sm3_finish(SM3_CTX *ctx, uint8_t *digest) sm3_compress_blocks(ctx->digest, ctx->block, 1); memset(ctx->block, 0, SM3_BLOCK_SIZE - 8); } + PUTU32(ctx->block + 56, ctx->nblocks >> 23); PUTU32(ctx->block + 60, (ctx->nblocks << 9) + (ctx->num << 3)); - sm3_compress_blocks(ctx->digest, ctx->block, 1); + for (i = 0; i < 8; i++) { PUTU32(digest + i*4, ctx->digest[i]); }