From 8e8159ade60b9a4b0de042109281eefd78bca1cf Mon Sep 17 00:00:00 2001
From: Zhi Guan <guanzhi1980@gmail.com>
Date: Wed, 12 Apr 2023 12:04:30 +0800
Subject: [PATCH] Add optimized SM2 point operations for 64-bit CPU and GCC

---
 src/sm2_uint128.c       | 2126 +++++++++++++++++++++++++++++++++++++++
 tests/sm2_uint128test.c |  158 +++
 2 files changed, 2284 insertions(+)
 create mode 100644 src/sm2_uint128.c
 create mode 100644 tests/sm2_uint128test.c

diff --git a/src/sm2_uint128.c b/src/sm2_uint128.c
new file mode 100644
index 00000000..a9e10c6b
--- /dev/null
+++ b/src/sm2_uint128.c
@@ -0,0 +1,2126 @@
+/*
+ *  Copyright 2014-2023 The GmSSL Project. All Rights Reserved.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the License); you may
+ *  not use this file except in compliance with the License.
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ */
+/*
+ * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+/* Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ *
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/*
+ * A 64-bit implementation of the SM2 elliptic curve point multiplication
+ *
+ * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
+ * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
+ * work which got its smarts from Daniel J. Bernstein's work on the same.
+ */
+
+#include <stdio.h>
+# include <stdint.h>
+# include <string.h>
+#include <gmssl/sm2.h>
+#include <gmssl/hex.h>
+#include <gmssl/endian.h>
+#include <gmssl/error.h>
+
+
+typedef __uint128_t uint128_t;
+
+/*
+ * The underlying field. SM2-P256 operates over GF(2^256-2^224-2^96+2^64-1).
+ * We can serialise an element of this field into 32 bytes. We call this an
+ * felem_bytearray.
+ */
+typedef uint8_t felem_bytearray[32];
+
+
+/*-
+ * The representation of field elements.
+ * ------------------------------------
+ *
+ * We represent field elements with either four 128-bit values, eight 128-bit
+ * values, or four 64-bit values. The field element represented is:
+ *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
+ * or:
+ *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
+ *
+ * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
+ * apart, but are 128-bits wide, the most significant bits of each limb overlap
+ * with the least significant bits of the next.
+ *
+ * A field element with four limbs is an 'felem'. One with eight limbs is a
+ * 'longfelem'
+ *
+ * A field element with four, 64-bit values is called a 'smallfelem'. Small
+ * values are used as intermediate values before multiplication.
+ */
+
+# define NLIMBS 4
+
+typedef uint128_t limb;
+typedef limb felem[NLIMBS];
+typedef limb longfelem[NLIMBS * 2];
+typedef uint64_t smallfelem[NLIMBS];
+
+/* This is the value of the prime as four 64-bit words, little-endian. */
+static const uint64_t kPrime[4] = {
+	0xfffffffffffffffful, 0xffffffff00000000ul,
+	0xfffffffffffffffful, 0xfffffffefffffffful};
+static const uint64_t bottom63bits = 0x7ffffffffffffffful;
+
+
+/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
+static void flip_endian(uint8_t *out, const uint8_t *in)
+{
+	int i;
+	for (i = 0; i < 32; ++i) {
+		out[i] = in[31 - i];
+	}
+}
+
+
+/*-
+ * Field operations
+ * ----------------
+ */
+
+static void smallfelem_one(smallfelem out)
+{
+	out[0] = 1;
+	out[1] = 0;
+	out[2] = 0;
+	out[3] = 0;
+}
+
+static void felem_one(felem out)
+{
+	out[0] = 1;
+	out[1] = 0;
+	out[2] = 0;
+	out[3] = 0;
+}
+
+static void smallfelem_assign(smallfelem out, const smallfelem in)
+{
+	out[0] = in[0];
+	out[1] = in[1];
+	out[2] = in[2];
+	out[3] = in[3];
+}
+
+static void felem_assign(felem out, const felem in)
+{
+	out[0] = in[0];
+	out[1] = in[1];
+	out[2] = in[2];
+	out[3] = in[3];
+}
+
+/* felem_sum sets out = out + in. */
+static void felem_sum(felem out, const felem in)
+{
+	out[0] += in[0];
+	out[1] += in[1];
+	out[2] += in[2];
+	out[3] += in[3];
+}
+
+/* felem_small_sum sets out = out + in. */
+static void felem_small_sum(felem out, const smallfelem in)
+{
+	out[0] += in[0];
+	out[1] += in[1];
+	out[2] += in[2];
+	out[3] += in[3];
+}
+
+/* felem_scalar sets out = out * scalar */
+static void felem_scalar(felem out, const uint64_t scalar)
+{
+	out[0] *= scalar;
+	out[1] *= scalar;
+	out[2] *= scalar;
+	out[3] *= scalar;
+}
+
+/* longfelem_scalar sets out = out * scalar */
+static void longfelem_scalar(longfelem out, const uint64_t scalar)
+{
+	out[0] *= scalar;
+	out[1] *= scalar;
+	out[2] *= scalar;
+	out[3] *= scalar;
+	out[4] *= scalar;
+	out[5] *= scalar;
+	out[6] *= scalar;
+	out[7] *= scalar;
+}
+
+# define two105m73m41 (((limb)1) << 105) - (((limb)1) << 73) - (((limb)1) << 41)
+# define two105m41 (((limb)1) << 105) - (((limb)1) << 41)
+# define two105m73 (((limb)1) << 105) - (((limb)1) << 73)
+
+/* zero105 is 0 mod p */
+static const felem zero105 =
+	{ two105m41, two105m73, two105m41, two105m73m41 };
+
+/*-
+ * smallfelem_neg sets |out| to |-small|
+ * On exit:
+ *   out[i] < out[i] + 2^105
+ */
+static void smallfelem_neg(felem out, const smallfelem small)
+{
+	/* In order to prevent underflow, we subtract from 0 mod p. */
+	out[0] = zero105[0] - small[0];
+	out[1] = zero105[1] - small[1];
+	out[2] = zero105[2] - small[2];
+	out[3] = zero105[3] - small[3];
+}
+
+/*-
+ * felem_diff subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 2^104
+ * On exit:
+ *   out[i] < out[i] + 2^105
+ */
+static void felem_diff(felem out, const felem in)
+{
+	/*
+	* In order to prevent underflow, we add 0 mod p before subtracting.
+	*/
+	out[0] += zero105[0];
+	out[1] += zero105[1];
+	out[2] += zero105[2];
+	out[3] += zero105[3];
+
+	out[0] -= in[0];
+	out[1] -= in[1];
+	out[2] -= in[2];
+	out[3] -= in[3];
+}
+
+# define two107m75m43 (((limb)1) << 107) - (((limb)1) << 75) - (((limb)1) << 43)
+# define two107m43 (((limb)1) << 107) - (((limb)1) << 43)
+# define two107m75 (((limb)1) << 107) - (((limb)1) << 75)
+
+/* zero107 is 0 mod p */
+static const felem zero107 =
+	{ two107m43, two107m75, two107m43, two107m75m43 };
+
+/*-
+ * An alternative felem_diff for larger inputs |in|
+ * felem_diff_zero107 subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 2^106
+ * On exit:
+ *   out[i] < out[i] + 2^107
+ */
+static void felem_diff_zero107(felem out, const felem in)
+{
+	/*
+	* In order to prevent underflow, we add 0 mod p before subtracting.
+	*/
+	out[0] += zero107[0];
+	out[1] += zero107[1];
+	out[2] += zero107[2];
+	out[3] += zero107[3];
+
+	out[0] -= in[0];
+	out[1] -= in[1];
+	out[2] -= in[2];
+	out[3] -= in[3];
+}
+
+/*-
+ * longfelem_diff subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 7*2^67
+ * On exit:
+ *   out[i] < out[i] + 2^70 + 2^40
+ */
+static void longfelem_diff(longfelem out, const longfelem in)
+{
+	static const limb two70m39m7m6 =
+		(((limb) 1) << 70) - (((limb) 1) << 39)- (((limb) 1) << 7) -
+		(((limb) 1) << 6);
+	static const limb two70m40p38 =
+		(((limb) 1) << 70) - (((limb) 1) << 40) + (((limb) 1) << 38);
+	static const limb two70m38m7 =
+		(((limb) 1) << 70) - (((limb) 1) << 38) - (((limb) 1) << 7);
+	static const limb two70m40m7m6 =
+		(((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 7) -
+		(((limb) 1) << 6);
+	static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
+
+	/* add 0 mod p to avoid underflow */
+	out[0] += two70m39m7m6;
+	out[1] += two70m40p38;
+	out[2] += two70m38m7;
+	out[3] += two70m40m7m6;
+	out[4] += two70m6;
+	out[5] += two70m6;
+	out[6] += two70m6;
+	out[7] += two70m6;
+
+	out[0] -= in[0];
+	out[1] -= in[1];
+	out[2] -= in[2];
+	out[3] -= in[3];
+	out[4] -= in[4];
+	out[5] -= in[5];
+	out[6] -= in[6];
+	out[7] -= in[7];
+}
+
+# define two64m32m0 (((limb)1) << 64) - (((limb)1) << 32) - 1
+# define two64m0 (((limb)1) << 64) - 1
+# define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
+
+/* zero64 == p */
+static const felem zero64 = { two64m0, two64m32, two64m0, two64m32m0 };
+
+/*-
+ * felem_shrink converts an felem into a smallfelem. The result isn't quite
+ * minimal as the value may be greater than p.
+ *
+ * On entry:
+ *   in[i] < 2^109
+ * On exit:
+ *   out[i] < 2^64
+ */
+static void felem_shrink(smallfelem out, const felem in)
+{
+	felem tmp;
+	uint64_t a, b, mask;
+	int64_t high, low;
+
+	static const uint64_t kPrime3Test = 0x7ffffffefffffffful;
+
+	/* Carry 2->3 */
+	tmp[3] = zero64[3] + in[3] + ((uint64_t)(in[2] >> 64));
+	/* tmp[3] < 2^110 */
+
+	tmp[2] = zero64[2] + (uint64_t)in[2];
+	tmp[1] = zero64[1] + in[1];
+	tmp[0] = zero64[0] + in[0];
+	/* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
+
+	/*
+	* We perform two partial reductions where we eliminate the high-word of
+	* tmp[3]. We don't update the other words till the end.
+	*/
+	a = tmp[3] >> 64;           /* a < 2^46 */
+	tmp[3] = (uint64_t)tmp[3];
+	tmp[3] += ((limb) a) << 32;
+	/* tmp[3] < 2^79 */
+
+	b = a;
+	a = tmp[3] >> 64;           /* a < 2^15 */
+	b += a;                     /* b < 2^46 + 2^15 < 2^47 */
+	tmp[3] = (uint64_t)tmp[3];
+	tmp[3] += ((limb) a) << 32;
+	/* tmp[3] < 2^64 + 2^47 */
+
+	/*
+	* This adjusts the other two words to complete the two partial
+	* reductions.
+	*/
+	tmp[1] += (((limb) b) << 32);
+	tmp[1] -= b;
+	tmp[0] += b;
+
+	/*
+	* In order to make space in tmp[3] for the carry from 2 -> 3, we
+	* conditionally subtract kPrime if tmp[3] is large enough.
+	*/
+	high = tmp[3] >> 64;
+	/* As tmp[3] < 2^65, high is either 1 or 0 */
+	high <<= 63;
+	high >>= 63;
+	/*-
+	* high is:
+	*   all ones   if the high word of tmp[3] is 1
+	*   all zeros  if the high word of tmp[3] if 0 */
+	low = tmp[3];
+	mask = low >> 63;
+	/*-
+	* mask is:
+	*   all ones   if the MSB of low is 1
+	*   all zeros  if the MSB of low if 0 */
+	low &= bottom63bits;
+	low -= kPrime3Test;
+	/* if low was greater than kPrime3Test then the MSB is zero */
+	low = ~low;
+	low >>= 63;
+	/*-
+	* low is:
+	*   all ones   if low was > kPrime3Test
+	*   all zeros  if low was <= kPrime3Test */
+	mask = (mask & low) | high;
+	tmp[0] -= mask & kPrime[0];
+	tmp[1] -= mask & kPrime[1];
+	tmp[2] -= mask & kPrime[2];
+	tmp[3] -= mask & kPrime[3];
+	/* tmp[3] < 2**64 - 2**32 + 1 */
+
+	tmp[1] += ((uint64_t)(tmp[0] >> 64));
+	tmp[0] = (uint64_t)tmp[0];
+	tmp[2] += ((uint64_t)(tmp[1] >> 64));
+	tmp[1] = (uint64_t)tmp[1];
+	tmp[3] += ((uint64_t)(tmp[2] >> 64));
+	tmp[2] = (uint64_t)tmp[2];
+	/* tmp[i] < 2^64 */
+
+	out[0] = tmp[0];
+	out[1] = tmp[1];
+	out[2] = tmp[2];
+	out[3] = tmp[3];
+}
+
+/* smallfelem_expand converts a smallfelem to an felem */
+static void smallfelem_expand(felem out, const smallfelem in)
+{
+	out[0] = in[0];
+	out[1] = in[1];
+	out[2] = in[2];
+	out[3] = in[3];
+}
+
+/*-
+ * smallfelem_square sets |out| = |small|^2
+ * On entry:
+ *   small[i] < 2^64
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void smallfelem_square(longfelem out, const smallfelem small)
+{
+	limb a;
+	uint64_t high, low;
+
+	a = ((uint128_t) small[0]) * small[0];
+	low = a;
+	high = a >> 64;
+	out[0] = low;
+	out[1] = high;
+
+	a = ((uint128_t) small[0]) * small[1];
+	low = a;
+	high = a >> 64;
+	out[1] += low;
+	out[1] += low;
+	out[2] = high;
+
+	a = ((uint128_t) small[0]) * small[2];
+	low = a;
+	high = a >> 64;
+	out[2] += low;
+	out[2] *= 2;
+	out[3] = high;
+
+	a = ((uint128_t) small[0]) * small[3];
+	low = a;
+	high = a >> 64;
+	out[3] += low;
+	out[4] = high;
+
+	a = ((uint128_t) small[1]) * small[2];
+	low = a;
+	high = a >> 64;
+	out[3] += low;
+	out[3] *= 2;
+	out[4] += high;
+
+	a = ((uint128_t) small[1]) * small[1];
+	low = a;
+	high = a >> 64;
+	out[2] += low;
+	out[3] += high;
+
+	a = ((uint128_t) small[1]) * small[3];
+	low = a;
+	high = a >> 64;
+	out[4] += low;
+	out[4] *= 2;
+	out[5] = high;
+
+	a = ((uint128_t) small[2]) * small[3];
+	low = a;
+	high = a >> 64;
+	out[5] += low;
+	out[5] *= 2;
+	out[6] = high;
+	out[6] += high;
+
+	a = ((uint128_t) small[2]) * small[2];
+	low = a;
+	high = a >> 64;
+	out[4] += low;
+	out[5] += high;
+
+	a = ((uint128_t) small[3]) * small[3];
+	low = a;
+	high = a >> 64;
+	out[6] += low;
+	out[7] = high;
+}
+
+/*-
+ * felem_square sets |out| = |in|^2
+ * On entry:
+ *   in[i] < 2^109
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void felem_square(longfelem out, const felem in)
+{
+	uint64_t small[4];
+	felem_shrink(small, in);
+	smallfelem_square(out, small);
+}
+
+/*-
+ * smallfelem_mul sets |out| = |small1| * |small2|
+ * On entry:
+ *   small1[i] < 2^64
+ *   small2[i] < 2^64
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void smallfelem_mul(longfelem out, const smallfelem small1,
+                           const smallfelem small2)
+{
+	limb a;
+	uint64_t high, low;
+
+	a = ((uint128_t) small1[0]) * small2[0];
+	low = a;
+	high = a >> 64;
+	out[0] = low;
+	out[1] = high;
+
+	a = ((uint128_t) small1[0]) * small2[1];
+	low = a;
+	high = a >> 64;
+	out[1] += low;
+	out[2] = high;
+
+	a = ((uint128_t) small1[1]) * small2[0];
+	low = a;
+	high = a >> 64;
+	out[1] += low;
+	out[2] += high;
+
+	a = ((uint128_t) small1[0]) * small2[2];
+	low = a;
+	high = a >> 64;
+	out[2] += low;
+	out[3] = high;
+
+	a = ((uint128_t) small1[1]) * small2[1];
+	low = a;
+	high = a >> 64;
+	out[2] += low;
+	out[3] += high;
+
+	a = ((uint128_t) small1[2]) * small2[0];
+	low = a;
+	high = a >> 64;
+	out[2] += low;
+	out[3] += high;
+
+	a = ((uint128_t) small1[0]) * small2[3];
+	low = a;
+	high = a >> 64;
+	out[3] += low;
+	out[4] = high;
+
+	a = ((uint128_t) small1[1]) * small2[2];
+	low = a;
+	high = a >> 64;
+	out[3] += low;
+	out[4] += high;
+
+	a = ((uint128_t) small1[2]) * small2[1];
+	low = a;
+	high = a >> 64;
+	out[3] += low;
+	out[4] += high;
+
+	a = ((uint128_t) small1[3]) * small2[0];
+	low = a;
+	high = a >> 64;
+	out[3] += low;
+	out[4] += high;
+
+	a = ((uint128_t) small1[1]) * small2[3];
+	low = a;
+	high = a >> 64;
+	out[4] += low;
+	out[5] = high;
+
+	a = ((uint128_t) small1[2]) * small2[2];
+	low = a;
+	high = a >> 64;
+	out[4] += low;
+	out[5] += high;
+
+	a = ((uint128_t) small1[3]) * small2[1];
+	low = a;
+	high = a >> 64;
+	out[4] += low;
+	out[5] += high;
+
+	a = ((uint128_t) small1[2]) * small2[3];
+	low = a;
+	high = a >> 64;
+	out[5] += low;
+	out[6] = high;
+
+	a = ((uint128_t) small1[3]) * small2[2];
+	low = a;
+	high = a >> 64;
+	out[5] += low;
+	out[6] += high;
+
+	a = ((uint128_t) small1[3]) * small2[3];
+	low = a;
+	high = a >> 64;
+	out[6] += low;
+	out[7] = high;
+}
+
+/*-
+ * felem_mul sets |out| = |in1| * |in2|
+ * On entry:
+ *   in1[i] < 2^109
+ *   in2[i] < 2^109
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void felem_mul(longfelem out, const felem in1, const felem in2)
+{
+	smallfelem small1, small2;
+	felem_shrink(small1, in1);
+	felem_shrink(small2, in2);
+	smallfelem_mul(out, small1, small2);
+}
+
+/*-
+ * felem_small_mul sets |out| = |small1| * |in2|
+ * On entry:
+ *   small1[i] < 2^64
+ *   in2[i] < 2^109
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void felem_small_mul(longfelem out, const smallfelem small1,
+                            const felem in2)
+{
+	smallfelem small2;
+	felem_shrink(small2, in2);
+	smallfelem_mul(out, small1, small2);
+}
+
+/*-
+ * Internal function for the different flavours of felem_reduce.
+ * felem_reduce_ reduces the higher coefficients in[4]-in[7].
+ * On entry:
+ *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
+ *   out[1] >= in[7] + 2^32*in[4]
+ *   out[2] >= in[5] + 2^32*in[5]
+ *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
+ * On exit:
+ *   out[0] <= out[0] + in[4] + 2^32*in[5]
+ *   out[1] <= out[1] + in[5] + 2^33*in[6]
+ *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
+ *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
+ */
+static void felem_reduce(felem out, const longfelem in)
+{
+	uint128_t a, b, c, d;
+	a = in[6] + in[7];
+	b = in[5] + in[7];
+	c = in[4] + in[7];
+	d = a + b;
+
+	out[3] = in[3] + ((in[4] + in[5] + a * 2) << 32) + in[7];
+	out[2] = in[2] + (b << 32) + a + in[7];
+	out[1] = in[1] + ((c + in[6]) << 32) - c;
+	out[0] = in[0] + (d << 32) + d + in[4];
+}
+
+/*
+ * subtract_uint64 sets *result = *result - v and *carry to one if the
+ * subtraction underflowed.
+ */
+static void subtract_uint64(uint64_t *result, uint64_t *carry, uint64_t v)
+{
+	uint128_t r = *result;
+	r -= v;
+	*carry = (r >> 64) & 1;
+	*result = (uint64_t)r;
+}
+
+/*
+ * felem_contract converts |in| to its unique, minimal representation. On
+ * entry: in[i] < 2^109
+ */
+static void felem_contract(smallfelem out, const felem in)
+{
+	unsigned int i;
+	uint64_t all_equal_so_far = 0, result = 0, carry;
+
+	felem_shrink(out, in);
+	/* small is minimal except that the value might be > p */
+
+	all_equal_so_far--;
+	/*
+	* We are doing a constant time test if out >= kPrime. We need to compare
+	* each uint64_t, from most-significant to least significant. For each one, if
+	* all words so far have been equal (m is all ones) then a non-equal
+	* result is the answer. Otherwise we continue.
+	*/
+	for (i = 3; i < 4; i--) {
+		uint64_t equal;
+		uint128_t a = ((uint128_t) kPrime[i]) - out[i];
+		/*
+		* if out[i] > kPrime[i] then a will underflow and the high 64-bits
+		* will all be set.
+		*/
+		result |= all_equal_so_far & ((uint64_t)(a >> 64));
+
+		/*
+		* if kPrime[i] == out[i] then |equal| will be all zeros and the
+		* decrement will make it all ones.
+		*/
+		equal = kPrime[i] ^ out[i];
+		equal--;
+		equal &= equal << 32;
+		equal &= equal << 16;
+		equal &= equal << 8;
+		equal &= equal << 4;
+		equal &= equal << 2;
+		equal &= equal << 1;
+		equal = ((int64_t) equal) >> 63;
+
+		all_equal_so_far &= equal;
+	}
+
+	/*
+	* if all_equal_so_far is still all ones then the two values are equal
+	* and so out >= kPrime is true.
+	*/
+	result |= all_equal_so_far;
+
+	/* if out >= kPrime then we subtract kPrime. */
+	subtract_uint64(&out[0], &carry, result & kPrime[0]);
+	subtract_uint64(&out[1], &carry, carry);
+	subtract_uint64(&out[2], &carry, carry);
+	subtract_uint64(&out[3], &carry, carry);
+
+	subtract_uint64(&out[1], &carry, result & kPrime[1]);
+	subtract_uint64(&out[2], &carry, carry);
+	subtract_uint64(&out[3], &carry, carry);
+
+	subtract_uint64(&out[2], &carry, result & kPrime[2]);
+	subtract_uint64(&out[3], &carry, carry);
+
+	subtract_uint64(&out[3], &carry, result & kPrime[3]);
+}
+
+static void smallfelem_square_contract(smallfelem out, const smallfelem in)
+{
+	longfelem longtmp;
+	felem tmp;
+
+	smallfelem_square(longtmp, in);
+	felem_reduce(tmp, longtmp);
+	felem_contract(out, tmp);
+}
+
+static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
+                                    const smallfelem in2)
+{
+	longfelem longtmp;
+	felem tmp;
+
+	smallfelem_mul(longtmp, in1, in2);
+	felem_reduce(tmp, longtmp);
+	felem_contract(out, tmp);
+}
+
+/*-
+ * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
+ * otherwise.
+ * On entry:
+ *   small[i] < 2^64
+ */
+static limb smallfelem_is_zero(const smallfelem small)
+{
+	limb result;
+	uint64_t is_p;
+
+	uint64_t is_zero = small[0] | small[1] | small[2] | small[3];
+	is_zero--;
+	is_zero &= is_zero << 32;
+	is_zero &= is_zero << 16;
+	is_zero &= is_zero << 8;
+	is_zero &= is_zero << 4;
+	is_zero &= is_zero << 2;
+	is_zero &= is_zero << 1;
+	is_zero = ((int64_t) is_zero) >> 63;
+
+	is_p = (small[0] ^ kPrime[0]) |
+	(small[1] ^ kPrime[1]) |
+	(small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
+	is_p--;
+	is_p &= is_p << 32;
+	is_p &= is_p << 16;
+	is_p &= is_p << 8;
+	is_p &= is_p << 4;
+	is_p &= is_p << 2;
+	is_p &= is_p << 1;
+	is_p = ((int64_t) is_p) >> 63;
+
+	is_zero |= is_p;
+
+	result = is_zero;
+	result |= ((limb) is_zero) << 64;
+	return result;
+}
+
+static int smallfelem_is_zero_int(const smallfelem small)
+{
+	return (int)(smallfelem_is_zero(small) & ((limb) 1));
+}
+
+/*-
+ * felem_inv calculates |out| = |in|^{-1}
+ */
+static void felem_inv(felem out, const felem in)
+{
+	felem ftmp;
+	felem a1, a2, a3, a4, a5;
+	longfelem tmp;
+	unsigned int i;
+
+	felem_square(tmp, in);
+	felem_reduce(a1, tmp);
+	felem_mul(tmp, a1, in);
+	felem_reduce(a2, tmp);
+	felem_square(tmp, a2);
+	felem_reduce(ftmp, tmp);
+	felem_square(tmp, ftmp);
+	felem_reduce(ftmp, tmp);
+	felem_mul(tmp, ftmp, a2);
+	felem_reduce(a3, tmp);
+	felem_square(tmp, a3);
+	felem_reduce(ftmp, tmp);
+	felem_square(tmp, ftmp);
+	felem_reduce(ftmp, tmp);
+	felem_square(tmp, ftmp);
+	felem_reduce(ftmp, tmp);
+	felem_square(tmp, ftmp);
+	felem_reduce(ftmp, tmp);
+	felem_mul(tmp, ftmp, a3);
+	felem_reduce(a4, tmp);
+	felem_square(tmp, a4);
+	felem_reduce(a5, tmp);
+	for (i = 1; i < 8; i++) {
+		felem_square(tmp, a5);
+		felem_reduce(a5, tmp);
+	}
+	felem_mul(tmp, a5, a4);
+	felem_reduce(a5, tmp);
+	for (i = 0; i < 8; i++) {
+		felem_square(tmp, a5);
+		felem_reduce(a5, tmp);
+	}
+	felem_mul(tmp, a5, a4);
+	felem_reduce(a5, tmp);
+	for (i = 0; i < 4; i++) {
+		felem_square(tmp, a5);
+		felem_reduce(a5, tmp);
+	}
+	felem_mul(tmp, a5, a3);
+	felem_reduce(a5, tmp);
+	felem_square(tmp, a5);
+	felem_reduce(a5, tmp);
+	felem_square(tmp, a5);
+	felem_reduce(a5, tmp);
+	felem_mul(tmp, a5, a2);
+	felem_reduce(a5, tmp);
+	felem_square(tmp, a5);
+	felem_reduce(ftmp, tmp);
+	felem_mul(tmp, ftmp, in);
+	felem_reduce(a5, tmp);
+	felem_square(tmp, a5);
+	felem_reduce(a4, tmp);
+	felem_mul(tmp, a4, a1);
+	felem_reduce(a3, tmp);
+	felem_square(tmp, a4);
+	felem_reduce(a5, tmp);
+	for (i = 0; i < 30; i++) {
+		felem_square(tmp, a5);
+		felem_reduce(a5, tmp);
+	}
+	felem_mul(tmp, a5, a4);
+	felem_reduce(a4, tmp);
+	felem_square(tmp, a4);
+	felem_reduce(a4, tmp);
+	felem_mul(tmp, a4, in);
+	felem_reduce(a4, tmp);
+	felem_mul(tmp, a4, a2);
+	felem_reduce(a3, tmp);
+	for (i = 0; i < 33; i++) {
+		felem_square(tmp, a5);
+		felem_reduce(a5, tmp);
+	}
+	felem_mul(tmp, a5, a3);
+	felem_reduce(a2, tmp);
+	felem_mul(tmp, a2, a3);
+	felem_reduce(a3, tmp);
+	for (i = 0; i < 32; i++) {
+		felem_square(tmp, a5);
+		felem_reduce(a5, tmp);
+	}
+	felem_mul(tmp, a5, a3);
+	felem_reduce(a2, tmp);
+	felem_mul(tmp, a2, a3);
+	felem_reduce(a3, tmp);
+	felem_mul(tmp, a2, a4);
+	felem_reduce(a4, tmp);
+	for (i = 0; i < 32; i++) {
+		felem_square(tmp, a5);
+		felem_reduce(a5, tmp);
+	}
+	felem_mul(tmp, a5, a3);
+	felem_reduce(a2, tmp);
+	felem_mul(tmp, a2, a3);
+	felem_reduce(a3, tmp);
+	felem_mul(tmp, a2, a4);
+	felem_reduce(a4, tmp);
+	for (i = 0; i < 32; i++) {
+		felem_square(tmp, a5);
+		felem_reduce(a5, tmp);
+	}
+	felem_mul(tmp, a5, a3);
+	felem_reduce(a2, tmp);
+	felem_mul(tmp, a2, a3);
+	felem_reduce(a3, tmp);
+	felem_mul(tmp, a2, a4);
+	felem_reduce(a4, tmp);
+	for (i = 0; i < 32; i++) {
+		felem_square(tmp, a5);
+		felem_reduce(a5, tmp);
+	}
+	felem_mul(tmp, a5, a3);
+	felem_reduce(a2, tmp);
+	felem_mul(tmp, a2, a3);
+	felem_reduce(a3, tmp);
+	felem_mul(tmp, a2, a4);
+	felem_reduce(a4, tmp);
+	for (i = 0; i < 32; i++) {
+		felem_square(tmp, a5);
+		felem_reduce(a5, tmp);
+	}
+	felem_mul(tmp, a4, a5);
+	felem_reduce(out, tmp);
+}
+
+static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
+{
+	felem tmp;
+
+	smallfelem_expand(tmp, in);
+	felem_inv(tmp, tmp);
+	felem_contract(out, tmp);
+}
+
+
+static void felem_from_bytearray(felem out, const uint8_t in[32])
+{
+	out[3] = GETU64(in);
+	out[2] = GETU64(in + 8);
+	out[1] = GETU64(in + 16);
+	out[0] = GETU64(in + 24);
+}
+
+static void felem_get_bytearray(felem_bytearray out, const felem in)
+{
+	smallfelem small;
+
+	felem_contract(small, in);
+	PUTU64(out, small[3]);
+	PUTU64(out + 8, small[2]);
+	PUTU64(out + 16, small[1]);
+	PUTU64(out + 24, small[0]);
+}
+
+/*-
+ * Group operations
+ * ----------------
+ *
+ * Building on top of the field operations we have the operations on the
+ * elliptic curve group itself. Points on the curve are represented in Jacobian
+ * coordinates
+ */
+
+/*-
+ * point_double calculates 2*(x_in, y_in, z_in)
+ *
+ * The method is taken from:
+ *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
+ *
+ * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
+ * while x_out == y_in is not (maybe this works, but it's not tested).
+ */
+static void
+point_double(felem x_out, felem y_out, felem z_out,
+             const felem x_in, const felem y_in, const felem z_in)
+{
+	longfelem tmp, tmp2;
+	felem delta, gamma, beta, alpha, ftmp, ftmp2;
+	smallfelem small1, small2;
+
+	felem_assign(ftmp, x_in);
+	/* ftmp[i] < 2^106 */
+	felem_assign(ftmp2, x_in);
+	/* ftmp2[i] < 2^106 */
+
+	/* delta = z^2 */
+	felem_square(tmp, z_in);
+	felem_reduce(delta, tmp);
+	/* delta[i] < 2^101 */
+
+	/* gamma = y^2 */
+	felem_square(tmp, y_in);
+	felem_reduce(gamma, tmp);
+
+	/* gamma[i] < 2^101 */
+	felem_shrink(small1, gamma);
+
+	/* beta = x*gamma */
+	felem_small_mul(tmp, small1, x_in);
+	felem_reduce(beta, tmp);
+	/* beta[i] < 2^101 */
+
+	/* alpha = 3*(x-delta)*(x+delta) */
+	felem_diff(ftmp, delta);
+	/* ftmp[i] < 2^105 + 2^106 < 2^107 */
+
+	felem_sum(ftmp2, delta);
+	/* ftmp2[i] < 2^105 + 2^106 < 2^107 */
+
+	felem_scalar(ftmp2, 3);
+	/* ftmp2[i] < 3 * 2^107 < 2^109 */
+
+	felem_mul(tmp, ftmp, ftmp2);
+	felem_reduce(alpha, tmp);
+	felem_shrink(small2, alpha);
+	/* alpha[i] < 2^101 */
+
+	/* x' = alpha^2 - 8*beta */
+	smallfelem_square(tmp, small2);
+	felem_reduce(x_out, tmp);
+	felem_assign(ftmp, beta);
+	felem_scalar(ftmp, 8);
+	/* ftmp[i] < 8 * 2^101 = 2^104 */
+	felem_diff(x_out, ftmp);
+	/* x_out[i] < 2^105 + 2^101 < 2^106 */
+
+	/* z' = (y + z)^2 - gamma - delta */
+	felem_sum(delta, gamma);
+	/* delta[i] < 2^101 + 2^101 = 2^102 */
+	felem_assign(ftmp, y_in);
+	felem_sum(ftmp, z_in);
+	/* ftmp[i] < 2^106 + 2^106 = 2^107 */
+	felem_square(tmp, ftmp);
+	felem_reduce(z_out, tmp);
+	felem_diff(z_out, delta);
+	/* z_out[i] < 2^105 + 2^101 < 2^106 */
+
+	/* y' = alpha*(4*beta - x') - 8*gamma^2 */
+	felem_scalar(beta, 4);
+	/* beta[i] < 4 * 2^101 = 2^103 */
+	felem_diff_zero107(beta, x_out);
+	/* beta[i] < 2^107 + 2^103 < 2^108 */
+	felem_small_mul(tmp, small2, beta);
+	/* tmp[i] < 7 * 2^64 < 2^67 */
+	smallfelem_square(tmp2, small1);
+	/* tmp2[i] < 7 * 2^64 */
+	longfelem_scalar(tmp2, 8);
+	/* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
+
+	longfelem_diff(tmp, tmp2);
+	/* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
+	felem_reduce(y_out, tmp);
+	/* y_out[i] < 2^106 */
+}
+
+/*
+ * point_double_small is the same as point_double, except that it operates on
+ * smallfelems
+ */
+static void
+point_double_small(
+	smallfelem x_out, smallfelem y_out, smallfelem z_out,
+	const smallfelem x_in, const smallfelem y_in, const smallfelem z_in)
+{
+	felem felem_x_out, felem_y_out, felem_z_out;
+	felem felem_x_in, felem_y_in, felem_z_in;
+
+	smallfelem_expand(felem_x_in, x_in);
+	smallfelem_expand(felem_y_in, y_in);
+	smallfelem_expand(felem_z_in, z_in);
+
+	point_double(
+		felem_x_out, felem_y_out, felem_z_out,
+		felem_x_in, felem_y_in, felem_z_in);
+
+	felem_shrink(x_out, felem_x_out);
+	felem_shrink(y_out, felem_y_out);
+	felem_shrink(z_out, felem_z_out);
+}
+
+/* copy_conditional copies in to out iff mask is all ones. */
+static void copy_conditional(felem out, const felem in, limb mask)
+{
+	unsigned int i;
+	for (i = 0; i < NLIMBS; ++i) {
+		const limb tmp = mask & (in[i] ^ out[i]);
+		out[i] ^= tmp;
+	}
+}
+
+/* copy_small_conditional copies in to out iff mask is all ones. */
+static void copy_small_conditional(felem out, const smallfelem in, limb mask)
+{
+	unsigned int i;
+	const uint64_t mask64 = mask;
+	for (i = 0; i < NLIMBS; ++i) {
+		out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
+	}
+}
+
+/*-
+ * point_add calculates (x1, y1, z1) + (x2, y2, z2)
+ *
+ * The method is taken from:
+ *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
+ * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
+ *
+ * This function includes a branch for checking whether the two input points
+ * are equal, (while not equal to the point at infinity). This case never
+ * happens during single point multiplication, so there is no timing leak for
+ * ECDH or ECDSA signing.
+ */
+static void point_add(felem x3, felem y3, felem z3,
+                      const felem x1, const felem y1, const felem z1,
+                      const int mixed,
+		const smallfelem x2,
+                      const smallfelem y2, const smallfelem z2)
+{
+	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
+	longfelem tmp, tmp2;
+	smallfelem small1, small2, small3, small4, small5;
+	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
+
+
+	felem_shrink(small3, z1);
+
+	z1_is_zero = smallfelem_is_zero(small3);
+	z2_is_zero = smallfelem_is_zero(z2);
+
+	/* ftmp = z1z1 = z1**2 */
+	smallfelem_square(tmp, small3);
+	felem_reduce(ftmp, tmp);
+	/* ftmp[i] < 2^101 */
+	felem_shrink(small1, ftmp);
+
+	if (!mixed) {
+		/* ftmp2 = z2z2 = z2**2 */
+		smallfelem_square(tmp, z2);
+		felem_reduce(ftmp2, tmp);
+		/* ftmp2[i] < 2^101 */
+		felem_shrink(small2, ftmp2);
+
+		felem_shrink(small5, x1);
+
+		/* u1 = ftmp3 = x1*z2z2 */
+		smallfelem_mul(tmp, small5, small2);
+		felem_reduce(ftmp3, tmp);
+		/* ftmp3[i] < 2^101 */
+
+		/* ftmp5 = z1 + z2 */
+		felem_assign(ftmp5, z1);
+		felem_small_sum(ftmp5, z2);
+		/* ftmp5[i] < 2^107 */
+
+		/* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
+		felem_square(tmp, ftmp5);
+		felem_reduce(ftmp5, tmp);
+		/* ftmp2 = z2z2 + z1z1 */
+		felem_sum(ftmp2, ftmp);
+		/* ftmp2[i] < 2^101 + 2^101 = 2^102 */
+		felem_diff(ftmp5, ftmp2);
+		/* ftmp5[i] < 2^105 + 2^101 < 2^106 */
+
+		/* ftmp2 = z2 * z2z2 */
+		smallfelem_mul(tmp, small2, z2);
+		felem_reduce(ftmp2, tmp);
+
+		/* s1 = ftmp2 = y1 * z2**3 */
+		felem_mul(tmp, y1, ftmp2);
+		felem_reduce(ftmp6, tmp);
+		/* ftmp6[i] < 2^101 */
+	} else {
+		/*
+		* We'll assume z2 = 1 (special case z2 = 0 is handled later)
+		*/
+
+		/* u1 = ftmp3 = x1*z2z2 */
+		felem_assign(ftmp3, x1);
+		/* ftmp3[i] < 2^106 */
+
+		/* ftmp5 = 2z1z2 */
+		felem_assign(ftmp5, z1);
+		felem_scalar(ftmp5, 2);
+		/* ftmp5[i] < 2*2^106 = 2^107 */
+
+		/* s1 = ftmp2 = y1 * z2**3 */
+		felem_assign(ftmp6, y1);
+		/* ftmp6[i] < 2^106 */
+		}
+
+		/* u2 = x2*z1z1 */
+		smallfelem_mul(tmp, x2, small1);
+		felem_reduce(ftmp4, tmp);
+
+		/* h = ftmp4 = u2 - u1 */
+		felem_diff_zero107(ftmp4, ftmp3);
+		/* ftmp4[i] < 2^107 + 2^101 < 2^108 */
+		felem_shrink(small4, ftmp4);
+
+		x_equal = smallfelem_is_zero(small4);
+
+		/* z_out = ftmp5 * h */
+		felem_small_mul(tmp, small4, ftmp5);
+		felem_reduce(z_out, tmp);
+		/* z_out[i] < 2^101 */
+
+		/* ftmp = z1 * z1z1 */
+		smallfelem_mul(tmp, small1, small3);
+		felem_reduce(ftmp, tmp);
+
+		/* s2 = tmp = y2 * z1**3 */
+		felem_small_mul(tmp, y2, ftmp);
+		felem_reduce(ftmp5, tmp);
+
+		/* r = ftmp5 = (s2 - s1)*2 */
+		felem_diff_zero107(ftmp5, ftmp6);
+		/* ftmp5[i] < 2^107 + 2^107 = 2^108 */
+		felem_scalar(ftmp5, 2);
+		/* ftmp5[i] < 2^109 */
+		felem_shrink(small1, ftmp5);
+		y_equal = smallfelem_is_zero(small1);
+
+		if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
+		point_double(x3, y3, z3, x1, y1, z1);
+		return;
+	}
+
+	/* I = ftmp = (2h)**2 */
+	felem_assign(ftmp, ftmp4);
+	felem_scalar(ftmp, 2);
+	/* ftmp[i] < 2*2^108 = 2^109 */
+	felem_square(tmp, ftmp);
+	felem_reduce(ftmp, tmp);
+
+	/* J = ftmp2 = h * I */
+	felem_mul(tmp, ftmp4, ftmp);
+	felem_reduce(ftmp2, tmp);
+
+	/* V = ftmp4 = U1 * I */
+	felem_mul(tmp, ftmp3, ftmp);
+	felem_reduce(ftmp4, tmp);
+
+	/* x_out = r**2 - J - 2V */
+	smallfelem_square(tmp, small1);
+	felem_reduce(x_out, tmp);
+	felem_assign(ftmp3, ftmp4);
+	felem_scalar(ftmp4, 2);
+	felem_sum(ftmp4, ftmp2);
+	/* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
+	felem_diff(x_out, ftmp4);
+	/* x_out[i] < 2^105 + 2^101 */
+
+	/* y_out = r(V-x_out) - 2 * s1 * J */
+	felem_diff_zero107(ftmp3, x_out);
+	/* ftmp3[i] < 2^107 + 2^101 < 2^108 */
+	felem_small_mul(tmp, small1, ftmp3);
+	felem_mul(tmp2, ftmp6, ftmp2);
+	longfelem_scalar(tmp2, 2);
+	/* tmp2[i] < 2*2^67 = 2^68 */
+	longfelem_diff(tmp, tmp2);
+	/* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
+	felem_reduce(y_out, tmp);
+	/* y_out[i] < 2^106 */
+
+	copy_small_conditional(x_out, x2, z1_is_zero);
+	copy_conditional(x_out, x1, z2_is_zero);
+	copy_small_conditional(y_out, y2, z1_is_zero);
+	copy_conditional(y_out, y1, z2_is_zero);
+	copy_small_conditional(z_out, z2, z1_is_zero);
+	copy_conditional(z_out, z1, z2_is_zero);
+	felem_assign(x3, x_out);
+	felem_assign(y3, y_out);
+	felem_assign(z3, z_out);
+}
+
+/*
+ * point_add_small is the same as point_add, except that it operates on
+ * smallfelems
+ */
+static void point_add_small(
+	smallfelem x3, smallfelem y3, smallfelem z3,
+	smallfelem x1, smallfelem y1, smallfelem z1,
+	smallfelem x2, smallfelem y2, smallfelem z2)
+{
+	felem felem_x3, felem_y3, felem_z3;
+	felem felem_x1, felem_y1, felem_z1;
+
+	smallfelem_expand(felem_x1, x1);
+	smallfelem_expand(felem_y1, y1);
+	smallfelem_expand(felem_z1, z1);
+
+	// mixed = 1 是一种特殊形式，这里面一个affine point被加到了jacobian上面
+	point_add(
+		felem_x3, felem_y3, felem_z3,
+		felem_x1, felem_y1, felem_z1,
+		0,
+		x2, y2, z2);
+
+	felem_shrink(x3, felem_x3);
+	felem_shrink(y3, felem_y3);
+	felem_shrink(z3, felem_z3);
+}
+
+/*-
+ * Base point pre computation
+ * --------------------------
+ *
+ * Two different sorts of precomputed tables are used in the following code.
+ * Each contain various points on the curve, where each point is three field
+ * elements (x, y, z).
+ *
+ * For the base point table, z is usually 1 (0 for the point at infinity).
+ * This table has 2 * 16 elements, starting with the following:
+ * index | bits    | point
+ * ------+---------+------------------------------
+ *     0 | 0 0 0 0 | 0G
+ *     1 | 0 0 0 1 | 1G
+ *     2 | 0 0 1 0 | 2^64G
+ *     3 | 0 0 1 1 | (2^64 + 1)G
+ *     4 | 0 1 0 0 | 2^128G
+ *     5 | 0 1 0 1 | (2^128 + 1)G
+ *     6 | 0 1 1 0 | (2^128 + 2^64)G
+ *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
+ *     8 | 1 0 0 0 | 2^192G
+ *     9 | 1 0 0 1 | (2^192 + 1)G
+ *    10 | 1 0 1 0 | (2^192 + 2^64)G
+ *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
+ *    12 | 1 1 0 0 | (2^192 + 2^128)G
+ *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
+ *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
+ *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
+ * followed by a copy of this with each element multiplied by 2^32.
+ *
+ * The reason for this is so that we can clock bits into four different
+ * locations when doing simple scalar multiplies against the base point,
+ * and then another four locations using the second 16 elements.
+ *
+ * Tables for other points have table[i] = iG for i in 0 .. 16.
+ */
+
+static const smallfelem g_pre_comp[2][16][3] = {
+	{{{0, 0, 0, 0},
+	 {0, 0, 0, 0},
+	 {0, 0, 0, 0}},
+	{{0x715a4589334c74c7, 0x8fe30bbff2660be1, 0x5f9904466a39c994, 0x32c4ae2c1f198119},
+	 {0x02df32e52139f0a0, 0xd0a9877cc62a4740, 0x59bdcee36b692153, 0xbc3736a2f4f6779c},
+	 {1, 0, 0, 0}},
+	{{0xe18bd546b5824517, 0x673891d791caa486, 0xba220b99df9f9a14, 0x95afbd1155c1da54},
+	 {0x8e4450eb334acdcb, 0xc3c7d1898a53f20d, 0x2eee750f4053017c, 0xe8a6d82c517388c2},
+	 {1, 0, 0, 0}},
+	{{0xf81c8da9b99fba55, 0x137f6c6149feef6e, 0xcb129aa494da9ad4, 0x82a0f5407d123db6},
+	 {0xfdeca00772c4dbc9, 0xa961b58f0cf58373, 0xecacab94e973f9c3, 0xf12fa4696a22ca3f},
+	 {1, 0, 0, 0}},
+	{{0xeae3d9a9d13a42ed, 0x2b2308f6484e1b38, 0x3db7b24888c21f3a, 0xb692e5b574d55da9},
+	 {0xd186469de295e5ab, 0xdb61ac1773438e6d, 0x5a924f85544926f9, 0xa175051b0f3fb613},
+	 {1, 0, 0, 0}},
+	{{0xa72d084f62c8d58b, 0xe3d6467deaf48fd7, 0x8fe75e5a128a56a7, 0xc0023fe7ff2b68bd},
+	 {0x64f67782316815f9, 0xb52b6d9b19a69cd2, 0x5d1ed6fa89cbbade, 0x796c910ee7f4ccdb},
+	 {1, 0, 0, 0}},
+	{{0x1b2150c1c5f13015, 0xdaaba91b5d952c9b, 0xe8cc24c3f546142, 0x75a34b243705f260},
+	 {0x77d195421cef1339, 0x636644aa0c3a0623, 0x4683df176eeb2444, 0x642ce3bd3535e74d},
+	 {1, 0, 0, 0}},
+	{{0x4a59ac2c6e7ecc08, 0xaf2b71164f191d63, 0x3622a87fb284554f, 0xd9eb397b441e9cd0},
+	 {0xa66b8a4893b6a54d, 0x26fb89a40b4a663a, 0xafa87501eedfc9f4, 0xf3f000bc66f98108},
+	 {1, 0, 0, 0}},
+	{{0xad8bc68ce031d616, 0x16888d8ee4003187, 0x44c0757f3bb8b600, 0x793fae7af0164245},
+	 {0x210cd042973f333b, 0x8666ff52dbd25f9, 0x65c5b129f5f7ad5d, 0xe03d7a8d19b3219a},
+	 {1, 0, 0, 0}},
+	{{0xd68bfbace0e00392, 0x261014f7d3445dc7, 0xd9f46b2714a071ee, 0x1b200af30810b682},
+	 {0x0d91d8b12ae69bcd, 0x74a08f17bf8cd981, 0xd822913cf0d2b82d, 0x248b7af0b05bfad2},
+	 {1, 0, 0, 0}},
+	{{0xba119a049e62f2e2, 0xf278e8a34df05ae5, 0xd269f3564eb5d180, 0x8e74ad0f4f957cb1},
+	 {0x112ff4dabd76e2dd, 0x91373f20630fdb7f, 0xf43eab474992904c, 0x55a5ccc7af3b6db4},
+	 {1, 0, 0, 0}},
+	{{0x5ad104a8bdd23de9, 0xf5a9e515eb71c2c1, 0x390542a0ba95c174, 0x4c55fb20426491bf},
+	 {0x91525735ef626289, 0xd2ed977f88f09635, 0xfd48731b7a8a8521, 0x08f89a03b8fdebea},
+	 {1, 0, 0, 0}},
+	{{0x7e8e61ea35eb8e2e, 0x1bb2700db98a762c, 0xd81ea23b7738c17c, 0xf9def2a46dba26a3},
+	 {0x183a7912d05e329f, 0x34664a0896ccde0e, 0x56c22652614283bb, 0x91692899d5ff0513},
+	 {1, 0, 0, 0}},
+	{{0x449d48d8f3bdbe19, 0xab95de03cc8510cb, 0xaef159463f8bfb25, 0xda72c379dae3ca8b},
+	 {0xcba9315ce82cc3ea, 0x4e524bac38a58020, 0x36ba2752538e348c, 0xb170d0da75ed450f},
+	 {1, 0, 0, 0}},
+	{{0x947af0f52b4f8da6, 0x7eda17d917827976, 0x5ba79a0c705853a0, 0xa5d9873b3fb2ddc7},
+	 {0xc2a48162a5fd9ce9, 0x80ee8ae526f25f02, 0xf60c8ef6633be6a9, 0xe2e23f0229a84a35},
+	 {1, 0, 0, 0}},
+	{{0xbc4945bd86bb6afb, 0x237eb711eba46fee, 0x7c1db58b7b86eb33, 0xd94eb728273b3ac7},
+	 {0xbe1717e59568d0a4, 0x4a6067cc45f70212, 0x19b32eb5afc2fb17, 0xbe3c1e7ac3ac9d3c},
+	 {1, 0, 0, 0}}},
+	{{{0, 0, 0, 0},
+	 {0, 0, 0, 0},
+	 {0, 0, 0, 0}},
+	{{0x68a88405ae53c1e9, 0x51e46707fd558656, 0x71e834cf86896c10, 0x3d251b54e10d581f},
+	 {0x1884d5b0eeb19032, 0xeeaf729853e526fe, 0x5931f6831a8d8c11, 0x87891d33fb98b4d8},
+	 {1, 0, 0, 0}},
+	{{0x9047673fcac14893, 0xf5df5d83bfb58659, 0x0a6230c81642e71a, 0xef14b33800777791},
+	 {0xcf1e99afa3386fca, 0x7ace937791313d53, 0x36fe159b6dcd01bb, 0xc9bc50d02e2b960a},
+	 {1, 0, 0, 0}},
+	{{0x716e5a7ee12e162d, 0xbbf9bb2c62dd5a00, 0xca235ccb4144dd05, 0xbcb7de0f8f70520e},
+	 {0x981e8964947cb8eb, 0x53c7102ea04de08d, 0xe9076332afc6a10d, 0x93d90f776b58c35d},
+	 {1, 0, 0, 0}},
+	{{0x834dbff6678337ee, 0xc607e811fef0785a, 0xaaefc62be30a298b, 0xeb5ca335326afad3},
+	 {0x9774fe1384af54a8, 0xca4b6ef5785388b4, 0x1346c82d66f6c642, 0xedcc0c2aaa2d53ce},
+	 {1, 0, 0, 0}},
+	{{0xb896b3f764b9e6f4, 0x47e4018c736fb3d0, 0xfc2fc86707413920, 0x1a8526428e1aeae7},
+	 {0x1386802650e2ae60, 0x7474dedc995384d0, 0x2c4cc396dd43b011, 0x63b0e9c7141de1b0},
+	 {1, 0, 0, 0}},
+	{{0xeb5fb3b369d17771, 0x1fe07b18933ed257, 0xdfc4c81ce3673912, 0x913614c66a91a647},
+	 {0x18aee853c0ba877f, 0x03109c2deceff091, 0x8532307e7e4ee08c, 0xcef0791a6e6ce0bb},
+	 {1, 0, 0, 0}},
+	{{0xf0e9f5d8057a4a0f, 0xbbf7f8b49f125aa9, 0x51e8fdd6283187c2, 0xe0997d4759d36298},
+	 {0x67ec3c5c6f4221c3, 0x3ea275dbc860722f, 0x152d01e23859f5e2, 0xfb57404312680f44},
+	 {1, 0, 0, 0}},
+	{{0x21ac3df849be2a1f, 0x11006e9fc51d112f, 0x9151aa584775c857, 0x5159d218ba04a8d9},
+	 {0x98b7d1a925fd1866, 0x8f4753cafc2ad9d8, 0x8eb91ec1569c05a9, 0x4abbd1ae27e13f11},
+	 {1, 0, 0, 0}},
+	{{0x616f6644b2c11f4c, 0x251cd7140e540758, 0xf927a40110f02017, 0x92ff3cc3c1c941b6},
+	 {0x3249906213f565fe, 0x4633e3ddeb9dbd4e, 0xea9a9d1ec402e6c2, 0xdc84ce34b14bb7cf},
+	 {1, 0, 0, 0}},
+	{{0xa93e23e5436ff69a, 0x52dcb0a79b63efce, 0x34f6538a9e90cb41, 0x9cac08f200234bc0},
+	 {0x6661825b5174a02d, 0x07d4d06de036be57, 0x589d74610ae6bd27, 0xa296f5577fc91a93},
+	 {1, 0, 0, 0}},
+	{{0x10acefa9d29721d0, 0x8b0f6b8bb5bcd340, 0x921d318c3d86785c, 0xd6916f3bc16aa378},
+	 {0x2a0d646a7ad84a0e, 0x7b93256c2fe7e97a, 0x5765e27626479e41, 0xae9da2272daaced3},
+	 {1, 0, 0, 0}},
+	{{0x56fdc215f7f34ac5, 0xebcb4ff2da3877d3, 0x1eb96792aba6b832, 0x807ce6bea24741aa},
+	 {0xff1c10109c721fb4, 0xd187d4bc796353a7, 0x7639ae749af2d303, 0xaff6d783d56c9286},
+	 {1, 0, 0, 0}},
+	{{0x6002d51b6290dd01, 0xcba3ab0099a836a5, 0x71776611e00d2528, 0xfaf2cb8c87fce119},
+	 {0xd445228bdf6882ae, 0xcbbfade17cbce919, 0x837b6335a2eb2453, 0x11ad7c4b8597f6b6},
+	 {1, 0, 0, 0}},
+	{{0x48de8f368cf2e399, 0x7ae3d25630a74277, 0xdef1a9a6c505323f, 0xe55f203b4b8d9672},
+	 {0xc58d8f0d9a1e6e97, 0xe160e6d4b2737a76, 0xd60bd087d47cbdd8, 0x687d41364d5fef53},
+	 {1, 0, 0, 0}},
+	{{0x83f21bbe056bbf9b, 0x4c2a9d120b4ba5ab, 0xff383d1845b64e4f, 0x8f13cc8d06dd7867},
+	 {0xf3a292d8424f0995, 0xfd2546eae7cbe44b, 0x67d14dee6c1e75a3, 0x53b49e6cc93fb5a8},
+	 {1, 0, 0, 0}}}
+};
+
+
+/*
+ * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
+ * (X/Z^2, Y/Z^3)
+ */
+static void make_point_affine(felem_bytearray x, felem_bytearray y,
+	const felem x_in, const felem y_in, const felem z_in)
+{
+	felem z_inv;
+	felem x_out;
+	felem y_out;
+	longfelem tmp;
+
+	// z_inv = 1/Z
+	felem_inv(z_inv, z_in);
+
+	// y_out = Y/Z
+	if (y) {
+		felem_mul(tmp, y_in, z_inv);
+		felem_reduce(y_out, tmp);
+	}
+
+	// z_inv = 1/Z^2
+	felem_square(tmp, z_inv);
+	felem_reduce(z_inv, tmp);
+
+	// x_out = X/Z^2
+	felem_mul(tmp, x_in, z_inv);
+	felem_reduce(x_out, tmp);
+	felem_get_bytearray(x, x_out);
+
+	if (y) {
+		// y_out = Y/Z^3
+		felem_mul(tmp, y_out, z_inv);
+		felem_reduce(y_out, tmp);
+		felem_get_bytearray(y, y_out);
+	}
+}
+
+static void make_points_affine(size_t num, smallfelem points[][3], smallfelem *tmp_felems)
+{
+	int i = 0;
+
+#define tmp_felem(I) tmp_felems[I]
+#define X(I) points[I][0]
+#define Y(I) points[I][1]
+#define Z(I) points[I][2]
+
+	if (!smallfelem_is_zero_int(Z(0))) {
+		smallfelem_assign(tmp_felem(0), Z(0));
+	} else {
+		smallfelem_one(tmp_felem(0));
+	}
+
+	for (i = 1; i < (int)num; i++) {
+		if (!smallfelem_is_zero_int(Z(i))) {
+			smallfelem_mul_contract(tmp_felem(i), tmp_felem(i - 1), Z(i));
+		} else {
+			smallfelem_assign(tmp_felem(i), tmp_felem(i - 1));
+		}
+	}
+	/*
+	* Now each tmp_felem(i) is the product of Z(0) .. Z(i), skipping any
+	* zero-valued factors: if Z(i) = 0, we essentially pretend that Z(i) = 1
+	*/
+
+	smallfelem_inv_contract(tmp_felem(num - 1), tmp_felem(num - 1));
+
+	for (i = num - 1; i >= 0; i--) {
+		if (i > 0) {
+			/*
+			* tmp_felem(i-1) is the product of Z(0) .. Z(i-1), tmp_felem(i)
+			* is the inverse of the product of Z(0) .. Z(i)
+			*/
+			/* 1/Z(i) */
+			smallfelem_mul_contract(tmp_felem(num), tmp_felem(i - 1), tmp_felem(i));
+		} else {
+			smallfelem_assign(tmp_felem(num), tmp_felem(0)); /* 1/Z(0) */
+		}
+
+		if (!smallfelem_is_zero_int(Z(i))) {
+			if (i > 0) {
+				/*
+				* For next iteration, replace tmp_felem(i-1) by its inverse
+				*/
+				smallfelem_mul_contract(tmp_felem(i - 1), tmp_felem(i), Z(i));
+			}
+
+			/*
+			* Convert point (X, Y, Z) into affine form (X/(Z^2), Y/(Z^3), 1)
+			*/
+			smallfelem_square_contract(Z(i), tmp_felem(num)); /* 1/(Z^2) */
+			smallfelem_mul_contract(X(i), X(i), Z(i)); /* X/(Z^2) */
+			smallfelem_mul_contract(Z(i), Z(i), tmp_felem(num)); /* 1/(Z^3) */
+			smallfelem_mul_contract(Y(i), Y(i), Z(i)); /* Y/(Z^3) */
+			smallfelem_one(Z(i));
+		} else {
+			if (i > 0) {
+				/*
+				* For next iteration, replace tmp_felem(i-1) by its inverse
+				*/
+				smallfelem_assign(tmp_felem(i - 1), tmp_felem(i));
+			}
+		}
+	}
+}
+
+#if 0
+// generate g_pre_comp[2][16][3]
+static int precompute_mult(const felem_bytearray x, const felem_bytearray y, smallfelem pre_comp[2][16][3])
+{
+	int ret = 0;
+	int i, j;
+
+	smallfelem tmp_smallfelems[32];
+
+	felem x_tmp;
+	felem y_tmp;
+	felem z_tmp;
+
+	felem_from_bytearray(x_tmp, x);
+	felem_from_bytearray(y_tmp, y);
+	felem_one(z_tmp);
+
+	// T0[1] = G
+	felem_shrink(pre_comp[0][1][0], x_tmp);
+	felem_shrink(pre_comp[0][1][1], y_tmp);
+	felem_shrink(pre_comp[0][1][2], z_tmp);
+
+	// T1[1] = 2^32 * T0[1] = 2^32  * G   T0[2] = 2^32 * T1[1] = 2^64  * G
+	// T1[2] = 2^32 * T0[2] = 2^96  * G   T0[4] = 2^32 * T1[2] = 2^128 * G
+	// T1[4] = 2^32 * T0[4] = 2^160 * G   T0[8] = 2^32 * T1[4] = 2^192 * G
+	// T1[8] = 2^32 * T0[8] = 2^224 * G
+
+	for (i = 1; i <= 8; i <<= 1) {
+
+		// T1[i] = 2^32 * T0[i]
+
+		point_double_small(
+			pre_comp[1][i][0],
+			pre_comp[1][i][1],
+			pre_comp[1][i][2],
+			pre_comp[0][i][0],
+			pre_comp[0][i][1],
+			pre_comp[0][i][2]);
+
+		for (j = 0; j < 31; ++j) {
+			point_double_small(
+				pre_comp[1][i][0],
+				pre_comp[1][i][1],
+				pre_comp[1][i][2],
+				pre_comp[1][i][0],
+				pre_comp[1][i][1],
+				pre_comp[1][i][2]);
+		}
+
+		if (i == 8) {
+			break;
+		}
+
+		// T0[2*i] = 2^32 * T1[i]
+		point_double_small(
+			pre_comp[0][2 * i][0],
+			pre_comp[0][2 * i][1],
+			pre_comp[0][2 * i][2],
+			pre_comp[1][i][0],
+			pre_comp[1][i][1],
+			pre_comp[1][i][2]);
+
+		for (j = 0; j < 31; ++j) {
+			point_double_small(
+				pre_comp[0][2 * i][0],
+				pre_comp[0][2 * i][1],
+				pre_comp[0][2 * i][2],
+				pre_comp[0][2 * i][0],
+				pre_comp[0][2 * i][1],
+				pre_comp[0][2 * i][2]);
+		}
+	}
+
+	// Ti[ 0] = O
+	// Ti[ 6] = Ti[ 4] + Ti[2]
+	// Ti[10] = Ti[ 8] + Ti[2]
+	// Ti[12] = Ti[ 8] + Ti[4]
+	// Ti[14] = Ti[12] + Ti[2]
+	// Ti[2*j + 1] = Ti[2*j] + Ti[1]
+
+	for (i = 0; i < 2; i++) {
+
+		memset(pre_comp[i][0], 0, sizeof(pre_comp[i][0]));
+
+		point_add_small(
+			pre_comp[i][6][0],
+			pre_comp[i][6][1],
+			pre_comp[i][6][2],
+			pre_comp[i][4][0],
+			pre_comp[i][4][1],
+			pre_comp[i][4][2],
+			pre_comp[i][2][0],
+			pre_comp[i][2][1],
+			pre_comp[i][2][2]);
+
+		point_add_small(
+			pre_comp[i][10][0],
+			pre_comp[i][10][1],
+			pre_comp[i][10][2],
+			pre_comp[i][8][0],
+			pre_comp[i][8][1],
+			pre_comp[i][8][2],
+			pre_comp[i][2][0],
+			pre_comp[i][2][1],
+			pre_comp[i][2][2]);
+
+		point_add_small(
+			pre_comp[i][12][0],
+			pre_comp[i][12][1],
+			pre_comp[i][12][2],
+			pre_comp[i][8][0],
+			pre_comp[i][8][1],
+			pre_comp[i][8][2],
+			pre_comp[i][4][0],
+			pre_comp[i][4][1],
+			pre_comp[i][4][2]);
+
+		point_add_small(
+			pre_comp[i][14][0],
+			pre_comp[i][14][1],
+			pre_comp[i][14][2],
+			pre_comp[i][12][0],
+			pre_comp[i][12][1],
+			pre_comp[i][12][2],
+			pre_comp[i][2][0],
+			pre_comp[i][2][1],
+			pre_comp[i][2][2]);
+
+		for (j = 1; j < 8; ++j) {
+			point_add_small(
+				pre_comp[i][2 * j + 1][0],
+				pre_comp[i][2 * j + 1][1],
+				pre_comp[i][2 * j + 1][2],
+				pre_comp[i][2 * j][0],
+				pre_comp[i][2 * j][1],
+				pre_comp[i][2 * j][2],
+				pre_comp[i][1][0],
+				pre_comp[i][1][1],
+				pre_comp[i][1][2]);
+		}
+	}
+
+	make_points_affine(31, &pre_comp[0][1], tmp_smallfelems);
+
+	#define smallfelem_print(a) printf("%016llx%016llx%016llx%016llx\n",(a)[3],(a)[2],(a)[1],(a)[0])
+	for (i = 0; i < 2; i++) {
+		for (j = 0; j < 16; j++) {
+			smallfelem_print(pre_comp[i][j][0]);
+			smallfelem_print(pre_comp[i][j][1]);
+			smallfelem_print(pre_comp[i][j][2]);
+			printf("\n");
+		}
+	}
+	return ret;
+}
+
+static int test_precompute_mult(void)
+{
+	static const felem_bytearray sm2_g_x = {
+		0x32, 0xC4, 0xAE, 0x2C, 0x1F, 0x19, 0x81, 0x19,
+		0x5F, 0x99, 0x04, 0x46, 0x6A, 0x39, 0xC9, 0x94,
+		0x8F, 0xE3, 0x0B, 0xBF, 0xF2, 0x66, 0x0B, 0xE1,
+		0x71, 0x5A, 0x45, 0x89, 0x33, 0x4C, 0x74, 0xC7,
+	};
+	static const felem_bytearray sm2_g_y = {
+		0xBC, 0x37, 0x36, 0xA2, 0xF4, 0xF6, 0x77, 0x9C,
+		0x59, 0xBD, 0xCE, 0xE3, 0x6B, 0x69, 0x21, 0x53,
+		0xD0, 0xA9, 0x87, 0x7C, 0xC6, 0x2A, 0x47, 0x40,
+		0x02, 0xDF, 0x32, 0xE5, 0x21, 0x39, 0xF0, 0xA0,
+	};
+	smallfelem pre_comp[2][16][3];
+	int i, j;
+
+	precompute_mult(sm2_g_x, sm2_g_y, pre_comp);
+
+	for (i = 0; i < 16; i++) {
+		for (j = 0; j < 3; j++) {
+			if (memcmp(&pre_comp[0][i][j], &g_pre_comp[0][i][j], sizeof(smallfelem)) != 0) {
+				fprintf(stderr, "%s %d: error\n", __FILE__, __LINE__);
+				return -1;
+			}
+			if (memcmp(&pre_comp[1][i][j], &g_pre_comp[1][i][j], sizeof(smallfelem)) != 0) {
+				fprintf(stderr, "%s %d: error\n", __FILE__, __LINE__);
+				return -1;
+			}
+		}
+	}
+
+	printf("%s ok\n", __FUNCTION__);
+	return 1;
+}
+#endif
+
+/*
+ * select_point selects the |idx|th point from a precomputation table and
+ * copies it to out.
+ */
+static void select_point(const uint64_t idx, unsigned int size, // size = 16， 也就是预计算表的大小
+                         const smallfelem pre_comp[16][3], smallfelem out[3])
+{
+	unsigned int j;
+	uint64_t *outlimbs = &out[0][0];
+
+#ifdef SM2_NO_CONST_TIME
+	const uint64_t *inlimbs = (uint64_t *)&pre_comp[idx][0][0];
+	for (j = 0; j < NLIMBS * 3; j++) {
+		outlimbs[j] = inlimbs[j];
+	}
+#else
+	int i;
+	memset(out, 0, sizeof(*out) * 3);
+
+	for (i = 0; i < size; i++) {
+		const uint64_t *inlimbs = (uint64_t *)&pre_comp[i][0][0];
+		uint64_t mask = i ^ idx; // 如果正好相等的话，mask == 0
+		mask |= mask >> 4;
+		mask |= mask >> 2;
+		mask |= mask >> 1;
+		mask &= 1;
+		mask--;
+		for (j = 0; j < NLIMBS * 3; j++) {
+			outlimbs[j] |= inlimbs[j] & mask;
+		}
+	}
+#endif
+}
+
+/* get_bit returns the |i|th bit in |in| */
+static char get_bit(const felem_bytearray in, int i)
+{
+	if ((i < 0) || (i >= 256))
+		return 0;
+	return (in[i >> 3] >> (i & 7)) & 1;
+}
+
+/*-
+ * This function looks at 5+1 scalar bits (5 current, 1 adjacent less
+ * significant bit), and recodes them into a signed digit for use in fast point
+ * multiplication: the use of signed rather than unsigned digits means that
+ * fewer points need to be precomputed, given that point inversion is easy
+ * (a precomputed point dP makes -dP available as well).
+ *
+ * BACKGROUND:
+ *
+ * Signed digits for multiplication were introduced by Booth ("A signed binary
+ * multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV,
+ * pt. 2 (1951), pp. 236-240), in that case for multiplication of integers.
+ * Booth's original encoding did not generally improve the density of nonzero
+ * digits over the binary representation, and was merely meant to simplify the
+ * handling of signed factors given in two's complement; but it has since been
+ * shown to be the basis of various signed-digit representations that do have
+ * further advantages, including the wNAF, using the following general approach:
+ *
+ * (1) Given a binary representation
+ *
+ *       b_k  ...  b_2  b_1  b_0,
+ *
+ *     of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1
+ *     by using bit-wise subtraction as follows:
+ *
+ *        b_k b_(k-1)  ...  b_2  b_1  b_0
+ *      -     b_k      ...  b_3  b_2  b_1  b_0
+ *       -------------------------------------
+ *        s_k b_(k-1)  ...  s_3  s_2  s_1  s_0
+ *
+ *     A left-shift followed by subtraction of the original value yields a new
+ *     representation of the same value, using signed bits s_i = b_(i+1) - b_i.
+ *     This representation from Booth's paper has since appeared in the
+ *     literature under a variety of different names including "reversed binary
+ *     form", "alternating greedy expansion", "mutual opposite form", and
+ *     "sign-alternating {+-1}-representation".
+ *
+ *     An interesting property is that among the nonzero bits, values 1 and -1
+ *     strictly alternate.
+ *
+ * (2) Various window schemes can be applied to the Booth representation of
+ *     integers: for example, right-to-left sliding windows yield the wNAF
+ *     (a signed-digit encoding independently discovered by various researchers
+ *     in the 1990s), and left-to-right sliding windows yield a left-to-right
+ *     equivalent of the wNAF (independently discovered by various researchers
+ *     around 2004).
+ *
+ * To prevent leaking information through side channels in point multiplication,
+ * we need to recode the given integer into a regular pattern: sliding windows
+ * as in wNAFs won't do, we need their fixed-window equivalent -- which is a few
+ * decades older: we'll be using the so-called "modified Booth encoding" due to
+ * MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49
+ * (1961), pp. 67-91), in a radix-2^5 setting.  That is, we always combine five
+ * signed bits into a signed digit:
+ *
+ *       s_(4j + 4) s_(4j + 3) s_(4j + 2) s_(4j + 1) s_(4j)
+ *
+ * The sign-alternating property implies that the resulting digit values are
+ * integers from -16 to 16.
+ *
+ * Of course, we don't actually need to compute the signed digits s_i as an
+ * intermediate step (that's just a nice way to see how this scheme relates
+ * to the wNAF): a direct computation obtains the recoded digit from the
+ * six bits b_(4j + 4) ... b_(4j - 1).
+ *
+ * This function takes those five bits as an integer (0 .. 63), writing the
+ * recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
+ * value, in the range 0 .. 8).  Note that this integer essentially provides the
+ * input bits "shifted to the left" by one position: for example, the input to
+ * compute the least significant recoded digit, given that there's no bit b_-1,
+ * has to be b_4 b_3 b_2 b_1 b_0 0.
+ *
+ */
+static void recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in)
+{
+	unsigned char s, d;
+
+	s = ~((in >> 5) - 1);
+
+	d = (1 << 6) - in - 1;
+	d = (d & s) | (in & ~s);
+	d = (d >> 1) + (d & 1);
+
+	*sign = s & 1;
+	*digit = d;
+}
+
+/*
+ * Interleaved point multiplication using precomputed point multiples: The
+ * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
+ * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
+ * generator, using certain (large) precomputed multiples in g_pre_comp.
+ * Output point (X, Y, Z) is stored in x_out, y_out, z_out
+ */
+static void batch_mul(
+	felem x_out, felem y_out, felem z_out,
+	const felem_bytearray scalars[],
+	const unsigned int num_points,
+	const felem_bytearray g_scalar,
+	const int mixed,
+	const smallfelem pre_comp[][17][3])
+{
+	int i, skip;
+	unsigned int num;
+	//unsigned int gen_mul = (g_scalar != NULL);
+	felem nq[3], ftmp;
+	smallfelem tmp[3];
+	uint64_t bits;
+	uint8_t sign, digit;
+
+	/* set nq to the point at infinity */
+	memset(nq, 0, sizeof(nq));
+
+	/*
+	* Loop over all scalars msb-to-lsb, interleaving additions of multiples
+	* of the generator (two in each of the last 32 rounds) and additions of
+	* other points multiples (every 5th round).
+	*/
+	skip = 1; /* save two point operations in the first round */
+
+	for (i = (num_points ? 255 : 31); i >= 0; --i) {
+		/* double */
+		if (!skip) {
+			point_double(
+				nq[0], nq[1], nq[2],
+				nq[0], nq[1], nq[2]);
+		}
+
+		/* add multiples of the generator */
+		if (g_scalar && (i <= 31)) {
+			/* first, look 32 bits upwards */
+			bits = get_bit(g_scalar, i + 224) << 3;
+			bits |= get_bit(g_scalar, i + 160) << 2;
+			bits |= get_bit(g_scalar, i + 96) << 1;
+			bits |= get_bit(g_scalar, i + 32);
+
+			/* select the point to add, in constant time */
+			select_point(bits, 16, g_pre_comp[1], tmp); // 从G表中选出来的点是仿射
+
+			if (!skip) {
+				/* Arg 1 below is for "mixed" */
+				point_add(
+					nq[0], nq[1], nq[2],
+					nq[0], nq[1], nq[2],
+					1, tmp[0], tmp[1], tmp[2]);
+			} else {
+				smallfelem_expand(nq[0], tmp[0]);
+				smallfelem_expand(nq[1], tmp[1]);
+				smallfelem_expand(nq[2], tmp[2]);
+				skip = 0;
+			}
+
+			/* second, look at the current position */
+			bits = get_bit(g_scalar, i + 192) << 3;
+			bits |= get_bit(g_scalar, i + 128) << 2;
+			bits |= get_bit(g_scalar, i + 64) << 1;
+			bits |= get_bit(g_scalar, i);
+
+			/* select the point to add, in constant time */
+			select_point(bits, 16, g_pre_comp[0], tmp);
+
+			/* Arg 1 below is for "mixed" */
+			point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
+		}
+
+		/* do other additions every 5 doublings */
+		if (num_points && (i % 5 == 0)) {
+			/* loop over all scalars */
+
+			for (num = 0; num < num_points; ++num) {
+				bits = get_bit(scalars[num], i + 4) << 5;
+				bits |= get_bit(scalars[num], i + 3) << 4;
+				bits |= get_bit(scalars[num], i + 2) << 3;
+				bits |= get_bit(scalars[num], i + 1) << 2;
+				bits |= get_bit(scalars[num], i) << 1;
+				bits |= get_bit(scalars[num], i - 1);
+				recode_scalar_bits(&sign, &digit, bits);
+				/*
+				* select the point to add or subtract, in constant time
+				*/
+				select_point(digit, 17, pre_comp[num], tmp);
+				smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
+						       * point */
+				copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
+				felem_contract(tmp[1], ftmp);
+
+				if (!skip) {
+					point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], mixed, tmp[0], tmp[1], tmp[2]);
+				} else {
+					smallfelem_expand(nq[0], tmp[0]);
+					smallfelem_expand(nq[1], tmp[1]);
+					smallfelem_expand(nq[2], tmp[2]);
+					skip = 0;
+				}
+			}
+		}
+	}
+
+	felem_assign(x_out, nq[0]);
+	felem_assign(y_out, nq[1]);
+	felem_assign(z_out, nq[2]);
+}
+
+/*
+ * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
+ * values Result is stored in r (r can equal one of the inputs).
+ */
+int sm2_points_mul(
+	uint8_t r_x[32],
+	uint8_t r_y[32],
+	const uint8_t g_scalar[32],
+	size_t num_points,
+	const SM2_POINT points[],
+	const uint8_t scalars[][32])
+{
+	int ret = 0;
+	int mixed = (num_points >= 3) ? 1 : 0;
+
+	felem_bytearray g_secret;
+	smallfelem (*pre_comp)[17][3] = NULL;
+	smallfelem *tmp_smallfelems = NULL;
+	felem_bytearray *secrets = NULL;
+	felem x_out, y_out, z_out;
+	unsigned int i;
+	int j;
+
+	if (num_points > 0) {
+		if (!(secrets = malloc(sizeof(felem_bytearray) * num_points))) {
+			goto end;
+		}
+		if (!(pre_comp = malloc(sizeof(*pre_comp) * num_points))) {
+			goto end;
+		}
+
+		/*
+		* we treat NULL scalars as 0, and NULL points as points at infinity,
+		* i.e., they contribute nothing to the linear combination
+		*/
+		memset(secrets, 0, sizeof(*secrets) * num_points);
+		memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
+
+		for (i = 0; i < num_points; ++i) {
+
+			flip_endian(secrets[i], scalars[i]);
+
+			/* precompute multiples */
+			felem_from_bytearray(x_out, points[i].x);
+			felem_from_bytearray(y_out, points[i].y);
+			felem_one(z_out);
+
+			felem_shrink(pre_comp[i][1][0], x_out);
+			felem_shrink(pre_comp[i][1][1], y_out);
+			felem_shrink(pre_comp[i][1][2], z_out);
+
+			for (j = 2; j <= 16; ++j) {
+				if (j & 1) {
+					point_add_small(
+						pre_comp[i][j][0],
+						pre_comp[i][j][1],
+						pre_comp[i][j][2],
+						pre_comp[i][1][0],
+						pre_comp[i][1][1],
+						pre_comp[i][1][2],
+						pre_comp[i][j - 1][0],
+						pre_comp[i][j - 1][1],
+						pre_comp[i][j - 1][2]);
+				} else {
+					point_double_small(
+						pre_comp[i][j][0],
+						pre_comp[i][j][1],
+						pre_comp[i][j][2],
+						pre_comp[i][j / 2][0],
+						pre_comp[i][j / 2][1],
+						pre_comp[i][j / 2][2]);
+				}
+			}
+		}
+		if (mixed) {
+			if (!(tmp_smallfelems = malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1)))) {
+				error_print();
+				goto end;
+			}
+			make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
+		}
+	}
+
+	/* the scalar for the generator */
+	if (g_scalar) {
+		flip_endian(g_secret, g_scalar);
+
+		/* do the multiplication with generator precomputation */
+		batch_mul(x_out, y_out, z_out,
+			//(const felem_bytearray(*))
+			secrets, num_points,
+			g_secret,
+			mixed, (const smallfelem(*)[17][3])pre_comp);
+
+	} else {
+		/* do the multiplication without generator precomputation */
+		batch_mul(x_out, y_out, z_out,
+			(const felem_bytearray(*))secrets, num_points,
+			NULL, mixed,
+			(const smallfelem(*)[17][3])pre_comp);
+	}
+
+	make_point_affine(r_x, r_y, x_out, y_out, z_out);
+	ret = 1;
+
+end:
+	if (pre_comp) free(pre_comp);
+	if (tmp_smallfelems) free(tmp_smallfelems);
+	if (secrets) free(secrets);
+	return ret;
+}
+
diff --git a/tests/sm2_uint128test.c b/tests/sm2_uint128test.c
new file mode 100644
index 00000000..e8d5b616
--- /dev/null
+++ b/tests/sm2_uint128test.c
@@ -0,0 +1,158 @@
+﻿/*
+ *  Copyright 2014-2023 The GmSSL Project. All Rights Reserved.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the License); you may
+ *  not use this file except in compliance with the License.
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <gmssl/sm2.h>
+#include <gmssl/error.h>
+
+
+int test_sm2_points_mul_g(void)
+{
+	const char *k_hex[] = {
+		"D49733EF7322BCA402FF2C55A76226D036293578228F05BB0A084E7FD679DA6A",
+		"EE459A3CEA02B64116CA83738284A8673141F2C0800791A5774B3B3326BCBAE9",
+		"112C903A44D865817C34050F32A7B7C5562FF21D5FB86F3EBC82E017E63AA650",
+	};
+	const char *P_hex[] = {
+		"DA0A26E5A2FD1A8338E6465CD73EACD360EFD0C0278DD3239174077FD1646D0D"
+		"7B5454054D4E799DE8AF8D2BB4E35D34B4C4A286BE6E631691E11A8D878EF0AA",
+		"FBEEA44629CCCE80894CCDD66594B391E62A476AD2F401CBF1B64728C6195402"
+		"45E6ACF90653E584C6F71A6A130B311B9582977751AE413697398C85D35C4DD4",
+		"7D40E2CDA2528106C41D90B654E8BEB52E66BAF2D1F49574992FE707F0BE6223"
+		"E4C5F597E924662F2A85C6065F05A2E554D856C8135C34A4BC82870D555F9084",
+	};
+	uint8_t k[32];
+	uint8_t P[64];
+	uint8_t x[64];
+	uint8_t *y = x + 32;
+	size_t len, i;
+
+	for (i = 0; i < sizeof(k_hex)/sizeof(k_hex[0]); i++) {
+		hex_to_bytes(k_hex[i], strlen(k_hex[i]), k, &len);
+		hex_to_bytes(P_hex[i], strlen(P_hex[i]), P, &len);
+
+		sm2_points_mul(x, y, k, 0, NULL, NULL);
+
+		if (memcmp(x, P, 64) != 0) {
+			error_print();
+			return -1;
+		}
+	}
+
+	printf("%s ok\n", __FUNCTION__);
+	return 1;
+}
+
+int test_sm2_points_mul(void)
+{
+	const char *k_hex[] = {
+		"84890E2149BF27124A123441ED574373C5506293AA087B319A44A8481CFF4201",
+		"C80DD28E6093883E8BB637CA8C73CDD43D562D4235E201A83FB5FEF6CE7B0E00",
+		"1BFC60372080CDCB4B11A798FE91D15FBB9ACD1B696850DFC84DA4B54B849D92",
+		"06104FD237BCA808B3D6039A0A859C7AD46E4FF8358F0E909A65CB0F5E63D07E",
+	};
+	const char *P_hex[] = {
+		"E658713DDCDB7C89431C2F19CE1187CA323620197534AFCFD322EE39CD18CDB8"
+		"D1FCD5160877086EF85F5C7CF2F850D0D3D57BA4B2547CB719EE5EDD3A2ACD54",
+		"01E9266592CF24B124A8CE2ACA03941A32FA05EADB7EB165207148F0468A32D0"
+		"AD576F985A6365EF43654C6F82B151153928C59D0A04BDB8865CBC3370093E7C",
+		"F77F4A004CA3ACFC1476D6BE6E74B3724D116CB43913B81CD18C9F67FB684FDD"
+		"A64D24AA17DAAFC359D368CA4FB5023552C1C9158C8C63359B8617C12031A723",
+		"A56B91BA232C11099C2545F637E0949E943631032592BE3AD063426A54B6D4B9"
+		"7E706E500A0D0BCE94A6574889544A8425791C5C1D5E80EBA73B1A4D2FBA85DF",
+	};
+	const char *Q_hex[] = {
+		"D9C7FE8529EC92CD4C371BF031DFBC939DB3A4571C0FBE3BFFAF53016B57A767"
+		"995BDFC720CD7504839275029A423172725B2EFD57448A45533727167578257C",
+		"9ED9C5F08B7853ABB486B9C62D6E8D76A014A7BBA1967710CF419FBA8D038F15"
+		"EB503473E3C401974BD2CD61BFC9B3F84E32B4137F116FBFEC551D604DD654CC",
+		"6FDEEC7B9405F75E1F6BD81524AD982A8752AD0093CA356535AFDDD37068D960"
+		"1D010E962355349956CC8F0408C5FA945A4F29FC4D7083F9325F392528C40693",
+		"2520E6EFB29A073E71B63ABBD93FA2899ABA1940DC942BF628861BD6697B3F6F"
+		"BEB40823F2365975B10AA0848E127C72D32F0924323B276413FA267717E0716B",
+	};
+	uint8_t k[32];
+	uint8_t P[64];
+	uint8_t Q[64];
+	uint8_t R[64];
+	size_t len, i;
+
+
+	for (i = 0; i < sizeof(k_hex)/sizeof(k_hex[0]); i++) {
+		hex_to_bytes(k_hex[i], strlen(k_hex[i]), k, &len);
+		hex_to_bytes(P_hex[i], strlen(P_hex[i]), P, &len);
+		hex_to_bytes(Q_hex[i], strlen(Q_hex[i]), Q, &len);
+
+		sm2_points_mul(R, R + 32, NULL, 1, (SM2_POINT *)P, &k);
+
+		if (memcmp(R, Q, 64) != 0) {
+			error_print();
+			return -1;
+		}
+	}
+
+	printf("%s ok\n", __FUNCTION__);
+	return 1;
+}
+
+int test_sm2_points_mul_mult(void)
+{
+	const char *s_hex =
+		"75B6973113B59FBF4EF91D3986B8EA3548B8DA538611154A4F91049645F36C74";
+	const char *k_hex[] = {
+		"0E7A0ABB57773AA55F4B829767654D451AB6E7AECF05FFA8DFF41134D5700D40",
+		"27F4E552EE458A90FB5A46BC9106799AAF9F51D873E188D148C61154539A272A",
+		"2C6C45FFBD27C2F9AFFCEB023F45E16B6A9415BF34A4D94599B6C35B03EAC1F7",
+	};
+	const char *P_hex[] = {
+		"E8346D7CCF6C071C3C8343A305F5AD9EDCE59EF75F64774732F2CED8F3B7143A"
+		"D0CF09FDFD9C69775BE47FBA558D3AD904D61D5DB6CC8D9C6F7CA484DFF0182B",
+		"E6679FBC4C0E460B1FAB57EA3CD653601BA68223062E6C53B65DF3DE4A566862"
+		"124EE9725C6A3FB427B05130B68FD963FB4762DC6EDCC0319B807DB290B3032D",
+		"FA6654F72E8DEDD9ED00EA74C81F0CAB37C6D28000E02A2648E7BEB7F22FC118"
+		"97F08B2357503E7BBCE7275C336FF1F713DD9B5BE66BFEEC18A6C837E59FD4E6",
+	};
+	const char *Q_hex =
+		"F5F2749B9D36AE0175DC63F3F97D9747B813F63D1475978E8792DD1AB7F13050"
+		"1B6951A69477F39550F5550FF783905F3B7305186F38D3A63A8ECCD6E28E1603";
+
+	uint8_t s[32];
+	uint8_t k[3][32];
+	SM2_POINT P[3];
+	uint8_t Q[64];
+	uint8_t R[64];
+	size_t len, i;
+
+	hex_to_bytes(s_hex, strlen(s_hex), s, &len);
+	for (i = 0; i < sizeof(k_hex)/sizeof(k_hex[0]); i++) {
+		hex_to_bytes(k_hex[i], strlen(k_hex[i]), k[i], &len);
+		hex_to_bytes(P_hex[i], strlen(P_hex[i]), (uint8_t *)&P[i], &len);
+	}
+	hex_to_bytes(Q_hex, strlen(Q_hex), Q, &len);
+
+	sm2_points_mul(R, R + 32, s, sizeof(k_hex)/sizeof(k_hex[0]), P, k);
+	if (memcmp(R, Q, 64) != 0) {
+		error_print();
+		return -1;
+	}
+
+	printf("%s ok\n", __FUNCTION__);
+	return 1;
+}
+
+int main(void)
+{
+	if (test_sm2_points_mul() != 1) { error_print(); return -1; }
+	if (test_sm2_points_mul_mult() != 1) { error_print(); return -1; }
+	printf("%s all tests passed\n", __FILE__);
+	return 0;
+}