mirror of
https://github.com/guanzhi/GmSSL.git
synced 2026-05-06 16:36:16 +08:00
103 lines
2.1 KiB
ArmAsm
103 lines
2.1 KiB
ArmAsm
/*
|
|
* Copyright 2014-2024 The GmSSL Project. All Rights Reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the License); you may
|
|
* not use this file except in compliance with the License.
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*/
|
|
|
|
|
|
#include <gmssl/asm.h>
|
|
|
|
|
|
/* GF(2^128) defined by f(x) = x^128 + x^7 + x^2 + x + 1
|
|
|
|
f0 = x^128 = x^7 + x^2 + x + 1
|
|
ext([a0,a1],[b0,b1],8) => [a1,b0]
|
|
|
|
a * b
|
|
= (a0 + a1 * x^64) + (b0 + b1 * x^64)
|
|
= a0 * b0 + (a0 * b1 + a1 * b0) * x^64 + a1 * b1 * x^128
|
|
= a0 * b0 + ((a0 + a1)*(b0 + b1) - a0*b0 - a1*b1) * x^64 + a1 * b1 * x^128
|
|
= c + e * x^64 + d' * x^128
|
|
= c + e0 * x^64 + e1 * x^128 + d' * x^128
|
|
= c + e0 * x^64 + (d' + e1) * f0
|
|
= c + e0 * x^64 + d * f0
|
|
= c + e0 * x^64 + (d0 + d1 * x^64) * f0
|
|
= c + e0 * x^64 + d0 * f0 + (d1 * f0) * x^64 -- w = d1 * f0
|
|
= c + e0 * x^64 + d0 * f0 + (w0 + w1 * x^64) * x^64
|
|
= c + e0 * x^64 + d0 * f0 + w0 * x^64 + w1 * x^128
|
|
= c + e0 * x^64 + w0 * x^64 + d0 * f0 + w1 * f0
|
|
= c + (e0 + w0) * x^64 + (d0 + w1) * f0
|
|
*/
|
|
.text
|
|
|
|
.globl func(gf128_mul)
|
|
.align 4
|
|
|
|
func(gf128_mul):
|
|
// load (a0, a1)
|
|
ld1 {v1.2d},[x1]
|
|
// load (b0, b1)
|
|
ld1 {v2.2d},[x2]
|
|
|
|
// prepare zero
|
|
eor v0.16b, v0.16b, v0.16b
|
|
|
|
// set f(x) = x^7 + x^2 + x + 1 (0x87)
|
|
movi v7.16b, #0x87
|
|
ushr v7.2d, v7.2d, #56
|
|
|
|
// Multiply: 3*mul + 2*ext + 4*eor
|
|
|
|
// c = a0 * b0
|
|
pmull v3.1q, v1.1d, v2.1d
|
|
|
|
// a0 + a1
|
|
ext v5.16b, v1.16b, v1.16b, #8
|
|
eor v5.16b, v5.16b, v1.16b
|
|
|
|
// d' = a1 * b1
|
|
pmull2 v4.1q, v1.2d, v2.2d
|
|
|
|
// b0 + b1
|
|
ext v6.16b, v2.16b, v2.16b, #8
|
|
eor v6.16b, v6.16b, v2.16b
|
|
|
|
// e = (a0 + a1) * (b0 + b1) - a0 * b0 - a1 * b1
|
|
pmull v5.1q, v5.1d, v6.1d
|
|
eor v5.16b, v5.16b, v3.16b
|
|
eor v5.16b, v5.16b, v4.16b
|
|
|
|
// Reduce: 2*mul + 3*ext + 5*eor
|
|
|
|
// d = d' + e1
|
|
ext v6.16b, v5.16b, v0.16b, #8
|
|
eor v4.16b, v4.16b, v6.16b
|
|
|
|
// w = d1 * f0
|
|
pmull2 v6.1q, v4.2d, v7.2d
|
|
|
|
// (e0 + w0) * x^64
|
|
eor v5.16b, v5.16b, v6.16b
|
|
ext v5.16b, v0.16b, v5.16b, #8
|
|
|
|
// c = c + (e0 + w0) * x^64
|
|
eor v3.16b, v3.16b, v5.16b
|
|
|
|
// (d0 + w1) * f0
|
|
ext v6.16b, v6.16b, v6.16b, #8
|
|
eor v4.16b, v4.16b, v6.16b
|
|
pmull v4.1q, v4.1d, v7.1d
|
|
|
|
// c += (d0 + w1) * f0
|
|
eor v3.16b, v3.16b, v4.16b
|
|
|
|
// Output
|
|
st1 {v3.2d}, [x0]
|
|
|
|
ret
|
|
|
|
|