diff options
author | MITSUNARI Shigeo <herumi@nifty.com> | 2016-12-26 12:58:59 +0800 |
---|---|---|
committer | MITSUNARI Shigeo <herumi@nifty.com> | 2016-12-26 12:58:59 +0800 |
commit | 389bb658ef48e597d27d9940a8cfe43314ab8f7c (patch) | |
tree | ce736493c683501f47ea712ed08c2d39aee115a6 | |
parent | 21324c9e0adf5c664c439bd5cc6f994211457c6a (diff) | |
download | dexon-mcl-389bb658ef48e597d27d9940a8cfe43314ab8f7c.tar.gz dexon-mcl-389bb658ef48e597d27d9940a8cfe43314ab8f7c.tar.zst dexon-mcl-389bb658ef48e597d27d9940a8cfe43314ab8f7c.zip |
add gmp version of Mont with not full prime
-rw-r--r-- | readme.md | 4 | ||||
-rw-r--r-- | src/fp.cpp | 9 | ||||
-rw-r--r-- | src/gen.cpp | 91 | ||||
-rw-r--r-- | src/low_func.hpp | 78 | ||||
-rw-r--r-- | src/low_func_llvm.hpp | 3 | ||||
-rw-r--r-- | src/proto.hpp | 1 |
6 files changed, 131 insertions, 55 deletions
@@ -89,7 +89,9 @@ make MCL_USE_LLVM=1 LLVM_VER=-3.8 ARCH=arm A benchmark of a BN curve over the 254-bit prime.
* x64, x86 ; Inte Core i7-6700 3.4GHz(Skylake)
- * `sudo cpufreq-set -g performance`
+ ```
+ sudo cpufreq-set -g performance
+ ```
* arm ; 900MHz quad-core ARM Cortex-A7 on Raspberry Pi2, Linux 4.4.11-v7+
* arm64 ; 1.2GHz ARM Cortex-A53 [HiKey](http://www.96boards.org/product/hikey/)
@@ -191,8 +191,13 @@ void setOp2(Op& op) op.fp_sub = Sub<N, false, Tag>::f; } if (op.isMont) { - op.fp_mul = Mont<N, Tag>::f; - op.fp_sqr = SqrMont<N, Tag>::f; + if (op.isFullBit) { + op.fp_mul = Mont<N, true, Tag>::f; + op.fp_sqr = SqrMont<N, true, Tag>::f; + } else { + op.fp_mul = Mont<N, false, Tag>::f; + op.fp_sqr = SqrMont<N, false, Tag>::f; + } op.fpDbl_mod = MontRed<N, Tag>::f; } else { op.fp_mul = Mul<N, Tag>::f; diff --git a/src/gen.cpp b/src/gen.cpp index 627c2d2..401a582 100644 --- a/src/gen.cpp +++ b/src/gen.cpp @@ -738,7 +738,7 @@ struct Code : public mcl::Generator { generic_fpDbl_mul(py, px, px); endFunc(); } - void gen_mcl_fp_mont() + void gen_mcl_fp_mont(bool isFullBit = true) { const int bu = bit + unit; const int bu2 = bit + unit * 2; @@ -747,39 +747,75 @@ struct Code : public mcl::Generator { Operand px(IntPtr, unit); Operand py(IntPtr, unit); Operand pp(IntPtr, unit); - std::string name = "mcl_fp_mont" + cybozu::itoa(N) + "L"; + std::string name = "mcl_fp_mont"; + if (!isFullBit) { + name += "NF"; + } + name += cybozu::itoa(N) + "L"; mcl_fp_montM[N] = Function(name, Void, pz, px, py, pp); mcl_fp_montM[N].setAlias(); verifyAndSetPrivate(mcl_fp_montM[N]); beginFunc(mcl_fp_montM[N]); Operand rp = load(getelementptr(pp, -1)); - Operand p = loadN(pp, N); Operand z, s, a; - for (uint32_t i = 0; i < N; i++) { - Operand y = load(getelementptr(py, i)); - Operand xy = call(mulPvM[bit], px, y); - Operand at; - if (i == 0) { - a = zext(xy, bu2); - at = trunc(xy, unit); - } else { - xy = zext(xy, bu2); - a = add(s, xy); - at = trunc(a, unit); + if (1 || isFullBit) { + for (uint32_t i = 0; i < N; i++) { + Operand y = load(getelementptr(py, i)); + Operand xy = call(mulPvM[bit], px, y); + Operand at; + if (i == 0) { + a = zext(xy, bu2); + at = trunc(xy, unit); + } else { + xy = zext(xy, bu2); + a = add(s, xy); + at = trunc(a, unit); + } + Operand q = mul(at, rp); + Operand pq = call(mulPvM[bit], pp, q); + pq = zext(pq, bu2); + Operand t = add(a, pq); + s = lshr(t, unit); } - Operand q = mul(at, rp); - Operand pq = call(mulPvM[bit], pp, q); - pq = zext(pq, bu2); - Operand t = add(a, pq); - s = lshr(t, unit); + s = trunc(s, bu); + Operand p = zext(loadN(pp, N), bu); + Operand vc = sub(s, p); + Operand c = trunc(lshr(vc, bit), 1); + z = select(c, s, vc); + z = trunc(z, bit); + storeN(z, pz); + } else { + for (uint32_t i = 0; i < N; i++) { + Operand y = load(getelementptr(py, i)); + Operand xy = call(mulPvM[bit], px, y); + Operand at; + if (i == 0) { + a = xy; + at = trunc(xy, unit); + Operand q = mul(at, rp); + Operand pq = call(mulPvM[bit], pp, q); + pq = zext(pq, bu2); + Operand t = add(a, pq); + s = lshr(t, unit); + } else { + xy = zext(xy, bu2); + a = add(s, xy); + at = trunc(a, unit); + Operand q = mul(at, rp); + Operand pq = call(mulPvM[bit], pp, q); + pq = zext(pq, bu2); + Operand t = add(a, pq); + s = lshr(t, unit); + } + } + s = trunc(s, bu); + Operand p = zext(loadN(pp, N), bu); + Operand vc = sub(s, p); + Operand c = trunc(lshr(vc, bit), 1); + z = select(c, s, vc); + z = trunc(z, bit); + storeN(z, pz); } - s = trunc(s, bu); - p = zext(p, bu); - Operand vc = sub(s, p); - Operand c = trunc(lshr(vc, bit), 1); - z = select(c, s, vc); - z = trunc(z, bit); - storeN(z, pz); ret(Void); endFunc(); } @@ -840,7 +876,8 @@ struct Code : public mcl::Generator { gen_mcl_fp_mulUnitPre(); gen_mcl_fpDbl_mulPre(); gen_mcl_fpDbl_sqrPre(); - gen_mcl_fp_mont(); + gen_mcl_fp_mont(true); + gen_mcl_fp_mont(false); gen_mcl_fp_montRed(); } void setBit(uint32_t bit) diff --git a/src/low_func.hpp b/src/low_func.hpp index c3c7450..2f8c4a0 100644 --- a/src/low_func.hpp +++ b/src/low_func.hpp @@ -507,7 +507,7 @@ const void3u MontRed<N, Tag>::f = MontRed<N, Tag>::func; z[N] <- Montgomery(x[N], y[N], p[N]) REMARK : assume p[-1] = rp */ -template<size_t N, class Tag = Gtag> +template<size_t N, bool isFullBit, class Tag = Gtag> struct Mont { static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p) { @@ -517,26 +517,56 @@ struct Mont { MontRed<N, Tag>::f(z, xy, p); #else const Unit rp = p[-1]; - Unit buf[N * 2 + 2]; - Unit *c = buf; - MulUnitPre<N, Tag>::f(c, x, y[0]); // x * y[0] - Unit q = c[0] * rp; - Unit t[N + 2]; - MulUnitPre<N, Tag>::f(t, p, q); // p * q - t[N + 1] = 0; // always zero - c[N + 1] = AddPre<N + 1, Tag>::f(c, c, t); - c++; - for (size_t i = 1; i < N; i++) { - MulUnitPre<N, Tag>::f(t, x, y[i]); + if (isFullBit) { + Unit buf[N * 2 + 2]; + Unit *c = buf; + MulUnitPre<N, Tag>::f(c, x, y[0]); // x * y[0] + Unit q = c[0] * rp; + Unit t[N + 2]; + MulUnitPre<N, Tag>::f(t, p, q); // p * q + t[N + 1] = 0; // always zero c[N + 1] = AddPre<N + 1, Tag>::f(c, c, t); - q = c[0] * rp; - MulUnitPre<N, Tag>::f(t, p, q); - AddPre<N + 2, Tag>::f(c, c, t); c++; - } - if (c[N]) { - SubPre<N, Tag>::f(z, c, p); + for (size_t i = 1; i < N; i++) { + MulUnitPre<N, Tag>::f(t, x, y[i]); + c[N + 1] = AddPre<N + 1, Tag>::f(c, c, t); + q = c[0] * rp; + MulUnitPre<N, Tag>::f(t, p, q); + AddPre<N + 2, Tag>::f(c, c, t); + c++; + } + if (c[N]) { + SubPre<N, Tag>::f(z, c, p); + } else { + if (SubPre<N, Tag>::f(z, c, p)) { + memcpy(z, c, N * sizeof(Unit)); + } + } } else { + Unit carry; + (void)carry; + Unit buf[N * 2 + 1]; + Unit *c = buf; + MulUnitPre<N, Tag>::f(c, x, y[0]); // x * y[0] + Unit q = c[0] * rp; + Unit t[N + 1]; + MulUnitPre<N, Tag>::f(t, p, q); // p * q + carry = AddPre<N + 1, Tag>::f(c, c, t); + assert(carry == 0); + c++; + c[N] = 0; + for (size_t i = 1; i < N; i++) { + c[N + 1] = 0; + MulUnitPre<N, Tag>::f(t, x, y[i]); + carry = AddPre<N + 1, Tag>::f(c, c, t); + assert(carry == 0); + q = c[0] * rp; + MulUnitPre<N, Tag>::f(t, p, q); + carry = AddPre<N + 1, Tag>::f(c, c, t); + assert(carry == 0); + c++; + } + assert(c[N] == 0); if (SubPre<N, Tag>::f(z, c, p)) { memcpy(z, c, N * sizeof(Unit)); } @@ -546,11 +576,11 @@ struct Mont { static const void4u f; }; -template<size_t N, class Tag> -const void4u Mont<N, Tag>::f = Mont<N, Tag>::func; +template<size_t N, bool isFullBit, class Tag> +const void4u Mont<N, isFullBit, Tag>::f = Mont<N, isFullBit, Tag>::func; // z[N] <- Montgomery(x[N], x[N], p[N]) -template<size_t N, class Tag = Gtag> +template<size_t N, bool isFullBit, class Tag = Gtag> struct SqrMont { static inline void func(Unit *y, const Unit *x, const Unit *p) { @@ -559,13 +589,13 @@ struct SqrMont { SqrPre<N, Tag>::f(xx, x); MontRed<N, Tag>::f(y, xx, p); #else - Mont<N, Tag>::f(y, x, x, p); + Mont<N, isFullBit, Tag>::f(y, x, x, p); #endif } static const void3u f; }; -template<size_t N, class Tag> -const void3u SqrMont<N, Tag>::f = SqrMont<N, Tag>::func; +template<size_t N, bool isFullBit, class Tag> +const void3u SqrMont<N, isFullBit, Tag>::f = SqrMont<N, isFullBit, Tag>::func; // z[N] <- (x[N] * y[N]) % p[N] template<size_t N, class Tag = Gtag> diff --git a/src/low_func_llvm.hpp b/src/low_func_llvm.hpp index 4798cbe..ea7a19c 100644 --- a/src/low_func_llvm.hpp +++ b/src/low_func_llvm.hpp @@ -35,7 +35,8 @@ template<>const void4u Add<n, true, Ltag>::f = &mcl_fp_add ## n ## L; \ template<>const void4u Add<n, false, Ltag>::f = &mcl_fp_addNF ## n ## L; \ template<>const void4u Sub<n, true, Ltag>::f = &mcl_fp_sub ## n ## L; \ template<>const void4u Sub<n, false, Ltag>::f = &mcl_fp_subNF ## n ## L; \ -template<>const void4u Mont<n, Ltag>::f = &mcl_fp_mont ## n ## L; \ +template<>const void4u Mont<n, true, Ltag>::f = &mcl_fp_mont ## n ## L; \ +template<>const void4u Mont<n, false, Ltag>::f = &mcl_fp_mont ## n ## L; \ template<>const void3u MontRed<n, Ltag>::f = &mcl_fp_montRed ## n ## L; \ template<>const void4u DblAdd<n, Ltag>::f = &mcl_fpDbl_add ## n ## L; \ template<>const void4u DblSub<n, Ltag>::f = &mcl_fpDbl_sub ## n ## L; \ diff --git a/src/proto.hpp b/src/proto.hpp index 46dac04..18e8567 100644 --- a/src/proto.hpp +++ b/src/proto.hpp @@ -20,6 +20,7 @@ void mcl_fp_mulUnitPre ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, mcl void mcl_fpDbl_mulPre ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y); \ void mcl_fpDbl_sqrPre ## n ## suf(mcl::fp::Unit* y, const mcl::fp::Unit* x); \ void mcl_fp_mont ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p); \ +void mcl_fp_montNF ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p); \ void mcl_fp_montRed ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* xy, const mcl::fp::Unit* p); \ void mcl_fpDbl_add ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p); \ void mcl_fpDbl_sub ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p); |