diff options
author | MITSUNARI Shigeo <herumi@nifty.com> | 2017-01-11 11:16:31 +0800 |
---|---|---|
committer | MITSUNARI Shigeo <herumi@nifty.com> | 2017-01-11 11:16:36 +0800 |
commit | bf035e3692e283b687d6ae2bb8e111b5c088b612 (patch) | |
tree | f10201f34fe46c61920bfc7643bd5144d7330668 | |
parent | 7b99668980fd9346e319d828f95b64758b64cfd0 (diff) | |
download | tangerine-mcl-bf035e3692e283b687d6ae2bb8e111b5c088b612.tar.gz tangerine-mcl-bf035e3692e283b687d6ae2bb8e111b5c088b612.tar.zst tangerine-mcl-bf035e3692e283b687d6ae2bb8e111b5c088b612.zip |
a little faster fp2_mul for llvm
-rw-r--r-- | include/mcl/fp_tower.hpp | 11 | ||||
-rw-r--r-- | include/mcl/op.hpp | 2 | ||||
-rw-r--r-- | src/fp.cpp | 1 | ||||
-rw-r--r-- | src/fp_generator.hpp | 1 | ||||
-rw-r--r-- | src/low_func.hpp | 29 |
5 files changed, 43 insertions, 1 deletions
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp index a0edde5..28f1772 100644 --- a/include/mcl/fp_tower.hpp +++ b/include/mcl/fp_tower.hpp @@ -214,7 +214,11 @@ public: if (op.isFastMod) { op.fp2_mul = fp2_mulW; } else if (!op.isFullBit) { - op.fp2_mul = fp2_mulUseDblUseNCW; + if (sizeof(Fp) * 8 == op.N * fp::UnitBitSize && op.fp2_mulNF) { + op.fp2_mul = fp2_mulNFW; + } else { + op.fp2_mul = fp2_mulUseDblUseNCW; + } } else { op.fp2_mul = fp2_mulUseDblW; } @@ -306,6 +310,11 @@ private: FpDbl::sub(d1, d1, d2); // ac - bd FpDbl::mod(pz[0], d1); // set z0 } + static void fp2_mulNFW(Unit *z, const Unit *x, const Unit *y) + { + const fp::Op& op = Fp::op_; + op.fp2_mulNF(z, x, y, op.p); + } static void fp2_mulUseDblUseNCW(Unit *z, const Unit *x, const Unit *y) { const Fp *px = reinterpret_cast<const Fp*>(x); diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index 0e5cba9..159047a 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -123,6 +123,7 @@ struct Op { void3u fp2_add; void3u fp2_sub; void3u fp2_mul; + void4u fp2_mulNF; void2u fp2_neg; void2u fp2_inv; void2u fp2_sqr; @@ -188,6 +189,7 @@ struct Op { fp2_add = 0; fp2_sub = 0; fp2_mul = 0; + fp2_mulNF = 0; fp2_neg = 0; fp2_inv = 0; fp2_sqr = 0; @@ -211,6 +211,7 @@ void setOp2(Op& op) op.fpDbl_sub = DblSub<N, Tag>::f; op.fp_addPre = AddPre<N, Tag>::f; op.fp_subPre = SubPre<N, Tag>::f; + op.fp2_mulNF = Fp2MulNF<N, Tag>::f; SetFpDbl<N, enableFpDbl>::exec(op); } diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 9036f86..bc318bf 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -229,6 +229,7 @@ struct FpGenerator : Xbyak::CodeGenerator { } // setup fp_tower if (op.N > 4) return; + op.fp2_mulNF = 0; align(16); op.fpDbl_add = getCurr<void4u>(); gen_fpDbl_add(); diff --git a/src/low_func.hpp b/src/low_func.hpp index 8684131..cc75464 100644 --- a/src/low_func.hpp +++ b/src/low_func.hpp @@ -638,6 +638,35 @@ struct Sqr { template<size_t N, class Tag> const void3u Sqr<N, Tag>::f = Sqr<N, Tag>::func; +template<size_t N, class Tag = Gtag> +struct Fp2MulNF { + static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p) + { + const Unit *const a = x; + const Unit *const b = x + N; + const Unit *const c = y; + const Unit *const d = y + N; + Unit d0[N * 2]; + Unit d1[N * 2]; + Unit d2[N * 2]; + Unit s[N]; + Unit t[N]; + AddPre<N, Tag>::f(s, a, b); + AddPre<N, Tag>::f(t, c, d); + MulPre<N, Tag>::f(d0, s, t); + MulPre<N, Tag>::f(d1, a, c); + MulPre<N, Tag>::f(d2, b, d); + SubPre<N * 2, Tag>::f(d0, d0, d1); + SubPre<N * 2, Tag>::f(d0, d0, d2); + MontRed<N, Tag>::f(z + N, d0, p); + DblSub<N, Tag>::f(d1, d1, d2, p); + MontRed<N, Tag>::f(z, d1, p); + } + static const void4u f; +}; +template<size_t N, class Tag> +const void4u Fp2MulNF<N, Tag>::f = Fp2MulNF<N, Tag>::func; + } } // mcl::fp #ifdef _WIN32 |