aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <herumi@nifty.com>2017-01-11 11:16:31 +0800
committerMITSUNARI Shigeo <herumi@nifty.com>2017-01-11 11:16:36 +0800
commitbf035e3692e283b687d6ae2bb8e111b5c088b612 (patch)
treef10201f34fe46c61920bfc7643bd5144d7330668
parent7b99668980fd9346e319d828f95b64758b64cfd0 (diff)
downloadtangerine-mcl-bf035e3692e283b687d6ae2bb8e111b5c088b612.tar.gz
tangerine-mcl-bf035e3692e283b687d6ae2bb8e111b5c088b612.tar.zst
tangerine-mcl-bf035e3692e283b687d6ae2bb8e111b5c088b612.zip
a little faster fp2_mul for llvm
-rw-r--r--include/mcl/fp_tower.hpp11
-rw-r--r--include/mcl/op.hpp2
-rw-r--r--src/fp.cpp1
-rw-r--r--src/fp_generator.hpp1
-rw-r--r--src/low_func.hpp29
5 files changed, 43 insertions, 1 deletions
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index a0edde5..28f1772 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -214,7 +214,11 @@ public:
if (op.isFastMod) {
op.fp2_mul = fp2_mulW;
} else if (!op.isFullBit) {
- op.fp2_mul = fp2_mulUseDblUseNCW;
+ if (sizeof(Fp) * 8 == op.N * fp::UnitBitSize && op.fp2_mulNF) {
+ op.fp2_mul = fp2_mulNFW;
+ } else {
+ op.fp2_mul = fp2_mulUseDblUseNCW;
+ }
} else {
op.fp2_mul = fp2_mulUseDblW;
}
@@ -306,6 +310,11 @@ private:
FpDbl::sub(d1, d1, d2); // ac - bd
FpDbl::mod(pz[0], d1); // set z0
}
+ static void fp2_mulNFW(Unit *z, const Unit *x, const Unit *y)
+ {
+ const fp::Op& op = Fp::op_;
+ op.fp2_mulNF(z, x, y, op.p);
+ }
static void fp2_mulUseDblUseNCW(Unit *z, const Unit *x, const Unit *y)
{
const Fp *px = reinterpret_cast<const Fp*>(x);
diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp
index 0e5cba9..159047a 100644
--- a/include/mcl/op.hpp
+++ b/include/mcl/op.hpp
@@ -123,6 +123,7 @@ struct Op {
void3u fp2_add;
void3u fp2_sub;
void3u fp2_mul;
+ void4u fp2_mulNF;
void2u fp2_neg;
void2u fp2_inv;
void2u fp2_sqr;
@@ -188,6 +189,7 @@ struct Op {
fp2_add = 0;
fp2_sub = 0;
fp2_mul = 0;
+ fp2_mulNF = 0;
fp2_neg = 0;
fp2_inv = 0;
fp2_sqr = 0;
diff --git a/src/fp.cpp b/src/fp.cpp
index 1817a38..107b99d 100644
--- a/src/fp.cpp
+++ b/src/fp.cpp
@@ -211,6 +211,7 @@ void setOp2(Op& op)
op.fpDbl_sub = DblSub<N, Tag>::f;
op.fp_addPre = AddPre<N, Tag>::f;
op.fp_subPre = SubPre<N, Tag>::f;
+ op.fp2_mulNF = Fp2MulNF<N, Tag>::f;
SetFpDbl<N, enableFpDbl>::exec(op);
}
diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp
index 9036f86..bc318bf 100644
--- a/src/fp_generator.hpp
+++ b/src/fp_generator.hpp
@@ -229,6 +229,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
}
// setup fp_tower
if (op.N > 4) return;
+ op.fp2_mulNF = 0;
align(16);
op.fpDbl_add = getCurr<void4u>();
gen_fpDbl_add();
diff --git a/src/low_func.hpp b/src/low_func.hpp
index 8684131..cc75464 100644
--- a/src/low_func.hpp
+++ b/src/low_func.hpp
@@ -638,6 +638,35 @@ struct Sqr {
template<size_t N, class Tag>
const void3u Sqr<N, Tag>::f = Sqr<N, Tag>::func;
+template<size_t N, class Tag = Gtag>
+struct Fp2MulNF {
+ static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
+ {
+ const Unit *const a = x;
+ const Unit *const b = x + N;
+ const Unit *const c = y;
+ const Unit *const d = y + N;
+ Unit d0[N * 2];
+ Unit d1[N * 2];
+ Unit d2[N * 2];
+ Unit s[N];
+ Unit t[N];
+ AddPre<N, Tag>::f(s, a, b);
+ AddPre<N, Tag>::f(t, c, d);
+ MulPre<N, Tag>::f(d0, s, t);
+ MulPre<N, Tag>::f(d1, a, c);
+ MulPre<N, Tag>::f(d2, b, d);
+ SubPre<N * 2, Tag>::f(d0, d0, d1);
+ SubPre<N * 2, Tag>::f(d0, d0, d2);
+ MontRed<N, Tag>::f(z + N, d0, p);
+ DblSub<N, Tag>::f(d1, d1, d2, p);
+ MontRed<N, Tag>::f(z, d1, p);
+ }
+ static const void4u f;
+};
+template<size_t N, class Tag>
+const void4u Fp2MulNF<N, Tag>::f = Fp2MulNF<N, Tag>::func;
+
} } // mcl::fp
#ifdef _WIN32