aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <herumi@nifty.com>2016-11-14 16:33:26 +0800
committerMITSUNARI Shigeo <herumi@nifty.com>2016-11-14 16:33:26 +0800
commitb0f23bb8fd7bacee2e416b6949256f49e32b0e7b (patch)
tree14f0980ae041961717a6651425001481d365baf2
parentf03372873edeb0879cae9473c66085a89e299c3e (diff)
downloadtangerine-mcl-b0f23bb8fd7bacee2e416b6949256f49e32b0e7b.tar.gz
tangerine-mcl-b0f23bb8fd7bacee2e416b6949256f49e32b0e7b.tar.zst
tangerine-mcl-b0f23bb8fd7bacee2e416b6949256f49e32b0e7b.zip
optimize divBy2
-rw-r--r--include/mcl/fp.hpp8
-rw-r--r--include/mcl/op.hpp2
-rw-r--r--sample/rawbench.cpp9
-rw-r--r--src/fp.cpp11
-rw-r--r--src/fp_generator.hpp17
-rw-r--r--src/fp_proto.hpp14
6 files changed, 36 insertions, 25 deletions
diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp
index b6c9cc0..57e5cfa 100644
--- a/include/mcl/fp.hpp
+++ b/include/mcl/fp.hpp
@@ -341,7 +341,15 @@ public:
static inline void sqr(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); }
static inline void divBy2(FpT& y, const FpT& x)
{
+#if 0
mul(y, x, inv2_); // QQQ : optimize later
+#else
+ bool odd = (x.v_[0] & 1) != 0;
+ op_.fp_shr1(y.v_, x.v_);
+ if (odd) {
+ op_.fp_addPre(y.v_, y.v_, op_.half);
+ }
+#endif
}
bool isZero() const { return op_.fp_isZero(v_); }
bool isOne() const { return fp::isEqualArray(v_, op_.oneRep, op_.N); }
diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp
index cdf71c5..0e5cba9 100644
--- a/include/mcl/op.hpp
+++ b/include/mcl/op.hpp
@@ -92,6 +92,7 @@ struct Op {
bool (*fp_isZero)(const Unit*);
void1u fp_clear;
void2u fp_copy;
+ void2u fp_shr1;
void3u fp_neg;
void4u fp_add;
void4u fp_sub;
@@ -159,6 +160,7 @@ struct Op {
fp_isZero = 0;
fp_clear = 0;
fp_copy = 0;
+ fp_shr1 = 0;
fp_neg = 0;
fp_add = 0;
fp_sub = 0;
diff --git a/sample/rawbench.cpp b/sample/rawbench.cpp
index 81f261a..2030c30 100644
--- a/sample/rawbench.cpp
+++ b/sample/rawbench.cpp
@@ -38,13 +38,8 @@ void benchRaw(const char *p, mcl::fp::Mode mode)
double fp2_sqrT, fp2_mulT;
CYBOZU_BENCH_T(fp_addT, op.fp_add, uz, ux, uy, op.p);
CYBOZU_BENCH_T(fp_subT, op.fp_sub, uz, uy, ux, op.p);
- if (op.fp_addPre) {
- CYBOZU_BENCH_T(fp_addPreT, op.fp_addPre, uz, ux, uy);
- CYBOZU_BENCH_T(fp_subPreT, op.fp_subPre, uz, uy, ux);
- } else {
- fp_addPreT = 0;
- fp_subPreT = 0;
- }
+ CYBOZU_BENCH_T(fp_addPreT, op.fp_addPre, uz, ux, uy);
+ CYBOZU_BENCH_T(fp_subPreT, op.fp_subPre, uz, uy, ux);
CYBOZU_BENCH_T(fp_sqrT, op.fp_sqr, uz, ux, op.p);
CYBOZU_BENCH_T(fp_mulT, op.fp_mul, uz, ux, uy, op.p);
CYBOZU_BENCH_T(fp_mulUnitT, op.fp_mulUnit, uz, ux, 12345678, op.p);
diff --git a/src/fp.cpp b/src/fp.cpp
index a789d96..2f5b12d 100644
--- a/src/fp.cpp
+++ b/src/fp.cpp
@@ -153,16 +153,17 @@ template<size_t N>
struct SetFpDbl<N, true> {
static inline void exec(Op& op)
{
- if (!op.isFullBit) {
+// if (!op.isFullBit) {
op.fpDbl_addPre = AddPre<N * 2, Ltag>::f;
op.fpDbl_subPre = SubPre<N * 2, Ltag>::f;
- }
+// }
}
};
template<size_t N, class Tag, bool enableFpDbl>
void setOpSub(Op& op)
{
+ op.fp_shr1 = Shr1<N, Tag>::f;
op.fp_neg = Neg<N, Tag>::f;
op.fp_add = Add<N, Tag>::f;
op.fp_sub = Sub<N, Tag>::f;
@@ -182,10 +183,8 @@ void setOpSub(Op& op)
op.fpN1_mod = N1_Mod<N, Tag>::f;
op.fpDbl_add = DblAdd<N, Tag>::f;
op.fpDbl_sub = DblSub<N, Tag>::f;
- if (!op.isFullBit) {
- op.fp_addPre = AddPre<N, Tag>::f;
- op.fp_subPre = SubPre<N, Tag>::f;
- }
+ op.fp_addPre = AddPre<N, Tag>::f;
+ op.fp_subPre = SubPre<N, Tag>::f;
SetFpDbl<N, enableFpDbl>::exec(op);
}
diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp
index 18089cd..7947e2d 100644
--- a/src/fp_generator.hpp
+++ b/src/fp_generator.hpp
@@ -197,17 +197,12 @@ struct FpGenerator : Xbyak::CodeGenerator {
op.fp_sub = getCurr<void4u>();
gen_fp_sub();
- if (op.isFullBit) {
- op.fp_addPre = 0;
- op.fp_subPre = 0;
- } else {
- align(16);
- op.fp_addPre = getCurr<u3u>();
- gen_addSubPre(true, pn_);
- align(16);
- op.fp_subPre = getCurr<u3u>();
- gen_addSubPre(false, pn_);
- }
+ align(16);
+ op.fp_addPre = getCurr<u3u>();
+ gen_addSubPre(true, pn_);
+ align(16);
+ op.fp_subPre = getCurr<u3u>();
+ gen_addSubPre(false, pn_);
align(16);
shr1_ = getCurr<void2op>();
gen_shr1();
diff --git a/src/fp_proto.hpp b/src/fp_proto.hpp
index 6b538d7..7c755b6 100644
--- a/src/fp_proto.hpp
+++ b/src/fp_proto.hpp
@@ -43,7 +43,6 @@ template<size_t N, class Tag = Gtag>
struct AddPre {
static inline Unit func(Unit *z, const Unit *x, const Unit *y)
{
- if (N == 0) return 0;
return mpn_add_n((mp_limb_t*)z, (const mp_limb_t*)x, (const mp_limb_t*)y, N);
}
static const u3u f;
@@ -87,6 +86,19 @@ struct SubPre {
template<size_t N, class Tag>
const u3u SubPre<N, Tag>::f = SubPre<N, Tag>::func;
+// y[N] <- (x[N] >> 1)
+template<size_t N, class Tag = Gtag>
+struct Shr1 {
+ static inline void func(Unit *y, const Unit *x)
+ {
+ mpn_rshift((mp_limb_t*)y, (const mp_limb_t*)x, (int)N, 1);
+ }
+ static const void2u f;
+};
+
+template<size_t N, class Tag>
+const void2u Shr1<N, Tag>::f = Shr1<N, Tag>::func;
+
// y[N] <- (-x[N]) % p[N]
template<size_t N, class Tag = Gtag>
struct Neg {