aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <herumi@nifty.com>2018-08-15 11:11:09 +0800
committerMITSUNARI Shigeo <herumi@nifty.com>2018-08-15 11:11:09 +0800
commit7f54b72a942320dead6e912b869d9212f5d73dde (patch)
tree4b47962e99adeffbcdad2ffdaded1935ffa8dccc
parent7f7d59451e437ec9196e6d05b31606beed162ef5 (diff)
downloadtangerine-mcl-7f54b72a942320dead6e912b869d9212f5d73dde.tar.gz
tangerine-mcl-7f54b72a942320dead6e912b869d9212f5d73dde.tar.zst
tangerine-mcl-7f54b72a942320dead6e912b869d9212f5d73dde.zip
optimize Fp2::add/sub
-rw-r--r--include/mcl/fp_tower.hpp30
-rw-r--r--include/mcl/op.hpp6
-rw-r--r--src/fp_generator.hpp28
-rw-r--r--test/bench.hpp2
4 files changed, 44 insertions, 22 deletions
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 5424b37..116bd4b 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -237,8 +237,8 @@ public:
a = a_;
b = b_;
}
- static void add(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::op_.fp2_add(z.a.v_, x.a.v_, y.a.v_); }
- static void sub(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::op_.fp2_sub(z.a.v_, x.a.v_, y.a.v_); }
+ static void (*add)(Fp2T& z, const Fp2T& x, const Fp2T& y);
+ static void (*sub)(Fp2T& z, const Fp2T& x, const Fp2T& y);
static void addPre(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::addPre(z.a, x.a, y.a); Fp::addPre(z.b, x.b, y.b); }
static void mul(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::op_.fp2_mul(z.a.v_, x.a.v_, y.a.v_); }
static void inv(Fp2T& y, const Fp2T& x) { Fp::op_.fp2_inv(y.a.v_, x.a.v_); }
@@ -375,8 +375,10 @@ public:
// assert(Fp::maxSize <= 256);
xi_a_ = xi_a;
mcl::fp::Op& op = Fp::op_;
- op.fp2_add = fp2_addW;
- op.fp2_sub = fp2_subW;
+ add = (void (*)(Fp2T& z, const Fp2T& x, const Fp2T& y))op.fp2_addA_;
+ if (add == 0) add = fp2_addC;
+ sub = (void (*)(Fp2T& z, const Fp2T& x, const Fp2T& y))op.fp2_subA_;
+ if (sub == 0) sub = fp2_subC;
if (op.fp2Dbl_mulPre == 0) {
if (op.isFullBit) {
op.fp2Dbl_mulPre = FpDblT<Fp>::fp2Dbl_mulPreW;
@@ -475,21 +477,15 @@ private:
default Fp2T operator
Fp2T = Fp[i]/(i^2 + 1)
*/
- static void fp2_addW(Unit *z, const Unit *x, const Unit *y)
+ static void fp2_addC(Fp2T& z, const Fp2T& x, const Fp2T& y)
{
- const Fp *px = reinterpret_cast<const Fp*>(x);
- const Fp *py = reinterpret_cast<const Fp*>(y);
- Fp *pz = reinterpret_cast<Fp*>(z);
- Fp::add(pz[0], px[0], py[0]);
- Fp::add(pz[1], px[1], py[1]);
+ Fp::add(z.a, x.a, y.a);
+ Fp::add(z.b, x.b, y.b);
}
- static void fp2_subW(Unit *z, const Unit *x, const Unit *y)
+ static void fp2_subC(Fp2T& z, const Fp2T& x, const Fp2T& y)
{
- const Fp *px = reinterpret_cast<const Fp*>(x);
- const Fp *py = reinterpret_cast<const Fp*>(y);
- Fp *pz = reinterpret_cast<Fp*>(z);
- Fp::sub(pz[0], px[0], py[0]);
- Fp::sub(pz[1], px[1], py[1]);
+ Fp::sub(z.a, x.a, y.a);
+ Fp::sub(z.b, x.b, y.b);
}
static void fp2_negW(Unit *y, const Unit *x)
{
@@ -621,6 +617,8 @@ private:
}
};
+template<class Fp_> void (*Fp2T<Fp_>::add)(Fp2T& z, const Fp2T& x, const Fp2T& y);
+template<class Fp_> void (*Fp2T<Fp_>::sub)(Fp2T& z, const Fp2T& x, const Fp2T& y);
template<class Fp_> void (*Fp2T<Fp_>::sqr)(Fp2T& y, const Fp2T& x);
template<class Fp>
diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp
index 460c685..69d2987 100644
--- a/include/mcl/op.hpp
+++ b/include/mcl/op.hpp
@@ -183,6 +183,8 @@ struct Op {
void3u fp_subA_;
void3u fp_mulA_;
void2u fp_sqrA_;
+ void3u fp2_addA_;
+ void3u fp2_subA_;
void2u fp2_sqrA_;
size_t maxN;
size_t N;
@@ -219,7 +221,6 @@ struct Op {
x = a + bu
*/
int xi_a; // xi = xi_a + u
- void3u fp2_add;
void3u fp2_sub;
void3u fp2_mul;
void4u fp2_mulNF;
@@ -264,6 +265,8 @@ struct Op {
fp_subA_ = 0;
fp_mulA_ = 0;
fp_sqrA_ = 0;
+ fp2_addA_ = 0;
+ fp2_subA_ = 0;
fp2_sqrA_ = 0;
maxN = 0;
N = 0;
@@ -297,7 +300,6 @@ struct Op {
fp2Dbl_mulPre = 0;
xi_a = 0;
- fp2_add = 0;
fp2_sub = 0;
fp2_mul = 0;
fp2_mulNF = 0;
diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp
index c570548..3c79700 100644
--- a/src/fp_generator.hpp
+++ b/src/fp_generator.hpp
@@ -370,14 +370,20 @@ struct Code : Xbyak::CodeGenerator {
}
if (op.N == 4 && !isFullBit_) {
align(16);
+ op.fp2_addA_ = getCurr<void3u>();
+ gen_fp2_add4();
+ align(16);
+ op.fp2_subA_ = getCurr<void3u>();
+ gen_fp2_sub4();
+ align(16);
op.fp2Dbl_mulPre = getCurr<void3u>();
gen_fp2Dbl_mulPre();
align(16);
op.fp2_mul = getCurr<void3u>();
- gen_fp2_mul();
+ gen_fp2_mul4();
align(16);
op.fp2_sqrA_ = getCurr<void2u>();
- gen_fp2_sqr();
+ gen_fp2_sqr4();
}
}
void gen_addSubPre(bool isAdd, int n)
@@ -2844,7 +2850,21 @@ private:
gen_raw_fp_sub(gp0 + 8 * 4, gp1 + 8 * 4, gp2 + 8 * 4, Pack(gt0, gt1, gt2, gt3, gt4, gt5, gt6, gt7), true);
}
- void gen_fp2_mul()
+ void gen_fp2_add4()
+ {
+ assert(!isFullBit_);
+ StackFrame sf(this, 3, 8);
+ gen_raw_fp_add(sf.p[0], sf.p[1], sf.p[2], sf.t, false);
+ gen_raw_fp_add(sf.p[0] + FpByte_, sf.p[1] + FpByte_, sf.p[2] + FpByte_, sf.t, false);
+ }
+ void gen_fp2_sub4()
+ {
+ assert(!isFullBit_);
+ StackFrame sf(this, 3, 8);
+ gen_raw_fp_sub(sf.p[0], sf.p[1], sf.p[2], sf.t, false);
+ gen_raw_fp_sub(sf.p[0] + FpByte_, sf.p[1] + FpByte_, sf.p[2] + FpByte_, sf.t, false);
+ }
+ void gen_fp2_mul4()
{
assert(!isFullBit_);
const RegExp z = rsp + 0 * 8;
@@ -2892,7 +2912,7 @@ private:
lea(gp1, ptr[d1]);
call(fpDbl_modL_);
}
- void gen_fp2_sqr()
+ void gen_fp2_sqr4()
{
assert(!isFullBit_);
const RegExp y = rsp + 0 * 8;
diff --git a/test/bench.hpp b/test/bench.hpp
index faec4cd..3807bf9 100644
--- a/test/bench.hpp
+++ b/test/bench.hpp
@@ -45,6 +45,7 @@ void testBench(const G1& P, const G2& Q)
CYBOZU_BENCH_C("hashAndMapToG2", C, hashAndMapToG2, QQ, "abc", 3);
#endif
CYBOZU_BENCH_C("Fp::add ", C3, Fp::add, x, x, y);
+ CYBOZU_BENCH_C("Fp::sub ", C3, Fp::sub, x, x, y);
CYBOZU_BENCH_C("Fp::mul ", C3, Fp::mul, x, x, y);
CYBOZU_BENCH_C("Fp::sqr ", C3, Fp::sqr, x, x);
CYBOZU_BENCH_C("Fp::inv ", C3, Fp::inv, x, x);
@@ -54,6 +55,7 @@ void testBench(const G1& P, const G2& Q)
yy.a = y;
yy.b = -5;
CYBOZU_BENCH_C("Fp2::add ", C3, Fp2::add, xx, xx, yy);
+ CYBOZU_BENCH_C("Fp2::sub ", C3, Fp2::sub, xx, xx, yy);
CYBOZU_BENCH_C("Fp2::mul ", C3, Fp2::mul, xx, xx, yy);
CYBOZU_BENCH_C("Fp2::sqr ", C3, Fp2::sqr, xx, xx);
CYBOZU_BENCH_C("Fp2::inv ", C3, Fp2::inv, xx, xx);