diff options
author | MITSUNARI Shigeo <herumi@nifty.com> | 2018-08-15 11:11:09 +0800 |
---|---|---|
committer | MITSUNARI Shigeo <herumi@nifty.com> | 2018-08-15 11:11:09 +0800 |
commit | 7f54b72a942320dead6e912b869d9212f5d73dde (patch) | |
tree | 4b47962e99adeffbcdad2ffdaded1935ffa8dccc | |
parent | 7f7d59451e437ec9196e6d05b31606beed162ef5 (diff) | |
download | tangerine-mcl-7f54b72a942320dead6e912b869d9212f5d73dde.tar.gz tangerine-mcl-7f54b72a942320dead6e912b869d9212f5d73dde.tar.zst tangerine-mcl-7f54b72a942320dead6e912b869d9212f5d73dde.zip |
optimize Fp2::add/sub
-rw-r--r-- | include/mcl/fp_tower.hpp | 30 | ||||
-rw-r--r-- | include/mcl/op.hpp | 6 | ||||
-rw-r--r-- | src/fp_generator.hpp | 28 | ||||
-rw-r--r-- | test/bench.hpp | 2 |
4 files changed, 44 insertions, 22 deletions
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp index 5424b37..116bd4b 100644 --- a/include/mcl/fp_tower.hpp +++ b/include/mcl/fp_tower.hpp @@ -237,8 +237,8 @@ public: a = a_; b = b_; } - static void add(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::op_.fp2_add(z.a.v_, x.a.v_, y.a.v_); } - static void sub(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::op_.fp2_sub(z.a.v_, x.a.v_, y.a.v_); } + static void (*add)(Fp2T& z, const Fp2T& x, const Fp2T& y); + static void (*sub)(Fp2T& z, const Fp2T& x, const Fp2T& y); static void addPre(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::addPre(z.a, x.a, y.a); Fp::addPre(z.b, x.b, y.b); } static void mul(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::op_.fp2_mul(z.a.v_, x.a.v_, y.a.v_); } static void inv(Fp2T& y, const Fp2T& x) { Fp::op_.fp2_inv(y.a.v_, x.a.v_); } @@ -375,8 +375,10 @@ public: // assert(Fp::maxSize <= 256); xi_a_ = xi_a; mcl::fp::Op& op = Fp::op_; - op.fp2_add = fp2_addW; - op.fp2_sub = fp2_subW; + add = (void (*)(Fp2T& z, const Fp2T& x, const Fp2T& y))op.fp2_addA_; + if (add == 0) add = fp2_addC; + sub = (void (*)(Fp2T& z, const Fp2T& x, const Fp2T& y))op.fp2_subA_; + if (sub == 0) sub = fp2_subC; if (op.fp2Dbl_mulPre == 0) { if (op.isFullBit) { op.fp2Dbl_mulPre = FpDblT<Fp>::fp2Dbl_mulPreW; @@ -475,21 +477,15 @@ private: default Fp2T operator Fp2T = Fp[i]/(i^2 + 1) */ - static void fp2_addW(Unit *z, const Unit *x, const Unit *y) + static void fp2_addC(Fp2T& z, const Fp2T& x, const Fp2T& y) { - const Fp *px = reinterpret_cast<const Fp*>(x); - const Fp *py = reinterpret_cast<const Fp*>(y); - Fp *pz = reinterpret_cast<Fp*>(z); - Fp::add(pz[0], px[0], py[0]); - Fp::add(pz[1], px[1], py[1]); + Fp::add(z.a, x.a, y.a); + Fp::add(z.b, x.b, y.b); } - static void fp2_subW(Unit *z, const Unit *x, const Unit *y) + static void fp2_subC(Fp2T& z, const Fp2T& x, const Fp2T& y) { - const Fp *px = reinterpret_cast<const Fp*>(x); - const Fp *py = reinterpret_cast<const Fp*>(y); - Fp *pz = reinterpret_cast<Fp*>(z); - Fp::sub(pz[0], px[0], py[0]); - Fp::sub(pz[1], px[1], py[1]); + Fp::sub(z.a, x.a, y.a); + Fp::sub(z.b, x.b, y.b); } static void fp2_negW(Unit *y, const Unit *x) { @@ -621,6 +617,8 @@ private: } }; +template<class Fp_> void (*Fp2T<Fp_>::add)(Fp2T& z, const Fp2T& x, const Fp2T& y); +template<class Fp_> void (*Fp2T<Fp_>::sub)(Fp2T& z, const Fp2T& x, const Fp2T& y); template<class Fp_> void (*Fp2T<Fp_>::sqr)(Fp2T& y, const Fp2T& x); template<class Fp> diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index 460c685..69d2987 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -183,6 +183,8 @@ struct Op { void3u fp_subA_; void3u fp_mulA_; void2u fp_sqrA_; + void3u fp2_addA_; + void3u fp2_subA_; void2u fp2_sqrA_; size_t maxN; size_t N; @@ -219,7 +221,6 @@ struct Op { x = a + bu */ int xi_a; // xi = xi_a + u - void3u fp2_add; void3u fp2_sub; void3u fp2_mul; void4u fp2_mulNF; @@ -264,6 +265,8 @@ struct Op { fp_subA_ = 0; fp_mulA_ = 0; fp_sqrA_ = 0; + fp2_addA_ = 0; + fp2_subA_ = 0; fp2_sqrA_ = 0; maxN = 0; N = 0; @@ -297,7 +300,6 @@ struct Op { fp2Dbl_mulPre = 0; xi_a = 0; - fp2_add = 0; fp2_sub = 0; fp2_mul = 0; fp2_mulNF = 0; diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index c570548..3c79700 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -370,14 +370,20 @@ struct Code : Xbyak::CodeGenerator { } if (op.N == 4 && !isFullBit_) { align(16); + op.fp2_addA_ = getCurr<void3u>(); + gen_fp2_add4(); + align(16); + op.fp2_subA_ = getCurr<void3u>(); + gen_fp2_sub4(); + align(16); op.fp2Dbl_mulPre = getCurr<void3u>(); gen_fp2Dbl_mulPre(); align(16); op.fp2_mul = getCurr<void3u>(); - gen_fp2_mul(); + gen_fp2_mul4(); align(16); op.fp2_sqrA_ = getCurr<void2u>(); - gen_fp2_sqr(); + gen_fp2_sqr4(); } } void gen_addSubPre(bool isAdd, int n) @@ -2844,7 +2850,21 @@ private: gen_raw_fp_sub(gp0 + 8 * 4, gp1 + 8 * 4, gp2 + 8 * 4, Pack(gt0, gt1, gt2, gt3, gt4, gt5, gt6, gt7), true); } - void gen_fp2_mul() + void gen_fp2_add4() + { + assert(!isFullBit_); + StackFrame sf(this, 3, 8); + gen_raw_fp_add(sf.p[0], sf.p[1], sf.p[2], sf.t, false); + gen_raw_fp_add(sf.p[0] + FpByte_, sf.p[1] + FpByte_, sf.p[2] + FpByte_, sf.t, false); + } + void gen_fp2_sub4() + { + assert(!isFullBit_); + StackFrame sf(this, 3, 8); + gen_raw_fp_sub(sf.p[0], sf.p[1], sf.p[2], sf.t, false); + gen_raw_fp_sub(sf.p[0] + FpByte_, sf.p[1] + FpByte_, sf.p[2] + FpByte_, sf.t, false); + } + void gen_fp2_mul4() { assert(!isFullBit_); const RegExp z = rsp + 0 * 8; @@ -2892,7 +2912,7 @@ private: lea(gp1, ptr[d1]); call(fpDbl_modL_); } - void gen_fp2_sqr() + void gen_fp2_sqr4() { assert(!isFullBit_); const RegExp y = rsp + 0 * 8; diff --git a/test/bench.hpp b/test/bench.hpp index faec4cd..3807bf9 100644 --- a/test/bench.hpp +++ b/test/bench.hpp @@ -45,6 +45,7 @@ void testBench(const G1& P, const G2& Q) CYBOZU_BENCH_C("hashAndMapToG2", C, hashAndMapToG2, QQ, "abc", 3); #endif CYBOZU_BENCH_C("Fp::add ", C3, Fp::add, x, x, y); + CYBOZU_BENCH_C("Fp::sub ", C3, Fp::sub, x, x, y); CYBOZU_BENCH_C("Fp::mul ", C3, Fp::mul, x, x, y); CYBOZU_BENCH_C("Fp::sqr ", C3, Fp::sqr, x, x); CYBOZU_BENCH_C("Fp::inv ", C3, Fp::inv, x, x); @@ -54,6 +55,7 @@ void testBench(const G1& P, const G2& Q) yy.a = y; yy.b = -5; CYBOZU_BENCH_C("Fp2::add ", C3, Fp2::add, xx, xx, yy); + CYBOZU_BENCH_C("Fp2::sub ", C3, Fp2::sub, xx, xx, yy); CYBOZU_BENCH_C("Fp2::mul ", C3, Fp2::mul, xx, xx, yy); CYBOZU_BENCH_C("Fp2::sqr ", C3, Fp2::sqr, xx, xx); CYBOZU_BENCH_C("Fp2::inv ", C3, Fp2::inv, xx, xx); |