aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <herumi@nifty.com>2018-08-29 13:01:45 +0800
committerMITSUNARI Shigeo <herumi@nifty.com>2018-08-29 14:05:05 +0800
commitb7c2d17e99a2b95a8eb941ccdd03c559d56a830f (patch)
tree2a29098e24405be86d8688a11dc922b33052d8bd
parent8919b73aa9bad85e431d4485378875454e757322 (diff)
downloaddexon-mcl-b7c2d17e99a2b95a8eb941ccdd03c559d56a830f.tar.gz
dexon-mcl-b7c2d17e99a2b95a8eb941ccdd03c559d56a830f.tar.zst
dexon-mcl-b7c2d17e99a2b95a8eb941ccdd03c559d56a830f.zip
add fp2Dbl_sqrPreA_
-rw-r--r--include/mcl/op.hpp2
-rw-r--r--src/fp_generator.hpp42
-rw-r--r--test/bn_test.cpp2
3 files changed, 43 insertions, 3 deletions
diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp
index 96771ea..57ceb52 100644
--- a/include/mcl/op.hpp
+++ b/include/mcl/op.hpp
@@ -197,7 +197,7 @@ struct Op {
void2u fpDbl_sqrPreA_;
void2u fpDbl_modA_;
void3u fp2Dbl_mulPreA_;
- void3u fp2Dbl_sqrPreA_;
+ void2u fp2Dbl_sqrPreA_;
size_t maxN;
size_t N;
size_t bitSize;
diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp
index d955341..2e5f769 100644
--- a/src/fp_generator.hpp
+++ b/src/fp_generator.hpp
@@ -387,6 +387,9 @@ private:
op.fp2Dbl_mulPreA_ = getCurr<void3u>();
gen_fp2Dbl_mulPre(mulPreL);
align(16);
+ op.fp2Dbl_sqrPreA_ = getCurr<void2u>();
+ gen_fp2Dbl_sqrPre(mulPreL);
+ align(16);
op.fp2_mulA_ = getCurr<void3u>();
gen_fp2_mul4(fpDbl_modL);
align(16);
@@ -2867,7 +2870,44 @@ private:
gen_raw_sub(gp0, gp1, gp2, rax, 4);
gen_raw_fp_sub(gp0 + 8 * 4, gp1 + 8 * 4, gp2 + 8 * 4, Pack(gt0, gt1, gt2, gt3, gt4, gt5, gt6, gt7), true);
}
-
+ void gen_fp2Dbl_sqrPre(Label& mulPreL)
+ {
+ assert(!isFullBit_);
+ const RegExp y = rsp + 0 * 8;
+ const RegExp x = rsp + 1 * 8;
+ const Ext1 t1(FpByte_, rsp, 2 * 8);
+ const Ext1 t2(FpByte_, rsp, t1.next);
+ // use mulPreL then use 3
+ StackFrame sf(this, 3 /* not 2 */, 10 | UseRDX, t2.next);
+ mov(ptr [y], gp0);
+ mov(ptr [x], gp1);
+ const Pack a = sf.t.sub(0, 4);
+ const Pack b = sf.t.sub(4, 4);
+ load_rm(b, gp1 + FpByte_);
+ for (int i = 0; i < 4; i++) {
+ mov(rax, b[i]);
+ if (i == 0) {
+ add(rax, rax);
+ } else {
+ adc(rax, rax);
+ }
+ mov(ptr [(const RegExp&)t1 + i * 8], rax);
+ }
+ load_rm(a, gp1);
+ add_rr(a, b);
+ store_mr(t2, a);
+ mov(gp0, ptr [y]);
+ add(gp0, FpByte_ * 2);
+ lea(gp1, ptr [t1]);
+ mov(gp2, ptr [x]);
+ call(mulPreL);
+ mov(gp0, ptr [x]);
+ gen_raw_fp_sub(t1, gp0, gp0 + FpByte_, sf.t, false);
+ mov(gp0, ptr [y]);
+ lea(gp1, ptr [t1]);
+ lea(gp2, ptr [t2]);
+ call(mulPreL);
+ }
void gen_fp2_add4()
{
assert(!isFullBit_);
diff --git a/test/bn_test.cpp b/test/bn_test.cpp
index af57309..929e235 100644
--- a/test/bn_test.cpp
+++ b/test/bn_test.cpp
@@ -358,7 +358,7 @@ CYBOZU_TEST_AUTO(naive)
#ifdef ONLY_BENCH
{
Fp12 e;
- for (int i = 0; i < 1000; i++) pairing(e, P, Q);
+ for (int i = 0; i < 10000; i++) { clk.begin(); pairing(e, P, Q); clk.end(); }
}
clk.put();
return;