diff options
author | MITSUNARI Shigeo <herumi@nifty.com> | 2018-08-09 16:29:14 +0800 |
---|---|---|
committer | MITSUNARI Shigeo <herumi@nifty.com> | 2018-08-09 16:29:14 +0800 |
commit | d957961c79bc6e05602ec9ae6317d824fc1c4e67 (patch) | |
tree | d3ee37afbe867b5948bf8c107e0ac019d4549a06 | |
parent | bd9f75d265c4ab79e90c133af15532312efb76ca (diff) | |
download | tangerine-mcl-d957961c79bc6e05602ec9ae6317d824fc1c4e67.tar.gz tangerine-mcl-d957961c79bc6e05602ec9ae6317d824fc1c4e67.tar.zst tangerine-mcl-d957961c79bc6e05602ec9ae6317d824fc1c4e67.zip |
fp2_sqr is ok
-rw-r--r-- | include/mcl/fp_tower.hpp | 17 | ||||
-rw-r--r-- | src/fp_generator.hpp | 66 |
2 files changed, 37 insertions, 46 deletions
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp index 27b2bfc..955fcfd 100644 --- a/include/mcl/fp_tower.hpp +++ b/include/mcl/fp_tower.hpp @@ -547,11 +547,6 @@ private: */ static void fp2_sqrW(Unit *y, const Unit *x) { -#if 0 - Unit xx[8], copyX[8]; - memcpy(copyX, x, sizeof(copyX)); - Fp::getOp().fp2_sqrA_(xx, x); -#endif const Fp *px = reinterpret_cast<const Fp*>(x); Fp *py = reinterpret_cast<Fp*>(y); const Fp& a = px[0]; @@ -575,18 +570,6 @@ private: FpDbl::mod(py[0], d1); FpDbl::mod(py[1], d2); #endif -#if 0 - for (int i = 0; i < 8; i++) { - if (y[i] != xx[i]) { - printf("ERR %d %016llx %016llx\n", i, (long long)y[i], (long long)xx[i]); - printf("X\n"); - for (int j = 0; j < 8; j++) { - printf("%016llx ", (long long)copyX[i]); - } - puts(""); - } - } -#endif } /* xi = xi_a + i diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index e723e96..c570548 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -2900,48 +2900,56 @@ private: const Ext1 t1(FpByte_, rsp, 2 * 8); const Ext1 t2(FpByte_, rsp, t1.next); const Ext1 t3(FpByte_, rsp, t2.next); + bool nocarry = (p_[pn_ - 1] >> 62) == 0; StackFrame sf(this, 3, 10 | UseRDX, t3.next); mov(ptr [y], gp0); mov(ptr [x], gp1); // t1 = b + b lea(gp0, ptr [t1]); - for (int i = 0; i < 4; i++) { - mov(rax, ptr [gp1 + FpByte_ + i * 8]); - if (i == 0) { - add(rax, rax); - } else { - adc(rax, rax); + if (nocarry) { + for (int i = 0; i < 4; i++) { + mov(rax, ptr [gp1 + FpByte_ + i * 8]); + if (i == 0) { + add(rax, rax); + } else { + adc(rax, rax); + } + mov(ptr [gp0 + i * 8], rax); } - mov(ptr [gp0 + i * 8], rax); + } else { + gen_raw_fp_add(gp0, gp1 + FpByte_, gp1 + FpByte_, sf.t, false); } + // t1 = 2ab mov(gp1, gp0); mov(gp2, ptr [x]); call(fp_mulL_); -#if 0 - mov(gp0, ptr [x]); - gen_raw_fp_add(t2, gp0, gp0 + FpByte_, sf.t, false); - gen_raw_fp_sub(t3, gp0, gp0 + FpByte_, sf.t, false); -#else - Pack a = sf.t.sub(0, 4); - Pack b = sf.t.sub(4, 4); - mov(gp0, ptr [x]); - load_rm(a, gp0); - load_rm(b, gp0 + FpByte_); - for (int i = 0; i < 4; i++) { - mov(rax, a[i]); - if (i == 0) { - add(rax, b[i]); - } else { - adc(rax, b[i]); + if (nocarry) { + Pack a = sf.t.sub(0, 4); + Pack b = sf.t.sub(4, 4); + mov(gp0, ptr [x]); + load_rm(a, gp0); + load_rm(b, gp0 + FpByte_); + // t2 = a + b + for (int i = 0; i < 4; i++) { + mov(rax, a[i]); + if (i == 0) { + add(rax, b[i]); + } else { + adc(rax, b[i]); + } + mov(ptr [(RegExp)t2 + i * 8], rax); } - mov(ptr [(RegExp)t2 + i * 8], rax); + // t3 = a + p - b + mov(gp1, (size_t)p_); + add_rm(a, gp1); + sub_rr(a, b); + store_mr(t3, a); + } else { + mov(gp0, ptr [x]); + gen_raw_fp_add(t2, gp0, gp0 + FpByte_, sf.t, false); + gen_raw_fp_sub(t3, gp0, gp0 + FpByte_, sf.t, false); } - mov(gp1, (size_t)p_); - add_rm(a, gp1); - sub_rr(a, b); - store_mr(t3, a); -#endif mov(gp0, ptr [y]); lea(gp1, ptr [t2]); |