diff options
author | MITSUNARI Shigeo <herumi@nifty.com> | 2018-08-07 19:12:45 +0800 |
---|---|---|
committer | MITSUNARI Shigeo <herumi@nifty.com> | 2018-08-07 19:12:45 +0800 |
commit | 42710833307dd9c863be16bddf3754c2ff92ecc9 (patch) | |
tree | e17059666656865387ba57f1a3dd8c8fc14f4b03 | |
parent | a7439e3638492ef85c75c1c8e5e88ad1878dfdd8 (diff) | |
download | tangerine-mcl-42710833307dd9c863be16bddf3754c2ff92ecc9.tar.gz tangerine-mcl-42710833307dd9c863be16bddf3754c2ff92ecc9.tar.zst tangerine-mcl-42710833307dd9c863be16bddf3754c2ff92ecc9.zip |
add fp2_sqr ; but not enable
-rw-r--r-- | include/mcl/fp.hpp | 7 | ||||
-rw-r--r-- | include/mcl/fp_tower.hpp | 3 | ||||
-rw-r--r-- | include/mcl/op.hpp | 4 | ||||
-rw-r--r-- | src/fp_generator.hpp | 71 |
4 files changed, 80 insertions, 5 deletions
diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp index ed4eba4..e0b64ff 100644 --- a/include/mcl/fp.hpp +++ b/include/mcl/fp.hpp @@ -136,6 +136,8 @@ public: if (sub == 0) sub = subC; mul = (void (*)(FpT& z, const FpT& x, const FpT& y))op_.fp_mulA_; if (mul == 0) mul = mulC; + sqr = (void (*)(FpT& y, const FpT& x))op_.fp_sqrA_; + if (sqr == 0) sqr = sqrC; #endif *pb = true; } @@ -359,10 +361,13 @@ public: static inline void subC(FpT& z, const FpT& x, const FpT& y) { op_.fp_sub(z.v_, x.v_, y.v_, op_.p); } static void (*mul)(FpT& z, const FpT& x, const FpT& y); static inline void mulC(FpT& z, const FpT& x, const FpT& y) { op_.fp_mul(z.v_, x.v_, y.v_, op_.p); } + static void (*sqr)(FpT& y, const FpT& x); + static inline void sqrC(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); } #else static inline void add(FpT& z, const FpT& x, const FpT& y) { op_.fp_add(z.v_, x.v_, y.v_, op_.p); } static inline void sub(FpT& z, const FpT& x, const FpT& y) { op_.fp_sub(z.v_, x.v_, y.v_, op_.p); } static inline void mul(FpT& z, const FpT& x, const FpT& y) { op_.fp_mul(z.v_, x.v_, y.v_, op_.p); } + static inline void sqr(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); } #endif static inline void addPre(FpT& z, const FpT& x, const FpT& y) { op_.fp_addPre(z.v_, x.v_, y.v_); } static inline void subPre(FpT& z, const FpT& x, const FpT& y) { op_.fp_subPre(z.v_, x.v_, y.v_); } @@ -373,7 +378,6 @@ public: } static inline void inv(FpT& y, const FpT& x) { op_.fp_invOp(y.v_, x.v_, op_); } static inline void neg(FpT& y, const FpT& x) { op_.fp_neg(y.v_, x.v_, op_.p); } - static inline void sqr(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); } static inline void divBy2(FpT& y, const FpT& x) { #if 0 @@ -584,6 +588,7 @@ template<class tag, size_t maxBitSize> int FpT<tag, maxBitSize>::ioMode_ = IoAut template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::add)(FpT& z, const FpT& x, const FpT& y); template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::sub)(FpT& z, const FpT& x, const FpT& y); template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::mul)(FpT& z, const FpT& x, const FpT& y); +template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::sqr)(FpT& y, const FpT& x); #endif } // mcl diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp index 318003e..89d7fa0 100644 --- a/include/mcl/fp_tower.hpp +++ b/include/mcl/fp_tower.hpp @@ -402,6 +402,9 @@ public: op.fp2_sqr = fp2_sqrW; if (xi_a == 1) { op.fp2_mul_xi = fp2_mul_xi_1_1i; + if (op.fp2_sqrA_) { + op.fp2_sqr = op.fp2_sqrA_; + } } else { op.fp2_mul_xi = fp2_mul_xiW; } diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index 1d3db0b..24dadc9 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -182,6 +182,8 @@ struct Op { void3u fp_addA_; void3u fp_subA_; void3u fp_mulA_; + void2u fp_sqrA_; + void2u fp2_sqrA_; size_t maxN; size_t N; size_t bitSize; @@ -262,6 +264,8 @@ struct Op { fp_addA_ = 0; fp_subA_ = 0; fp_mulA_ = 0; + fp_sqrA_ = 0; + fp2_sqrA_ = 0; maxN = 0; N = 0; bitSize = 0; diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 69f1bc6..6024043 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -216,6 +216,7 @@ struct Code : Xbyak::CodeGenerator { // the following labels assume sf(this, 3, 10 | UseRDX) Label mulPreL_; Label fpDbl_modL_; + Label fp_mulL_; Code(uint8_t *mem, size_t codeSize) : CodeGenerator(codeSize, mem) @@ -294,12 +295,12 @@ struct Code : Xbyak::CodeGenerator { mulUnit_ = getCurr<uint3opI>(); gen_mulUnit(); align(16); - mul_ = getCurr<void4u>(); - op.fp_mul = mul_; + op.fp_mul = getCurr<void4u>(); op.fp_mulA_ = getCurr<void3u>(); gen_mul(); align(16); op.fp_sqr = getCurr<void3u>(); + op.fp_sqrA_ = getCurr<void2u>(); gen_sqr(); if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 align(16); @@ -374,6 +375,9 @@ struct Code : Xbyak::CodeGenerator { align(16); op.fp2_mul = getCurr<void3u>(); gen_fp2_mul(); + align(16); +// op.fp2_sqrA_ = getCurr<void2u>(); +// gen_fp2_sqr(); } } void gen_addSubPre(bool isAdd, int n) @@ -1045,7 +1049,7 @@ struct Code : Xbyak::CodeGenerator { #else mov(rdx, rsi); #endif - jmp((void*)mul_); + jmp((const void*)op_->fp_mulA_); } /* input (pz[], px[], py[]) @@ -1092,7 +1096,9 @@ struct Code : Xbyak::CodeGenerator { */ void gen_montMul4(const uint64_t *p, uint64_t pp) { - StackFrame sf(this, 3, 10 | UseRDX); + StackFrame sf(this, 3, 10 | UseRDX, 0, false); + call(fp_mulL_); + sf.close(); const Reg64& p0 = sf.p[0]; const Reg64& p1 = sf.p[1]; const Reg64& p2 = sf.p[2]; @@ -1108,6 +1114,7 @@ struct Code : Xbyak::CodeGenerator { const Reg64& t8 = sf.t[8]; const Reg64& t9 = sf.t[9]; + L(fp_mulL_); movq(xm0, p0); // save p0 mov(p0, (uint64_t)p); movq(xm1, p2); @@ -1140,6 +1147,7 @@ struct Code : Xbyak::CodeGenerator { movq(p0, xm0); // load p0 store_mr(p0, Pack(t3, t2, t1, t0)); + ret(); } /* input (z, x, y) = (p0, p1, p2) @@ -2884,6 +2892,61 @@ private: lea(gp1, ptr[d1]); call(fpDbl_modL_); } + void gen_fp2_sqr() + { + assert(!isFullBit_); + const RegExp y = rsp + 0 * 8; + const RegExp x = rsp + 1 * 8; + const Ext1 t1(FpByte_, rsp, 2 * 8); + const Ext1 t2(FpByte_, rsp, t1.next); + const Ext1 t3(FpByte_, rsp, t2.next); + StackFrame sf(this, 3, 10 | UseRDX, t3.next); + mov(ptr [y], gp0); + mov(ptr [x], gp1); + // t1 = b + b + lea(gp0, ptr [t1]); + for (int i = 0; i < 4; i++) { + mov(rax, ptr [gp1 + FpByte_ + i * 8]); + if (i == 0) { + add(rax, rax); + } else { + adc(rax, rax); + } + mov(ptr [gp0 + i * 8], rax); + } + mov(gp1, gp0); + mov(gp2, ptr [x]); + call(fp_mulL_); + + Pack a = sf.t.sub(0, 4); + Pack b = sf.t.sub(4, 4); + mov(gp0, ptr [x]); + load_rm(a, gp0); + load_rm(b, gp0 + FpByte_); + for (int i = 0; i < 4; i++) { + mov(rax, a[i]); + if (i == 0) { + add(rax, b[i]); + } else { + adc(rax, b[i]); + } + mov(ptr [(RegExp)t2 + i * 8], rax); + } + mov(gp1, (size_t)p_); + add_rm(a, gp1); + sub_rr(a, b); + store_mr(t3, a); + + mov(gp0, ptr [y]); + lea(gp1, ptr [t2]); + lea(gp2, ptr [t3]); + call(fp_mulL_); + mov(gp0, ptr [y]); + for (int i = 0; i < 4; i++) { + mov(rax, ptr [(RegExp)t1 + i * 8]); + mov(ptr [gp0 + FpByte_ + i * 8], rax); + } + } }; struct FpGenerator { |