diff options
author | MITSUNARI Shigeo <herumi@nifty.com> | 2018-10-29 16:00:44 +0800 |
---|---|---|
committer | MITSUNARI Shigeo <herumi@nifty.com> | 2018-10-29 16:00:44 +0800 |
commit | dbf3defbea2fb92737ceebfac3ea6e7af4b81187 (patch) | |
tree | f566e96fb79e76fce35e01eeabbf50ad932a9853 | |
parent | 6df454fdd16d47ff37d44d07188809d64860ff91 (diff) | |
download | tangerine-mcl-dbf3defbea2fb92737ceebfac3ea6e7af4b81187.tar.gz tangerine-mcl-dbf3defbea2fb92737ceebfac3ea6e7af4b81187.tar.zst tangerine-mcl-dbf3defbea2fb92737ceebfac3ea6e7af4b81187.zip |
move Label outside Code
-rw-r--r-- | src/fp_generator.hpp | 77 |
1 files changed, 43 insertions, 34 deletions
diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 581a0de..4408203 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -8,6 +8,7 @@ */ #if CYBOZU_HOST == CYBOZU_HOST_INTEL #define XBYAK_NO_OP_NAMES +#define XBYAK_DISABLE_AVX512 #include "xbyak/xbyak_util.h" #if MCL_SIZEOF_UNIT == 8 @@ -192,7 +193,11 @@ struct FpGenerator : Xbyak::CodeGenerator { const Reg64& gt8; const Reg64& gt9; const mcl::fp::Op *op_; - Label *pL_; // valid only in init_inner + Label pL_; // pointer to p + // the following labels assume sf(this, 3, 10 | UseRDX) + Label mulPreL; + Label fpDbl_modL; + Label fp_mulL; const uint64_t *p_; uint64_t rp_; int pn_; @@ -264,18 +269,12 @@ struct FpGenerator : Xbyak::CodeGenerator { private: void init_inner(Op& op) { - // the following labels assume sf(this, 3, 10 | UseRDX) - Label mulPreL; - Label fpDbl_modL; - Label fp_mulL; - Label pL; // label to p_ op_ = &op; - pL_ = &pL; /* first 4096-byte is data area remain is code area */ - L(pL); + L(pL_); p_ = reinterpret_cast<const uint64_t*>(getCurr()); for (size_t i = 0; i < op.N; i++) { dq(op.p[i]); @@ -393,15 +392,16 @@ private: } } if (op.N > 4) return; + align(16); + op.fp_mul = getCurr<void4u>(); // used in toMont/fromMont + op.fp_mulA_ = getCurr<void3u>(); + gen_mul(); + if (op.N > 4) return; if ((useMulx_ && op.N == 2) || op.N == 3 || op.N == 4) { align(16); op.fpDbl_sqrPreA_ = getCurr<void2u>(); gen_fpDbl_sqrPre(op); } - align(16); - op.fp_mul = getCurr<void4u>(); // used in toMont/fromMont - op.fp_mulA_ = getCurr<void3u>(); - gen_mul(fp_mulL); // if (op.N > 4) return; align(16); op.fp_sqrA_ = getCurr<void2u>(); @@ -423,16 +423,16 @@ private: gen_fp2_neg4(); align(16); op.fp2Dbl_mulPreA_ = getCurr<void3u>(); - gen_fp2Dbl_mulPre(mulPreL); + gen_fp2Dbl_mulPre(); align(16); op.fp2Dbl_sqrPreA_ = getCurr<void2u>(); - gen_fp2Dbl_sqrPre(mulPreL); + gen_fp2Dbl_sqrPre(); align(16); op.fp2_mulA_ = getCurr<void3u>(); - gen_fp2_mul4(fpDbl_modL); + gen_fp2_mul4(); align(16); op.fp2_sqrA_ = getCurr<void2u>(); - gen_fp2_sqr4(fp_mulL); + gen_fp2_sqr4(); align(16); op.fp2_mul_xiA_ = getCurr<void2u>(); gen_fp2_mul_xi4(); @@ -687,13 +687,13 @@ private: Label exit; if (isFullBit_) { jnc("@f"); - mov(t2[0], *pL_); // t2 is not used + mov(t2[0], pL_); // t2 is not used sub_rm(t1, t2[0]); jmp(exit); L("@@"); } mov_rr(t2, t1); - sub_rm(t2, rip + *pL_); + sub_rm(t2, rip + pL_); for (int i = 0; i < 6; i++) { cmovnc(t1[i], t2[i]); } @@ -819,7 +819,7 @@ private: jmp is faster than and-mask without jmp */ jnc("@f"); - add_rm(t, rip + *pL_); + add_rm(t, rip + pL_); L("@@"); store_mr(pz + offset, t); } @@ -879,7 +879,7 @@ private: shr(*t0, c); mov(ptr [pz + (pn_ - 1) * 8], *t0); } - void gen_mul(Label& fp_mulL) + void gen_mul() { if (op_->primeMode == PM_NIST_P192) { StackFrame sf(this, 3, 10 | UseRDX, 8 * 6); @@ -889,9 +889,18 @@ private: if (pn_ == 3) { gen_montMul3(p_, rp_); } else if (pn_ == 4) { - gen_montMul4(fp_mulL, p_, rp_); -// } else if (pn_ == 6 && useAdx_) { -// gen_montMul6(fp_mulL, p_, rp_); + gen_montMul4(p_, rp_); +#if 0 + } else if (pn_ == 6 && useAdx_) { +// gen_montMul6(p_, rp_); + StackFrame sf(this, 3, 10 | UseRDX, (1 + 12) * 8); + mov(ptr[rsp + 12 * 8], gp0); + mov(gp0, rsp); + call(mulPreL); // gp0, x, y + mov(gp0, ptr[rsp + 12 * 8]); + mov(gp1, rsp); + call(fpDbl_modL); +#endif } else if (pn_ <= 9) { gen_montMulN(p_, rp_, pn_); } else { @@ -1259,7 +1268,7 @@ private: z[0..3] <- montgomery(x[0..3], y[0..3]) destroy gt0, ..., gt9, xm0, xm1, p2 */ - void gen_montMul4(Label& fp_mulL, const uint64_t *p, uint64_t pp) + void gen_montMul4(const uint64_t *p, uint64_t pp) { StackFrame sf(this, 3, 10 | UseRDX, 0, false); call(fp_mulL); @@ -1938,7 +1947,7 @@ private: mov(z, ptr [xy + 0 * 8]); mov(a, rp_); mul(z); - lea(t0, ptr [rip + *pL_]); + lea(t0, ptr [rip + pL_]); load_rm(Pack(t7, t6, t5, t4, t3, t2, t1), xy); mov(d, a); // q mulPackAddShr(Pack(t7, t6, t5, t4, t3, t2, t1), t0, t10); @@ -1952,7 +1961,7 @@ private: mov(a, rp_); mul(t2); movq(xm1, t0); // save - lea(t0, ptr [rip + *pL_]); + lea(t0, ptr [rip + pL_]); mov(d, a); movq(xm2, t10); mulPackAddShr(Pack(t8, t7, t6, t5, t4, t3, t2), t0, t10); @@ -1965,7 +1974,7 @@ private: // z = [t1:t0:t10:t9:t8:t7:t6:t5:t4:t3] mov(a, rp_); mul(t3); - lea(t2, ptr [rip + *pL_]); + lea(t2, ptr [rip + pL_]); mov(d, a); movq(xm2, t10); mulPackAddShr(Pack(t9, t8, t7, t6, t5, t4, t3), t2, t10); @@ -1976,7 +1985,7 @@ private: // z = [t1:t0:t10:t9:t8:t7:t6:t5:t4] mov(a, rp_); mul(t4); - lea(t2, ptr [rip + *pL_]); + lea(t2, ptr [rip + pL_]); mov(d, a); mulPackAddShr(Pack(t10, t9, t8, t7, t6, t5, t4), t2, t3); adc(t0, rax); @@ -1984,14 +1993,14 @@ private: // z = [t1:t0:t10:t9:t8:t7:t6:t5] mov(a, rp_); mul(t5); - lea(t2, ptr [rip + *pL_]); + lea(t2, ptr [rip + pL_]); mov(d, a); mulPackAddShr(Pack(t0, t10, t9, t8, t7, t6, t5), t2, t3); adc(t1, a); // z = [t1:t0:t10:t9:t8:t7:t6] mov(a, rp_); mul(t6); - lea(t2, ptr [rip + *pL_]); + lea(t2, ptr [rip + pL_]); mov(d, a); mulPackAddShr(Pack(t1, t0, t10, t9, t8, t7, t6), t2, t3, true); // z = [t1:t0:t10:t9:t8:t7] @@ -3085,7 +3094,7 @@ private: } } } - void gen_fp2Dbl_mulPre(Label& mulPreL) + void gen_fp2Dbl_mulPre() { assert(!isFullBit_); const RegExp z = rsp + 0 * 8; @@ -3138,7 +3147,7 @@ private: gen_raw_sub(gp0, gp1, gp2, rax, 4); gen_raw_fp_sub(gp0 + 8 * 4, gp1 + 8 * 4, gp2 + 8 * 4, Pack(gt0, gt1, gt2, gt3, gt4, gt5, gt6, gt7), true); } - void gen_fp2Dbl_sqrPre(Label& mulPreL) + void gen_fp2Dbl_sqrPre() { assert(!isFullBit_); const RegExp y = rsp + 0 * 8; @@ -3246,7 +3255,7 @@ private: gen_raw_neg(sf.p[0], sf.p[1], sf.t); gen_raw_neg(sf.p[0] + FpByte_, sf.p[1] + FpByte_, sf.t); } - void gen_fp2_mul4(Label& fpDbl_modL) + void gen_fp2_mul4() { assert(!isFullBit_); const RegExp z = rsp + 0 * 8; @@ -3294,7 +3303,7 @@ private: lea(gp1, ptr[d1]); call(fpDbl_modL); } - void gen_fp2_sqr4(Label& fp_mulL) + void gen_fp2_sqr4() { assert(!isFullBit_); const RegExp y = rsp + 0 * 8; |