aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <herumi@nifty.com>2018-02-17 02:56:28 +0800
committerMITSUNARI Shigeo <herumi@nifty.com>2018-02-17 02:56:28 +0800
commit0181a098fd64d68a831e1e8d013e29f60ad2f96f (patch)
tree04fc932cabaa1eb4924532f7b6386ae3d0486f52
parent8f3c1b82df92eb0acad317233fe1fd051ccc241c (diff)
downloaddexon-mcl-0181a098fd64d68a831e1e8d013e29f60ad2f96f.tar.gz
dexon-mcl-0181a098fd64d68a831e1e8d013e29f60ad2f96f.tar.zst
dexon-mcl-0181a098fd64d68a831e1e8d013e29f60ad2f96f.zip
test mulPre6
-rw-r--r--src/fp_generator.hpp26
1 files changed, 24 insertions, 2 deletions
diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp
index c22cf4d..d2ac96a 100644
--- a/src/fp_generator.hpp
+++ b/src/fp_generator.hpp
@@ -178,7 +178,6 @@ struct FpGenerator : Xbyak::CodeGenerator {
*/
void init(Op& op)
{
-// if (op.N < 2) throw cybozu::Exception("mcl:FpGenerator:small pn") << op.N;
op_ = &op;
p_ = op.p;
rp_ = fp::getMontgomeryCoeff(p_[0]);
@@ -254,7 +253,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
op.fpDbl_mod = getCurr<void3u>();
gen_fpDbl_mod(op);
}
- if ((useMulx_ && op.N == 2) || op.N == 3 || op.N == 4) {
+ if ((useMulx_ && op.N == 2) || op.N == 3 || op.N == 4 || (useAdx_ && op.N == 6)) {
align(16);
op.fpDbl_mulPre = getCurr<void3u>();
gen_fpDbl_mulPre();
@@ -1566,6 +1565,24 @@ struct FpGenerator : Xbyak::CodeGenerator {
mov(ptr [pz + 8 * 7], d);
#endif
}
+ void mulPre6(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t)
+ {
+ const Reg64& t0 = t[0];
+ const Reg64& t1 = t[1];
+ const Reg64& t2 = t[2];
+ const Reg64& t3 = t[3];
+ const Reg64& t4 = t[4];
+ const Reg64& t5 = t[5];
+ const Reg64& t6 = t[6];
+
+ mulPack(pz, px, py, Pack(t5, t4, t3, t2, t1, t0)); // [t5:t4:t3:t2:t1:t0]
+ mulPackAdd(pz + 8 * 1, px + 8 * 1, py, t6, Pack(t5, t4, t3, t2, t1, t0)); // [t6:t5:t4:t3:t2:t1]
+ mulPackAdd(pz + 8 * 2, px + 8 * 2, py, t0, Pack(t6, t5, t4, t3, t2, t1)); // [t0:t6:t5:t4:t3:t2]
+ mulPackAdd(pz + 8 * 3, px + 8 * 3, py, t1, Pack(t0, t6, t5, t4, t3, t2)); // [t1:t0:t6:t5:t4:t3]
+ mulPackAdd(pz + 8 * 4, px + 8 * 4, py, t2, Pack(t1, t0, t6, t5, t4, t3)); // [t2:t1:t0:t6:t5:t4]
+ mulPackAdd(pz + 8 * 5, px + 8 * 5, py, t3, Pack(t2, t1, t0, t6, t5, t4)); // [t3:t2:t1:t0:t6:t5]
+ store_mr(pz + 8 * 6, Pack(t3, t2, t1, t0, t6, t5));
+ }
void gen_fpDbl_sqrPre(mcl::fp::Op& op)
{
if (useMulx_ && pn_ == 2) {
@@ -1601,6 +1618,11 @@ struct FpGenerator : Xbyak::CodeGenerator {
} else if (pn_ == 4) {
StackFrame sf(this, 3, 10 | UseRDX);
mulPre4(sf.p[0], sf.p[1], sf.p[2], sf.t);
+#if 0 // slow?
+ } else if (pn_ == 6 && useAdx_) {
+ StackFrame sf(this, 3, 7 | UseRDX);
+ mulPre6(sf.p[0], sf.p[1], sf.p[2], sf.t);
+#endif
}
}
static inline void debug_put_inner(const uint64_t *ptr, int n)