aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <herumi@nifty.com>2018-08-07 19:12:45 +0800
committerMITSUNARI Shigeo <herumi@nifty.com>2018-08-07 19:12:45 +0800
commit42710833307dd9c863be16bddf3754c2ff92ecc9 (patch)
treee17059666656865387ba57f1a3dd8c8fc14f4b03
parenta7439e3638492ef85c75c1c8e5e88ad1878dfdd8 (diff)
downloadtangerine-mcl-42710833307dd9c863be16bddf3754c2ff92ecc9.tar.gz
tangerine-mcl-42710833307dd9c863be16bddf3754c2ff92ecc9.tar.zst
tangerine-mcl-42710833307dd9c863be16bddf3754c2ff92ecc9.zip
add fp2_sqr ; but not enable
-rw-r--r--include/mcl/fp.hpp7
-rw-r--r--include/mcl/fp_tower.hpp3
-rw-r--r--include/mcl/op.hpp4
-rw-r--r--src/fp_generator.hpp71
4 files changed, 80 insertions, 5 deletions
diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp
index ed4eba4..e0b64ff 100644
--- a/include/mcl/fp.hpp
+++ b/include/mcl/fp.hpp
@@ -136,6 +136,8 @@ public:
if (sub == 0) sub = subC;
mul = (void (*)(FpT& z, const FpT& x, const FpT& y))op_.fp_mulA_;
if (mul == 0) mul = mulC;
+ sqr = (void (*)(FpT& y, const FpT& x))op_.fp_sqrA_;
+ if (sqr == 0) sqr = sqrC;
#endif
*pb = true;
}
@@ -359,10 +361,13 @@ public:
static inline void subC(FpT& z, const FpT& x, const FpT& y) { op_.fp_sub(z.v_, x.v_, y.v_, op_.p); }
static void (*mul)(FpT& z, const FpT& x, const FpT& y);
static inline void mulC(FpT& z, const FpT& x, const FpT& y) { op_.fp_mul(z.v_, x.v_, y.v_, op_.p); }
+ static void (*sqr)(FpT& y, const FpT& x);
+ static inline void sqrC(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); }
#else
static inline void add(FpT& z, const FpT& x, const FpT& y) { op_.fp_add(z.v_, x.v_, y.v_, op_.p); }
static inline void sub(FpT& z, const FpT& x, const FpT& y) { op_.fp_sub(z.v_, x.v_, y.v_, op_.p); }
static inline void mul(FpT& z, const FpT& x, const FpT& y) { op_.fp_mul(z.v_, x.v_, y.v_, op_.p); }
+ static inline void sqr(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); }
#endif
static inline void addPre(FpT& z, const FpT& x, const FpT& y) { op_.fp_addPre(z.v_, x.v_, y.v_); }
static inline void subPre(FpT& z, const FpT& x, const FpT& y) { op_.fp_subPre(z.v_, x.v_, y.v_); }
@@ -373,7 +378,6 @@ public:
}
static inline void inv(FpT& y, const FpT& x) { op_.fp_invOp(y.v_, x.v_, op_); }
static inline void neg(FpT& y, const FpT& x) { op_.fp_neg(y.v_, x.v_, op_.p); }
- static inline void sqr(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); }
static inline void divBy2(FpT& y, const FpT& x)
{
#if 0
@@ -584,6 +588,7 @@ template<class tag, size_t maxBitSize> int FpT<tag, maxBitSize>::ioMode_ = IoAut
template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::add)(FpT& z, const FpT& x, const FpT& y);
template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::sub)(FpT& z, const FpT& x, const FpT& y);
template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::mul)(FpT& z, const FpT& x, const FpT& y);
+template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::sqr)(FpT& y, const FpT& x);
#endif
} // mcl
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 318003e..89d7fa0 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -402,6 +402,9 @@ public:
op.fp2_sqr = fp2_sqrW;
if (xi_a == 1) {
op.fp2_mul_xi = fp2_mul_xi_1_1i;
+ if (op.fp2_sqrA_) {
+ op.fp2_sqr = op.fp2_sqrA_;
+ }
} else {
op.fp2_mul_xi = fp2_mul_xiW;
}
diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp
index 1d3db0b..24dadc9 100644
--- a/include/mcl/op.hpp
+++ b/include/mcl/op.hpp
@@ -182,6 +182,8 @@ struct Op {
void3u fp_addA_;
void3u fp_subA_;
void3u fp_mulA_;
+ void2u fp_sqrA_;
+ void2u fp2_sqrA_;
size_t maxN;
size_t N;
size_t bitSize;
@@ -262,6 +264,8 @@ struct Op {
fp_addA_ = 0;
fp_subA_ = 0;
fp_mulA_ = 0;
+ fp_sqrA_ = 0;
+ fp2_sqrA_ = 0;
maxN = 0;
N = 0;
bitSize = 0;
diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp
index 69f1bc6..6024043 100644
--- a/src/fp_generator.hpp
+++ b/src/fp_generator.hpp
@@ -216,6 +216,7 @@ struct Code : Xbyak::CodeGenerator {
// the following labels assume sf(this, 3, 10 | UseRDX)
Label mulPreL_;
Label fpDbl_modL_;
+ Label fp_mulL_;
Code(uint8_t *mem, size_t codeSize)
: CodeGenerator(codeSize, mem)
@@ -294,12 +295,12 @@ struct Code : Xbyak::CodeGenerator {
mulUnit_ = getCurr<uint3opI>();
gen_mulUnit();
align(16);
- mul_ = getCurr<void4u>();
- op.fp_mul = mul_;
+ op.fp_mul = getCurr<void4u>();
op.fp_mulA_ = getCurr<void3u>();
gen_mul();
align(16);
op.fp_sqr = getCurr<void3u>();
+ op.fp_sqrA_ = getCurr<void2u>();
gen_sqr();
if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4
align(16);
@@ -374,6 +375,9 @@ struct Code : Xbyak::CodeGenerator {
align(16);
op.fp2_mul = getCurr<void3u>();
gen_fp2_mul();
+ align(16);
+// op.fp2_sqrA_ = getCurr<void2u>();
+// gen_fp2_sqr();
}
}
void gen_addSubPre(bool isAdd, int n)
@@ -1045,7 +1049,7 @@ struct Code : Xbyak::CodeGenerator {
#else
mov(rdx, rsi);
#endif
- jmp((void*)mul_);
+ jmp((const void*)op_->fp_mulA_);
}
/*
input (pz[], px[], py[])
@@ -1092,7 +1096,9 @@ struct Code : Xbyak::CodeGenerator {
*/
void gen_montMul4(const uint64_t *p, uint64_t pp)
{
- StackFrame sf(this, 3, 10 | UseRDX);
+ StackFrame sf(this, 3, 10 | UseRDX, 0, false);
+ call(fp_mulL_);
+ sf.close();
const Reg64& p0 = sf.p[0];
const Reg64& p1 = sf.p[1];
const Reg64& p2 = sf.p[2];
@@ -1108,6 +1114,7 @@ struct Code : Xbyak::CodeGenerator {
const Reg64& t8 = sf.t[8];
const Reg64& t9 = sf.t[9];
+ L(fp_mulL_);
movq(xm0, p0); // save p0
mov(p0, (uint64_t)p);
movq(xm1, p2);
@@ -1140,6 +1147,7 @@ struct Code : Xbyak::CodeGenerator {
movq(p0, xm0); // load p0
store_mr(p0, Pack(t3, t2, t1, t0));
+ ret();
}
/*
input (z, x, y) = (p0, p1, p2)
@@ -2884,6 +2892,61 @@ private:
lea(gp1, ptr[d1]);
call(fpDbl_modL_);
}
+ void gen_fp2_sqr()
+ {
+ assert(!isFullBit_);
+ const RegExp y = rsp + 0 * 8;
+ const RegExp x = rsp + 1 * 8;
+ const Ext1 t1(FpByte_, rsp, 2 * 8);
+ const Ext1 t2(FpByte_, rsp, t1.next);
+ const Ext1 t3(FpByte_, rsp, t2.next);
+ StackFrame sf(this, 3, 10 | UseRDX, t3.next);
+ mov(ptr [y], gp0);
+ mov(ptr [x], gp1);
+ // t1 = b + b
+ lea(gp0, ptr [t1]);
+ for (int i = 0; i < 4; i++) {
+ mov(rax, ptr [gp1 + FpByte_ + i * 8]);
+ if (i == 0) {
+ add(rax, rax);
+ } else {
+ adc(rax, rax);
+ }
+ mov(ptr [gp0 + i * 8], rax);
+ }
+ mov(gp1, gp0);
+ mov(gp2, ptr [x]);
+ call(fp_mulL_);
+
+ Pack a = sf.t.sub(0, 4);
+ Pack b = sf.t.sub(4, 4);
+ mov(gp0, ptr [x]);
+ load_rm(a, gp0);
+ load_rm(b, gp0 + FpByte_);
+ for (int i = 0; i < 4; i++) {
+ mov(rax, a[i]);
+ if (i == 0) {
+ add(rax, b[i]);
+ } else {
+ adc(rax, b[i]);
+ }
+ mov(ptr [(RegExp)t2 + i * 8], rax);
+ }
+ mov(gp1, (size_t)p_);
+ add_rm(a, gp1);
+ sub_rr(a, b);
+ store_mr(t3, a);
+
+ mov(gp0, ptr [y]);
+ lea(gp1, ptr [t2]);
+ lea(gp2, ptr [t3]);
+ call(fp_mulL_);
+ mov(gp0, ptr [y]);
+ for (int i = 0; i < 4; i++) {
+ mov(rax, ptr [(RegExp)t1 + i * 8]);
+ mov(ptr [gp0 + FpByte_ + i * 8], rax);
+ }
+ }
};
struct FpGenerator {