aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <herumi@nifty.com>2018-12-10 16:58:03 +0800
committerMITSUNARI Shigeo <herumi@nifty.com>2018-12-10 16:58:03 +0800
commit5a16675a178c873d76ba8c66426e01591d1ca903 (patch)
treef1de74cc82402e51b3fa14ea981afa78c52b7a66
parent38fb035e24ec090ddf9e2f8051ee0e7f3f41b703 (diff)
downloaddexon-mcl-5a16675a178c873d76ba8c66426e01591d1ca903.tar.gz
dexon-mcl-5a16675a178c873d76ba8c66426e01591d1ca903.tar.zst
dexon-mcl-5a16675a178c873d76ba8c66426e01591d1ca903.zip
a little optimization of Fp12::mul
-rw-r--r--include/mcl/fp_tower.hpp203
1 files changed, 117 insertions, 86 deletions
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index c065094..c68a5c2 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -764,6 +764,8 @@ template<class Fp> Fp2T<Fp> Fp2T<Fp>::g[Fp2T<Fp>::gN];
template<class Fp> Fp2T<Fp> Fp2T<Fp>::g2[Fp2T<Fp>::gN];
template<class Fp> Fp2T<Fp> Fp2T<Fp>::g3[Fp2T<Fp>::gN];
+template<class Fp>
+struct Fp6DblT;
/*
Fp6T = Fp2[v] / (v^3 - xi)
x = a + b v + c v^2
@@ -774,6 +776,7 @@ struct Fp6T : public fp::Serializable<Fp6T<_Fp>,
typedef _Fp Fp;
typedef Fp2T<Fp> Fp2;
typedef Fp2DblT<Fp> Fp2Dbl;
+ typedef Fp6DblT<Fp> Fp6Dbl;
typedef Fp BaseFp;
Fp2 a, b, c;
Fp6T() { }
@@ -904,91 +907,7 @@ struct Fp6T : public fp::Serializable<Fp6T<_Fp>,
y.b += t1; // c^2 xi + 2ab
y.c -= t1; // b^2 + 2ac
}
- /*
- x = a + bv + cv^2, y = d + ev + fv^2, v^3 = xi
- xy = (ad + (bf + ce)xi) + ((ae + bd) + cf xi)v + ((af + cd) + be)v^2
- bf + ce = (b + c)(e + f) - be - cf
- ae + bd = (a + b)(e + d) - ad - be
- af + cd = (a + c)(d + f) - ad - cf
- */
- static void mul(Fp6T& z, const Fp6T& x, const Fp6T& y)
- {
-//clk.begin();
- const Fp2& a = x.a;
- const Fp2& b = x.b;
- const Fp2& c = x.c;
- const Fp2& d = y.a;
- const Fp2& e = y.b;
- const Fp2& f = y.c;
-#if 1
- Fp2Dbl AD, BE, CF;
- Fp2Dbl::mulPre(AD, a, d);
- Fp2Dbl::mulPre(BE, b, e);
- Fp2Dbl::mulPre(CF, c, f);
-
- Fp2 t1, t2, t3, t4;
- Fp2::add(t1, b, c);
- Fp2::add(t2, e, f);
- Fp2Dbl T1;
- Fp2Dbl::mulPre(T1, t1, t2);
- Fp2Dbl::sub(T1, T1, BE);
- Fp2Dbl::sub(T1, T1, CF);
- Fp2Dbl::mul_xi(T1, T1);
-
- Fp2::add(t2, a, b);
- Fp2::add(t3, e, d);
- Fp2Dbl T2;
- Fp2Dbl::mulPre(T2, t2, t3);
- Fp2Dbl::sub(T2, T2, AD);
- Fp2Dbl::sub(T2, T2, BE);
-
- Fp2::add(t3, a, c);
- Fp2::add(t4, d, f);
- Fp2Dbl T3;
- Fp2Dbl::mulPre(T3, t3, t4);
- Fp2Dbl::sub(T3, T3, AD);
- Fp2Dbl::sub(T3, T3, CF);
-
- Fp2Dbl::add(AD, AD, T1);
- Fp2Dbl::mod(z.a, AD);
- Fp2Dbl::mul_xi(CF, CF);
- Fp2Dbl::add(CF, CF, T2);
- Fp2Dbl::mod(z.b, CF);
- Fp2Dbl::add(T3, T3, BE);
- Fp2Dbl::mod(z.c, T3);
-#else
- Fp2 ad, be, cf;
- Fp2::mul(ad, a, d);
- Fp2::mul(be, b, e);
- Fp2::mul(cf, c, f);
-
- Fp2 t1, t2, t3, t4;
- Fp2::add(t1, b, c);
- Fp2::add(t2, e, f);
- t1 *= t2;
- t1 -= be;
- t1 -= cf;
- Fp2::mul_xi(t1, t1);
-
- Fp2::add(t2, a, b);
- Fp2::add(t3, e, d);
- t2 *= t3;
- t2 -= ad;
- t2 -= be;
-
- Fp2::add(t3, a, c);
- Fp2::add(t4, d, f);
- t3 *= t4;
- t3 -= ad;
- t3 -= cf;
-
- Fp2::add(z.a, ad, t1);
- Fp2::mul_xi(z.b, cf);
- z.b += t2;
- Fp2::add(z.c, t3, be);
-#endif
-//clk.end();
- }
+ static inline void mul(Fp6T& z, const Fp6T& x, const Fp6T& y);
/*
x = a + bv + cv^2, v^3 = xi
y = 1/x = p/q where
@@ -1030,6 +949,94 @@ struct Fp6T : public fp::Serializable<Fp6T<_Fp>,
}
};
+template<class Fp>
+struct Fp6DblT {
+ typedef Fp2T<Fp> Fp2;
+ typedef Fp6T<Fp> Fp6;
+ typedef Fp2DblT<Fp> Fp2Dbl;
+ typedef Fp6DblT<Fp> Fp6Dbl;
+ typedef fp::Unit Unit;
+ Fp2Dbl a, b, c;
+ static void add(Fp6Dbl& z, const Fp6Dbl& x, const Fp6Dbl& y)
+ {
+ Fp2Dbl::add(z.a, x.a, y.a);
+ Fp2Dbl::add(z.b, x.b, y.b);
+ Fp2Dbl::add(z.c, x.c, y.c);
+ }
+ static void sub(Fp6Dbl& z, const Fp6Dbl& x, const Fp6Dbl& y)
+ {
+ Fp2Dbl::sub(z.a, x.a, y.a);
+ Fp2Dbl::sub(z.b, x.b, y.b);
+ Fp2Dbl::sub(z.c, x.c, y.c);
+ }
+ /*
+ x = a + bv + cv^2, y = d + ev + fv^2, v^3 = xi
+ xy = (ad + (bf + ce)xi) + ((ae + bd) + cf xi)v + ((af + cd) + be)v^2
+ bf + ce = (b + c)(e + f) - be - cf
+ ae + bd = (a + b)(e + d) - ad - be
+ af + cd = (a + c)(d + f) - ad - cf
+ */
+ static void mulPre(Fp6DblT& z, const Fp6& x, const Fp6& y)
+ {
+//clk.begin();
+ const Fp2& a = x.a;
+ const Fp2& b = x.b;
+ const Fp2& c = x.c;
+ const Fp2& d = y.a;
+ const Fp2& e = y.b;
+ const Fp2& f = y.c;
+ Fp2Dbl& za = z.a;
+ Fp2Dbl& zb = z.b;
+ Fp2Dbl& zc = z.c;
+ Fp2Dbl BE;
+ Fp2Dbl::mulPre(za, a, d);
+ Fp2Dbl::mulPre(BE, b, e);
+ Fp2Dbl::mulPre(zb, c, f);
+
+ Fp2 t1, t2, t3, t4;
+ Fp2::add(t1, b, c);
+ Fp2::add(t2, e, f);
+ Fp2Dbl T1;
+ Fp2Dbl::mulPre(T1, t1, t2);
+ Fp2Dbl::sub(T1, T1, BE);
+ Fp2Dbl::sub(T1, T1, zb);
+ Fp2Dbl::mul_xi(T1, T1);
+
+ Fp2::add(t2, a, b);
+ Fp2::add(t3, e, d);
+ Fp2Dbl T2;
+ Fp2Dbl::mulPre(T2, t2, t3);
+ Fp2Dbl::sub(T2, T2, za);
+ Fp2Dbl::sub(T2, T2, BE);
+
+ Fp2::add(t3, a, c);
+ Fp2::add(t4, d, f);
+ Fp2Dbl::mulPre(zc, t3, t4);
+ Fp2Dbl::sub(zc, zc, za);
+ Fp2Dbl::sub(zc, zc, zb);
+
+ Fp2Dbl::add(za, za, T1);
+ Fp2Dbl::mul_xi(zb, zb);
+ Fp2Dbl::add(zb, zb, T2);
+ Fp2Dbl::add(zc, zc, BE);
+//clk.end();
+ }
+ static void mod(Fp6& y, const Fp6Dbl& x)
+ {
+ Fp2Dbl::mod(y.a, x.a);
+ Fp2Dbl::mod(y.b, x.b);
+ Fp2Dbl::mod(y.c, x.c);
+ }
+};
+
+template<class Fp>
+inline void Fp6T<Fp>::mul(Fp6T<Fp>& z, const Fp6T<Fp>& x, const Fp6T<Fp>& y)
+{
+ Fp6DblT<Fp> Z;
+ Fp6DblT<Fp>::mulPre(Z, x, y);
+ Fp6DblT<Fp>::mod(z, Z);
+}
+
/*
Fp12T = Fp6[w] / (w^2 - v)
x = a + b w
@@ -1039,6 +1046,8 @@ struct Fp12T : public fp::Serializable<Fp12T<Fp>,
fp::Operator<Fp12T<Fp> > > {
typedef Fp2T<Fp> Fp2;
typedef Fp6T<Fp> Fp6;
+ typedef Fp2DblT<Fp> Fp2Dbl;
+ typedef Fp6DblT<Fp> Fp6Dbl;
typedef Fp BaseFp;
Fp6 a, b;
Fp12T() {}
@@ -1105,6 +1114,14 @@ struct Fp12T : public fp::Serializable<Fp12T<Fp>,
Fp2::add(z.b, x.a, y.b);
Fp2::add(z.a, t, y.a);
}
+ static void mulVadd(Fp6Dbl& z, const Fp6Dbl& x, const Fp6Dbl& y)
+ {
+ Fp2Dbl t;
+ Fp2Dbl::mul_xi(t, x.c);
+ Fp2Dbl::add(z.c, x.b, y.c);
+ Fp2Dbl::add(z.b, x.a, y.b);
+ Fp2Dbl::add(z.a, t, y.a);
+ }
/*
x = a + bw, y = c + dw, w^2 = v
z = xy = (a + bw)(c + dw) = (ac + bdv) + (ad + bc)w
@@ -1114,19 +1131,33 @@ struct Fp12T : public fp::Serializable<Fp12T<Fp>,
*/
static void mul(Fp12T& z, const Fp12T& x, const Fp12T& y)
{
+ // 4.7Kclk -> 4.55Kclk
const Fp6& a = x.a;
const Fp6& b = x.b;
const Fp6& c = y.a;
const Fp6& d = y.b;
- Fp6 t1, t2, ac, bd;
+ Fp6 t1, t2;
Fp6::add(t1, a, b);
Fp6::add(t2, c, d);
+#if 1
+ Fp6Dbl T, AC, BD;
+ Fp6Dbl::mulPre(AC, a, c);
+ Fp6Dbl::mulPre(BD, b, d);
+ mulVadd(T, BD, AC);
+ Fp6Dbl::mod(z.a, T);
+ Fp6Dbl::mulPre(T, t1, t2); // (a + b)(c + d)
+ Fp6Dbl::sub(T, T, AC);
+ Fp6Dbl::sub(T, T, BD);
+ Fp6Dbl::mod(z.b, T);
+#else
+ Fp6 ac, bd;
t1 *= t2; // (a + b)(c + d)
Fp6::mul(ac, a, c);
Fp6::mul(bd, b, d);
mulVadd(z.a, bd, ac);
t1 -= ac;
Fp6::sub(z.b, t1, bd);
+#endif
}
/*
x = a + bw, w^2 = v