aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <herumi@nifty.com>2016-12-26 12:58:59 +0800
committerMITSUNARI Shigeo <herumi@nifty.com>2016-12-26 12:58:59 +0800
commit389bb658ef48e597d27d9940a8cfe43314ab8f7c (patch)
treece736493c683501f47ea712ed08c2d39aee115a6
parent21324c9e0adf5c664c439bd5cc6f994211457c6a (diff)
downloaddexon-mcl-389bb658ef48e597d27d9940a8cfe43314ab8f7c.tar.gz
dexon-mcl-389bb658ef48e597d27d9940a8cfe43314ab8f7c.tar.zst
dexon-mcl-389bb658ef48e597d27d9940a8cfe43314ab8f7c.zip
add gmp version of Mont with not full prime
-rw-r--r--readme.md4
-rw-r--r--src/fp.cpp9
-rw-r--r--src/gen.cpp91
-rw-r--r--src/low_func.hpp78
-rw-r--r--src/low_func_llvm.hpp3
-rw-r--r--src/proto.hpp1
6 files changed, 131 insertions, 55 deletions
diff --git a/readme.md b/readme.md
index eee9e2f..1f8a481 100644
--- a/readme.md
+++ b/readme.md
@@ -89,7 +89,9 @@ make MCL_USE_LLVM=1 LLVM_VER=-3.8 ARCH=arm
A benchmark of a BN curve over the 254-bit prime.
* x64, x86 ; Inte Core i7-6700 3.4GHz(Skylake)
- * `sudo cpufreq-set -g performance`
+ ```
+ sudo cpufreq-set -g performance
+ ```
* arm ; 900MHz quad-core ARM Cortex-A7 on Raspberry Pi2, Linux 4.4.11-v7+
* arm64 ; 1.2GHz ARM Cortex-A53 [HiKey](http://www.96boards.org/product/hikey/)
diff --git a/src/fp.cpp b/src/fp.cpp
index 42299c9..aab1945 100644
--- a/src/fp.cpp
+++ b/src/fp.cpp
@@ -191,8 +191,13 @@ void setOp2(Op& op)
op.fp_sub = Sub<N, false, Tag>::f;
}
if (op.isMont) {
- op.fp_mul = Mont<N, Tag>::f;
- op.fp_sqr = SqrMont<N, Tag>::f;
+ if (op.isFullBit) {
+ op.fp_mul = Mont<N, true, Tag>::f;
+ op.fp_sqr = SqrMont<N, true, Tag>::f;
+ } else {
+ op.fp_mul = Mont<N, false, Tag>::f;
+ op.fp_sqr = SqrMont<N, false, Tag>::f;
+ }
op.fpDbl_mod = MontRed<N, Tag>::f;
} else {
op.fp_mul = Mul<N, Tag>::f;
diff --git a/src/gen.cpp b/src/gen.cpp
index 627c2d2..401a582 100644
--- a/src/gen.cpp
+++ b/src/gen.cpp
@@ -738,7 +738,7 @@ struct Code : public mcl::Generator {
generic_fpDbl_mul(py, px, px);
endFunc();
}
- void gen_mcl_fp_mont()
+ void gen_mcl_fp_mont(bool isFullBit = true)
{
const int bu = bit + unit;
const int bu2 = bit + unit * 2;
@@ -747,39 +747,75 @@ struct Code : public mcl::Generator {
Operand px(IntPtr, unit);
Operand py(IntPtr, unit);
Operand pp(IntPtr, unit);
- std::string name = "mcl_fp_mont" + cybozu::itoa(N) + "L";
+ std::string name = "mcl_fp_mont";
+ if (!isFullBit) {
+ name += "NF";
+ }
+ name += cybozu::itoa(N) + "L";
mcl_fp_montM[N] = Function(name, Void, pz, px, py, pp);
mcl_fp_montM[N].setAlias();
verifyAndSetPrivate(mcl_fp_montM[N]);
beginFunc(mcl_fp_montM[N]);
Operand rp = load(getelementptr(pp, -1));
- Operand p = loadN(pp, N);
Operand z, s, a;
- for (uint32_t i = 0; i < N; i++) {
- Operand y = load(getelementptr(py, i));
- Operand xy = call(mulPvM[bit], px, y);
- Operand at;
- if (i == 0) {
- a = zext(xy, bu2);
- at = trunc(xy, unit);
- } else {
- xy = zext(xy, bu2);
- a = add(s, xy);
- at = trunc(a, unit);
+ if (1 || isFullBit) {
+ for (uint32_t i = 0; i < N; i++) {
+ Operand y = load(getelementptr(py, i));
+ Operand xy = call(mulPvM[bit], px, y);
+ Operand at;
+ if (i == 0) {
+ a = zext(xy, bu2);
+ at = trunc(xy, unit);
+ } else {
+ xy = zext(xy, bu2);
+ a = add(s, xy);
+ at = trunc(a, unit);
+ }
+ Operand q = mul(at, rp);
+ Operand pq = call(mulPvM[bit], pp, q);
+ pq = zext(pq, bu2);
+ Operand t = add(a, pq);
+ s = lshr(t, unit);
}
- Operand q = mul(at, rp);
- Operand pq = call(mulPvM[bit], pp, q);
- pq = zext(pq, bu2);
- Operand t = add(a, pq);
- s = lshr(t, unit);
+ s = trunc(s, bu);
+ Operand p = zext(loadN(pp, N), bu);
+ Operand vc = sub(s, p);
+ Operand c = trunc(lshr(vc, bit), 1);
+ z = select(c, s, vc);
+ z = trunc(z, bit);
+ storeN(z, pz);
+ } else {
+ for (uint32_t i = 0; i < N; i++) {
+ Operand y = load(getelementptr(py, i));
+ Operand xy = call(mulPvM[bit], px, y);
+ Operand at;
+ if (i == 0) {
+ a = xy;
+ at = trunc(xy, unit);
+ Operand q = mul(at, rp);
+ Operand pq = call(mulPvM[bit], pp, q);
+ pq = zext(pq, bu2);
+ Operand t = add(a, pq);
+ s = lshr(t, unit);
+ } else {
+ xy = zext(xy, bu2);
+ a = add(s, xy);
+ at = trunc(a, unit);
+ Operand q = mul(at, rp);
+ Operand pq = call(mulPvM[bit], pp, q);
+ pq = zext(pq, bu2);
+ Operand t = add(a, pq);
+ s = lshr(t, unit);
+ }
+ }
+ s = trunc(s, bu);
+ Operand p = zext(loadN(pp, N), bu);
+ Operand vc = sub(s, p);
+ Operand c = trunc(lshr(vc, bit), 1);
+ z = select(c, s, vc);
+ z = trunc(z, bit);
+ storeN(z, pz);
}
- s = trunc(s, bu);
- p = zext(p, bu);
- Operand vc = sub(s, p);
- Operand c = trunc(lshr(vc, bit), 1);
- z = select(c, s, vc);
- z = trunc(z, bit);
- storeN(z, pz);
ret(Void);
endFunc();
}
@@ -840,7 +876,8 @@ struct Code : public mcl::Generator {
gen_mcl_fp_mulUnitPre();
gen_mcl_fpDbl_mulPre();
gen_mcl_fpDbl_sqrPre();
- gen_mcl_fp_mont();
+ gen_mcl_fp_mont(true);
+ gen_mcl_fp_mont(false);
gen_mcl_fp_montRed();
}
void setBit(uint32_t bit)
diff --git a/src/low_func.hpp b/src/low_func.hpp
index c3c7450..2f8c4a0 100644
--- a/src/low_func.hpp
+++ b/src/low_func.hpp
@@ -507,7 +507,7 @@ const void3u MontRed<N, Tag>::f = MontRed<N, Tag>::func;
z[N] <- Montgomery(x[N], y[N], p[N])
REMARK : assume p[-1] = rp
*/
-template<size_t N, class Tag = Gtag>
+template<size_t N, bool isFullBit, class Tag = Gtag>
struct Mont {
static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
{
@@ -517,26 +517,56 @@ struct Mont {
MontRed<N, Tag>::f(z, xy, p);
#else
const Unit rp = p[-1];
- Unit buf[N * 2 + 2];
- Unit *c = buf;
- MulUnitPre<N, Tag>::f(c, x, y[0]); // x * y[0]
- Unit q = c[0] * rp;
- Unit t[N + 2];
- MulUnitPre<N, Tag>::f(t, p, q); // p * q
- t[N + 1] = 0; // always zero
- c[N + 1] = AddPre<N + 1, Tag>::f(c, c, t);
- c++;
- for (size_t i = 1; i < N; i++) {
- MulUnitPre<N, Tag>::f(t, x, y[i]);
+ if (isFullBit) {
+ Unit buf[N * 2 + 2];
+ Unit *c = buf;
+ MulUnitPre<N, Tag>::f(c, x, y[0]); // x * y[0]
+ Unit q = c[0] * rp;
+ Unit t[N + 2];
+ MulUnitPre<N, Tag>::f(t, p, q); // p * q
+ t[N + 1] = 0; // always zero
c[N + 1] = AddPre<N + 1, Tag>::f(c, c, t);
- q = c[0] * rp;
- MulUnitPre<N, Tag>::f(t, p, q);
- AddPre<N + 2, Tag>::f(c, c, t);
c++;
- }
- if (c[N]) {
- SubPre<N, Tag>::f(z, c, p);
+ for (size_t i = 1; i < N; i++) {
+ MulUnitPre<N, Tag>::f(t, x, y[i]);
+ c[N + 1] = AddPre<N + 1, Tag>::f(c, c, t);
+ q = c[0] * rp;
+ MulUnitPre<N, Tag>::f(t, p, q);
+ AddPre<N + 2, Tag>::f(c, c, t);
+ c++;
+ }
+ if (c[N]) {
+ SubPre<N, Tag>::f(z, c, p);
+ } else {
+ if (SubPre<N, Tag>::f(z, c, p)) {
+ memcpy(z, c, N * sizeof(Unit));
+ }
+ }
} else {
+ Unit carry;
+ (void)carry;
+ Unit buf[N * 2 + 1];
+ Unit *c = buf;
+ MulUnitPre<N, Tag>::f(c, x, y[0]); // x * y[0]
+ Unit q = c[0] * rp;
+ Unit t[N + 1];
+ MulUnitPre<N, Tag>::f(t, p, q); // p * q
+ carry = AddPre<N + 1, Tag>::f(c, c, t);
+ assert(carry == 0);
+ c++;
+ c[N] = 0;
+ for (size_t i = 1; i < N; i++) {
+ c[N + 1] = 0;
+ MulUnitPre<N, Tag>::f(t, x, y[i]);
+ carry = AddPre<N + 1, Tag>::f(c, c, t);
+ assert(carry == 0);
+ q = c[0] * rp;
+ MulUnitPre<N, Tag>::f(t, p, q);
+ carry = AddPre<N + 1, Tag>::f(c, c, t);
+ assert(carry == 0);
+ c++;
+ }
+ assert(c[N] == 0);
if (SubPre<N, Tag>::f(z, c, p)) {
memcpy(z, c, N * sizeof(Unit));
}
@@ -546,11 +576,11 @@ struct Mont {
static const void4u f;
};
-template<size_t N, class Tag>
-const void4u Mont<N, Tag>::f = Mont<N, Tag>::func;
+template<size_t N, bool isFullBit, class Tag>
+const void4u Mont<N, isFullBit, Tag>::f = Mont<N, isFullBit, Tag>::func;
// z[N] <- Montgomery(x[N], x[N], p[N])
-template<size_t N, class Tag = Gtag>
+template<size_t N, bool isFullBit, class Tag = Gtag>
struct SqrMont {
static inline void func(Unit *y, const Unit *x, const Unit *p)
{
@@ -559,13 +589,13 @@ struct SqrMont {
SqrPre<N, Tag>::f(xx, x);
MontRed<N, Tag>::f(y, xx, p);
#else
- Mont<N, Tag>::f(y, x, x, p);
+ Mont<N, isFullBit, Tag>::f(y, x, x, p);
#endif
}
static const void3u f;
};
-template<size_t N, class Tag>
-const void3u SqrMont<N, Tag>::f = SqrMont<N, Tag>::func;
+template<size_t N, bool isFullBit, class Tag>
+const void3u SqrMont<N, isFullBit, Tag>::f = SqrMont<N, isFullBit, Tag>::func;
// z[N] <- (x[N] * y[N]) % p[N]
template<size_t N, class Tag = Gtag>
diff --git a/src/low_func_llvm.hpp b/src/low_func_llvm.hpp
index 4798cbe..ea7a19c 100644
--- a/src/low_func_llvm.hpp
+++ b/src/low_func_llvm.hpp
@@ -35,7 +35,8 @@ template<>const void4u Add<n, true, Ltag>::f = &mcl_fp_add ## n ## L; \
template<>const void4u Add<n, false, Ltag>::f = &mcl_fp_addNF ## n ## L; \
template<>const void4u Sub<n, true, Ltag>::f = &mcl_fp_sub ## n ## L; \
template<>const void4u Sub<n, false, Ltag>::f = &mcl_fp_subNF ## n ## L; \
-template<>const void4u Mont<n, Ltag>::f = &mcl_fp_mont ## n ## L; \
+template<>const void4u Mont<n, true, Ltag>::f = &mcl_fp_mont ## n ## L; \
+template<>const void4u Mont<n, false, Ltag>::f = &mcl_fp_mont ## n ## L; \
template<>const void3u MontRed<n, Ltag>::f = &mcl_fp_montRed ## n ## L; \
template<>const void4u DblAdd<n, Ltag>::f = &mcl_fpDbl_add ## n ## L; \
template<>const void4u DblSub<n, Ltag>::f = &mcl_fpDbl_sub ## n ## L; \
diff --git a/src/proto.hpp b/src/proto.hpp
index 46dac04..18e8567 100644
--- a/src/proto.hpp
+++ b/src/proto.hpp
@@ -20,6 +20,7 @@ void mcl_fp_mulUnitPre ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, mcl
void mcl_fpDbl_mulPre ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y); \
void mcl_fpDbl_sqrPre ## n ## suf(mcl::fp::Unit* y, const mcl::fp::Unit* x); \
void mcl_fp_mont ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p); \
+void mcl_fp_montNF ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p); \
void mcl_fp_montRed ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* xy, const mcl::fp::Unit* p); \
void mcl_fpDbl_add ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p); \
void mcl_fpDbl_sub ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p);