from mie

author: MITSUNARI Shigeo <herumi@nifty.com> 2015-05-05 09:52:44 +0800
committer: MITSUNARI Shigeo <herumi@nifty.com> 2015-05-05 09:52:44 +0800
commit: 14fd1d125d21f47c2539a0e820e2a417e0c715f8 (patch)
tree: 4fccb395f51a3f9c1202a783665ce396b649eeb9
download: tangerine-mcl-14fd1d125d21f47c2539a0e820e2a417e0c715f8.tar.gz
tangerine-mcl-14fd1d125d21f47c2539a0e820e2a417e0c715f8.tar.zst
tangerine-mcl-14fd1d125d21f47c2539a0e820e2a417e0c715f8.zip
39 files changed, 8447 insertions, 0 deletions
diff --git a/COPYRIGHT b/COPYRIGHT
new file mode 100644
index 0000000..bfe54da
--- /dev/null
+++ b/COPYRIGHT
@@ -0,0 +1,47 @@
+
+Copyright (c) 2015 MITSUNARI Shigeo
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+Neither the name of the copyright owner nor the names of its contributors may
+be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+ソースコード形式かバイナリ形式か、変更するかしないかを問わず、以下の条件を満た
+す場合に限り、再頒布および使用が許可されます。
+
+ソースコードを再頒布する場合、上記の著作権表示、本条件一覧、および下記免責条項
+を含めること。
+バイナリ形式で再頒布する場合、頒布物に付属のドキュメント等の資料に、上記の著作
+権表示、本条件一覧、および下記免責条項を含めること。
+書面による特別の許可なしに、本ソフトウェアから派生した製品の宣伝または販売促進
+に、著作権者の名前またはコントリビューターの名前を使用してはならない。
+本ソフトウェアは、著作権者およびコントリビューターによって「現状のまま」提供さ
+れており、明示黙示を問わず、商業的な使用可能性、および特定の目的に対する適合性
+に関する暗黙の保証も含め、またそれに限定されない、いかなる保証もありません。
+著作権者もコントリビューターも、事由のいかんを問わず、 損害発生の原因いかんを
+問わず、かつ責任の根拠が契約であるか厳格責任であるか（過失その他の）不法行為で
+あるかを問わず、仮にそのような損害が発生する可能性を知らされていたとしても、
+本ソフトウェアの使用によって発生した（代替品または代用サービスの調達、使用の
+喪失、データの喪失、利益の喪失、業務の中断も含め、またそれに限定されない）直接
+損害、間接損害、偶発的な損害、特別損害、懲罰的損害、または結果損害について、
+一切責任を負わないものとします。
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..383af2b
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,20 @@
+include common.mk
+
+all:
+	$(MKDIR) bin
+	$(MAKE) -C test
+	$(MAKE) -C sample
+
+test:
+	$(MAKE) -C test test
+
+sample:
+	$(MAKE) -C sample test
+
+clean:
+#	$(MAKE) -C src clean
+	$(MAKE) -C test clean
+	$(MAKE) -C sample clean
+
+.PHONY: sample
+
diff --git a/common.mk b/common.mk
new file mode 100644
index 0000000..b861db1
--- /dev/null
+++ b/common.mk
@@ -0,0 +1,105 @@
+GCC_VER=$(shell $(PRE)$(CC) -dumpversion)
+UNAME_S=$(shell uname -s)
+ifeq ($(UNAME_S),Linux)
+  OS=Linux
+endif
+ifneq ($(UNAME_S),Darwin)
+  LDFLAGS += -lrt
+endif
+CP = cp -f
+AR = ar r
+MKDIR=mkdir -p
+RM=rm -fr
+CFLAGS_OPT+=-fomit-frame-pointer -DNDEBUG
+ifeq ($(CXX),clang++)
+  CFLAGS_OPT+=-O3
+else
+  ifeq ($(shell expr $(GCC_VER) \> 4.6.0),1)
+    CFLAGS_OPT+=-Ofast
+  else
+    CFLAGS_OPT+=-O3
+  endif
+endif
+CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith
+CFLAGS+= -g -D_FILE_OFFSET_BITS=64
+CFLAGS+=$(CFLAGS_WARN)
+BIT?=64
+ifeq ($(BIT),32)
+  CPU?=x86
+else
+  ifeq ($(BIT),64)
+    CPU?=x64
+  endif
+endif
+ifeq ($(BIT),0)
+	BIT_OPT=
+else
+	BIT_OPT=-m$(BIT)
+endif
+ifeq ($(MARCH),)
+ifeq ($(shell expr $(GCC_VER) \> 4.2.1),1)
+	CFLAGS+=-march=native
+endif
+else
+	CFLAGS+=$(MARCH)
+endif
+
+DEBUG=1
+ifeq ($(RELEASE),1)
+	DEBUG=0
+endif
+
+ifeq ($(DEBUG),0)
+  CFLAGS+=$(CFLAGS_OPT)
+  OBJDIR=release
+  OBJSUF=
+else
+  ifeq ($(OS),Linux)
+    LDFLAGS+=-rdynamic
+  endif
+  OBJDIR=debug
+  OBJSUF=d
+endif
+
+####################################################
+
+LDFLAGS += -lpthread -m$(BIT) -lgmp -lgmpxx
+
+####################################################
+
+TOPDIR:=$(realpath $(dir $(lastword $(MAKEFILE_LIST))))/
+EXTDIR:=$(TOPDIR)../cybozulib_ext/
+CFLAGS+= -I$(TOPDIR)include -I$(TOPDIR)../cybozulib/include/ -I$(TOPDIR)../xbyak/ $(BIT_OPT) $(INC_DIR)
+LDFLAGS+= -L$(TOPDIR)lib $(BIT_OPT) -Wl,-rpath,'$$ORIGIN/../lib' $(LD_DIR)
+
+MKDEP = sh -ec '$(PRE)$(CC) -MM $(CFLAGS) $< | sed "s@\($*\)\.o[ :]*@$(OBJDIR)/\1.o $@ : @g" > $@; [ -s $@ ] || rm -f $@; touch $@'
+
+CLEAN=$(RM) $(TARGET) $(OBJDIR)
+
+define UNIT_TEST
+sh -ec 'for i in $(TARGET); do $$i|grep "ctest:name"; done' > result.txt
+grep -v "ng=0, exception=0" result.txt || echo "all unit tests are ok"
+endef
+
+define SAMPLE_TEST
+sh -ec 'for i in $(TARGET); do $$i; done'
+endef
+
+.SUFFIXES: .cpp .d .exe
+
+$(OBJDIR)/%.o: %.cpp
+	$(PRE)$(CXX) -c $< -o $@ $(CFLAGS)
+
+$(OBJDIR)/%.d: %.cpp $(OBJDIR)
+	@$(MKDEP)
+
+$(TOPDIR)bin/%$(OBJSUF).exe: $(OBJDIR)/%.o $(LIBS)
+	$(PRE)$(CXX) $< -o $@ $(LIBS) $(LDFLAGS)
+
+OBJS=$(addprefix $(OBJDIR)/,$(SRC:.cpp=.o))
+
+DEPEND_FILE=$(addprefix $(OBJDIR)/, $(SRC:.cpp=.d))
+TEST_FILE=$(addprefix $(TOPDIR)bin/, $(SRC:.cpp=$(OBJSUF).exe))
+
+.PHONY: test
+
diff --git a/common.props b/common.props
new file mode 100644
index 0000000..8ec3f67
--- /dev/null
+++ b/common.props
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ImportGroup Label="PropertySheets" />
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <OutDir>$(SolutionDir)bin\</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <AdditionalIncludeDirectories>$(SolutionDir)../cybozulib/include;$(SolutionDir)../cybozulib_ext/mpir/include;$(SolutionDir)include</AdditionalIncludeDirectories>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <PrecompiledHeaderFile />
+      <PrecompiledHeaderOutputFile />
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NOMINMAX</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <AdditionalLibraryDirectories>$(SolutionDir)../cybozulib_ext/mpir/lib</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup />
+</Project>
+\ No newline at end of file
diff --git a/debug.props b/debug.props
new file mode 100644
index 0000000..d261c8d
--- /dev/null
+++ b/debug.props
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ImportGroup Label="PropertySheets" />
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <TargetName>$(ProjectName)d</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup />
+</Project>
+\ No newline at end of file
diff --git a/include/mcl/ec.hpp b/include/mcl/ec.hpp
new file mode 100644
index 0000000..8b70b70
--- /dev/null
+++ b/include/mcl/ec.hpp
@@ -0,0 +1,585 @@
+#pragma once
+/**
+	@file
+	@brief elliptic curve
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#include <sstream>
+#include <cybozu/exception.hpp>
+#include <cybozu/bitvector.hpp>
+#include <mcl/operator.hpp>
+#include <mcl/power.hpp>
+#include <mcl/gmp_util.hpp>
+
+namespace mcl {
+
+#define MCL_EC_USE_AFFINE 0
+#define MCL_EC_USE_PROJ 1
+#define MCL_EC_USE_JACOBI 2
+
+//#define MCL_EC_COORD MCL_EC_USE_JACOBI
+//#define MCL_EC_COORD MCL_EC_USE_PROJ
+#ifndef MCL_EC_COORD
+	#define MCL_EC_COORD MCL_EC_USE_PROJ
+#endif
+/*
+	elliptic curve
+	y^2 = x^3 + ax + b (affine)
+	y^2 = x^3 + az^4 + bz^6 (Jacobi) x = X/Z^2, y = Y/Z^3
+*/
+template<class _Fp>
+class EcT : public ope::addsub<EcT<_Fp>,
+	ope::comparable<EcT<_Fp>,
+	ope::hasNegative<EcT<_Fp> > > > {
+	enum {
+		zero,
+		minus3,
+		generic
+	};
+public:
+	typedef _Fp Fp;
+	typedef typename Fp::BlockType BlockType;
+#if MCL_EC_COORD == MCL_EC_USE_AFFINE
+	Fp x, y;
+	bool inf_;
+#else
+	mutable Fp x, y, z;
+#endif
+	static Fp a_;
+	static Fp b_;
+	static int specialA_;
+	static bool compressedExpression_;
+#if MCL_EC_COORD == MCL_EC_USE_AFFINE
+	EcT() : inf_(true) {}
+#else
+	EcT() { z.clear(); }
+#endif
+	EcT(const Fp& _x, const Fp& _y)
+	{
+		set(_x, _y);
+	}
+	void normalize() const
+	{
+#if MCL_EC_COORD == MCL_EC_USE_JACOBI
+		if (isZero() || z == 1) return;
+		Fp rz, rz2;
+		Fp::inv(rz, z);
+		rz2 = rz * rz;
+		x *= rz2;
+		y *= rz2 * rz;
+		z = 1;
+#elif MCL_EC_COORD == MCL_EC_USE_PROJ
+		if (isZero() || z == 1) return;
+		Fp rz;
+		Fp::inv(rz, z);
+		x *= rz;
+		y *= rz;
+		z = 1;
+#endif
+	}
+
+	static inline void setParam(const std::string& astr, const std::string& bstr)
+	{
+		a_.fromStr(astr);
+		b_.fromStr(bstr);
+		if (a_.isZero()) {
+			specialA_ = zero;
+		} else if (a_ == -3) {
+			specialA_ = minus3;
+		} else {
+			specialA_ = generic;
+		}
+	}
+	static inline bool isValid(const Fp& _x, const Fp& _y)
+	{
+		return _y * _y == (_x * _x + a_) * _x + b_;
+	}
+	void set(const Fp& _x, const Fp& _y, bool verify = true)
+	{
+		if (verify && !isValid(_x, _y)) throw cybozu::Exception("ec:EcT:set") << _x << _y;
+		x = _x; y = _y;
+#if MCL_EC_COORD == MCL_EC_USE_AFFINE
+		inf_ = false;
+#else
+		z = 1;
+#endif
+	}
+	void clear()
+	{
+#if MCL_EC_COORD == MCL_EC_USE_AFFINE
+		inf_ = true;
+#else
+		z = 0;
+#endif
+		x.clear();
+		y.clear();
+	}
+
+	static inline void dbl(EcT& R, const EcT& P, bool verifyInf = true)
+	{
+		if (verifyInf) {
+			if (P.isZero()) {
+				R.clear(); return;
+			}
+		}
+#if MCL_EC_COORD == MCL_EC_USE_JACOBI
+		Fp S, M, t, y2;
+		Fp::square(y2, P.y);
+		Fp::mul(S, P.x, y2);
+		S += S;
+		S += S;
+		Fp::square(M, P.x);
+		switch (specialA_) {
+		case zero:
+			Fp::add(t, M, M);
+			M += t;
+			break;
+		case minus3:
+			Fp::square(t, P.z);
+			Fp::square(t, t);
+			M -= t;
+			Fp::add(t, M, M);
+			M += t;
+			break;
+		case generic:
+		default:
+			Fp::square(t, P.z);
+			Fp::square(t, t);
+			t *= a_;
+			t += M;
+			M += M;
+			M += t;
+			break;
+		}
+		Fp::square(R.x, M);
+		R.x -= S;
+		R.x -= S;
+		Fp::mul(R.z, P.y, P.z);
+		R.z += R.z;
+		Fp::square(y2, y2);
+		y2 += y2;
+		y2 += y2;
+		y2 += y2;
+		Fp::sub(R.y, S, R.x);
+		R.y *= M;
+		R.y -= y2;
+#elif MCL_EC_COORD == MCL_EC_USE_PROJ
+		Fp w, t, h;
+		switch (specialA_) {
+		case zero:
+			Fp::square(w, P.x);
+			Fp::add(t, w, w);
+			w += t;
+			break;
+		case minus3:
+			Fp::square(w, P.x);
+			Fp::square(t, P.z);
+			w -= t;
+			Fp::add(t, w, w);
+			w += t;
+			break;
+		case generic:
+		default:
+			Fp::square(w, P.z);
+			w *= a_;
+			Fp::square(t, P.x);
+			w += t;
+			w += t;
+			w += t; // w = a z^2 + 3x^2
+			break;
+		}
+		Fp::mul(R.z, P.y, P.z); // s = yz
+		Fp::mul(t, R.z, P.x);
+		t *= P.y; // xys
+		t += t;
+		t += t; // 4(xys) ; 4B
+		Fp::square(h, w);
+		h -= t;
+		h -= t; // w^2 - 8B
+		Fp::mul(R.x, h, R.z);
+		t -= h; // h is free
+		t *= w;
+		Fp::square(w, P.y);
+		R.x += R.x;
+		R.z += R.z;
+		Fp::square(h, R.z);
+		w *= h;
+		R.z *= h;
+		Fp::sub(R.y, t, w);
+		R.y -= w;
+#else
+		Fp t, s;
+		Fp::square(t, P.x);
+		Fp::add(s, t, t);
+		t += s;
+		t += a_;
+		Fp::add(s, P.y, P.y);
+		t /= s;
+		Fp::square(s, t);
+		s -= P.x;
+		Fp x3;
+		Fp::sub(x3, s, P.x);
+		Fp::sub(s, P.x, x3);
+		s *= t;
+		Fp::sub(R.y, s, P.y);
+		R.x = x3;
+		R.inf_ = false;
+#endif
+	}
+	static inline void add(EcT& R, const EcT& P, const EcT& Q)
+	{
+		if (P.isZero()) { R = Q; return; }
+		if (Q.isZero()) { R = P; return; }
+#if MCL_EC_COORD == MCL_EC_USE_JACOBI
+		Fp r, U1, S1, H, H3;
+		Fp::square(r, P.z);
+		Fp::square(S1, Q.z);
+		Fp::mul(U1, P.x, S1);
+		Fp::mul(H, Q.x, r);
+		H -= U1;
+		r *= P.z;
+		S1 *= Q.z;
+		S1 *= P.y;
+		Fp::mul(r, Q.y, r);
+		r -= S1;
+		if (H.isZero()) {
+			if (r.isZero()) {
+				dbl(R, P, false);
+			} else {
+				R.clear();
+			}
+			return;
+		}
+		Fp::mul(R.z, P.z, Q.z);
+		R.z *= H;
+		Fp::square(H3, H); // H^2
+		Fp::square(R.y, r); // r^2
+		U1 *= H3; // U1 H^2
+		H3 *= H; // H^3
+		R.y -= U1;
+		R.y -= U1;
+		Fp::sub(R.x, R.y, H3);
+		U1 -= R.x;
+		U1 *= r;
+		H3 *= S1;
+		Fp::sub(R.y, U1, H3);
+#elif MCL_EC_COORD == MCL_EC_USE_PROJ
+		Fp r, PyQz, v, A, vv;
+		Fp::mul(r, P.x, Q.z);
+		Fp::mul(PyQz, P.y, Q.z);
+		Fp::mul(A, Q.y, P.z);
+		Fp::mul(v, Q.x, P.z);
+		v -= r;
+		if (v.isZero()) {
+			Fp::add(vv, A, PyQz);
+			if (vv.isZero()) {
+				R.clear();
+			} else {
+				dbl(R, P, false);
+			}
+			return;
+		}
+		Fp::sub(R.y, A, PyQz);
+		Fp::square(A, R.y);
+		Fp::square(vv, v);
+		r *= vv;
+		vv *= v;
+		Fp::mul(R.z, P.z, Q.z);
+		A *= R.z;
+		R.z *= vv;
+		A -= vv;
+		vv *= PyQz;
+		A -= r;
+		A -= r;
+		Fp::mul(R.x, v, A);
+		r -= A;
+		R.y *= r;
+		R.y -= vv;
+#else
+		Fp t;
+		Fp::neg(t, Q.y);
+		if (P.y == t) { R.clear(); return; }
+		Fp::sub(t, Q.x, P.x);
+		if (t.isZero()) {
+			dbl(R, P, false);
+			return;
+		}
+		Fp s;
+		Fp::sub(s, Q.y, P.y);
+		Fp::div(t, s, t);
+		R.inf_ = false;
+		Fp x3;
+		Fp::square(x3, t);
+		x3 -= P.x;
+		x3 -= Q.x;
+		Fp::sub(s, P.x, x3);
+		s *= t;
+		Fp::sub(R.y, s, P.y);
+		R.x = x3;
+#endif
+	}
+	static inline void sub(EcT& R, const EcT& P, const EcT& Q)
+	{
+#if 0
+		if (P.inf_) { neg(R, Q); return; }
+		if (Q.inf_) { R = P; return; }
+		if (P.y == Q.y) { R.clear(); return; }
+		Fp t;
+		Fp::sub(t, Q.x, P.x);
+		if (t.isZero()) {
+			dbl(R, P, false);
+			return;
+		}
+		Fp s;
+		Fp::add(s, Q.y, P.y);
+		Fp::neg(s, s);
+		Fp::div(t, s, t);
+		R.inf_ = false;
+		Fp x3;
+		Fp::mul(x3, t, t);
+		x3 -= P.x;
+		x3 -= Q.x;
+		Fp::sub(s, P.x, x3);
+		s *= t;
+		Fp::sub(R.y, s, P.y);
+		R.x = x3;
+#else
+		EcT nQ;
+		neg(nQ, Q);
+		add(R, P, nQ);
+#endif
+	}
+	static inline void neg(EcT& R, const EcT& P)
+	{
+		if (P.isZero()) {
+			R.clear();
+			return;
+		}
+#if MCL_EC_COORD == MCL_EC_USE_AFFINE
+		R.inf_ = false;
+		R.x = P.x;
+		Fp::neg(R.y, P.y);
+#else
+		R.x = P.x;
+		Fp::neg(R.y, P.y);
+		R.z = P.z;
+#endif
+	}
+	template<class N>
+	static inline void power(EcT& z, const EcT& x, const N& y)
+	{
+		power_impl::power(z, x, y);
+	}
+	/*
+		0 <= P for any P
+		(Px, Py) <= (P'x, P'y) iff Px < P'x or Px == P'x and Py <= P'y
+	*/
+	static inline int compare(const EcT& P, const EcT& Q)
+	{
+		P.normalize();
+		Q.normalize();
+		if (P.isZero()) {
+			if (Q.isZero()) return 0;
+			return -1;
+		}
+		if (Q.isZero()) return 1;
+		int c = _Fp::compare(P.x, Q.x);
+		if (c > 0) return 1;
+		if (c < 0) return -1;
+		return _Fp::compare(P.y, Q.y);
+	}
+	bool isZero() const
+	{
+#if MCL_EC_COORD == MCL_EC_USE_AFFINE
+		return inf_;
+#else
+		return z.isZero();
+#endif
+	}
+	friend inline std::ostream& operator<<(std::ostream& os, const EcT& self)
+	{
+		if (self.isZero()) {
+			return os << '0';
+		} else {
+			self.normalize();
+			os << self.x.toStr(16) << '_';
+			if (compressedExpression_) {
+				return os << Fp::isYodd(self.y);
+			} else {
+				return os << self.y.toStr(16);
+			}
+		}
+	}
+	friend inline std::istream& operator>>(std::istream& is, EcT& self)
+	{
+		std::string str;
+		is >> str;
+		if (str == "0") {
+			self.clear();
+		} else {
+#if MCL_EC_COORD == MCL_EC_USE_AFFINE
+			self.inf_ = false;
+#else
+			self.z = 1;
+#endif
+			size_t pos = str.find('_');
+			if (pos == std::string::npos) throw cybozu::Exception("EcT:operator>>:bad format") << str;
+			str[pos] = '\0';
+			self.x.fromStr(&str[0], 16);
+			if (compressedExpression_) {
+				const char c = str[pos + 1];
+				if ((c == '0' || c == '1') && str.size() == pos + 2) {
+					getYfromX(self.y, self.x, c == '1');
+				} else {
+					str[pos] = '_';
+					throw cybozu::Exception("EcT:operator>>:bad y") << str;
+				}
+			} else {
+				self.y.fromStr(&str[pos + 1], 16);
+			}
+		}
+		return is;
+	}
+	static inline void setCompressedExpression(bool compressedExpression)
+	{
+		compressedExpression_ = compressedExpression;
+	}
+	/*
+		append to bv(not clear bv)
+	*/
+	void appendToBitVec(cybozu::BitVector& bv) const
+	{
+#if MCL_EC_COORD == MCL_EC_USE_AFFINE
+		#error "not implemented"
+#else
+		normalize();
+		const size_t bitLen = _Fp::getModBitLen();
+		/*
+				elem |x|y|z|
+				size  n n 1 if not compressed
+				size  n 1 1 if compressed
+		*/
+		const size_t maxBitLen = compressedExpression_ ? (bitLen + 1 + 1) : (bitLen * 2 + 1);
+		if (isZero()) {
+			bv.resize(bv.size() + maxBitLen);
+			return;
+		}
+		x.appendToBitVec(bv);
+		if (compressedExpression_) {
+			bv.append(Fp::isYodd(y), 1);
+		} else {
+			y.appendToBitVec(bv);
+		}
+		bv.append(1, 1); // z = 1
+#endif
+	}
+	void fromBitVec(const cybozu::BitVector& bv)
+	{
+#if MCL_EC_COORD == MCL_EC_USE_AFFINE
+		#error "not implemented"
+#else
+		const size_t bitLen = _Fp::getModBitLen();
+		const size_t maxBitLen = compressedExpression_ ? (bitLen + 1 + 1) : (bitLen * 2 + 1);
+		if (bv.size() != maxBitLen) {
+			throw cybozu::Exception("EcT:fromBitVec:bad size") << bv.size() << maxBitLen;
+		}
+		if (!bv.get(maxBitLen - 1)) { // if z = 0
+			clear();
+			return;
+		}
+		cybozu::BitVector t;
+		bv.extract(t, 0, bitLen);
+		x.fromBitVec(t);
+		if (compressedExpression_) {
+			bool odd = bv.get(bitLen); // y
+			getYfromX(y, x, odd);
+		} else {
+			bv.extract(t, bitLen, bitLen);
+			y.fromBitVec(t);
+		}
+		z = 1;
+#endif
+	}
+	static inline size_t getBitVecSize()
+	{
+		const size_t bitLen = _Fp::getModBitLen();
+		if (compressedExpression_) {
+			return bitLen + 2;
+		} else {
+			return bitLen * 2 + 1;;
+		}
+	}
+	static inline void getYfromX(Fp& y, const Fp& x, bool isYodd)
+	{
+		Fp t;
+		Fp::square(t, x);
+		t += a_;
+		t *= x;
+		t += b_;
+		Fp::squareRoot(y, t);
+		if (Fp::isYodd(y) ^ isYodd) {
+			Fp::neg(y, y);
+		}
+	}
+};
+
+template<class T>
+struct TagMultiGr<EcT<T> > {
+	static void square(EcT<T>& z, const EcT<T>& x)
+	{
+		EcT<T>::dbl(z, x);
+	}
+	static void mul(EcT<T>& z, const EcT<T>& x, const EcT<T>& y)
+	{
+		EcT<T>::add(z, x, y);
+	}
+	static void inv(EcT<T>& z, const EcT<T>& x)
+	{
+		EcT<T>::neg(z, x);
+	}
+	static void div(EcT<T>& z, const EcT<T>& x, const EcT<T>& y)
+	{
+		EcT<T>::sub(z, x, y);
+	}
+	static void init(EcT<T>& x)
+	{
+		x.clear();
+	}
+};
+
+template<class _Fp> _Fp EcT<_Fp>::a_;
+template<class _Fp> _Fp EcT<_Fp>::b_;
+template<class _Fp> int EcT<_Fp>::specialA_;
+template<class _Fp> bool EcT<_Fp>::compressedExpression_;
+
+struct EcParam {
+	const char *name;
+	const char *p;
+	const char *a;
+	const char *b;
+	const char *gx;
+	const char *gy;
+	const char *n;
+	size_t bitLen; // bit length of p
+};
+
+} // mcl
+
+namespace std { CYBOZU_NAMESPACE_TR1_BEGIN
+template<class T> struct hash;
+
+template<class _Fp>
+struct hash<mcl::EcT<_Fp> > {
+	size_t operator()(const mcl::EcT<_Fp>& P) const
+	{
+		if (P.isZero()) return 0;
+		P.normalize();
+		uint64_t v = hash<_Fp>()(P.x);
+		v = hash<_Fp>()(P.y, v);
+		return static_cast<size_t>(v);
+	}
+};
+
+CYBOZU_NAMESPACE_TR1_END } // std
diff --git a/include/mcl/ecparam.hpp b/include/mcl/ecparam.hpp
new file mode 100644
index 0000000..a5206e9
--- /dev/null
+++ b/include/mcl/ecparam.hpp
@@ -0,0 +1,161 @@
+#pragma once
+/**
+	@file
+	@brief Elliptic curve parameter
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#include <mcl/ec.hpp>
+
+namespace mcl { namespace ecparam {
+
+const struct mcl::EcParam secp160k1 = {
+	"secp160k1",
+	"0xfffffffffffffffffffffffffffffffeffffac73",
+	"0",
+	"7",
+	"0x3b4c382ce37aa192a4019e763036f4f5dd4d7ebb",
+	"0x938cf935318fdced6bc28286531733c3f03c4fee",
+	"0x100000000000000000001b8fa16dfab9aca16b6b3",
+	160
+};
+// p=2^160 + 7
+const struct mcl::EcParam p160_1 = {
+	"p160_1",
+	"0x10000000000000000000000000000000000000007",
+	"10",
+	"1343632762150092499701637438970764818528075565078",
+	"1",
+	"1236612389951462151661156731535316138439983579284",
+	"1461501637330902918203683518218126812711137002561",
+	161
+};
+const struct mcl::EcParam secp192k1 = {
+	"secp192k1",
+	"0xfffffffffffffffffffffffffffffffffffffffeffffee37",
+	"0",
+	"3",
+	"0xdb4ff10ec057e9ae26b07d0280b7f4341da5d1b1eae06c7d",
+	"0x9b2f2f6d9c5628a7844163d015be86344082aa88d95e2f9d",
+	"0xfffffffffffffffffffffffe26f2fc170f69466a74defd8d",
+	192
+};
+const struct mcl::EcParam secp224k1 = {
+	"secp224k1",
+	"0xfffffffffffffffffffffffffffffffffffffffffffffffeffffe56d",
+	"0",
+	"5",
+	"0xa1455b334df099df30fc28a169a467e9e47075a90f7e650eb6b7a45c",
+	"0x7e089fed7fba344282cafbd6f7e319f7c0b0bd59e2ca4bdb556d61a5",
+	"0x10000000000000000000000000001dce8d2ec6184caf0a971769fb1f7",
+	224
+};
+const struct mcl::EcParam secp256k1 = {
+	"secp256k1",
+	"0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f",
+	"0",
+	"7",
+	"0x79be667ef9dcbbac55a06295ce870b07029bfcdb2dce28d959f2815b16f81798",
+	"0x483ada7726a3c4655da4fbfc0e1108a8fd17b448a68554199c47d08ffb10d4b8",
+	"0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141",
+	256
+};
+const struct mcl::EcParam secp384r1 = {
+	"secp384r1",
+	"0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff0000000000000000ffffffff",
+	"-3",
+	"0xb3312fa7e23ee7e4988e056be3f82d19181d9c6efe8141120314088f5013875ac656398d8a2ed19d2a85c8edd3ec2aef",
+	"0xaa87ca22be8b05378eb1c71ef320ad746e1d3b628ba79b9859f741e082542a385502f25dbf55296c3a545e3872760ab7",
+	"0x3617de4a96262c6f5d9e98bf9292dc29f8f41dbd289a147ce9da3113b5f0b8c00a60b1ce1d7e819d7a431d7c90ea0e5f",
+	"0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973",
+	384
+};
+const struct mcl::EcParam secp521r1 = {
+	"secp521r1",
+	"0x1ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+	"-3",
+	"0x51953eb9618e1c9a1f929a21a0b68540eea2da725b99b315f3b8b489918ef109e156193951ec7e937b1652c0bd3bb1bf073573df883d2c34f1ef451fd46b503f00",
+	"0xc6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66",
+	"0x11839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650",
+	"0x1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb71e91386409",
+	521
+};
+const struct mcl::EcParam NIST_P192 = {
+	"NIST_P192",
+	"0xfffffffffffffffffffffffffffffffeffffffffffffffff",
+	"-3",
+	"0x64210519e59c80e70fa7e9ab72243049feb8deecc146b9b1",
+	"0x188da80eb03090f67cbf20eb43a18800f4ff0afd82ff1012",
+	"0x07192b95ffc8da78631011ed6b24cdd573f977a11e794811",
+	"0xffffffffffffffffffffffff99def836146bc9b1b4d22831",
+	192
+};
+const struct mcl::EcParam NIST_P224 = {
+	"NIST_P224",
+	"0xffffffffffffffffffffffffffffffff000000000000000000000001",
+	"-3",
+	"0xb4050a850c04b3abf54132565044b0b7d7bfd8ba270b39432355ffb4",
+	"0xb70e0cbd6bb4bf7f321390b94a03c1d356c21122343280d6115c1d21",
+	"0xbd376388b5f723fb4c22dfe6cd4375a05a07476444d5819985007e34",
+	"0xffffffffffffffffffffffffffff16a2e0b8f03e13dd29455c5c2a3d",
+	224
+};
+const struct mcl::EcParam NIST_P256 = {
+	"NIST_P256",
+	"0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff",
+	"-3",
+	"0x5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b",
+	"0x6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296",
+	"0x4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5",
+	"0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551",
+	256
+};
+// same secp384r1
+const struct mcl::EcParam NIST_P384 = {
+	"NIST_P384",
+	"0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff0000000000000000ffffffff",
+	"-3",
+	"0xb3312fa7e23ee7e4988e056be3f82d19181d9c6efe8141120314088f5013875ac656398d8a2ed19d2a85c8edd3ec2aef",
+	"0xaa87ca22be8b05378eb1c71ef320ad746e1d3b628ba79b9859f741e082542a385502f25dbf55296c3a545e3872760ab7",
+	"0x3617de4a96262c6f5d9e98bf9292dc29f8f41dbd289a147ce9da3113b5f0b8c00a60b1ce1d7e819d7a431d7c90ea0e5f",
+	"0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973",
+	384
+};
+// same secp521r1
+const struct mcl::EcParam NIST_P521 = {
+	"NIST_P521",
+	"0x1ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+	"-3",
+	"0x051953eb9618e1c9a1f929a21a0b68540eea2da725b99b315f3b8b489918ef109e156193951ec7e937b1652c0bd3bb1bf073573df883d2c34f1ef451fd46b503f00",
+	"0xc6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66",
+	"0x11839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650",
+	"0x1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb71e91386409",
+	521
+};
+
+} // mcl::ecparam
+
+static inline const mcl::EcParam* getEcParam(const std::string& name)
+{
+	static const mcl::EcParam *tbl[] = {
+		&ecparam::secp160k1,
+		&ecparam::secp192k1,
+		&ecparam::secp224k1,
+		&ecparam::secp256k1,
+		&ecparam::secp384r1,
+		&ecparam::secp521r1,
+
+		&ecparam::NIST_P192,
+		&ecparam::NIST_P224,
+		&ecparam::NIST_P256,
+		&ecparam::NIST_P384,
+		&ecparam::NIST_P521,
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		if (name == tbl[i]->name) return tbl[i];
+	}
+	throw cybozu::Exception("mcl::getEcParam:not support name") << name;
+}
+
+} // mcl
diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp
new file mode 100644
index 0000000..a8b980b
--- /dev/null
+++ b/include/mcl/fp.hpp
@@ -0,0 +1,446 @@
+#pragma once
+/**
+	@file
+	@brief finite field class
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#include <sstream>
+#include <vector>
+#ifdef _MSC_VER
+	#pragma warning(push)
+	#pragma warning(disable : 4127)
+	#ifndef NOMINMAX
+		#define NOMINMAX
+	#endif
+#endif
+#if defined(_WIN64) || defined(__x86_64__)
+//	#define USE_MONT_FP
+#endif
+#include <cybozu/hash.hpp>
+#include <cybozu/itoa.hpp>
+#include <cybozu/atoi.hpp>
+#include <cybozu/bitvector.hpp>
+#include <mcl/fp_base.hpp>
+#include <mcl/fp_util.hpp>
+#include <mcl/gmp_util.hpp>
+#include <mcl/power.hpp>
+
+#ifndef MCL_FP_BLOCK_MAX_BIT_N
+	#define MCL_FP_BLOCK_MAX_BIT_N 521
+#endif
+
+namespace mcl {
+
+struct Block {
+	typedef fp::Unit Unit;
+	const Unit *p; // pointer to original FpT.v_
+	size_t n;
+	static const size_t UnitByteN = sizeof(Unit);
+	static const size_t maxUnitN = (MCL_FP_BLOCK_MAX_BIT_N + UnitByteN * 8 - 1) / (UnitByteN * 8);
+	Unit v_[maxUnitN];
+};
+
+template<class tag = fp::TagDefault, size_t maxBitN = MCL_FP_BLOCK_MAX_BIT_N>
+class FpT {
+	typedef fp::Unit Unit;
+	static const size_t UnitByteN = sizeof(Unit);
+	static const size_t maxUnitN = (maxBitN + UnitByteN * 8 - 1) / (UnitByteN * 8);
+	static fp::Op op_;
+	static mcl::SquareRoot sq_;
+	static size_t pBitLen_;
+	template<class tag2, size_t maxBitN2> friend class FpT;
+	Unit v_[maxUnitN];
+public:
+	// return pointer to array v_[]
+	const Unit *getUnit() const { return v_; }
+	size_t getUnitN() const { return op_.N; }
+	typedef Unit BlockType;
+	void dump() const
+	{
+		const size_t N = op_.N;
+		for (size_t i = 0; i < N; i++) {
+			printf("%016llx ", (long long)v_[N - 1 - i]);
+		}
+		printf("\n");
+	}
+	static inline void setModulo(const std::string& mstr, int base = 0)
+	{
+		bool isMinus;
+		mpz_class mp;
+		inFromStr(mp, &isMinus, mstr, base);
+		if (isMinus) throw cybozu::Exception("mcl:FpT:setModulo:mstr is not minus") << mstr;
+		pBitLen_ = Gmp::getBitLen(mp);
+		if (pBitLen_ > maxBitN) throw cybozu::Exception("mcl:FpT:setModulo:too large bitLen") << pBitLen_ << maxBitN;
+		Unit p[maxUnitN] = {};
+		const size_t n = Gmp::getRaw(p, maxUnitN, mp);
+		if (n == 0) throw cybozu::Exception("mcl:FpT:setModulo:bad mstr") << mstr;
+#ifdef USE_MONT_FP
+		if (pBitLen_ <= 128) {  op_ = fp::MontFp<tag, 128>::init(p); }
+#if CYBOZU_OS_BIT == 32
+		else if (pBitLen_ <= 160) { static fp::MontFp<tag, 160> f; op_ = f.init(p); }
+#endif
+		else if (pBitLen_ <= 192) { static fp::MontFp<tag, 192> f; op_ = f.init(p); }
+#if CYBOZU_OS_BIT == 32
+		else if (pBitLen_ <= 224) { static fp::MontFp<tag, 224> f; op_ = f.init(p); }
+#endif
+		else if (pBitLen_ <= 256) { static fp::MontFp<tag, 256> f; op_ = f.init(p); }
+		else if (pBitLen_ <= 384) { static fp::MontFp<tag, 384> f; op_ = f.init(p); }
+		else if (pBitLen_ <= 448) { static fp::MontFp<tag, 448> f; op_ = f.init(p); }
+#if CYBOZU_OS_BIT == 32
+		else if (pBitLen_ <= 544) { static fp::MontFp<tag, 544> f; op_ = f.init(p); }
+#else
+		else if (pBitLen_ <= 576) { static fp::MontFp<tag, 576> f; op_ = f.init(p); }
+#endif
+		else { static fp::MontFp<tag, maxBitN> f; op_ = f.init(p); }
+#else
+		if (pBitLen_ <= 128) {  op_ = fp::FixedFp<tag, 128>::init(p); }
+#if CYBOZU_OS_BIT == 32
+		else if (pBitLen_ <= 160) { static fp::FixedFp<tag, 160> f; op_ = f.init(p); }
+#endif
+		else if (pBitLen_ <= 192) { static fp::FixedFp<tag, 192> f; op_ = f.init(p); }
+#if CYBOZU_OS_BIT == 32
+		else if (pBitLen_ <= 224) { static fp::FixedFp<tag, 224> f; op_ = f.init(p); }
+#endif
+		else if (pBitLen_ <= 256) { static fp::FixedFp<tag, 256> f; op_ = f.init(p); }
+		else if (pBitLen_ <= 384) { static fp::FixedFp<tag, 384> f; op_ = f.init(p); }
+		else if (pBitLen_ <= 448) { static fp::FixedFp<tag, 448> f; op_ = f.init(p); }
+#if CYBOZU_OS_BIT == 32
+		else if (pBitLen_ <= 544) { static fp::FixedFp<tag, 544> f; op_ = f.init(p); }
+#else
+		else if (pBitLen_ <= 576) { static fp::FixedFp<tag, 576> f; op_ = f.init(p); }
+#endif
+		else { static fp::FixedFp<tag, maxBitN> f; op_ = f.init(p); }
+#endif
+		assert(op_.N <= maxUnitN);
+		sq_.set(mp);
+	}
+	static inline void getModulo(std::string& pstr)
+	{
+		Gmp::toStr(pstr, op_.mp);
+	}
+	static inline bool isYodd(const FpT& x)
+	{
+		Block b;
+		x.getBlock(b);
+		return (b.p[0] & 1) == 1;
+	}
+	static inline bool squareRoot(FpT& y, const FpT& x)
+	{
+		mpz_class mx, my;
+		x.toGmp(mx);
+		bool b = sq_.get(my, mx);
+		if (!b) return false;
+		y.fromGmp(my);
+		return true;
+	}
+	FpT() {}
+	FpT(const FpT& x)
+	{
+		op_.copy(v_, x.v_);
+	}
+	FpT& operator=(const FpT& x)
+	{
+		op_.copy(v_, x.v_);
+		return *this;
+	}
+	void clear()
+	{
+		op_.clear(v_);
+	}
+	FpT(int64_t x) { operator=(x); }
+	explicit FpT(const std::string& str, int base = 0)
+	{
+		fromStr(str, base);
+	}
+	FpT& operator=(int64_t x)
+	{
+		clear();
+		if (x) {
+			int64_t y = x < 0 ? -x : x;
+			if (sizeof(Unit) == 8) {
+				v_[0] = y;
+			} else {
+				v_[0] = (uint32_t)y;
+				v_[1] = (uint32_t)(y >> 32);
+			}
+			if (x < 0) neg(*this, *this);
+			toMont(*this, *this);
+		}
+		return *this;
+	}
+	void toMont(FpT& y, const FpT& x)
+	{
+		if (op_.toMont) op_.toMont(y.v_, x.v_);
+	}
+	void fromMont(FpT& y, const FpT& x)
+	{
+		if (op_.fromMont) op_.fromMont(y.v_, x.v_);
+	}
+	void fromStr(const std::string& str, int base = 0)
+	{
+		bool isMinus;
+		mpz_class x;
+		inFromStr(x, &isMinus, str, base);
+		if (x >= op_.mp) throw cybozu::Exception("fp:FpT:fromStr:large str") << str;
+		fp::local::toArray(v_, op_.N, x.get_mpz_t());
+		if (isMinus) {
+			neg(*this, *this);
+		}
+		toMont(*this, *this);
+	}
+	// alias of fromStr
+	void set(const std::string& str, int base = 0) { fromStr(str, base); }
+	template<class S>
+	void setRaw(const S *inBuf, size_t n)
+	{
+		const size_t byteN = sizeof(S) * n;
+		const size_t fpByteN = sizeof(Unit) * op_.N;
+		if (byteN > fpByteN) throw cybozu::Exception("setRaw:bad n") << n << fpByteN;
+		assert(byteN <= fpByteN);
+		memcpy(v_, inBuf, byteN);
+		memset((char *)v_ + byteN, 0, fpByteN - byteN);
+		if (!isValid()) throw cybozu::Exception("setRaw:large value");
+		toMont(*this, *this);
+	}
+	template<class S>
+	size_t getRaw(S *outBuf, size_t n) const
+	{
+		const size_t byteN = sizeof(S) * n;
+		const size_t fpByteN = sizeof(Unit) * op_.N;
+		if (byteN < fpByteN) throw cybozu::Exception("getRaw:bad n") << n << fpByteN;
+		assert(byteN >= fpByteN);
+		Block b;
+		getBlock(b);
+		memcpy(outBuf, b.p, fpByteN);
+		const size_t writeN = (fpByteN + sizeof(S) - 1) / sizeof(S);
+		memset((char *)outBuf + fpByteN, 0, writeN * sizeof(S) - fpByteN);
+		return writeN;
+	}
+	void getBlock(Block& b) const
+	{
+		assert(maxUnitN <= Block::maxUnitN);
+		b.n = op_.N;
+		if (op_.fromMont) {
+			op_.fromMont(b.v_, v_);
+			b.p = &b.v_[0];
+		} else {
+			b.p = &v_[0];
+		}
+	}
+	template<class RG>
+	void setRand(RG& rg)
+	{
+		fp::getRandVal(v_, rg, op_.p, pBitLen_);
+		fromMont(*this, *this);
+	}
+	static inline void toStr(std::string& str, const Unit *x, size_t n, int base = 10, bool withPrefix = false)
+	{
+		switch (base) {
+		case 10:
+			{
+				mpz_class t;
+				Gmp::setRaw(t, x, n);
+				Gmp::toStr(str, t, 10);
+			}
+			return;
+		case 16:
+			mcl::fp::toStr16(str, x, n, withPrefix);
+			return;
+		case 2:
+			mcl::fp::toStr2(str, x, n, withPrefix);
+			return;
+		default:
+			throw cybozu::Exception("fp:FpT:toStr:bad base") << base;
+		}
+	}
+	void toStr(std::string& str, int base = 10, bool withPrefix = false) const
+	{
+		Block b;
+		getBlock(b);
+		toStr(str, b.p, b.n, base, withPrefix);
+	}
+	std::string toStr(int base = 10, bool withPrefix = false) const
+	{
+		std::string str;
+		toStr(str, base, withPrefix);
+		return str;
+	}
+	void toGmp(mpz_class& x) const
+	{
+		Block b;
+		getBlock(b);
+		Gmp::setRaw(x, b.p, b.n);
+	}
+	mpz_class toGmp() const
+	{
+		mpz_class x;
+		toGmp(x);
+		return x;
+	}
+	void fromGmp(const mpz_class& x)
+	{
+		setRaw(Gmp::getBlock(x), Gmp::getBlockSize(x));
+	}
+	static inline void add(FpT& z, const FpT& x, const FpT& y) { op_.add(z.v_, x.v_, y.v_); }
+	static inline void sub(FpT& z, const FpT& x, const FpT& y) { op_.sub(z.v_, x.v_, y.v_); }
+	static inline void mul(FpT& z, const FpT& x, const FpT& y) { op_.mul(z.v_, x.v_, y.v_); }
+	static inline void inv(FpT& y, const FpT& x) { op_.inv(y.v_, x.v_); }
+	static inline void neg(FpT& y, const FpT& x) { op_.neg(y.v_, x.v_); }
+	static inline void square(FpT& y, const FpT& x) { op_.square(y.v_, x.v_); }
+	static inline void div(FpT& z, const FpT& x, const FpT& y)
+	{
+		FpT rev;
+		inv(rev, y);
+		mul(z, x, rev);
+	}
+	static inline void powerArray(FpT& z, const FpT& x, const Unit *y, size_t yn)
+	{
+		FpT out(1);
+		FpT t(x);
+		for (size_t i = 0; i < yn; i++) {
+			const Unit v = y[i];
+			int m = (int)sizeof(Unit) * 8;
+			if (i == yn - 1) {
+				while (m > 0 && (v & (Unit(1) << (m - 1))) == 0) {
+					m--;
+				}
+			}
+			for (int j = 0; j < m; j++) {
+				if (v & (Unit(1) << j)) {
+					out *= t;
+				}
+				t *= t;
+			}
+		}
+		z = out;
+	}
+	template<class tag2, size_t maxBitN2>
+	static inline void power(FpT& z, const FpT& x, const FpT<tag2, maxBitN2>& y)
+	{
+		Block b;
+		y.getBlock(b);
+		powerArray(z, x, b.p, b.n);
+	}
+	static inline void power(FpT& z, const FpT& x, int y)
+	{
+		if (y < 0) throw cybozu::Exception("FpT:power with negative y is not support") << y;
+		const Unit u = y;
+		powerArray(z, x, &u, 1);
+	}
+	static inline void power(FpT& z, const FpT& x, const mpz_class& y)
+	{
+		if (y < 0) throw cybozu::Exception("FpT:power with negative y is not support") << y;
+		powerArray(z, x, Gmp::getBlock(y), Gmp::getBlockSize(x));
+	}
+	bool isZero() const { return op_.isZero(v_); }
+	/*
+		append to bv(not clear bv)
+	*/
+	void appendToBitVec(cybozu::BitVector& bv) const
+	{
+		Block b;
+		getBlock(b);
+		bv.append(b.p, pBitLen_);
+	}
+	bool isValid() const
+	{
+		return fp::local::compareArray(v_, op_.p, op_.N) < 0;
+	}
+	void fromBitVec(const cybozu::BitVector& bv)
+	{
+		if (bv.size() != pBitLen_) throw cybozu::Exception("FpT:fromBitVec:bad size") << bv.size() << pBitLen_;
+		setRaw(bv.getBlock(), bv.getBlockSize());
+	}
+	static inline size_t getModBitLen() { return pBitLen_; }
+	static inline size_t getBitVecSize() { return pBitLen_; }
+	bool operator==(const FpT& rhs) const { return fp::local::isEqualArray(v_, rhs.v_, op_.N); }
+	bool operator!=(const FpT& rhs) const { return !operator==(rhs); }
+	inline friend FpT operator+(const FpT& x, const FpT& y) { FpT z; add(z, x, y); return z; }
+	inline friend FpT operator-(const FpT& x, const FpT& y) { FpT z; sub(z, x, y); return z; }
+	inline friend FpT operator*(const FpT& x, const FpT& y) { FpT z; mul(z, x, y); return z; }
+	inline friend FpT operator/(const FpT& x, const FpT& y) { FpT z; div(z, x, y); return z; }
+	FpT& operator+=(const FpT& x) { add(*this, *this, x); return *this; }
+	FpT& operator-=(const FpT& x) { sub(*this, *this, x); return *this; }
+	FpT& operator*=(const FpT& x) { mul(*this, *this, x); return *this; }
+	FpT& operator/=(const FpT& x) { div(*this, *this, x); return *this; }
+	FpT operator-() const { FpT x; neg(x, *this); return x; }
+	friend inline std::ostream& operator<<(std::ostream& os, const FpT& self)
+	{
+		const std::ios_base::fmtflags f = os.flags();
+		if (f & std::ios_base::oct) throw cybozu::Exception("fpT:operator<<:oct is not supported");
+		const int base = (f & std::ios_base::hex) ? 16 : 10;
+		const bool showBase = (f & std::ios_base::showbase) != 0;
+		std::string str;
+		self.toStr(str, base, showBase);
+		return os << str;
+	}
+	friend inline std::istream& operator>>(std::istream& is, FpT& self)
+	{
+		const std::ios_base::fmtflags f = is.flags();
+		if (f & std::ios_base::oct) throw cybozu::Exception("fpT:operator>>:oct is not supported");
+		const int base = (f & std::ios_base::hex) ? 16 : 0;
+		std::string str;
+		is >> str;
+		self.fromStr(str, base);
+		return is;
+	}
+	/*
+		not support
+		getBitLen, operator<, >
+	*/
+	/*
+		QQQ : should be removed
+	*/
+	bool operator<(const FpT&) const { return false; }
+	static inline int compare(const FpT& x, const FpT& y)
+	{
+		Block xb, yb;
+		x.getBlock(xb);
+		y.getBlock(yb);
+		return fp::local::compareArray(xb.p, yb.p, xb.n);
+	}
+private:
+	static inline void inFromStr(mpz_class& x, bool *isMinus, const std::string& str, int base)
+	{
+		const char *p = fp::verifyStr(isMinus, &base, str);
+		if (!Gmp::fromStr(x, p, base)) {
+			throw cybozu::Exception("fp:FpT:inFromStr") << str;
+		}
+	}
+};
+
+template<class tag, size_t maxBitN> fp::Op FpT<tag, maxBitN>::op_;
+template<class tag, size_t maxBitN> mcl::SquareRoot FpT<tag, maxBitN>::sq_;
+template<class tag, size_t maxBitN> size_t FpT<tag, maxBitN>::pBitLen_;
+
+namespace power_impl {
+
+template<class G, class tag, size_t bitN, template<class _tag, size_t _bitN>class FpT>
+void power(G& z, const G& x, const FpT<tag, bitN>& y)
+{
+	Block b;
+	y.getBlock(b);
+	mcl::power_impl::powerArray(z, x, b.p, b.n);
+}
+
+} // mcl::power_impl
+} // mcl
+
+namespace std { CYBOZU_NAMESPACE_TR1_BEGIN
+template<class T> struct hash;
+
+template<class tag, size_t maxBitN>
+struct hash<mcl::FpT<tag, maxBitN> > : public std::unary_function<mcl::FpT<tag, maxBitN>, size_t> {
+	size_t operator()(const mcl::FpT<tag, maxBitN>& x, uint64_t v = 0) const
+	{
+		return static_cast<size_t>(cybozu::hash64(x.getUnit(), x.getUnitN(), v));
+	}
+};
+
+CYBOZU_NAMESPACE_TR1_END } // std::tr1
+
+#ifdef _WIN32
+	#pragma warning(pop)
+#endif
diff --git a/include/mcl/fp_base.hpp b/include/mcl/fp_base.hpp
new file mode 100644
index 0000000..0fb174f
--- /dev/null
+++ b/include/mcl/fp_base.hpp
@@ -0,0 +1,527 @@
+#pragma once
+/**
+	@file
+	@brief basic operation
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#ifdef _MSC_VER
+	#pragma warning(push)
+	#pragma warning(disable : 4616)
+	#pragma warning(disable : 4800)
+	#pragma warning(disable : 4244)
+	#pragma warning(disable : 4127)
+	#pragma warning(disable : 4512)
+	#pragma warning(disable : 4146)
+#endif
+#include <stdint.h>
+#include <assert.h>
+#include <mcl/gmp_util.hpp>
+#ifdef _MSC_VER
+	#pragma warning(pop)
+#endif
+#include <cybozu/inttype.hpp>
+#ifdef USE_MONT_FP
+#include <mcl/fp_generator.hpp>
+#endif
+
+namespace mcl { namespace fp {
+
+#if defined(CYBOZU_OS_BIT) && (CYBOZU_OS_BIT == 32)
+typedef uint32_t Unit;
+#else
+typedef uint64_t Unit;
+#endif
+
+typedef void (*void1op)(Unit*);
+typedef void (*void2op)(Unit*, const Unit*);
+typedef void (*void3op)(Unit*, const Unit*, const Unit*);
+typedef void (*void4op)(Unit*, const Unit*, const Unit*, const Unit*);
+typedef int (*int2op)(Unit*, const Unit*);
+typedef void (*void4Iop)(Unit*, const Unit*, const Unit*, const Unit*, Unit);
+
+} } // mcl::fp
+
+#ifdef MCL_USE_LLVM
+
+extern "C" {
+
+#define MCL_FP_DEF_FUNC(len) \
+void mcl_fp_add ## len ## S(mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*); \
+void mcl_fp_add ## len ## L(mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*); \
+void mcl_fp_sub ## len ## S(mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*); \
+void mcl_fp_sub ## len ## L(mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*); \
+void mcl_fp_mul ## len ## pre(mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*); \
+void mcl_fp_mont ## len(mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*, mcl::fp::Unit);
+
+MCL_FP_DEF_FUNC(128)
+MCL_FP_DEF_FUNC(192)
+MCL_FP_DEF_FUNC(256)
+MCL_FP_DEF_FUNC(320)
+MCL_FP_DEF_FUNC(384)
+MCL_FP_DEF_FUNC(448)
+MCL_FP_DEF_FUNC(512)
+#if CYBOZU_OS_BIT == 32
+MCL_FP_DEF_FUNC(160)
+MCL_FP_DEF_FUNC(224)
+MCL_FP_DEF_FUNC(288)
+MCL_FP_DEF_FUNC(352)
+MCL_FP_DEF_FUNC(416)
+MCL_FP_DEF_FUNC(480)
+MCL_FP_DEF_FUNC(544)
+#else
+MCL_FP_DEF_FUNC(576)
+#endif
+
+void mcl_fp_mul_NIST_P192(mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*);
+
+}
+
+#endif
+
+namespace mcl { namespace fp {
+
+namespace local {
+
+inline int compareArray(const Unit* x, const Unit* y, size_t n)
+{
+	for (size_t i = n - 1; i != size_t(-1); i--) {
+		if (x[i] < y[i]) return -1;
+		if (x[i] > y[i]) return 1;
+	}
+	return 0;
+}
+
+inline bool isEqualArray(const Unit* x, const Unit* y, size_t n)
+{
+	for (size_t i = 0; i < n; i++) {
+		if (x[i] != y[i]) return false;
+	}
+	return true;
+}
+
+inline bool isZeroArray(const Unit *x, size_t n)
+{
+	for (size_t i = 0; i < n; i++) {
+		if (x[i]) return false;
+	}
+	return true;
+}
+
+inline void clearArray(Unit *x, size_t begin, size_t end)
+{
+	for (size_t i = begin; i < end; i++) x[i] = 0;
+}
+
+inline void copyArray(Unit *y, const Unit *x, size_t n)
+{
+	for (size_t i = 0; i < n; i++) y[i] = x[i];
+}
+
+inline void toArray(Unit *y, size_t yn, const mpz_srcptr x)
+{
+	const int xn = x->_mp_size;
+	assert(xn >= 0);
+	const Unit* xp = (const Unit*)x->_mp_d;
+	assert(xn <= (int)yn);
+	copyArray(y, xp, xn);
+	clearArray(y, xn, yn);
+}
+
+} // mcl::fp
+struct TagDefault;
+
+struct Op {
+	mpz_class mp;
+	const Unit* p;
+	size_t N;
+	bool (*isZero)(const Unit*);
+	void1op clear;
+	void2op neg;
+	void2op inv;
+	void2op square;
+	void2op copy;
+	void3op add;
+	void3op sub;
+	void3op mul;
+	// for Montgomery
+	void2op toMont;
+	void2op fromMont;
+	Op()
+		: p(0), N(0), isZero(0), clear(0), neg(0), inv(0)
+		, square(0), copy(0),add(0), sub(0), mul(0), toMont(0), fromMont(0)
+	{
+	}
+};
+
+template<class tag, size_t bitN>
+struct FixedFp {
+	typedef fp::Unit Unit;
+	static const size_t N = (bitN + sizeof(Unit) * 8 - 1) / (sizeof(Unit) * 8);
+	static mpz_class mp_;
+	static Unit p_[N];
+	static inline void setModulo(const Unit* p)
+	{
+		assert(N >= 2);
+		assert(sizeof(mp_limb_t) == sizeof(Unit));
+		copy(p_, p);
+		Gmp::setRaw(mp_, p, N);
+	}
+	static inline void set_mpz_t(mpz_t& z, const Unit* p, int n = (int)N)
+	{
+		z->_mp_alloc = n;
+		int i = n;
+		while (i > 0 && p[i - 1] == 0) {
+			i--;
+		}
+		z->_mp_size = i;
+		z->_mp_d = (mp_limb_t*)const_cast<Unit*>(p);
+	}
+	static inline void set_zero(mpz_t& z, Unit *p, size_t n)
+	{
+		z->_mp_alloc = (int)n;
+		z->_mp_size = 0;
+		z->_mp_d = (mp_limb_t*)p;
+	}
+	static inline void clear(Unit *x)
+	{
+		local::clearArray(x, 0, N);
+	}
+	static inline void copy(Unit *y, const Unit *x)
+	{
+		local::copyArray(y, x, N);
+	}
+	static inline void add(Unit *z, const Unit *x, const Unit *y)
+	{
+		Unit ret[N + 2]; // not N + 1
+		mpz_t mz, mx, my;
+		set_zero(mz, ret, N + 2);
+		set_mpz_t(mx, x);
+		set_mpz_t(my, y);
+		mpz_add(mz, mx, my);
+		if (mpz_cmp(mz, mp_.get_mpz_t()) >= 0) {
+			mpz_sub(mz, mz, mp_.get_mpz_t());
+		}
+		local::toArray(z, N, mz);
+	}
+#ifdef MCL_USE_LLVM
+#if CYBOZU_OS_BIT == 64
+	static inline void add128(Unit *z, const Unit *x, const Unit *y) { mcl_fp_add128S(z, x, y, p_); }
+	static inline void sub128(Unit *z, const Unit *x, const Unit *y) { mcl_fp_sub128S(z, x, y, p_); }
+	static inline void add192(Unit *z, const Unit *x, const Unit *y) { mcl_fp_add192S(z, x, y, p_); }
+	static inline void sub192(Unit *z, const Unit *x, const Unit *y) { mcl_fp_sub192S(z, x, y, p_); }
+	static inline void add256(Unit *z, const Unit *x, const Unit *y) { mcl_fp_add256S(z, x, y, p_); }
+	static inline void sub256(Unit *z, const Unit *x, const Unit *y) { mcl_fp_sub256S(z, x, y, p_); }
+	static inline void add384(Unit *z, const Unit *x, const Unit *y) { mcl_fp_add384L(z, x, y, p_); }
+	static inline void sub384(Unit *z, const Unit *x, const Unit *y) { mcl_fp_sub384L(z, x, y, p_); }
+
+	static inline void add576(Unit *z, const Unit *x, const Unit *y) { mcl_fp_add576L(z, x, y, p_); }
+	static inline void sub576(Unit *z, const Unit *x, const Unit *y) { mcl_fp_sub576L(z, x, y, p_); }
+#else
+	static inline void add128(Unit *z, const Unit *x, const Unit *y) { mcl_fp_add128S(z, x, y, p_); }
+	static inline void sub128(Unit *z, const Unit *x, const Unit *y) { mcl_fp_sub128S(z, x, y, p_); }
+	static inline void add192(Unit *z, const Unit *x, const Unit *y) { mcl_fp_add192L(z, x, y, p_); }
+	static inline void sub192(Unit *z, const Unit *x, const Unit *y) { mcl_fp_sub192L(z, x, y, p_); }
+	static inline void add256(Unit *z, const Unit *x, const Unit *y) { mcl_fp_add256L(z, x, y, p_); }
+	static inline void sub256(Unit *z, const Unit *x, const Unit *y) { mcl_fp_sub256L(z, x, y, p_); }
+	static inline void add384(Unit *z, const Unit *x, const Unit *y) { mcl_fp_add384L(z, x, y, p_); }
+	static inline void sub384(Unit *z, const Unit *x, const Unit *y) { mcl_fp_sub384L(z, x, y, p_); }
+
+	static inline void add160(Unit *z, const Unit *x, const Unit *y) { mcl_fp_add160L(z, x, y, p_); }
+	static inline void sub160(Unit *z, const Unit *x, const Unit *y) { mcl_fp_sub160L(z, x, y, p_); }
+	static inline void add224(Unit *z, const Unit *x, const Unit *y) { mcl_fp_add224L(z, x, y, p_); }
+	static inline void sub224(Unit *z, const Unit *x, const Unit *y) { mcl_fp_sub224L(z, x, y, p_); }
+	static inline void add544(Unit *z, const Unit *x, const Unit *y) { mcl_fp_add544L(z, x, y, p_); }
+	static inline void sub544(Unit *z, const Unit *x, const Unit *y) { mcl_fp_sub544L(z, x, y, p_); }
+#endif
+#endif
+	static inline void sub(Unit *z, const Unit *x, const Unit *y)
+	{
+		Unit ret[N + 1];
+		mpz_t mz, mx, my;
+		set_zero(mz, ret, N + 1);
+		set_mpz_t(mx, x);
+		set_mpz_t(my, y);
+		mpz_sub(mz, mx, my);
+		if (mpz_sgn(mz) < 0) {
+			mpz_add(mz, mz, mp_.get_mpz_t());
+		}
+		local::toArray(z, N, mz);
+	}
+	static inline void mul(Unit *z, const Unit *x, const Unit *y)
+	{
+		Unit ret[N * 2];
+#ifdef MCL_USE_LLVM
+#if CYBOZU_OS_BIT == 64
+		if (bitN <= 128) { mcl_fp_mul128pre(ret, x, y); mod(z, ret); return; }
+		if (bitN <= 192) { mcl_fp_mul192pre(ret, x, y); mod(z, ret); return; }
+		if (bitN <= 256) { mcl_fp_mul256pre(ret, x, y); mod(z, ret); return; }
+		if (bitN <= 384) { mcl_fp_mul384pre(ret, x, y); mod(z, ret); return; }
+//		if (bitN <= 576) { mcl_fp_mul576pre(ret, x, y); mod(z, ret); return; }
+#else
+		if (bitN <= 128) { mcl_fp_mul128pre(ret, x, y); mod(z, ret); return; }
+		if (bitN <= 160) { mcl_fp_mul160pre(ret, x, y); mod(z, ret); return; }
+		if (bitN <= 192) { mcl_fp_mul192pre(ret, x, y); mod(z, ret); return; }
+		if (bitN <= 224) { mcl_fp_mul224pre(ret, x, y); mod(z, ret); return; }
+//		if (bitN <= 256) { mcl_fp_mul256pre(ret, x, y); mod(z, ret); return; }
+//		if (bitN <= 384) { mcl_fp_mul384pre(ret, x, y); mod(z, ret); return; }
+//		if (bitN <= 544) { mcl_fp_mul544pre(ret, x, y); mod(z, ret); return; }
+#endif
+#endif
+#if 0
+		pre_mul(ret, x, y);
+		mod(z, ret);
+#else
+		mpz_t mx, my, mz;
+		set_zero(mz, ret, N * 2);
+		set_mpz_t(mx, x);
+		set_mpz_t(my, y);
+		mpz_mul(mz, mx, my);
+		mpz_mod(mz, mz, mp_.get_mpz_t());
+		local::toArray(z, N, mz);
+#endif
+	}
+	static inline void pre_mul(Unit *z, const Unit *x, const Unit *y)
+	{
+		mpz_t mx, my, mz;
+		set_zero(mz, z, N * 2);
+		set_mpz_t(mx, x);
+		set_mpz_t(my, y);
+		mpz_mul(mz, mx, my);
+		local::toArray(z, N * 2, mz);
+	}
+	// x[N * 2] -> y[N]
+	static inline void mod(Unit *y, const Unit *x)
+	{
+		mpz_t mx, my;
+		set_mpz_t(mx, x, N * 2);
+		set_mpz_t(my, y, N);
+		mpz_mod(my, mx, mp_.get_mpz_t());
+		local::clearArray(y, my->_mp_size, N);
+	}
+	static inline void square(Unit *z, const Unit *x)
+	{
+		mul(z, x, x); // QQQ : use powMod with 2?
+	}
+	static inline void inv(Unit *y, const Unit *x)
+	{
+		mpz_class my;
+		mpz_t mx;
+		set_mpz_t(mx, x);
+		mpz_invert(my.get_mpz_t(), mx, mp_.get_mpz_t());
+		local::toArray(y, N, my.get_mpz_t());
+	}
+	static inline bool isZero(const Unit *x)
+	{
+		return local::isZeroArray(x, N);
+	}
+	static inline void neg(Unit *y, const Unit *x)
+	{
+		if (isZero(x)) {
+			if (x != y) clear(y);
+			return;
+		}
+		sub(y, p_, x);
+	}
+	static inline Op init(const Unit *p)
+	{
+		setModulo(p);
+		Op op;
+		op.N = N;
+		op.isZero = &isZero;
+		op.clear = &clear;
+		op.neg = &neg;
+		op.inv = &inv;
+		op.square = &square;
+		op.copy = &copy;
+#ifdef MCL_USE_LLVM
+		printf("fp2 use llvm bitN=%zd\n", bitN);
+		if (bitN <= 128) {
+			op.add = &add128;
+			op.sub = &sub128;
+		} else
+#if CYBOZU_OS_BIT == 32
+		if (bitN <= 160) {
+			op.add = &add160;
+			op.sub = &sub160;
+		} else
+#endif
+		if (bitN <= 192) {
+			op.add = &add192;
+			op.sub = &sub192;
+		} else
+#if CYBOZU_OS_BIT == 32
+		if (bitN <= 224) {
+			op.add = &add224;
+			op.sub = &sub224;
+		} else
+#endif
+		if (bitN <= 256) {
+			op.add = &add256;
+			op.sub = &sub256;
+		} else
+		if (bitN <= 384) {
+			op.add = &add384;
+			op.sub = &sub384;
+		} else
+#if CYBOZU_OS_BIT == 64
+		if (bitN <= 576) {
+			op.add = &add576;
+			op.sub = &sub576;
+		} else
+#else
+		if (bitN <= 544) {
+			op.add = &add544;
+			op.sub = &sub544;
+		} else
+#endif
+#endif
+		{
+			op.add = &add;
+			op.sub = &sub;
+		}
+#ifdef MCL_USE_LLVM
+		if (mp_ == mpz_class("0xfffffffffffffffffffffffffffffffeffffffffffffffff")) {
+			op.mul = &mcl_fp_mul_NIST_P192; // slower than MontFp192
+		} else
+#endif
+		{
+			op.mul = &mul;
+		}
+		op.mp = mp_;
+		op.p = &p_[0];
+		return op;
+	}
+};
+
+template<class tag, size_t bitN> mpz_class FixedFp<tag, bitN>::mp_;
+template<class tag, size_t bitN> fp::Unit FixedFp<tag, bitN>::p_[FixedFp<tag, bitN>::N];
+
+#ifdef USE_MONT_FP
+template<class tag, size_t bitN>
+struct MontFp {
+	typedef fp::Unit Unit;
+	static const size_t N = (bitN + sizeof(Unit) * 8 - 1) / (sizeof(Unit) * 8);
+	static const size_t invTblN = N * sizeof(Unit) * 8 * 2;
+	static mpz_class mp_;
+//	static mcl::SquareRoot sq_;
+	static Unit p_[N];
+	static Unit one_[N];
+	static Unit R_[N]; // (1 << (N * 64)) % p
+	static Unit RR_[N]; // (R * R) % p
+	static Unit invTbl_[invTblN][N];
+	static size_t modBitLen_;
+	static FpGenerator fg_;
+	static void3op add_;
+	static void3op mul_;
+
+	static inline void fromRawGmp(Unit *y, const mpz_class& x)
+	{
+		local::toArray(y, N, x.get_mpz_t());
+	}
+	static inline void setModulo(const Unit *p)
+	{
+		copy(p_, p);
+		Gmp::setRaw(mp_, p, N);
+//		sq_.set(pOrg_);
+
+		mpz_class t = 1;
+		fromRawGmp(one_, t);
+		t = (t << (N * 64)) % mp_;
+		fromRawGmp(R_, t);
+		t = (t * t) % mp_;
+		fromRawGmp(RR_, t);
+		fg_.init(p_, N);
+
+		add_ = Xbyak::CastTo<void3op>(fg_.add_);
+		mul_ = Xbyak::CastTo<void3op>(fg_.mul_);
+	}
+	static void initInvTbl(Unit invTbl[invTblN][N])
+	{
+		Unit t[N];
+		clear(t);
+		t[0] = 2;
+		toMont(t, t);
+		for (int i = 0; i < invTblN; i++) {
+			copy(invTbl[invTblN - 1 - i], t);
+			add_(t, t, t);
+		}
+	}
+	static inline void clear(Unit *x)
+	{
+		local::clearArray(x, 0, N);
+	}
+	static inline void copy(Unit *y, const Unit *x)
+	{
+		local::copyArray(y, x, N);
+	}
+	static inline bool isZero(const Unit *x)
+	{
+		return local::isZeroArray(x, N);
+	}
+	static inline void invC(Unit *y, const Unit *x)
+	{
+		const int2op preInv = Xbyak::CastTo<int2op>(fg_.preInv_);
+		Unit r[N];
+		int k = preInv(r, x);
+		/*
+			xr = 2^k
+			R = 2^(N * 64)
+			get r2^(-k)R^2 = r 2^(N * 64 * 2 - k)
+		*/
+		mul_(y, r, invTbl_[k]);
+	}
+	static inline void squareC(Unit *y, const Unit *x)
+	{
+		mul_(y, x, x);
+	}
+	static inline void toMont(Unit *y, const Unit *x)
+	{
+		mul_(y, x, RR_);
+	}
+	static inline void fromMont(Unit *y, const Unit *x)
+	{
+		mul_(y, x, one_);
+	}
+	static inline Op init(const Unit *p)
+	{
+puts("use MontFp2");
+		setModulo(p);
+		Op op;
+		op.N = N;
+		op.isZero = &isZero;
+		op.clear = &clear;
+		op.neg = Xbyak::CastTo<void2op>(fg_.neg_);
+		op.inv = &invC;
+		op.square = Xbyak::CastTo<void2op>(fg_.sqr_);
+		if (op.square == 0) op.square = &squareC;
+		op.copy = &copy;
+		op.add = add_;
+		op.sub = Xbyak::CastTo<void3op>(fg_.sub_);
+		op.mul = mul_;
+		op.mp = mp_;
+		op.p = &p_[0];
+		op.toMont = &toMont;
+		op.fromMont = &fromMont;
+
+//		shr1 = Xbyak::CastTo<void2op>(fg_.shr1_);
+//		addNc = Xbyak::CastTo<bool3op>(fg_.addNc_);
+//		subNc = Xbyak::CastTo<bool3op>(fg_.subNc_);
+		initInvTbl(invTbl_);
+		return op;
+	}
+};
+template<class tag, size_t bitN> mpz_class MontFp<tag, bitN>::mp_;
+template<class tag, size_t bitN> fp::Unit MontFp<tag, bitN>::p_[MontFp<tag, bitN>::N];
+template<class tag, size_t bitN> fp::Unit MontFp<tag, bitN>::one_[MontFp<tag, bitN>::N];
+template<class tag, size_t bitN> fp::Unit MontFp<tag, bitN>::R_[MontFp<tag, bitN>::N];
+template<class tag, size_t bitN> fp::Unit MontFp<tag, bitN>::RR_[MontFp<tag, bitN>::N];
+template<class tag, size_t bitN> fp::Unit MontFp<tag, bitN>::invTbl_[MontFp<tag, bitN>::invTblN][MontFp<tag, bitN>::N];
+template<class tag, size_t bitN> size_t MontFp<tag, bitN>::modBitLen_;
+template<class tag, size_t bitN> FpGenerator MontFp<tag, bitN>::fg_;
+template<class tag, size_t bitN> void3op MontFp<tag, bitN>::add_;
+template<class tag, size_t bitN> void3op MontFp<tag, bitN>::mul_;
+#endif
+
+} } // mcl::fp
diff --git a/include/mcl/fp_generator.hpp b/include/mcl/fp_generator.hpp
new file mode 100644
index 0000000..9820ca9
--- /dev/null
+++ b/include/mcl/fp_generator.hpp
@@ -0,0 +1,1675 @@
+#pragma once
+/**
+	@file
+	@brief Fp generator
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#include <stdio.h>
+#include <assert.h>
+#include <cybozu/exception.hpp>
+
+namespace mcl {
+
+namespace montgomery {
+
+/*
+	get pp such that p * pp = -1 mod M,
+	where p is prime and M = 1 << 64(or 32).
+	@param pLow [in] p mod M
+	T is uint32_t or uint64_t
+*/
+template<class T>
+T getCoff(T pLow)
+{
+	T ret = 0;
+	T t = 0;
+	T x = 1;
+
+	for (size_t i = 0; i < sizeof(T) * 8; i++) {
+		if ((t & 1) == 0) {
+			t += pLow;
+			ret += x;
+		}
+		t >>= 1;
+		x <<= 1;
+	}
+	return ret;
+}
+
+} } // mcl::montgomery
+
+#if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && (CYBOZU_OS_BIT == 64)
+
+#ifndef XBYAK_NO_OP_NAMES
+	#define XBYAK_NO_OP_NAMES
+#endif
+#include <xbyak/xbyak.h>
+#include <xbyak/xbyak_util.h>
+
+namespace mcl {
+
+namespace fp_gen_local {
+
+class MemReg {
+	const Xbyak::Reg64 *r_;
+	const Xbyak::RegExp *m_;
+	size_t offset_;
+public:
+	MemReg(const Xbyak::Reg64 *r, const Xbyak::RegExp *m, size_t offset) : r_(r), m_(m), offset_(offset) {}
+	bool isReg() const { return r_ != 0; }
+	const Xbyak::Reg64& getReg() const { return *r_; }
+	Xbyak::RegExp getMem() const { return *m_ + offset_ * sizeof(size_t); }
+};
+
+struct MixPack {
+	static const size_t useAll = 100;
+	Xbyak::util::Pack p;
+	Xbyak::RegExp m;
+	size_t mn;
+	MixPack() : mn(0) {}
+	MixPack(Xbyak::util::Pack& remain, size_t& rspPos, size_t n, size_t useRegNum = useAll)
+	{
+		init(remain, rspPos, n, useRegNum);
+	}
+	void init(Xbyak::util::Pack& remain, size_t& rspPos, size_t n, size_t useRegNum = useAll)
+	{
+		size_t pn = std::min(remain.size(), n);
+		if (useRegNum != useAll && useRegNum < pn) pn = useRegNum;
+		this->mn = n - pn;
+		this->m = Xbyak::util::rsp + rspPos;
+		this->p = remain.sub(0, pn);
+		remain = remain.sub(pn);
+		rspPos += mn * 8;
+	}
+	size_t size() const { return p.size() + mn; }
+	bool isReg(size_t n) const { return n < p.size(); }
+	const Xbyak::Reg64& getReg(size_t n) const
+	{
+		assert(n < p.size());
+		return p[n];
+	}
+	Xbyak::RegExp getMem(size_t n) const
+	{
+		const size_t pn = p.size();
+		assert(pn <= n && n < size());
+		return m + (int)((n - pn) * sizeof(size_t));
+	}
+	MemReg operator[](size_t n) const
+	{
+		const size_t pn = p.size();
+		return MemReg((n < pn) ? &p[n] : 0, (n < pn) ? 0 : &m, n - pn);
+	}
+	void removeLast()
+	{
+		if (!size()) throw cybozu::Exception("MixPack:removeLast:empty");
+		if (mn > 0) {
+			mn--;
+		} else {
+			p = p.sub(0, p.size() - 1);
+		}
+	}
+	/*
+		replace Mem with r if possible
+	*/
+	bool replaceMemWith(Xbyak::CodeGenerator *code, const Xbyak::Reg64& r)
+	{
+		if (mn == 0) return false;
+		p.append(r);
+		code->mov(r, code->ptr [m]);
+		m = m + 8;
+		mn--;
+		return true;
+	}
+};
+
+} // fp_gen_local
+
+/*
+	op(r, rm);
+	r  : reg
+	rm : Reg/Mem
+*/
+#define MCL_FP_GEN_OP_RM(op, r, rm) \
+if (rm.isReg()) { \
+	op(r, rm.getReg()); \
+} else { \
+	op(r, qword [rm.getMem()]); \
+}
+
+/*
+	op(rm, r);
+	rm : Reg/Mem
+	r  : reg
+*/
+#define MCL_FP_GEN_OP_MR(op, rm, r) \
+if (rm.isReg()) { \
+	op(rm.getReg(), r); \
+} else { \
+	op(qword [rm.getMem()], r); \
+}
+
+struct FpGenerator : Xbyak::CodeGenerator {
+	typedef Xbyak::RegExp RegExp;
+	typedef Xbyak::Reg64 Reg64;
+	typedef Xbyak::Xmm Xmm;
+	typedef Xbyak::Operand Operand;
+	typedef Xbyak::util::StackFrame StackFrame;
+	typedef Xbyak::util::Pack Pack;
+	typedef fp_gen_local::MixPack MixPack;
+	typedef fp_gen_local::MemReg MemReg;
+	static const int UseRDX = Xbyak::util::UseRDX;
+	static const int UseRCX = Xbyak::util::UseRCX;
+	Xbyak::util::Cpu cpu_;
+	bool useMulx_;
+	const uint64_t *p_;
+	uint64_t pp_;
+	int pn_;
+	bool isFullBit_;
+	// add/sub without carry. return true if overflow
+	typedef bool (*bool3op)(uint64_t*, const uint64_t*, const uint64_t*);
+
+	// add/sub with mod
+	typedef void (*void3op)(uint64_t*, const uint64_t*, const uint64_t*);
+
+	// mul without carry. return top of z
+	typedef uint64_t (*uint3opI)(uint64_t*, const uint64_t*, uint64_t);
+
+	// neg
+	typedef void (*void2op)(uint64_t*, const uint64_t*);
+
+	// preInv
+	typedef int (*int2op)(uint64_t*, const uint64_t*);
+	bool3op addNc_;
+	bool3op subNc_;
+	void3op add_;
+	void3op sub_;
+	void3op mul_;
+	uint3opI mulI_;
+	void2op sqr_;
+	void2op neg_;
+	void2op shr1_;
+	int2op preInv_;
+	FpGenerator()
+		: CodeGenerator(4096 * 8)
+		, p_(0)
+		, pp_(0)
+		, pn_(0)
+		, isFullBit_(0)
+		, addNc_(0)
+		, subNc_(0)
+		, add_(0)
+		, sub_(0)
+		, mul_(0)
+		, mulI_(0)
+		, neg_(0)
+		, shr1_(0)
+		, preInv_(0)
+	{
+		useMulx_ = cpu_.has(Xbyak::util::Cpu::tBMI2);
+	}
+	/*
+		@param p [in] pointer to prime
+		@param pn [in] length of prime
+	*/
+	void init(const uint64_t *p, int pn)
+	{
+		if (pn < 2) throw cybozu::Exception("mcl:FpGenerator:small pn") << pn;
+		p_ = p;
+		pp_ = montgomery::getCoff(p[0]);
+		pn_ = pn;
+		isFullBit_ = (p_[pn_ - 1] >> 63) != 0;
+//		printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_);
+
+		setSize(0); // reset code
+		align(16);
+		addNc_ = getCurr<bool3op>();
+		gen_addSubNc(true);
+		align(16);
+		subNc_ = getCurr<bool3op>();
+		gen_addSubNc(false);
+		align(16);
+		add_ = getCurr<void3op>();
+		gen_addMod();
+		align(16);
+		sub_ = getCurr<void3op>();
+		gen_sub();
+		align(16);
+		neg_ = getCurr<void2op>();
+		gen_neg();
+		align(16);
+		mulI_ = getCurr<uint3opI>();
+		gen_mulI();
+		align(16);
+		mul_ = getCurr<void3op>();
+		gen_mul();
+		align(16);
+		sqr_ = getCurr<void2op>();
+		if (!gen_sqr()) {
+			sqr_ = 0;
+		}
+		align(16);
+		shr1_ = getCurr<void2op>();
+		gen_shr1();
+		preInv_ = getCurr<int2op>();
+		gen_preInv();
+	}
+	void gen_addSubNc(bool isAdd)
+	{
+		StackFrame sf(this, 3);
+		if (isAdd) {
+			gen_raw_add(sf.p[0], sf.p[1], sf.p[2], rax);
+		} else {
+			gen_raw_sub(sf.p[0], sf.p[1], sf.p[2], rax);
+		}
+		setc(al);
+		movzx(eax, al);
+	}
+	/*
+		pz[] = px[] + py[]
+	*/
+	void gen_raw_add(const RegExp& pz, const RegExp& px, const RegExp& py, const Reg64& t)
+	{
+		mov(t, ptr [px]);
+		add(t, ptr [py]);
+		mov(ptr [pz], t);
+		for (int i = 1; i < pn_; i++) {
+			mov(t, ptr [px + i * 8]);
+			adc(t, ptr [py + i * 8]);
+			mov(ptr [pz + i * 8], t);
+		}
+	}
+	/*
+		pz[] = px[] - py[]
+	*/
+	void gen_raw_sub(const RegExp& pz, const RegExp& px, const RegExp& py, const Reg64& t)
+	{
+		mov(t, ptr [px]);
+		sub(t, ptr [py]);
+		mov(ptr [pz], t);
+		for (int i = 1; i < pn_; i++) {
+			mov(t, ptr [px + i * 8]);
+			sbb(t, ptr [py + i * 8]);
+			mov(ptr [pz + i * 8], t);
+		}
+	}
+	/*
+		pz[] = -px[]
+	*/
+	void gen_raw_neg(const RegExp& pz, const RegExp& px, const Reg64& t0, const Reg64& t1)
+	{
+		inLocalLabel();
+		mov(t0, ptr [px]);
+		test(t0, t0);
+		jnz(".neg");
+		if (pn_ > 1) {
+			for (int i = 1; i < pn_; i++) {
+				or_(t0, ptr [px + i * 8]);
+			}
+			jnz(".neg");
+		}
+		// zero
+		for (int i = 0; i < pn_; i++) {
+			mov(ptr [pz + i * 8], t0);
+		}
+		jmp(".exit");
+	L(".neg");
+		mov(t1, (size_t)p_);
+		gen_raw_sub(pz, t1, px, t0);
+	L(".exit");
+		outLocalLabel();
+	}
+	/*
+		(rdx:pz[0..n-1]) = px[0..n-1] * y
+		use t, rax, rdx
+		if n > 2
+		use
+		wk[0] if useMulx_
+		wk[0..n-2] otherwise
+	*/
+	void gen_raw_mulI(const RegExp& pz, const RegExp& px, const Reg64& y, const MixPack& wk, const Reg64& t, size_t n)
+	{
+		assert(n >= 2);
+		if (n == 2) {
+			mov(rax, ptr [px]);
+			mul(y);
+			mov(ptr [pz], rax);
+			mov(t, rdx);
+			mov(rax, ptr [px + 8]);
+			mul(y);
+			add(rax, t);
+			adc(rdx, 0);
+			mov(ptr [pz + 8], rax);
+			return;
+		}
+		if (useMulx_) {
+			assert(wk.size() > 0 && wk.isReg(0));
+			const Reg64& t1 = wk.getReg(0);
+			// mulx(H, L, x) = [H:L] = x * rdx
+			mov(rdx, y);
+			mulx(t1, rax, ptr [px]); // [y:rax] = px * y
+			mov(ptr [pz], rax);
+			const Reg64 *pt0 = &t;
+			const Reg64 *pt1 = &t1;
+			for (size_t i = 1; i < n - 1; i++) {
+				mulx(*pt0, rax, ptr [px + i * 8]);
+				if (i == 1) {
+					add(rax, *pt1);
+				} else {
+					adc(rax, *pt1);
+				}
+				mov(ptr [pz + i * 8], rax);
+				std::swap(pt0, pt1);
+			}
+			mulx(rdx, rax, ptr [px + (n - 1) * 8]);
+			adc(rax, *pt1);
+			mov(ptr [pz + (n - 1) * 8], rax);
+			adc(rdx, 0);
+			return;
+		}
+		assert(wk.size() >= n - 1);
+		for (size_t i = 0; i < n; i++) {
+			mov(rax, ptr [px + i * 8]);
+			mul(y);
+			if (i < n - 1) {
+				mov(ptr [pz + i * 8], rax);
+				g_mov(wk[i], rdx);
+			}
+		}
+		for (size_t i = 1; i < n - 1; i++) {
+			mov(t, ptr [pz + i * 8]);
+			if (i == 1) {
+				g_add(t, wk[i - 1]);
+			} else {
+				g_adc(t, wk[i - 1]);
+			}
+			mov(ptr [pz + i * 8], t);
+		}
+		g_adc(rax, wk[n - 2]);
+		mov(ptr [pz + (n - 1) * 8], rax);
+		adc(rdx, 0);
+	}
+	void gen_mulI()
+	{
+		assert(pn_ >= 2);
+		const int regNum = useMulx_ ? 2 : (1 + std::min(pn_ - 1, 8));
+		const int stackSize = useMulx_ ? 0 : (pn_ - 1) * 8;
+		StackFrame sf(this, 3, regNum | UseRDX, stackSize);
+		const Reg64& pz = sf.p[0];
+		const Reg64& px = sf.p[1];
+		const Reg64& y = sf.p[2];
+		size_t rspPos = 0;
+		Pack remain = sf.t.sub(1);
+		MixPack wk(remain, rspPos, pn_ - 1);
+		gen_raw_mulI(pz, px, y, wk, sf.t[0], pn_);
+		mov(rax, rdx);
+	}
+	/*
+		pz[] = px[]
+	*/
+	void gen_mov(const RegExp& pz, const RegExp& px, const Reg64& t, int n)
+	{
+		for (int i = 0; i < n; i++) {
+			mov(t, ptr [px + i * 8]);
+			mov(ptr [pz + i * 8], t);
+		}
+	}
+	void gen_addMod3()
+	{
+		StackFrame sf(this, 3, 7);
+		const Reg64& pz = sf.p[0];
+		const Reg64& px = sf.p[1];
+		const Reg64& py = sf.p[2];
+
+		const Reg64& t0 = sf.t[0];
+		const Reg64& t1 = sf.t[1];
+		const Reg64& t2 = sf.t[2];
+		const Reg64& t3 = sf.t[3];
+		const Reg64& t4 = sf.t[4];
+		const Reg64& t5 = sf.t[5];
+		const Reg64& t6 = sf.t[6];
+
+		xor_(t6, t6);
+		load_rm(Pack(t2, t1, t0), px);
+		add_rm(Pack(t2, t1, t0), py);
+		mov_rr(Pack(t5, t4, t3), Pack(t2, t1, t0));
+		adc(t6, 0);
+		mov(rax, (size_t)p_);
+		sub_rm(Pack(t5, t4, t3), rax);
+		sbb(t6, 0);
+		cmovc(t5, t2);
+		cmovc(t4, t1);
+		cmovc(t3, t0);
+		store_mr(pz, Pack(t5, t4, t3));
+	}
+	void gen_subMod_le4(int n)
+	{
+		assert(2 <= n && n <= 4);
+		StackFrame sf(this, 3, (n - 1) * 2);
+		const Reg64& pz = sf.p[0];
+		const Reg64& px = sf.p[1];
+		const Reg64& py = sf.p[2];
+
+		Pack rx = sf.t.sub(0, n - 1);
+		rx.append(px); // rx = [px, t1, t0]
+		Pack ry = sf.t.sub(n - 1, n - 1);
+		ry.append(rax); // ry = [rax, t3, t2]
+
+		load_rm(rx, px); // destroy px
+		sub_rm(rx, py);
+#if 0
+		sbb(ry[0], ry[0]); // rx[0] = (x > y) ? 0 : -1
+		for (int i = 1; i < n; i++) mov(ry[i], ry[0]);
+		mov(py, (size_t)p_);
+		for (int i = 0; i < n; i++) and_(ry[i], qword [py + 8 * i]);
+		add_rr(rx, ry);
+#else
+		// a little faster
+		sbb(py, py); // py = (x > y) ? 0 : -1
+		mov(rax, (size_t)p_);
+		load_rm(ry, rax); // destroy rax
+		for (size_t i = 0; i < ry.size(); i++) {
+			and_(ry[i], py);
+		}
+		add_rr(rx, ry);
+#endif
+		store_mr(pz, rx);
+	}
+	void gen_addMod()
+	{
+		if (pn_ == 3) {
+			gen_addMod3();
+			return;
+		}
+		StackFrame sf(this, 3, 0, pn_ * 8);
+		const Reg64& pz = sf.p[0];
+		const Reg64& px = sf.p[1];
+		const Reg64& py = sf.p[2];
+		const Xbyak::CodeGenerator::LabelType jmpMode = pn_ < 5 ? T_AUTO : T_NEAR;
+
+		inLocalLabel();
+		gen_raw_add(pz, px, py, rax);
+		mov(px, (size_t)p_); // destroy px
+		if (isFullBit_) {
+			jc(".over", jmpMode);
+		}
+#ifdef MCL_USE_JMP
+		for (int i = 0; i < pn_; i++) {
+			mov(py, ptr [pz + (pn_ - 1 - i) * 8]); // destroy py
+			cmp(py, ptr [px + (pn_ - 1 - i) * 8]);
+			jc(".exit", jmpMode);
+			jnz(".over", jmpMode);
+		}
+		L(".over");
+			gen_raw_sub(pz, pz, px, rax);
+		L(".exit");
+#else
+		gen_raw_sub(rsp, pz, px, rax);
+		jc(".exit", jmpMode);
+		gen_mov(pz, rsp, rax, pn_);
+		if (isFullBit_) {
+			jmp(".exit", jmpMode);
+			L(".over");
+			gen_raw_sub(pz, pz, px, rax);
+		}
+		L(".exit");
+#endif
+		outLocalLabel();
+	}
+	void gen_sub()
+	{
+		if (pn_ <= 4) {
+			gen_subMod_le4(pn_);
+			return;
+		}
+		StackFrame sf(this, 3);
+		const Reg64& pz = sf.p[0];
+		const Reg64& px = sf.p[1];
+		const Reg64& py = sf.p[2];
+		const Xbyak::CodeGenerator::LabelType jmpMode = pn_ < 5 ? T_AUTO : T_NEAR;
+
+		inLocalLabel();
+		gen_raw_sub(pz, px, py, rax);
+		jnc(".exit", jmpMode);
+		mov(px, (size_t)p_);
+		gen_raw_add(pz, pz, px, rax);
+	L(".exit");
+		outLocalLabel();
+	}
+	void gen_neg()
+	{
+		StackFrame sf(this, 2, 2);
+		const Reg64& pz = sf.p[0];
+		const Reg64& px = sf.p[1];
+		gen_raw_neg(pz, px, sf.t[0], sf.t[1]);
+	}
+	void gen_shr1()
+	{
+		const int c = 1;
+		StackFrame sf(this, 2, 1);
+		const Reg64 *t0 = &rax;
+		const Reg64 *t1 = &sf.t[0];
+		const Reg64& pz = sf.p[0];
+		const Reg64& px = sf.p[1];
+		mov(*t0, ptr [px]);
+		for (int i = 0; i < pn_ - 1; i++) {
+			mov(*t1, ptr [px + 8 * (i + 1)]);
+			shrd(*t0, *t1, c);
+			mov(ptr [pz + i * 8], *t0);
+			std::swap(t0, t1);
+		}
+		shr(*t0, c);
+		mov(ptr [pz + (pn_ - 1) * 8], *t0);
+	}
+	void gen_mul()
+	{
+		if (pn_ == 3) {
+			gen_montMul3(p_, pp_);
+		} else if (pn_ == 4) {
+			gen_montMul4(p_, pp_);
+		} else if (pn_ <= 9) {
+			gen_montMulN(p_, pp_, pn_);
+		} else {
+			throw cybozu::Exception("mcl:FpGenerator:gen_mul:not implemented for") << pn_;
+		}
+	}
+	bool gen_sqr()
+	{
+		if (pn_ == 3) {
+			gen_montSqr3(p_, pp_);
+			return true;
+		}
+		return false;
+	}
+	/*
+		input (pz[], px[], py[])
+		z[] <- montgomery(x[], y[])
+	*/
+	void gen_montMulN(const uint64_t *p, uint64_t pp, int n)
+	{
+		assert(2 <= pn_ && pn_ <= 9);
+		const int regNum = useMulx_ ? 4 : 3 + std::min(n - 1, 7);
+		const int stackSize = (n * 3 + (isFullBit_ ? 2 : 1)) * 8;
+		StackFrame sf(this, 3, regNum | UseRDX, stackSize);
+		const Reg64& pz = sf.p[0];
+		const Reg64& px = sf.p[1];
+		const Reg64& py = sf.p[2];
+		const Reg64& y = sf.t[0];
+		const Reg64& pAddr = sf.t[1];
+		const Reg64& t = sf.t[2];
+		Pack remain = sf.t.sub(3);
+		size_t rspPos = 0;
+
+		MixPack pw1(remain, rspPos, n - 1);
+		const RegExp pw2 = rsp + rspPos; // pw2[0..n-1]
+		const RegExp pc = pw2 + n * 8; // pc[0..n+1]
+		mov(pAddr, (size_t)p);
+
+		for (int i = 0; i < n; i++) {
+			mov(y, ptr [py + i * 8]);
+			montgomeryN_1(pp, n, pc, px, y, pAddr, t, pw1, pw2, i == 0);
+		}
+		// pz[] = pc[] - p[]
+		gen_raw_sub(pz, pc, pAddr, t);
+		if (isFullBit_) sbb(qword[pc + n * 8], 0);
+		jnc("@f");
+		for (int i = 0; i < n; i++) {
+			mov(t, ptr [pc + i * 8]);
+			mov(ptr [pz + i * 8], t);
+		}
+	L("@@");
+	}
+	/*
+		input (z, x, y) = (p0, p1, p2)
+		z[0..3] <- montgomery(x[0..3], y[0..3])
+		destroy gt0, ..., gt9, xm0, xm1, p2
+	*/
+	void gen_montMul4(const uint64_t *p, uint64_t pp)
+	{
+		StackFrame sf(this, 3, 10 | UseRDX);
+		const Reg64& p0 = sf.p[0];
+		const Reg64& p1 = sf.p[1];
+		const Reg64& p2 = sf.p[2];
+
+		const Reg64& t0 = sf.t[0];
+		const Reg64& t1 = sf.t[1];
+		const Reg64& t2 = sf.t[2];
+		const Reg64& t3 = sf.t[3];
+		const Reg64& t4 = sf.t[4];
+		const Reg64& t5 = sf.t[5];
+		const Reg64& t6 = sf.t[6];
+		const Reg64& t7 = sf.t[7];
+		const Reg64& t8 = sf.t[8];
+		const Reg64& t9 = sf.t[9];
+
+		movq(xm0, p0); // save p0
+		mov(p0, (uint64_t)p);
+		movq(xm1, p2);
+		mov(p2, ptr [p2]);
+		montgomery4_1(pp, t0, t7, t3, t2, t1, p1, p2, p0, t4, t5, t6, t8, t9, true, xm2);
+
+		movq(p2, xm1);
+		mov(p2, ptr [p2 + 8]);
+		montgomery4_1(pp, t1, t0, t7, t3, t2, p1, p2, p0, t4, t5, t6, t8, t9, false, xm2);
+
+		movq(p2, xm1);
+		mov(p2, ptr [p2 + 16]);
+		montgomery4_1(pp, t2, t1, t0, t7, t3, p1, p2, p0, t4, t5, t6, t8, t9, false, xm2);
+
+		movq(p2, xm1);
+		mov(p2, ptr [p2 + 24]);
+		montgomery4_1(pp, t3, t2, t1, t0, t7, p1, p2, p0, t4, t5, t6, t8, t9, false, xm2);
+		// [t7:t3:t2:t1:t0]
+
+		mov(t4, t0);
+		mov(t5, t1);
+		mov(t6, t2);
+		mov(rdx, t3);
+		sub_rm(Pack(t3, t2, t1, t0), p0);
+		if (isFullBit_) sbb(t7, 0);
+		cmovc(t0, t4);
+		cmovc(t1, t5);
+		cmovc(t2, t6);
+		cmovc(t3, rdx);
+
+		movq(p0, xm0); // load p0
+		store_mr(p0, Pack(t3, t2, t1, t0));
+	}
+	/*
+		input (z, x, y) = (p0, p1, p2)
+		z[0..2] <- montgomery(x[0..2], y[0..2])
+		destroy gt0, ..., gt9, xm0, xm1, p2
+	*/
+	void gen_montMul3(const uint64_t *p, uint64_t pp)
+	{
+		StackFrame sf(this, 3, 10 | UseRDX);
+		const Reg64& p0 = sf.p[0];
+		const Reg64& p1 = sf.p[1];
+		const Reg64& p2 = sf.p[2];
+
+		const Reg64& t0 = sf.t[0];
+		const Reg64& t1 = sf.t[1];
+		const Reg64& t2 = sf.t[2];
+		const Reg64& t3 = sf.t[3];
+		const Reg64& t4 = sf.t[4];
+		const Reg64& t5 = sf.t[5];
+		const Reg64& t6 = sf.t[6];
+		const Reg64& t7 = sf.t[7];
+		const Reg64& t8 = sf.t[8];
+		const Reg64& t9 = sf.t[9];
+
+		movq(xm0, p0); // save p0
+		mov(t7, (uint64_t)p);
+		mov(t9, ptr [p2]);
+		//                c3, c2, c1, c0, px, y,  p,
+		montgomery3_1(pp, t0, t3, t2, t1, p1, t9, t7, t4, t5, t6, t8, p0, true);
+		mov(t9, ptr [p2 + 8]);
+		montgomery3_1(pp, t1, t0, t3, t2, p1, t9, t7, t4, t5, t6, t8, p0, false);
+
+		mov(t9, ptr [p2 + 16]);
+		montgomery3_1(pp, t2, t1, t0, t3, p1, t9, t7, t4, t5, t6, t8, p0, false);
+
+		// [(t3):t2:t1:t0]
+		mov(t4, t0);
+		mov(t5, t1);
+		mov(t6, t2);
+		sub_rm(Pack(t2, t1, t0), t7);
+		if (isFullBit_) sbb(t3, 0);
+		cmovc(t0, t4);
+		cmovc(t1, t5);
+		cmovc(t2, t6);
+		movq(p0, xm0);
+		store_mr(p0, Pack(t2, t1, t0));
+	}
+	/*
+		input (pz, px)
+		z[0..2] <- montgomery(px[0..2], px[0..2])
+		destroy gt0, ..., gt9, xm0, xm1, p2
+	*/
+	void gen_montSqr3(const uint64_t *p, uint64_t pp)
+	{
+		StackFrame sf(this, 3, 10 | UseRDX, 16 * 3);
+		const Reg64& pz = sf.p[0];
+		const Reg64& px = sf.p[1];
+//		const Reg64& py = sf.p[2]; // not used
+
+		const Reg64& t0 = sf.t[0];
+		const Reg64& t1 = sf.t[1];
+		const Reg64& t2 = sf.t[2];
+		const Reg64& t3 = sf.t[3];
+		const Reg64& t4 = sf.t[4];
+		const Reg64& t5 = sf.t[5];
+		const Reg64& t6 = sf.t[6];
+		const Reg64& t7 = sf.t[7];
+		const Reg64& t8 = sf.t[8];
+		const Reg64& t9 = sf.t[9];
+
+		movq(xm0, pz); // save pz
+		mov(t7, (uint64_t)p);
+		mov(t9, ptr [px]);
+		mul3x1_sqr1(px, t9, t3, t2, t1, t0);
+		mov(t0, rdx);
+		montgomery3_sub(pp, t0, t9, t2, t1, px, t3, t7, t4, t5, t6, t8, pz, true);
+
+		mov(t3, ptr [px + 8]);
+		mul3x1_sqr2(px, t3, t6, t5, t4);
+		add_rr(Pack(t1, t0, t9, t2), Pack(rdx, rax, t5, t4));
+		if (isFullBit_) setc(pz.cvt8());
+		montgomery3_sub(pp, t1, t3, t9, t2, px, t0, t7, t4, t5, t6, t8, pz, false);
+
+		mov(t0, ptr [px + 16]);
+		mul3x1_sqr3(t0, t5, t4);
+		add_rr(Pack(t2, t1, t3, t9), Pack(rdx, rax, t5, t4));
+		if (isFullBit_) setc(pz.cvt8());
+		montgomery3_sub(pp, t2, t0, t3, t9, px, t1, t7, t4, t5, t6, t8, pz, false);
+
+		// [t9:t2:t0:t3]
+		mov(t4, t3);
+		mov(t5, t0);
+		mov(t6, t2);
+		sub_rm(Pack(t2, t0, t3), t7);
+		if (isFullBit_) sbb(t9, 0);
+		cmovc(t3, t4);
+		cmovc(t0, t5);
+		cmovc(t2, t6);
+		movq(pz, xm0);
+		store_mr(pz, Pack(t2, t0, t3));
+	}
+	static inline void debug_put_inner(const uint64_t *ptr, int n)
+	{
+		printf("debug ");
+		for (int i = 0; i < n; i++) {
+			printf("%016llx", (long long)ptr[n - 1 - i]);
+		}
+		printf("\n");
+	}
+#ifdef _MSC_VER
+	void debug_put(const RegExp& m, int n)
+	{
+		assert(n <= 8);
+		static uint64_t regBuf[7];
+
+		push(rax);
+		mov(rax, (size_t)regBuf);
+		mov(ptr [rax + 8 * 0], rcx);
+		mov(ptr [rax + 8 * 1], rdx);
+		mov(ptr [rax + 8 * 2], r8);
+		mov(ptr [rax + 8 * 3], r9);
+		mov(ptr [rax + 8 * 4], r10);
+		mov(ptr [rax + 8 * 5], r11);
+		mov(rcx, ptr [rsp + 8]); // org rax
+		mov(ptr [rax + 8 * 6], rcx); // save
+		mov(rcx, ptr [rax + 8 * 0]); // org rcx
+		pop(rax);
+
+		lea(rcx, ptr [m]);
+		mov(rdx, n);
+		mov(rax, (size_t)debug_put_inner);
+		sub(rsp, 32);
+		call(rax);
+		add(rsp, 32);
+
+		push(rax);
+		mov(rax, (size_t)regBuf);
+		mov(rcx, ptr [rax + 8 * 0]);
+		mov(rdx, ptr [rax + 8 * 1]);
+		mov(r8, ptr [rax + 8 * 2]);
+		mov(r9, ptr [rax + 8 * 3]);
+		mov(r10, ptr [rax + 8 * 4]);
+		mov(r11, ptr [rax + 8 * 5]);
+		mov(rax, ptr [rax + 8 * 6]);
+		add(rsp, 8);
+	}
+#endif
+	/*
+		z >>= c
+		@note shrd(r/m, r, imm)
+	*/
+	void shr_mp(const MixPack& z, uint8_t c, const Reg64& t)
+	{
+		const size_t n = z.size();
+		for (size_t i = 0; i < n - 1; i++) {
+			const Reg64 *p;
+			if (z.isReg(i + 1)) {
+				p = &z.getReg(i + 1);
+			} else {
+				mov(t, ptr [z.getMem(i + 1)]);
+				p = &t;
+			}
+			if (z.isReg(i)) {
+				shrd(z.getReg(i), *p, c);
+			} else {
+				shrd(qword [z.getMem(i)], *p, c);
+			}
+		}
+		if (z.isReg(n - 1)) {
+			shr(z.getReg(n - 1), c);
+		} else {
+			shr(qword [z.getMem(n - 1)], c);
+		}
+	}
+	/*
+		z *= 2
+	*/
+	void twice_mp(const MixPack& z, const Reg64& t)
+	{
+		g_add(z[0], z[0], t);
+		for (size_t i = 1, n = z.size(); i < n; i++) {
+			g_adc(z[i], z[i], t);
+		}
+	}
+	/*
+		z += x
+	*/
+	void add_mp(const MixPack& z, const MixPack& x, const Reg64& t)
+	{
+		assert(z.size() == x.size());
+		g_add(z[0], x[0], t);
+		for (size_t i = 1, n = z.size(); i < n; i++) {
+			g_adc(z[i], x[i], t);
+		}
+	}
+	void add_m_m(const RegExp& mz, const RegExp& mx, const Reg64& t, int n)
+	{
+		for (int i = 0; i < n; i++) {
+			mov(t, ptr [mx + i * 8]);
+			if (i == 0) {
+				add(ptr [mz + i * 8], t);
+			} else {
+				adc(ptr [mz + i * 8], t);
+			}
+		}
+	}
+	/*
+		mz[] = mx[] - y
+	*/
+	void sub_m_mp_m(const RegExp& mz, const RegExp& mx, const MixPack& y, const Reg64& t)
+	{
+		for (size_t i = 0; i < y.size(); i++) {
+			mov(t, ptr [mx + i * 8]);
+			if (i == 0) {
+				if (y.isReg(i)) {
+					sub(t, y.getReg(i));
+				} else {
+					sub(t, ptr [y.getMem(i)]);
+				}
+			} else {
+				if (y.isReg(i)) {
+					sbb(t, y.getReg(i));
+				} else {
+					sbb(t, ptr [y.getMem(i)]);
+				}
+			}
+			mov(ptr [mz + i * 8], t);
+		}
+	}
+	/*
+		z -= x
+	*/
+	void sub_mp(const MixPack& z, const MixPack& x, const Reg64& t)
+	{
+		assert(z.size() == x.size());
+		g_sub(z[0], x[0], t);
+		for (size_t i = 1, n = z.size(); i < n; i++) {
+			g_sbb(z[i], x[i], t);
+		}
+	}
+	/*
+		z -= px[]
+	*/
+	void sub_mp_m(const MixPack& z, const RegExp& px, const Reg64& t)
+	{
+		if (z.isReg(0)) {
+			sub(z.getReg(0), ptr [px]);
+		} else {
+			mov(t, ptr [px]);
+			sub(ptr [z.getMem(0)], t);
+		}
+		for (size_t i = 1, n = z.size(); i < n; i++) {
+			if (z.isReg(i)) {
+				sbb(z.getReg(i), ptr [px + i * 8]);
+			} else {
+				mov(t, ptr [px + i * 8]);
+				sbb(ptr [z.getMem(i)], t);
+			}
+		}
+	}
+	void store_mp(const RegExp& m, const MixPack& z, const Reg64& t)
+	{
+		for (size_t i = 0, n = z.size(); i < n; i++) {
+			if (z.isReg(i)) {
+				mov(ptr [m + i * 8], z.getReg(i));
+			} else {
+				mov(t, ptr [z.getMem(i)]);
+				mov(ptr [m + i * 8], t);
+			}
+		}
+	}
+	void load_mp(const MixPack& z, const RegExp& m, const Reg64& t)
+	{
+		for (size_t i = 0, n = z.size(); i < n; i++) {
+			if (z.isReg(i)) {
+				mov(z.getReg(i), ptr [m + i * 8]);
+			} else {
+				mov(t, ptr [m + i * 8]);
+				mov(ptr [z.getMem(i)], t);
+			}
+		}
+	}
+	void set_mp(const MixPack& z, const Reg64& t)
+	{
+		for (size_t i = 0, n = z.size(); i < n; i++) {
+			MCL_FP_GEN_OP_MR(mov, z[i], t)
+		}
+	}
+	void mov_mp(const MixPack& z, const MixPack& x, const Reg64& t)
+	{
+		for (size_t i = 0, n = z.size(); i < n; i++) {
+			const MemReg zi = z[i], xi = x[i];
+			if (z.isReg(i)) {
+				MCL_FP_GEN_OP_RM(mov, zi.getReg(), xi)
+			} else {
+				if (x.isReg(i)) {
+					mov(ptr [z.getMem(i)], x.getReg(i));
+				} else {
+					mov(t, ptr [x.getMem(i)]);
+					mov(ptr [z.getMem(i)], t);
+				}
+			}
+		}
+	}
+#ifdef _MSC_VER
+	void debug_put_mp(const MixPack& mp, int n, const Reg64& t)
+	{
+		if (n >= 10) exit(1);
+		static uint64_t buf[10];
+		movq(xm0, rax);
+		mov(rax, (size_t)buf);
+		store_mp(rax, mp, t);
+		movq(rax, xm0);
+		push(rax);
+		mov(rax, (size_t)buf);
+		debug_put(rax, n);
+		pop(rax);
+	}
+#endif
+
+	std::string mkLabel(const char *label, int n) const
+	{
+		return std::string(label) + Xbyak::Label::toStr(n);
+	}
+	/*
+		int k = preInvC(pr, px)
+	*/
+	void gen_preInv()
+	{
+		assert(pn_ >= 2);
+		const int freeRegNum = 13;
+		if (pn_ > 9) {
+			throw cybozu::Exception("mcl:FpGenerator:gen_preInv:large pn_") << pn_;
+		}
+		StackFrame sf(this, 2, 10 | UseRDX | UseRCX, (std::max<int>(0, pn_ * 5 - freeRegNum) + 1 + (isFullBit_ ? 1 : 0)) * 8);
+		const Reg64& pr = sf.p[0];
+		const Reg64& px = sf.p[1];
+		const Reg64& t = rcx;
+		/*
+			k = rax, t = rcx : temporary
+			use rdx, pr, px in main loop, so we can use 13 registers
+			v = t[0, pn_) : all registers
+		*/
+		size_t rspPos = 0;
+
+		assert(sf.t.size() >= (size_t)pn_);
+		Pack remain = sf.t;
+
+		const MixPack rr(remain, rspPos, pn_);
+		remain.append(rdx);
+		const MixPack ss(remain, rspPos, pn_);
+		remain.append(px);
+		const int rSize = (int)remain.size();
+		MixPack vv(remain, rspPos, pn_, rSize > 0 ? rSize / 2 : -1);
+		remain.append(pr);
+		MixPack uu(remain, rspPos, pn_);
+
+		const RegExp keep_pr = rsp + rspPos;
+		rspPos += 8;
+		const RegExp rTop = rsp + rspPos; // used if isFullBit_
+
+		inLocalLabel();
+		mov(ptr [keep_pr], pr);
+		mov(rax, px);
+		// px is free frome here
+		load_mp(vv, rax, t); // v = x
+		mov(rax, (size_t)p_);
+		load_mp(uu, rax, t); // u = p_
+		// k = 0
+		xor_(rax, rax);
+		// rTop = 0
+		if (isFullBit_) {
+			mov(ptr [rTop], rax);
+		}
+		// r = 0;
+		set_mp(rr, rax);
+		// s = 1
+		set_mp(ss, rax);
+		if (ss.isReg(0)) {
+			mov(ss.getReg(0), 1);
+		} else {
+			mov(qword [ss.getMem(0)], 1);
+		}
+#if 0
+	L(".lp");
+		or_mp(vv, t);
+		jz(".exit", T_NEAR);
+
+		g_test(uu[0], 1);
+		jz(".u_even", T_NEAR);
+		g_test(vv[0], 1);
+		jz(".v_even", T_NEAR);
+		for (int i = pn_ - 1; i >= 0; i--) {
+			g_cmp(vv[i], uu[i], t);
+			jc(".v_lt_u", T_NEAR);
+			if (i > 0) jnz(".v_ge_u", T_NEAR);
+		}
+
+	L(".v_ge_u");
+		sub_mp(vv, uu, t);
+		add_mp(ss, rr, t);
+	L(".v_even");
+		shr_mp(vv, 1, t);
+		twice_mp(rr, t);
+		if (isFullBit_) {
+			sbb(t, t);
+			mov(ptr [rTop], t);
+		}
+		inc(rax);
+		jmp(".lp", T_NEAR);
+	L(".v_lt_u");
+		sub_mp(uu, vv, t);
+		add_mp(rr, ss, t);
+		if (isFullBit_) {
+			sbb(t, t);
+			mov(ptr [rTop], t);
+		}
+	L(".u_even");
+		shr_mp(uu, 1, t);
+		twice_mp(ss, t);
+		inc(rax);
+		jmp(".lp", T_NEAR);
+#else
+		for (int cn = pn_; cn > 0; cn--) {
+			const std::string _lp = mkLabel(".lp", cn);
+			const std::string _u_v_odd = mkLabel(".u_v_odd", cn);
+			const std::string _u_even = mkLabel(".u_even", cn);
+			const std::string _v_even = mkLabel(".v_even", cn);
+			const std::string _v_ge_u = mkLabel(".v_ge_u", cn);
+			const std::string _v_lt_u = mkLabel(".v_lt_u", cn);
+		L(_lp);
+			or_mp(vv, t);
+			jz(".exit", T_NEAR);
+
+			g_test(uu[0], 1);
+			jz(_u_even, T_NEAR);
+			g_test(vv[0], 1);
+			jz(_v_even, T_NEAR);
+		L(_u_v_odd);
+			if (cn > 1) {
+				isBothZero(vv[cn - 1], uu[cn - 1], t);
+				jz(mkLabel(".u_v_odd", cn - 1), T_NEAR);
+			}
+			for (int i = cn - 1; i >= 0; i--) {
+				g_cmp(vv[i], uu[i], t);
+				jc(_v_lt_u, T_NEAR);
+				if (i > 0) jnz(_v_ge_u, T_NEAR);
+			}
+
+		L(_v_ge_u);
+			sub_mp(vv, uu, t);
+			add_mp(ss, rr, t);
+		L(_v_even);
+			shr_mp(vv, 1, t);
+			twice_mp(rr, t);
+			if (isFullBit_) {
+				sbb(t, t);
+				mov(ptr [rTop], t);
+			}
+			inc(rax);
+			jmp(_lp, T_NEAR);
+		L(_v_lt_u);
+			sub_mp(uu, vv, t);
+			add_mp(rr, ss, t);
+			if (isFullBit_) {
+				sbb(t, t);
+				mov(ptr [rTop], t);
+			}
+		L(_u_even);
+			shr_mp(uu, 1, t);
+			twice_mp(ss, t);
+			inc(rax);
+			jmp(_lp, T_NEAR);
+
+			if (cn > 0) {
+				vv.removeLast();
+				uu.removeLast();
+			}
+		}
+#endif
+	L(".exit");
+		assert(ss.isReg(0) && ss.isReg(1));
+		const Reg64& t2 = ss.getReg(0);
+		const Reg64& t3 = ss.getReg(1);
+
+		mov(t2, (size_t)p_);
+		if (isFullBit_) {
+			mov(t, ptr [rTop]);
+			test(t, t);
+			jz("@f");
+			sub_mp_m(rr, t2, t);
+		L("@@");
+		}
+		mov(t3, ptr [keep_pr]);
+		// pr[] = p[] - rr
+		sub_m_mp_m(t3, t2, rr, t);
+		jnc("@f");
+		// pr[] += p[]
+		add_m_m(t3, t2, t, pn_);
+	L("@@");
+		outLocalLabel();
+	}
+	void mov32c(const Reg64& r, uint64_t c)
+	{
+		if (c & 0xffffffff00000000ULL) {
+			mov(r, c);
+		} else {
+			mov(Xbyak::Reg32(r.getIdx()), (uint32_t)c);
+		}
+	}
+private:
+	FpGenerator(const FpGenerator&);
+	void operator=(const FpGenerator&);
+	void make_op_rm(void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Operand&), const Reg64& op1, const MemReg& op2)
+	{
+		if (op2.isReg()) {
+			(this->*op)(op1, op2.getReg());
+		} else {
+			(this->*op)(op1, qword [op2.getMem()]);
+		}
+	}
+	void make_op_mr(void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Operand&), const MemReg& op1, const Reg64& op2)
+	{
+		if (op1.isReg()) {
+			(this->*op)(op1.getReg(), op2);
+		} else {
+			(this->*op)(qword [op1.getMem()], op2);
+		}
+	}
+	void make_op(void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Operand&), const MemReg& op1, const MemReg& op2, const Reg64& t)
+	{
+		if (op1.isReg()) {
+			make_op_rm(op, op1.getReg(), op2);
+		} else if (op2.isReg()) {
+			(this->*op)(ptr [op1.getMem()], op2.getReg());
+		} else {
+			mov(t, ptr [op2.getMem()]);
+			(this->*op)(ptr [op1.getMem()], t);
+		}
+	}
+	void g_add(const MemReg& op1, const MemReg& op2, const Reg64& t) { make_op(&Xbyak::CodeGenerator::add, op1, op2, t); }
+	void g_adc(const MemReg& op1, const MemReg& op2, const Reg64& t) { make_op(&Xbyak::CodeGenerator::adc, op1, op2, t); }
+	void g_sub(const MemReg& op1, const MemReg& op2, const Reg64& t) { make_op(&Xbyak::CodeGenerator::sub, op1, op2, t); }
+	void g_sbb(const MemReg& op1, const MemReg& op2, const Reg64& t) { make_op(&Xbyak::CodeGenerator::sbb, op1, op2, t); }
+	void g_cmp(const MemReg& op1, const MemReg& op2, const Reg64& t) { make_op(&Xbyak::CodeGenerator::cmp, op1, op2, t); }
+	void g_or(const Reg64& r, const MemReg& op) { make_op_rm(&Xbyak::CodeGenerator::or_, r, op); }
+	void g_test(const MemReg& op1, const MemReg& op2, const Reg64& t)
+	{
+		const MemReg *pop1 = &op1;
+		const MemReg *pop2 = &op2;
+		if (!pop1->isReg()) {
+			std::swap(pop1, pop2);
+		}
+		// (M, M), (R, M), (R, R)
+		if (pop1->isReg()) {
+			MCL_FP_GEN_OP_MR(test, (*pop2), pop1->getReg())
+		} else {
+			mov(t, ptr [pop1->getMem()]);
+			test(ptr [pop2->getMem()], t);
+		}
+	}
+	void g_mov(const MemReg& op, const Reg64& r)
+	{
+		make_op_mr(&Xbyak::CodeGenerator::mov, op, r);
+	}
+	void g_mov(const Reg64& r, const MemReg& op)
+	{
+		make_op_rm(&Xbyak::CodeGenerator::mov, r, op);
+	}
+	void g_add(const Reg64& r, const MemReg& mr) { MCL_FP_GEN_OP_RM(add, r, mr) }
+	void g_adc(const Reg64& r, const MemReg& mr) { MCL_FP_GEN_OP_RM(adc, r, mr) }
+	void isBothZero(const MemReg& op1, const MemReg& op2, const Reg64& t)
+	{
+		g_mov(t, op1);
+		g_or(t, op2);
+	}
+	void g_test(const MemReg& op, int imm)
+	{
+		MCL_FP_GEN_OP_MR(test, op, imm)
+	}
+	/*
+		z[] = x[]
+	*/
+	void mov_rr(const Pack& z, const Pack& x)
+	{
+		assert(z.size() == x.size());
+		for (int i = 0, n = (int)x.size(); i < n; i++) {
+			mov(z[i], x[i]);
+		}
+	}
+	/*
+		m[] = x[]
+	*/
+	void store_mr(const RegExp& m, const Pack& x)
+	{
+		for (int i = 0, n = (int)x.size(); i < n; i++) {
+			mov(ptr [m + 8 * i], x[i]);
+		}
+	}
+	/*
+		x[] = m[]
+	*/
+	void load_rm(const Pack& z, const RegExp& m)
+	{
+		for (int i = 0, n = (int)z.size(); i < n; i++) {
+			mov(z[i], ptr [m + 8 * i]);
+		}
+	}
+	/*
+		z[] += x[]
+	*/
+	void add_rr(const Pack& z, const Pack& x)
+	{
+		add(z[0], x[0]);
+		assert(z.size() == x.size());
+		for (size_t i = 1, n = z.size(); i < n; i++) {
+			adc(z[i], x[i]);
+		}
+	}
+	/*
+		z[] -= x[]
+	*/
+	void sub_rr(const Pack& z, const Pack& x)
+	{
+		sub(z[0], x[0]);
+		assert(z.size() == x.size());
+		for (size_t i = 1, n = z.size(); i < n; i++) {
+			sbb(z[i], x[i]);
+		}
+	}
+	/*
+		z[] += m[]
+	*/
+	void add_rm(const Pack& z, const RegExp& m)
+	{
+		add(z[0], ptr [m + 8 * 0]);
+		for (int i = 1, n = (int)z.size(); i < n; i++) {
+			adc(z[i], ptr [m + 8 * i]);
+		}
+	}
+	/*
+		z[] -= m[]
+	*/
+	void sub_rm(const Pack& z, const RegExp& m)
+	{
+		sub(z[0], ptr [m + 8 * 0]);
+		for (int i = 1, n = (int)z.size(); i < n; i++) {
+			sbb(z[i], ptr [m + 8 * i]);
+		}
+	}
+	/*
+		t = all or z[i]
+		ZF = z is zero
+	*/
+	void or_mp(const MixPack& z, const Reg64& t)
+	{
+		const size_t n = z.size();
+		if (n == 1) {
+			if (z.isReg(0)) {
+				test(z.getReg(0), z.getReg(0));
+			} else {
+				mov(t, ptr [z.getMem(0)]);
+				test(t, t);
+			}
+		} else {
+			g_mov(t, z[0]);
+			for (size_t i = 1; i < n; i++) {
+				g_or(t, z[i]);
+			}
+		}
+	}
+	/*
+		[rdx:x:t1:t0] <- py[2:1:0] * x
+		destroy x, t
+	*/
+	void mul3x1(const RegExp& py, const Reg64& x, const Reg64& t2, const Reg64& t1, const Reg64& t0, const Reg64& t)
+	{
+		if (useMulx_) {
+			// mulx(H, L, x) = [H:L] = x * rdx
+			/*
+				rdx:x
+				    t:t1
+				      rax:t0
+			*/
+			mov(rdx, x);
+			mulx(rax, t0, ptr [py]); // [rax:t0] = py[0] * x
+			mulx(t, t1, ptr [py + 8]); // [t:t1] = py[1] * x
+			add(t1, rax);
+			mulx(rdx, x, ptr [py + 8 * 2]);
+			adc(x, t);
+			adc(rdx, 0);
+		} else {
+			mov(rax, ptr [py]);
+			mul(x);
+			mov(t0, rax);
+			mov(t1, rdx);
+			mov(rax, ptr [py + 8]);
+			mul(x);
+			mov(t, rax);
+			mov(t2, rdx);
+			mov(rax, ptr [py + 8 * 2]);
+			mul(x);
+			/*
+				rdx:rax
+				     t2:t
+				        t1:t0
+			*/
+			add(t1, t);
+			adc(rax, t2);
+			adc(rdx, 0);
+			mov(x, rax);
+		}
+	}
+	/*
+		[x2:x1:x0] * x0
+	*/
+	void mul3x1_sqr1(const RegExp& px, const Reg64& x0, const Reg64& t2, const Reg64& t1, const Reg64& t0, const Reg64& t)
+	{
+		mov(rax, x0);
+		mul(x0);
+		mov(t0, rax);
+		mov(t1, rdx);
+		mov(rax, ptr [px + 8]);
+		mul(x0);
+		mov(ptr [rsp + 0 * 8], rax); // (x0 * x1)_L
+		mov(ptr [rsp + 1 * 8], rdx); // (x0 * x1)_H
+		mov(t, rax);
+		mov(t2, rdx);
+		mov(rax, ptr [px + 8 * 2]);
+		mul(x0);
+		mov(ptr [rsp + 2 * 8], rax); // (x0 * x2)_L
+		mov(ptr [rsp + 3 * 8], rdx); // (x0 * x2)_H
+		/*
+			rdx:rax
+			     t2:t
+			        t1:t0
+		*/
+		add(t1, t);
+		adc(t2, rax);
+		adc(rdx, 0);
+	}
+	/*
+		[x2:x1:x0] * x1
+	*/
+	void mul3x1_sqr2(const RegExp& px, const Reg64& x1, const Reg64& t2, const Reg64& t1, const Reg64& t0)
+	{
+		mov(t0, ptr [rsp + 0 * 8]);// (x0 * x1)_L
+		mov(rax, x1);
+		mul(x1);
+		mov(t1, rax);
+		mov(t2, rdx);
+		mov(rax, ptr [px + 8 * 2]);
+		mul(x1);
+		mov(ptr [rsp + 4 * 8], rax); // (x1 * x2)_L
+		mov(ptr [rsp + 5 * 8], rdx); // (x1 * x2)_H
+		/*
+			rdx:rax
+			     t2:t1
+			         t:t0
+		*/
+		add(t1, ptr [rsp + 1 * 8]); // (x0 * x1)_H
+		adc(rax, t2);
+		adc(rdx, 0);
+	}
+	/*
+		[rdx:rax:t1:t0] = [x2:x1:x0] * x2
+	*/
+	void mul3x1_sqr3(const Reg64& x2, const Reg64& t1, const Reg64& t0)
+	{
+		mov(rax, x2);
+		mul(x2);
+		/*
+			rdx:rax
+			     t2:t
+			        t1:t0
+		*/
+		mov(t0, ptr [rsp + 2 * 8]); // (x0 * x2)_L
+		mov(t1, ptr [rsp + 3 * 8]); // (x0 * x2)_H
+		add(t1, ptr [rsp + 4 * 8]); // (x1 * x2)_L
+		adc(rax, ptr [rsp + 5 * 8]); // (x1 * x2)_H
+		adc(rdx, 0);
+	}
+
+	/*
+		c = [c3:y:c1:c0] = c + x[2..0] * y
+		q = uint64_t(c0 * pp)
+		c = (c + q * p) >> 64
+		input  [c3:c2:c1:c0], px, y, p
+		output [c0:c3:c2:c1] ; c0 == 0 unless isFullBit_
+
+		@note use rax, rdx, destroy y
+	*/
+	void montgomery3_sub(uint64_t pp, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0,
+		const Reg64& /*px*/, const Reg64& y, const Reg64& p,
+		const Reg64& t0, const Reg64& t1, const Reg64& t2, const Reg64& t3, const Reg64& t4, bool isFirst)
+	{
+		// input [c3:y:c1:0]
+		// [t4:c3:y:c1:c0]
+		// t4 = 0 or 1 if isFullBit_, = 0 otherwise
+		mov(rax, pp);
+		mul(c0); // q = rax
+		mov(c2, rax);
+		mul3x1(p, c2, t2, t1, t0, t3);
+		// [rdx:c2:t1:t0] = p * q
+		add(c0, t0); // always c0 is zero
+		adc(c1, t1);
+		adc(c2, y);
+		adc(c3, rdx);
+		if (isFullBit_) {
+			if (isFirst) {
+				setc(c0.cvt8());
+			} else {
+				adc(c0.cvt8(), t4.cvt8());
+			}
+		}
+	}
+	/*
+		c = [c3:c2:c1:c0]
+		c += x[2..0] * y
+		q = uint64_t(c0 * pp)
+		c = (c + q * p) >> 64
+		input  [c3:c2:c1:c0], px, y, p
+		output [c0:c3:c2:c1] ; c0 == 0 unless isFullBit_
+
+		@note use rax, rdx, destroy y
+	*/
+	void montgomery3_1(uint64_t pp, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0,
+		const Reg64& px, const Reg64& y, const Reg64& p,
+		const Reg64& t0, const Reg64& t1, const Reg64& t2, const Reg64& t3, const Reg64& t4, bool isFirst)
+	{
+		if (isFirst) {
+			mul3x1(px, y, c2, c1, c0, c3);
+			mov(c3, rdx);
+			// [c3:y:c1:c0] = px[2..0] * y
+		} else {
+			mul3x1(px, y, t2, t1, t0, t3);
+			// [rdx:y:t1:t0] = px[2..0] * y
+			add_rr(Pack(c3, y, c1, c0), Pack(rdx, c2, t1, t0));
+			if (isFullBit_) setc(t4.cvt8());
+		}
+		montgomery3_sub(pp, c3, c2, c1, c0, px, y, p, t0, t1, t2, t3, t4, isFirst);
+	}
+	/*
+		pc[0..n] += x[0..n-1] * y ; pc[] = 0 if isFirst
+		pc[n + 1] is temporary used if isFullBit_
+		q = uint64_t(pc[0] * pp)
+		pc[] = (pc[] + q * p) >> 64
+		input : pc[], px[], y, p[], pw1[], pw2[]
+		output : pc[0..n]   ; if isFullBit_
+		         pc[0..n-1] ; if !isFullBit_
+		destroy y
+		use
+		pw1[0] if useMulx_
+		pw1[0..n-2] otherwise
+		pw2[0..n-1]
+	*/
+	void montgomeryN_1(uint64_t pp, int n, const RegExp& pc, const RegExp& px, const Reg64& y, const Reg64& p, const Reg64& t, const MixPack& pw1, const RegExp& pw2, bool isFirst)
+	{
+		// pc[] += x[] * y
+		if (isFirst) {
+			gen_raw_mulI(pc, px, y, pw1, t, n);
+			mov(ptr [pc + n * 8], rdx);
+		} else {
+			gen_raw_mulI(pw2, px, y, pw1, t, n);
+			mov(t, ptr [pw2 + 0 * 8]);
+			add(ptr [pc + 0 * 8], t);
+			for (int i = 1; i < n; i++) {
+				mov(t, ptr [pw2 + i * 8]);
+				adc(ptr [pc + i * 8], t);
+			}
+			adc(ptr [pc + n * 8], rdx);
+			if (isFullBit_) {
+				mov(t, 0);
+				adc(t, 0);
+				mov(qword [pc + (n + 1) * 8], t);
+			}
+		}
+		mov(rax, pp);
+		mul(qword [pc]);
+		mov(y, rax); // y = q
+		gen_raw_mulI(pw2, p, y, pw1, t, n);
+		// c[] = (c[] + pw2[]) >> 64
+		mov(t, ptr [pw2 + 0 * 8]);
+		add(t, ptr [pc + 0 * 8]);
+		for (int i = 1; i < n; i++) {
+			mov(t, ptr [pw2 + i * 8]);
+			adc(t, ptr [pc + i * 8]);
+			mov(ptr [pc + (i - 1) * 8], t);
+		}
+		adc(rdx, ptr [pc + n * 8]);
+		mov(ptr [pc + (n - 1) * 8], rdx);
+		if (isFullBit_) {
+			if (isFirst) {
+				mov(t, 0);
+			} else {
+				mov(t, ptr [pc + (n + 1) * 8]);
+			}
+			adc(t, 0);
+			mov(qword [pc + n * 8], t);
+		} else {
+			xor_(eax, eax);
+			mov(ptr [pc + n * 8], rax);
+		}
+	}
+	/*
+		[rdx:x:t2:t1:t0] <- py[3:2:1:0] * x
+		destroy x, t
+	*/
+	void mul4x1(const RegExp& py, const Reg64& x, const Reg64& t3, const Reg64& t2, const Reg64& t1, const Reg64& t0, const Reg64& t)
+	{
+		if (useMulx_) {
+			mov(rdx, x);
+			mulx(t1, t0, ptr [py + 8 * 0]);
+			mulx(t2, rax, ptr [py + 8 * 1]);
+			add(t1, rax);
+			mulx(x, rax, ptr [py + 8 * 2]);
+			adc(t2, rax);
+			mulx(rdx, rax, ptr [py + 8 * 3]);
+			adc(x, rax);
+			adc(rdx, 0);
+		} else {
+			mov(rax, ptr [py]);
+			mul(x);
+			mov(t0, rax);
+			mov(t1, rdx);
+			mov(rax, ptr [py + 8]);
+			mul(x);
+			mov(t, rax);
+			mov(t2, rdx);
+			mov(rax, ptr [py + 8 * 2]);
+			mul(x);
+			mov(t3, rax);
+			mov(rax, x);
+			mov(x, rdx);
+			mul(qword [py + 8 * 3]);
+			add(t1, t);
+			adc(t2, t3);
+			adc(x, rax);
+			adc(rdx, 0);
+		}
+	}
+
+	/*
+		c = [c4:c3:c2:c1:c0]
+		c += x[3..0] * y
+		q = uint64_t(c0 * pp)
+		c = (c + q * p) >> 64
+		input  [c4:c3:c2:c1:c0], px, y, p
+		output [c0:c4:c3:c2:c1]
+
+		@note use rax, rdx, destroy y
+		use xt if isFullBit_
+	*/
+	void montgomery4_1(uint64_t pp, const Reg64& c4, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0,
+		const Reg64& px, const Reg64& y, const Reg64& p,
+		const Reg64& t0, const Reg64& t1, const Reg64& t2, const Reg64& t3, const Reg64& t4, bool isFirst, const Xmm& xt)
+	{
+		if (isFirst) {
+			mul4x1(px, y, c3, c2, c1, c0, c4);
+			mov(c4, rdx);
+			// [c4:y:c2:c1:c0] = px[3..0] * y
+		} else {
+			mul4x1(px, y, t3, t2, t1, t0, t4);
+			// [rdx:y:t2:t1:t0] = px[3..0] * y
+			if (isFullBit_) {
+				movq(xt, px);
+				xor_(px, px);
+			}
+			add_rr(Pack(c4, y, c2, c1, c0), Pack(rdx, c3, t2, t1, t0));
+			if (isFullBit_) {
+				adc(px, 0);
+			}
+		}
+		// [px:c4:y:c2:c1:c0]
+		// px = 0 or 1 if isFullBit_, = 0 otherwise
+		mov(rax, pp);
+		mul(c0); // q = rax
+		mov(c3, rax);
+		mul4x1(p, c3, t3, t2, t1, t0, t4);
+		add(c0, t0); // always c0 is zero
+		adc(c1, t1);
+		adc(c2, t2);
+		adc(c3, y);
+		adc(c4, rdx);
+		if (isFullBit_) {
+			if (isFirst) {
+				adc(c0, 0);
+			} else {
+				adc(c0, px);
+				movq(px, xt);
+			}
+		}
+	}
+};
+
+} // mcl
+
+#endif
diff --git a/include/mcl/fp_util.hpp b/include/mcl/fp_util.hpp
new file mode 100644
index 0000000..7419672
--- /dev/null
+++ b/include/mcl/fp_util.hpp
@@ -0,0 +1,294 @@
+#pragma once
+#include <vector>
+#include <cybozu/itoa.hpp>
+#include <cybozu/atoi.hpp>
+#include <cybozu/bitvector.hpp>
+/**
+	@file
+	@brief utility of Fp
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+
+namespace mcl { namespace fp {
+
+#if defined(CYBOZU_OS_BIT) && (CYBOZU_OS_BIT == 32)
+	typedef uint32_t BlockType;
+#else
+	typedef uint64_t BlockType;
+#endif
+
+template<class S>
+void setBlockBit(S *buf, size_t bitLen, bool b)
+{
+	const size_t unitSize = sizeof(S) * 8;
+	const size_t q = bitLen / unitSize;
+	const size_t r = bitLen % unitSize;
+	if (b) {
+		buf[q] |= S(1) << r;
+	} else {
+		buf[q] &= ~(S(1) << r);
+	}
+}
+template<class S>
+bool getBlockBit(const S *buf, size_t bitLen)
+{
+	const size_t unitSize = sizeof(S) * 8;
+	const size_t q = bitLen / unitSize;
+	const size_t r = bitLen % unitSize;
+	return (buf[q] & (S(1) << r)) != 0;
+}
+/*
+	convert x[0..n) to hex string
+	start "0x" if withPrefix
+*/
+template<class T>
+void toStr16(std::string& str, const T *x, size_t n, bool withPrefix = false)
+{
+	size_t fullN = 0;
+	if (n > 1) {
+		size_t pos = n - 1;
+		while (pos > 0) {
+			if (x[pos]) break;
+			pos--;
+		}
+		if (pos > 0) fullN = pos;
+	}
+	const T v = n == 0 ? 0 : x[fullN];
+	const size_t topLen = cybozu::getHexLength(v);
+	const size_t startPos = withPrefix ? 2 : 0;
+	const size_t lenT = sizeof(T) * 2;
+	str.resize(startPos + fullN * lenT + topLen);
+	if (withPrefix) {
+		str[0] = '0';
+		str[1] = 'x';
+	}
+	cybozu::itohex(&str[startPos], topLen, v, false);
+	for (size_t i = 0; i < fullN; i++) {
+		cybozu::itohex(&str[startPos + topLen + i * lenT], lenT, x[fullN - 1 - i], false);
+	}
+}
+
+/*
+	convert x[0..n) to bin string
+	start "0b" if withPrefix
+*/
+template<class T>
+void toStr2(std::string& str, const T *x, size_t n, bool withPrefix)
+{
+	size_t fullN = 0;
+	if (n > 1) {
+		size_t pos = n - 1;
+		while (pos > 0) {
+			if (x[pos]) break;
+			pos--;
+		}
+		if (pos > 0) fullN = pos;
+	}
+	const T v = n == 0 ? 0 : x[fullN];
+	const size_t topLen = cybozu::getBinLength(v);
+	const size_t startPos = withPrefix ? 2 : 0;
+	const size_t lenT = sizeof(T) * 8;
+	str.resize(startPos + fullN * lenT + topLen);
+	if (withPrefix) {
+		str[0] = '0';
+		str[1] = 'b';
+	}
+	cybozu::itobin(&str[startPos], topLen, v);
+	for (size_t i = 0; i < fullN; i++) {
+		cybozu::itobin(&str[startPos + topLen + i * lenT], lenT, x[fullN - 1 - i]);
+	}
+}
+
+/*
+	convert hex string to x[0..xn)
+	hex string = [0-9a-fA-F]+
+*/
+template<class T>
+void fromStr16(T *x, size_t xn, const char *str, size_t strLen)
+{
+	if (strLen == 0) throw cybozu::Exception("fp:fromStr16:strLen is zero");
+	const size_t unitLen = sizeof(T) * 2;
+	const size_t q = strLen / unitLen;
+	const size_t r = strLen % unitLen;
+	const size_t requireSize = q + (r ? 1 : 0);
+	if (xn < requireSize) throw cybozu::Exception("fp:fromStr16:short size") << xn << requireSize;
+	for (size_t i = 0; i < q; i++) {
+		bool b;
+		x[i] = cybozu::hextoi(&b, &str[r + (q - 1 - i) * unitLen], unitLen);
+		if (!b) throw cybozu::Exception("fp:fromStr16:bad char") << cybozu::exception::makeString(str, strLen);
+	}
+	if (r) {
+		bool b;
+		x[q] = cybozu::hextoi(&b, str, r);
+		if (!b) throw cybozu::Exception("fp:fromStr16:bad char") << cybozu::exception::makeString(str, strLen);
+	}
+	for (size_t i = requireSize; i < xn; i++) x[i] = 0;
+}
+
+/*
+	@param base [inout]
+*/
+inline const char *verifyStr(bool *isMinus, int *base, const std::string& str)
+{
+	const char *p = str.c_str();
+	if (*p == '-') {
+		*isMinus = true;
+		p++;
+	} else {
+		*isMinus = false;
+	}
+	if (p[0] == '0') {
+		if (p[1] == 'x') {
+			if (*base != 0 && *base != 16) {
+				throw cybozu::Exception("fp:verifyStr:bad base") << *base << str;
+			}
+			*base = 16;
+			p += 2;
+		} else if (p[1] == 'b') {
+			if (*base != 0 && *base != 2) {
+				throw cybozu::Exception("fp:verifyStr:bad base") << *base << str;
+			}
+			*base = 2;
+			p += 2;
+		}
+	}
+	if (*base == 0) *base = 10;
+	if (*p == '\0') throw cybozu::Exception("fp:verifyStr:str is empty");
+	return p;
+}
+
+template<class S>
+size_t getRoundNum(size_t x)
+{
+	const size_t size = sizeof(S) * 8;
+	return (x + size - 1) / size;
+}
+
+/*
+	compare x[0, n) with y[0, n)
+*/
+template<class S>
+int compareArray(const S* x, const S* y, size_t n)
+{
+	for (size_t i = 0; i < n; i++) {
+		const S a = x[n - 1 - i];
+		const S b = y[n - 1 - i];
+		if (a > b) return 1;
+		if (a < b) return -1;
+	}
+	return 0;
+}
+
+/*
+	get random value less than in[]
+	n = (bitLen + sizeof(S) * 8) / (sizeof(S) * 8)
+	input  in[0..n)
+	output out[n..n)
+	0 <= out < in
+*/
+template<class RG, class S>
+inline void getRandVal(S *out, RG& rg, const S *in, size_t bitLen)
+{
+	const size_t unitBitSize = sizeof(S) * 8;
+	const size_t n = getRoundNum<S>(bitLen);
+	const size_t rem = bitLen & (unitBitSize - 1);
+	for (;;) {
+		rg.read(out, n);
+		if (rem > 0) out[n - 1] &= (S(1) << rem) - 1;
+		if (compareArray(out, in, n) < 0) return;
+	}
+}
+
+/*
+	z[] = (x[] << shift) | y
+	@param z [out] z[0..n)
+	@param x [in] x[0..n)
+	@param n [in] length of x, z
+	@param shift [in] 0 <= shift < (sizeof(S) * 8)
+	@param y [in]
+	@return (x[] << shift)[n]
+*/
+template<class S>
+S shiftLeftOr(S* z, const S* x, size_t n, size_t shift, S y = 0)
+{
+	if (n == 0) {
+		throw cybozu::Exception("fp:shiftLeftOr:bad n");
+	}
+	if (shift == 0) {
+		for (size_t i = n - 1; i > 0; i--) {
+			z[i] = x[i];
+		}
+		z[0] = x[0] | y;
+		return 0;
+	}
+	const size_t unitSize = sizeof(S) * 8;
+	if (shift >= unitSize) {
+		throw cybozu::Exception("fp:shiftLeftOr:large shift") << shift;
+	}
+	const size_t rev = unitSize - shift;
+	S ret = x[n - 1] >> rev;
+	for (size_t i = n - 1; i > 0; i--) {
+		z[i] = (x[i] << shift) | (x[i - 1] >> rev);
+	}
+	z[0] = (x[0] << shift) | y;
+	return ret;
+}
+template<class S>
+void shiftRight(S* z, const S* x, size_t n, size_t shift)
+{
+	if (n == 0) return;
+	if (shift == 0) {
+		for (size_t i = 0; i < n; i++) {
+			z[i] = x[i];
+		}
+		return;
+	}
+	const size_t unitSize = sizeof(S) * 8;
+	if (shift >= unitSize) {
+		throw cybozu::Exception("fp:shiftRight:large shift") << shift;
+	}
+	const size_t rev = unitSize - shift;
+	S prev = x[0];
+	for (size_t i = 0; i < n - 1; i++) {
+		S t = x[i + 1];
+		z[i] = (prev >> shift) | (t << rev);
+		prev = t;
+	}
+	z[n - 1] = prev >> shift;
+}
+
+template<class Vec, class T>
+size_t splitBitVec(Vec& v, const cybozu::BitVectorT<T>& bv, size_t width)
+{
+	if (width > sizeof(typename Vec::value_type) * 8) {
+		throw cybozu::Exception("fp:splitBitVec:bad width") << width;
+	}
+	const size_t q = bv.size() / width;
+	const size_t r = bv.size() % width;
+	for (size_t i = 0; i < q; i++) {
+		v.push_back(bv.extract(i * width, width));
+	}
+	if (r > 0) {
+		v.push_back(bv.extract(q * width, r));
+	}
+	return r ? r : width;
+}
+
+template<class Vec, class T>
+void concatBitVec(cybozu::BitVectorT<T>& bv, const Vec& v, size_t width, size_t lastWidth)
+{
+	if (width > sizeof(typename Vec::value_type) * 8) {
+		throw cybozu::Exception("fp:splitBitVec:bad width") << width;
+	}
+	bv.clear();
+	for (size_t i = 0; i < v.size() - 1; i++) {
+		bv.append(v[i], width);
+	}
+	bv.append(v[v.size() - 1], lastWidth);
+}
+
+} // mcl::fp
+
+} // fp
diff --git a/include/mcl/gmp_util.hpp b/include/mcl/gmp_util.hpp
new file mode 100644
index 0000000..c29c870
--- /dev/null
+++ b/include/mcl/gmp_util.hpp
@@ -0,0 +1,378 @@
+#pragma once
+/**
+	@file
+	@brief util function for gmp
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+#include <assert.h>
+#ifdef _MSC_VER
+	#pragma warning(push)
+	#pragma warning(disable : 4616)
+	#pragma warning(disable : 4800)
+	#pragma warning(disable : 4244)
+	#pragma warning(disable : 4127)
+	#pragma warning(disable : 4512)
+	#pragma warning(disable : 4146)
+#endif
+#include <gmpxx.h>
+#include <stdint.h>
+#ifdef _MSC_VER
+	#pragma warning(pop)
+#endif
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#ifdef _DEBUG
+#pragma comment(lib, "14/mpird.lib")
+#pragma comment(lib, "14/mpirxxd.lib")
+#else
+#pragma comment(lib, "14/mpir.lib")
+#pragma comment(lib, "14/mpirxx.lib")
+#endif
+#elif _MSC_VER == 1800
+#ifdef _DEBUG
+#pragma comment(lib, "12/mpird.lib")
+#pragma comment(lib, "12/mpirxxd.lib")
+#else
+#pragma comment(lib, "12/mpir.lib")
+#pragma comment(lib, "12/mpirxx.lib")
+#endif
+#else
+#ifdef _DEBUG
+#pragma comment(lib, "mpird.lib")
+#pragma comment(lib, "mpirxxd.lib")
+#else
+#pragma comment(lib, "mpir.lib")
+#pragma comment(lib, "mpirxx.lib")
+#endif
+#endif
+#endif
+#include <mcl/operator.hpp>
+
+namespace mcl {
+
+struct Gmp {
+	typedef mpz_class ImplType;
+#if CYBOZU_OS_BIT == 64
+	typedef uint64_t BlockType;
+#else
+	typedef uint32_t BlockType;
+#endif
+	// z = [buf[n-1]:..:buf[1]:buf[0]]
+	// eg. buf[] = {0x12345678, 0xaabbccdd}; => z = 0xaabbccdd12345678;
+	template<class T>
+	static void setRaw(mpz_class& z, const T *buf, size_t n)
+	{
+		mpz_import(z.get_mpz_t(), n, -1, sizeof(*buf), 0, 0, buf);
+	}
+	/*
+		return positive written size
+		return 0 if failure
+	*/
+	template<class T>
+	static size_t getRaw(T *buf, size_t maxSize, const mpz_class& x)
+	{
+		const size_t totalSize = sizeof(T) * maxSize;
+		if (getBitLen(x) > totalSize * 8) return 0;
+		memset(buf, 0, sizeof(*buf) * maxSize);
+		size_t size;
+		mpz_export(buf, &size, -1, sizeof(T), 0, 0, x.get_mpz_t());
+		// if x == 0, then size = 0 for gmp, size = 1 for mpir
+		return size == 0 ? 1 : size;
+	}
+	static inline void set(mpz_class& z, uint64_t x)
+	{
+		setRaw(z, &x, 1);
+	}
+	static inline bool fromStr(mpz_class& z, const std::string& str, int base = 0)
+	{
+		return z.set_str(str, base) == 0;
+	}
+	static inline void toStr(std::string& str, const mpz_class& z, int base = 10)
+	{
+		str = z.get_str(base);
+	}
+	static inline void add(mpz_class& z, const mpz_class& x, const mpz_class& y)
+	{
+		mpz_add(z.get_mpz_t(), x.get_mpz_t(), y.get_mpz_t());
+	}
+	static inline void add(mpz_class& z, const mpz_class& x, unsigned int y)
+	{
+		mpz_add_ui(z.get_mpz_t(), x.get_mpz_t(), y);
+	}
+	static inline void sub(mpz_class& z, const mpz_class& x, const mpz_class& y)
+	{
+		mpz_sub(z.get_mpz_t(), x.get_mpz_t(), y.get_mpz_t());
+	}
+	static inline void sub(mpz_class& z, const mpz_class& x, unsigned int y)
+	{
+		mpz_sub_ui(z.get_mpz_t(), x.get_mpz_t(), y);
+	}
+	static inline void mul(mpz_class& z, const mpz_class& x, const mpz_class& y)
+	{
+		mpz_mul(z.get_mpz_t(), x.get_mpz_t(), y.get_mpz_t());
+	}
+	static inline void square(mpz_class& z, const mpz_class& x)
+	{
+		mpz_mul(z.get_mpz_t(), x.get_mpz_t(), x.get_mpz_t());
+	}
+	static inline void mul(mpz_class& z, const mpz_class& x, unsigned int y)
+	{
+		mpz_mul_ui(z.get_mpz_t(), x.get_mpz_t(), y);
+	}
+	static inline void divmod(mpz_class& q, mpz_class& r, const mpz_class& x, const mpz_class& y)
+	{
+		mpz_divmod(q.get_mpz_t(), r.get_mpz_t(), x.get_mpz_t(), y.get_mpz_t());
+	}
+	static inline void div(mpz_class& q, const mpz_class& x, const mpz_class& y)
+	{
+		mpz_div(q.get_mpz_t(), x.get_mpz_t(), y.get_mpz_t());
+	}
+	static inline void div(mpz_class& q, const mpz_class& x, unsigned int y)
+	{
+		mpz_div_ui(q.get_mpz_t(), x.get_mpz_t(), y);
+	}
+	static inline void mod(mpz_class& r, const mpz_class& x, const mpz_class& m)
+	{
+		mpz_mod(r.get_mpz_t(), x.get_mpz_t(), m.get_mpz_t());
+	}
+	static inline void mod(mpz_class& r, const mpz_class& x, unsigned int m)
+	{
+		mpz_mod_ui(r.get_mpz_t(), x.get_mpz_t(), m);
+	}
+	static inline void clear(mpz_class& z)
+	{
+		mpz_set_ui(z.get_mpz_t(), 0);
+	}
+	static inline bool isZero(const mpz_class& z)
+	{
+		return mpz_sgn(z.get_mpz_t()) == 0;
+	}
+	static inline bool isNegative(const mpz_class& z)
+	{
+		return mpz_sgn(z.get_mpz_t()) < 0;
+	}
+	static inline void neg(mpz_class& z, const mpz_class& x)
+	{
+		mpz_neg(z.get_mpz_t(), x.get_mpz_t());
+	}
+	static inline int compare(const mpz_class& x, const mpz_class & y)
+	{
+		return mpz_cmp(x.get_mpz_t(), y.get_mpz_t());
+	}
+	static inline int compare(const mpz_class& x, int y)
+	{
+		return mpz_cmp_si(x.get_mpz_t(), y);
+	}
+	template<class T>
+	static inline void addMod(mpz_class& z, const mpz_class& x, const T& y, const mpz_class& m)
+	{
+		add(z, x, y);
+		if (compare(z, m) >= 0) {
+			sub(z, z, m);
+		}
+	}
+	template<class T>
+	static inline void subMod(mpz_class& z, const mpz_class& x, const T& y, const mpz_class& m)
+	{
+		sub(z, x, y);
+		if (!isNegative(z)) return;
+		add(z, z, m);
+	}
+	template<class T>
+	static inline void mulMod(mpz_class& z, const mpz_class& x, const T& y, const mpz_class& m)
+	{
+		mul(z, x, y);
+		mod(z, z, m);
+	}
+	static inline void squareMod(mpz_class& z, const mpz_class& x, const mpz_class& m)
+	{
+		square(z, x);
+		mod(z, z, m);
+	}
+	// z = x^y (y >= 0)
+	static inline void pow(mpz_class& z, const mpz_class& x, unsigned int y)
+	{
+		mpz_pow_ui(z.get_mpz_t(), x.get_mpz_t(), y);
+	}
+	// z = x^y mod m (y >=0)
+	static inline void powMod(mpz_class& z, const mpz_class& x, const mpz_class& y, const mpz_class& m)
+	{
+		mpz_powm(z.get_mpz_t(), x.get_mpz_t(), y.get_mpz_t(), m.get_mpz_t());
+	}
+	// z = 1/x mod m
+	static inline void invMod(mpz_class& z, const mpz_class& x, const mpz_class& m)
+	{
+		mpz_invert(z.get_mpz_t(), x.get_mpz_t(), m.get_mpz_t());
+	}
+	// z = lcm(x, y)
+	static inline void lcm(mpz_class& z, const mpz_class& x, const mpz_class& y)
+	{
+		mpz_lcm(z.get_mpz_t(), x.get_mpz_t(), y.get_mpz_t());
+	}
+	static inline mpz_class lcm(const mpz_class& x, const mpz_class& y)
+	{
+		mpz_class z;
+		lcm(z, x, y);
+		return z;
+	}
+	// z = gcd(x, y)
+	static inline void gcd(mpz_class& z, const mpz_class& x, const mpz_class& y)
+	{
+		mpz_gcd(z.get_mpz_t(), x.get_mpz_t(), y.get_mpz_t());
+	}
+	static inline mpz_class gcd(const mpz_class& x, const mpz_class& y)
+	{
+		mpz_class z;
+		gcd(z, x, y);
+		return z;
+	}
+	/*
+		assume p : odd prime
+		return  1 if x^2 = a mod p for some x
+		return -1 if x^2 != a mod p for any x
+	*/
+	static inline int legendre(const mpz_class& a, const mpz_class& p)
+	{
+		return mpz_legendre(a.get_mpz_t(), p.get_mpz_t());
+	}
+	static inline bool isPrime(const mpz_class& x)
+	{
+		return mpz_probab_prime_p(x.get_mpz_t(), 25) != 0;
+	}
+	static inline size_t getBitLen(const mpz_class& x)
+	{
+		return mpz_sizeinbase(x.get_mpz_t(), 2);
+	}
+	static inline BlockType getBlock(const mpz_class& x, size_t i)
+	{
+		return x.get_mpz_t()->_mp_d[i];
+	}
+	static inline const BlockType *getBlock(const mpz_class& x)
+	{
+		return reinterpret_cast<const BlockType*>(x.get_mpz_t()->_mp_d);
+	}
+	static inline size_t getBlockSize(const mpz_class& x)
+	{
+		assert(x.get_mpz_t()->_mp_size >= 0);
+		return x.get_mpz_t()->_mp_size;
+	}
+	template<class RG>
+	static inline void getRand(mpz_class& z, size_t bitLen, RG& rg)
+	{
+		assert(bitLen > 1);
+		const size_t rem = bitLen & 31;
+		const size_t n = (bitLen + 31) / 32;
+		std::vector<uint32_t> buf(n);
+		rg.read(buf.data(), n);
+		uint32_t v = buf[n - 1];
+		if (rem == 0) {
+			v |= 1U << 31;
+		} else {
+			v &= (1U << rem) - 1;
+			v |= 1U << (rem - 1);
+		}
+		buf[n - 1] = v;
+		Gmp::setRaw(z, &buf[0], n);
+	}
+	template<class RG>
+	static void getRandPrime(mpz_class& z, size_t bitLen, RG& rg, bool setSecondBit = false, bool mustBe3mod4 = false)
+	{
+		assert(bitLen > 2);
+		do {
+			getRand(z, bitLen, rg);
+			if (setSecondBit) {
+				z |= mpz_class(1) << (bitLen - 2);
+			}
+			if (mustBe3mod4) {
+				z |= 3;
+			}
+		} while (!(isPrime(z)));
+	}
+};
+
+/*
+	Tonelli-Shanks
+*/
+class SquareRoot {
+	bool isPrime;
+	mpz_class p;
+	mpz_class g;
+	int r;
+	mpz_class q; // p - 1 = 2^r q
+	mpz_class s; // s = g^q
+public:
+	SquareRoot() : isPrime(false) {}
+	void set(const mpz_class& p)
+	{
+		if (p <= 2) throw cybozu::Exception("SquareRoot:bad p") << p;
+		isPrime = Gmp::isPrime(p);
+		if (!isPrime) return; // don't throw until get() is called
+		this->p = p;
+		// g is quadratic nonresidue
+		g = 2;
+		while (Gmp::legendre(g, p) > 0) {
+			g++;
+		}
+		// p - 1 = 2^r q, q is odd
+		r = 0;
+		q = p - 1;
+		while ((q & 1) == 0) {
+			r++;
+			q /= 2;
+		}
+		Gmp::powMod(s, g, q, p);
+	}
+	/*
+		solve x^2 = a mod p
+	*/
+	bool get(mpz_class& x, const mpz_class& a) const
+	{
+		if (!isPrime) throw cybozu::Exception("SquareRoot:get:not prime") << p;
+		if (Gmp::legendre(a, p) < 0) return false;
+		if (r == 1) {
+			Gmp::powMod(x, a, (p + 1) / 4, p);
+			return true;
+		}
+		mpz_class c = s, d;
+		int e = r;
+		Gmp::powMod(d, a, q, p);
+		Gmp::powMod(x, a, (q + 1) / 2, p); // destroy a if &x == &a
+		while (d != 1) {
+			int i = 1;
+			mpz_class dd = (d * d) % p;
+			while (dd != 1) {
+				dd = (dd * dd) % p;
+				i++;
+			}
+			mpz_class b = 1;
+			b <<= e - i - 1;
+			Gmp::powMod(b, c, b, p);
+			x = (x * b) % p;
+			c = (b * b) % p;
+			d = (d * c) % p;
+			e = i;
+		}
+		return true;
+	}
+};
+
+namespace ope {
+
+template<>
+struct Optimized<mpz_class> {
+	void init(const mpz_class&) {}
+	bool hasPowMod() const { return true; }
+	static void powMod(mpz_class& z, const mpz_class& x, const mpz_class& y, const mpz_class& m)
+	{
+		mpz_powm(z.get_mpz_t(), x.get_mpz_t(), y.get_mpz_t(), m.get_mpz_t());
+	}
+};
+
+} // mcl::ope
+
+} // mcl
diff --git a/include/mcl/mont_fp.hpp b/include/mcl/mont_fp.hpp
new file mode 100644
index 0000000..1b539bf
--- /dev/null
+++ b/include/mcl/mont_fp.hpp
@@ -0,0 +1,463 @@
+#pragma once
+/**
+	@file
+	@brief Fp with montgomery(EXPERIMENTAL IMPLEMENTAION)
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+
+	@note this class should be merged to FpT
+*/
+#include <sstream>
+#include <vector>
+#include <mcl/gmp_util.hpp>
+#include <mcl/fp.hpp>
+#include <mcl/fp_generator.hpp>
+
+namespace mcl {
+
+template<size_t N, class tag = fp_local::TagDefault>
+class MontFpT : public ope::addsub<MontFpT<N, tag>,
+	ope::mulable<MontFpT<N, tag>,
+	ope::invertible<MontFpT<N, tag>,
+	ope::hasNegative<MontFpT<N, tag>,
+	ope::hasIO<MontFpT<N, tag> > > > > > {
+
+	static mpz_class pOrg_;
+	static mcl::SquareRoot sq_;
+	static MontFpT p_;
+	static MontFpT one_;
+	static MontFpT R_; // (1 << (N * 64)) % p
+	static MontFpT RR_; // (R * R) % p
+	static MontFpT invTbl_[N * 64 * 2];
+	static size_t modBitLen_;
+public:
+	static FpGenerator fg_;
+private:
+	uint64_t v_[N];
+	void fromRawGmp(const mpz_class& x)
+	{
+		if (Gmp::getRaw(v_, N, x) == 0) {
+			throw cybozu::Exception("MontFpT:fromRawGmp") << x;
+		}
+	}
+	template<class S>
+	void setMaskMod(std::vector<S>& buf)
+	{
+		assert(buf.size() * sizeof(S) * 8 <= modBitLen_);
+		assert(!buf.empty());
+		fp::maskBuffer(&buf[0], buf.size(), modBitLen_);
+		memcpy(v_, &buf[0], buf.size() * sizeof(S));
+		if (compare(*this, p_) >= 0) {
+			subNc(v_, v_, p_.v_);
+		}
+		assert(compare(*this, p_) < 0);
+	}
+	static void initInvTbl(MontFpT *invTbl)
+	{
+		MontFpT t(2);
+		const int n = N * 64 * 2;
+		for (int i = 0; i < n; i++) {
+			invTbl[n - 1 - i] = t;
+			t += t;
+		}
+	}
+	typedef void (*void3op)(MontFpT&, const MontFpT&, const MontFpT&);
+	typedef bool (*bool3op)(MontFpT&, const MontFpT&, const MontFpT&);
+	typedef void (*void2op)(MontFpT&, const MontFpT&);
+	typedef int (*int2op)(MontFpT&, const MontFpT&);
+public:
+	static const size_t BlockSize = N;
+	typedef uint64_t BlockType;
+	MontFpT() {}
+	MontFpT(int x) { operator=(x); }
+	MontFpT(uint64_t x) { operator=(x); }
+	explicit MontFpT(const std::string& str, int base = 0)
+	{
+		fromStr(str, base);
+	}
+	MontFpT& operator=(int x)
+	{
+		if (x == 0) {
+			clear();
+		} else {
+			v_[0] = abs(x);
+			for (size_t i = 1; i < N; i++) v_[i] = 0;
+			mul(*this, *this, RR_);
+			if (x < 0) {
+				neg(*this, *this);
+			}
+		}
+		return *this;
+	}
+	MontFpT& operator=(uint64_t x)
+	{
+		v_[0] = x;
+		for (size_t i = 1; i < N; i++) v_[i] = 0;
+		mul(*this, *this, RR_);
+		return *this;
+	}
+	void fromStr(const std::string& str, int base = 0)
+	{
+		bool isMinus;
+		const char *p = fp::verifyStr(&isMinus, &base, str);
+
+		if (base == 16) {
+			MontFpT t;
+			mcl::fp::fromStr16(t.v_, N, p, str.size() - (p - str.c_str()));
+			if (compare(t, p_) >= 0) throw cybozu::Exception("fp:MontFpT:str is too large") << str;
+			mul(*this, t, RR_);
+		} else {
+			mpz_class t;
+			if (!Gmp::fromStr(t, p, base)) {
+				throw cybozu::Exception("fp:MontFpT:fromStr") << str;
+			}
+			toMont(*this, t);
+		}
+		if (isMinus) {
+			neg(*this, *this);
+		}
+	}
+	void put() const
+	{
+		for (int i = N - 1; i >= 0; i--) {
+			printf("%016llx ", v_[i]);
+		}
+		printf("\n");
+	}
+	void set(const std::string& str, int base = 0) { fromStr(str, base); }
+	void toStr(std::string& str, int base = 10, bool withPrefix = false) const
+	{
+		if (isZero()) {
+			str = "0";
+			return;
+		}
+		if (base == 16 || base == 2) {
+			MontFpT t;
+			mul(t, *this, one_);
+			if (base == 16) {
+				mcl::fp::toStr16(str, t.v_, N, withPrefix);
+			} else {
+				mcl::fp::toStr2(str, t.v_, N, withPrefix);
+			}
+			return;
+		}
+		if (base != 10) throw cybozu::Exception("fp:MontFpT:toStr:bad base") << base;
+		// QQQ : remove conversion to gmp
+		mpz_class t;
+		fromMont(t, *this);
+		Gmp::toStr(str, t, base);
+	}
+	std::string toStr(int base = 10, bool withPrefix = false) const
+	{
+		std::string str;
+		toStr(str, base, withPrefix);
+		return str;
+	}
+	void clear()
+	{
+		for (size_t i = 0; i < N; i++) v_[i] = 0;
+	}
+	template<class RG>
+	void setRand(RG& rg)
+	{
+		fp::getRandVal(v_, rg, p_.v_, modBitLen_);
+	}
+	template<class S>
+	void setRaw(const S *inBuf, size_t n)
+	{
+		n = std::min(n, fp::getRoundNum<S>(modBitLen_));
+		if (n == 0) {
+			clear();
+			return;
+		}
+		std::vector<S> buf(inBuf, inBuf + n);
+		setMaskMod(buf);
+	}
+	static inline void setModulo(const std::string& pstr, int base = 0)
+	{
+		bool isMinus;
+		const char *p = fp::verifyStr(&isMinus, &base, pstr);
+		if (isMinus) throw cybozu::Exception("MontFp:setModulo:mstr is not pinus") << pstr;
+		if (!Gmp::fromStr(pOrg_, p, base)) {
+			throw cybozu::Exception("fp:MontFpT:setModulo") << pstr << base;
+		}
+		modBitLen_ = Gmp::getBitLen(pOrg_);
+		if (fp::getRoundNum<uint64_t>(modBitLen_) > N) {
+			throw cybozu::Exception("MontFp:setModulo:bad prime length") << pstr;
+		}
+		p_.fromRawGmp(pOrg_);
+		sq_.set(pOrg_);
+
+		mpz_class t = 1;
+		one_.fromRawGmp(t);
+		t = (t << (N * 64)) % pOrg_;
+		R_.fromRawGmp(t);
+		t = (t * t) % pOrg_;
+		RR_.fromRawGmp(t);
+		fg_.init(p_.v_, N);
+		add = Xbyak::CastTo<void3op>(fg_.add_);
+		sub = Xbyak::CastTo<void3op>(fg_.sub_);
+		mul = Xbyak::CastTo<void3op>(fg_.mul_);
+		square = Xbyak::CastTo<void2op>(fg_.sqr_);
+		if (square == 0) square = squareC;
+		neg = Xbyak::CastTo<void2op>(fg_.neg_);
+		shr1 = Xbyak::CastTo<void2op>(fg_.shr1_);
+		addNc = Xbyak::CastTo<bool3op>(fg_.addNc_);
+		subNc = Xbyak::CastTo<bool3op>(fg_.subNc_);
+		preInv = Xbyak::CastTo<int2op>(fg_.preInv_);
+		initInvTbl(invTbl_);
+	}
+	static inline void getModulo(std::string& pstr)
+	{
+		Gmp::toStr(pstr, pOrg_);
+	}
+	static inline bool isYodd(const MontFpT& y)
+	{
+#if 0
+		return (y.v_[0] & 1) == 1;
+#else
+		MontFpT t; // QQQ : is necessary?
+		mul(t, y, one_);
+		return (t.v_[0] & 1) == 1;
+#endif
+	}
+	static inline bool squareRoot(MontFpT& y, const MontFpT& x)
+	{
+		mpz_class t;
+		fromMont(t, x);
+		if (!sq_.get(t, t)) return false;
+		toMont(y, t);
+		return true;
+	}
+	static inline void fromMont(mpz_class& z, const MontFpT& x)
+	{
+		MontFpT t;
+		mul(t, x, one_);
+		Gmp::setRaw(z, t.v_, N);
+	}
+	static inline void toMont(MontFpT& z, const mpz_class& x)
+	{
+		if (x >= pOrg_) throw cybozu::Exception("fp:MontFpT:toMont:large x") << x;
+		MontFpT t;
+		t.fromRawGmp(x);
+		mul(z, t, RR_);
+	}
+	static void3op add;
+	static void3op sub;
+	static void3op mul;
+	static void2op square;
+	static void2op neg;
+	static void2op shr1;
+	static bool3op addNc;
+	static bool3op subNc;
+	static int2op preInv;
+	static inline void squareC(MontFpT& z, const MontFpT& x)
+	{
+		mul(z, x, x);
+	}
+	static inline int preInvC(MontFpT& r, const MontFpT& x)
+	{
+		MontFpT u, v, s;
+		u = p_;
+		v = x;
+		r.clear();
+		s.clear(); s.v_[0] = 1; // s is real 1
+		int k = 0;
+		// u, v : Pack, r, s : mem
+		bool rTop = false;
+	LP:
+		if (v.isZero()) goto EXIT;
+		if ((u.v_[0] & 1) == 0) {
+			goto U_EVEN;
+		}
+		if ((v.v_[0] & 1) == 0) {
+			goto V_EVEN;
+		}
+		if (compare(v, u) < 0) {
+			goto V_LT_U;
+		}
+		subNc(v, v, u); // sub_rr
+		addNc(s, s, r); // add_mm
+	V_EVEN:
+		shr1(v, v); // shr1_r
+		rTop = addNc(r, r, r); // twice_m
+		k++;
+		goto LP;
+	V_LT_U:
+		subNc(u, u, v); // sub_rr
+		rTop = addNc(r, r, s); // add_mm
+	U_EVEN:
+		shr1(u, u); // shr1_r
+		addNc(s, s, s); // twice_m
+		k++;
+		goto LP;
+	EXIT:;
+		if (rTop) subNc(r, r, p_);
+		if (subNc(r, p_, r)) {
+			addNc(r, r, p_);
+		}
+		return k;
+	}
+	static inline void inv(MontFpT& z, const MontFpT& x)
+	{
+#if 1
+		MontFpT r;
+#if 1
+		int k = preInv(r, x);
+#else
+		MontFpT s;
+		int h = preInvC(s, x);
+		int k = preInv(r, x);
+		if (r != s || k != h) {
+			std::cout << std::hex;
+			PUT(x);
+			PUT(r);
+			PUT(s);
+			printf("k=%d, h=%d\n", k, h);
+			exit(1);
+		}
+#endif
+		/*
+			xr = 2^k
+			R = 2^(N * 64)
+			get r2^(-k)R^2 = r 2^(N * 64 * 2 - k)
+		*/
+		mul(z, r, invTbl_[k]);
+#else
+		mpz_class t;
+		fromMont(t, x);
+		Gmp::invMod(t, t, pOrg_);
+		toMont(z, t);
+#endif
+	}
+	static inline void div(MontFpT& z, const MontFpT& x, const MontFpT& y)
+	{
+		MontFpT ry;
+		inv(ry, y);
+		mul(z, x, ry);
+	}
+#if 0
+	static inline BlockType getBlock(const MontFpT& x, size_t i)
+	{
+		return Gmp::getBlock(x.v, i);
+	}
+	static inline const BlockType *getBlock(const MontFpT& x)
+	{
+		return Gmp::getBlock(x.v);
+	}
+	static inline size_t getBlockSize(const MontFpT& x)
+	{
+		return Gmp::getBlockSize(x.v);
+	}
+	static inline void shr(MontFpT& z, const MontFpT& x, size_t n)
+	{
+		z.v = x.v >> n;
+	}
+#endif
+	/*
+		append to bv(not clear bv)
+	*/
+	void appendToBitVec(cybozu::BitVector& bv) const
+	{
+		MontFpT t;
+		MontFpT::mul(t, *this, MontFpT::one_);
+		bv.append(t.v_, modBitLen_);
+	}
+	void fromBitVec(const cybozu::BitVector& bv)
+	{
+		const size_t bitLen = bv.size();
+		if (bitLen != modBitLen_) throw cybozu::Exception("MontFp:fromBitVec:bad size") << bitLen << modBitLen_;
+		const size_t blockN = cybozu::RoundupBit<BlockType>(bitLen);
+		const MontFpT* src;
+		MontFpT t;
+		if (blockN == N) {
+			src = (const MontFpT*)bv.getBlock();
+		} else {
+			cybozu::CopyBit(t.v_, bv.getBlock(), bitLen);
+			for (size_t i = blockN; i < N; i++) t.v_[i] = 0;
+			src = &t;
+		}
+		mul(*this, *src, RR_);
+		if (compare(*this, p_) >= 0) {
+			throw cybozu::Exception("MontFpT:fromBitVec:large x") << *this << p_;
+		}
+	}
+	static inline size_t getBitVecSize() { return modBitLen_; }
+	static inline int compare(const MontFpT& x, const MontFpT& y)
+	{
+		return fp::compareArray(x.v_, y.v_, N);
+	}
+	static inline bool isZero(const MontFpT& x)
+	{
+		if (x.v_[0]) return false;
+		uint64_t r = 0;
+		for (size_t i = 1; i < N; i++) {
+			r |= x.v_[i];
+		}
+		return r == 0;
+	}
+	bool isZero() const { return isZero(*this); }
+	template<class Z>
+	static void power(MontFpT& z, const MontFpT& x, const Z& y)
+	{
+		power_impl::power(z, x, y);
+	}
+	const uint64_t* getInnerValue() const { return v_; }
+	bool operator==(const MontFpT& rhs) const { return compare(*this, rhs) == 0; }
+	bool operator!=(const MontFpT& rhs) const { return compare(*this, rhs) != 0; }
+	static inline size_t getModBitLen() { return modBitLen_; }
+	static inline uint64_t cvtInt(const MontFpT& x, bool *err = 0)
+	{
+		MontFpT t;
+		mul(t, x, one_);
+		for (size_t i = 1; i < N; i++) {
+			if (t.v_[i]) {
+				if (err) {
+					*err = true;
+					return 0;
+				} else {
+					throw cybozu::Exception("MontFp:cvtInt:too large") << x;
+				}
+			}
+		}
+		if (err) *err = false;
+		return t.v_[0];
+	}
+	uint64_t cvtInt(bool *err = 0) const { return cvtInt(*this, err); }
+};
+
+template<size_t N, class tag>mpz_class MontFpT<N, tag>::pOrg_;
+template<size_t N, class tag>mcl::SquareRoot MontFpT<N, tag>::sq_;
+template<size_t N, class tag>MontFpT<N, tag> MontFpT<N, tag>::p_;
+template<size_t N, class tag>MontFpT<N, tag> MontFpT<N, tag>::one_;
+template<size_t N, class tag>MontFpT<N, tag> MontFpT<N, tag>::R_;
+template<size_t N, class tag>MontFpT<N, tag> MontFpT<N, tag>::RR_;
+template<size_t N, class tag>MontFpT<N, tag> MontFpT<N, tag>::invTbl_[N * 64 * 2];
+template<size_t N, class tag>FpGenerator MontFpT<N, tag>::fg_;
+template<size_t N, class tag>size_t MontFpT<N, tag>::modBitLen_;
+
+template<size_t N, class tag>typename MontFpT<N, tag>::void3op MontFpT<N, tag>::add;
+template<size_t N, class tag>typename MontFpT<N, tag>::void3op MontFpT<N, tag>::sub;
+template<size_t N, class tag>typename MontFpT<N, tag>::void3op MontFpT<N, tag>::mul;
+template<size_t N, class tag>typename MontFpT<N, tag>::void2op MontFpT<N, tag>::square;
+template<size_t N, class tag>typename MontFpT<N, tag>::void2op MontFpT<N, tag>::neg;
+template<size_t N, class tag>typename MontFpT<N, tag>::void2op MontFpT<N, tag>::shr1;
+template<size_t N, class tag>typename MontFpT<N, tag>::bool3op MontFpT<N, tag>::addNc;
+template<size_t N, class tag>typename MontFpT<N, tag>::bool3op MontFpT<N, tag>::subNc;
+template<size_t N, class tag>typename MontFpT<N, tag>::int2op MontFpT<N, tag>::preInv;
+
+} // mcl
+
+namespace std { CYBOZU_NAMESPACE_TR1_BEGIN
+template<class T> struct hash;
+
+template<size_t N, class tag>
+struct hash<mcl::MontFpT<N, tag> > {
+	size_t operator()(const mcl::MontFpT<N, tag>& x, uint64_t v = 0) const
+	{
+		return static_cast<size_t>(cybozu::hash64(x.getInnerValue(), N, v));
+	}
+};
+
+CYBOZU_NAMESPACE_TR1_END } // std::tr1
diff --git a/include/mcl/operator.hpp b/include/mcl/operator.hpp
new file mode 100644
index 0000000..f5d0df3
--- /dev/null
+++ b/include/mcl/operator.hpp
@@ -0,0 +1,118 @@
+#pragma once
+/**
+	@file
+	@brief operator
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#include <ios>
+#include <cybozu/exception.hpp>
+
+#ifdef _WIN32
+	#ifndef MCL_FORCE_INLINE
+		#define MCL_FORCE_INLINE __forceinline
+	#endif
+	#pragma warning(push)
+	#pragma warning(disable : 4714)
+#else
+	#ifndef MCL_FORCE_INLINE
+		#define MCL_FORCE_INLINE __attribute__((always_inline))
+	#endif
+#endif
+
+namespace mcl { namespace ope {
+
+template<class T>
+struct Empty {};
+
+/*
+	T must have compare
+*/
+template<class T, class E = Empty<T> >
+struct comparable : E {
+	friend MCL_FORCE_INLINE bool operator<(const T& x, const T& y) { return T::compare(x, y) < 0; }
+	friend MCL_FORCE_INLINE bool operator>=(const T& x, const T& y) { return !operator<(x, y); }
+
+	friend MCL_FORCE_INLINE bool operator>(const T& x, const T& y) { return T::compare(x, y) > 0; }
+	friend MCL_FORCE_INLINE bool operator<=(const T& x, const T& y) { return !operator>(x, y); }
+	friend MCL_FORCE_INLINE bool operator==(const T& x, const T& y) { return T::compare(x, y) == 0; }
+	friend MCL_FORCE_INLINE bool operator!=(const T& x, const T& y) { return !operator==(x, y); }
+};
+
+/*
+	T must have add, sub
+*/
+template<class T, class E = Empty<T> >
+struct addsub : E {
+	template<class S> MCL_FORCE_INLINE T& operator+=(const S& rhs) { T::add(static_cast<T&>(*this), static_cast<const T&>(*this), rhs); return static_cast<T&>(*this); }
+	template<class S> MCL_FORCE_INLINE T& operator-=(const S& rhs) { T::sub(static_cast<T&>(*this), static_cast<const T&>(*this), rhs); return static_cast<T&>(*this); }
+	template<class S> friend MCL_FORCE_INLINE T operator+(const T& a, const S& b) { T c; T::add(c, a, b); return c; }
+	template<class S> friend MCL_FORCE_INLINE T operator-(const T& a, const S& b) { T c; T::sub(c, a, b); return c; }
+};
+
+/*
+	T must have mul
+*/
+template<class T, class E = Empty<T> >
+struct mulable : E {
+	template<class S> MCL_FORCE_INLINE T& operator*=(const S& rhs) { T::mul(static_cast<T&>(*this), static_cast<const T&>(*this), rhs); return static_cast<T&>(*this); }
+	template<class S> friend MCL_FORCE_INLINE T operator*(const T& a, const S& b) { T c; T::mul(c, a, b); return c; }
+};
+
+/*
+	T must have inv, mul
+*/
+template<class T, class E = Empty<T> >
+struct invertible : E {
+	MCL_FORCE_INLINE T& operator/=(const T& rhs) { T c; T::inv(c, rhs); T::mul(static_cast<T&>(*this), static_cast<const T&>(*this), c); return static_cast<T&>(*this); }
+	friend MCL_FORCE_INLINE T operator/(const T& a, const T& b) { T c; T::inv(c, b); T::mul(c, c, a); return c; }
+};
+
+/*
+	T must have neg
+*/
+template<class T, class E = Empty<T> >
+struct hasNegative : E {
+	MCL_FORCE_INLINE T operator-() const { T c; T::neg(c, static_cast<const T&>(*this)); return c; }
+};
+
+template<class T, class E = Empty<T> >
+struct hasIO : E {
+	friend inline std::ostream& operator<<(std::ostream& os, const T& self)
+	{
+		const std::ios_base::fmtflags f = os.flags();
+		if (f & std::ios_base::oct) throw cybozu::Exception("fpT:operator<<:oct is not supported");
+		const int base = (f & std::ios_base::hex) ? 16 : 10;
+		const bool showBase = (f & std::ios_base::showbase) != 0;
+		std::string str;
+		self.toStr(str, base, showBase);
+		return os << str;
+	}
+	friend inline std::istream& operator>>(std::istream& is, T& self)
+	{
+		const std::ios_base::fmtflags f = is.flags();
+		if (f & std::ios_base::oct) throw cybozu::Exception("fpT:operator>>:oct is not supported");
+		const int base = (f & std::ios_base::hex) ? 16 : 0;
+		std::string str;
+		is >> str;
+		self.fromStr(str, base);
+		return is;
+	}
+};
+
+template<class T>
+struct Optimized {
+	bool hasMulMod() const { return false; }
+	void init(const T&) {}
+	static void mulMod(T&, const T&, const T&) {}
+	static void mulMod(T&, const T&, unsigned int) {}
+	bool hasPowMod() const { return false; }
+	static void powMod(T&, const T&, const T&, const T&) {}
+};
+
+} } // mcl::ope
+
+#ifdef _WIN32
+//	#pragma warning(pop)
+#endif
diff --git a/include/mcl/power.hpp b/include/mcl/power.hpp
new file mode 100644
index 0000000..27fd15e
--- /dev/null
+++ b/include/mcl/power.hpp
@@ -0,0 +1,181 @@
+#pragma once
+/**
+	@file
+	@brief power
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#include <assert.h>
+#include <cybozu/bit_operation.hpp>
+#include <mcl/tagmultigr.hpp>
+#ifdef _MSC_VER
+	#pragma warning(push)
+	#pragma warning(disable : 4616)
+	#pragma warning(disable : 4800)
+	#pragma warning(disable : 4244)
+	#pragma warning(disable : 4127)
+	#pragma warning(disable : 4512)
+	#pragma warning(disable : 4146)
+#endif
+#include <gmpxx.h>
+#ifdef _MSC_VER
+	#pragma warning(pop)
+#endif
+
+namespace mcl {
+
+namespace power_impl {
+
+template<class F>
+struct TagInt {
+	typedef typename F::BlockType BlockType;
+	static size_t getBlockSize(const F& x)
+	{
+		return F::getBlockSize(x);
+	}
+	static BlockType getBlock(const F& x, size_t i)
+	{
+		return F::getBlock(x, i);
+	}
+	static const BlockType* getBlock(const F& x)
+	{
+		return F::getBlock(x);
+	}
+	static size_t getBitLen(const F& x)
+	{
+		return F::getBitLen(x);
+	}
+	static void shr(F& x, size_t n)
+	{
+		F::shr(x, x, n);
+	}
+};
+
+template<>
+struct TagInt<int> {
+	typedef int BlockType;
+	static int getBlockSize(int)
+	{
+		return 1;
+	}
+	static BlockType getBlock(int x, size_t i)
+	{
+		assert(i == 0);
+		cybozu::disable_warning_unused_variable(i);
+		return x;
+	}
+	static const BlockType* getBlock(const int& x)
+	{
+		return &x;
+	}
+	static size_t getBitLen(int x)
+	{
+		return x == 0 ? 1 : cybozu::bsr(x) + 1;
+	}
+	static void shr(int& x, size_t n)
+	{
+		x >>= n;
+	}
+};
+
+template<>
+struct TagInt<size_t> {
+	typedef size_t BlockType;
+	static size_t getBlockSize(size_t)
+	{
+		return 1;
+	}
+	static BlockType getBlock(size_t x, size_t i)
+	{
+		assert(i == 0);
+		cybozu::disable_warning_unused_variable(i);
+		return x;
+	}
+	static const BlockType* getBlock(const size_t& x)
+	{
+		return &x;
+	}
+	static size_t getBitLen(size_t x)
+	{
+		return x == 0 ? 1 : cybozu::bsr<size_t>(x) + 1;
+	}
+	static void shr(size_t& x, size_t n)
+	{
+		x >>= n;
+	}
+};
+
+template<>
+struct TagInt<mpz_class> {
+	typedef mp_limb_t BlockType;
+	static size_t getBlockSize(const mpz_class& x)
+	{
+		return x.get_mpz_t()->_mp_size;
+	}
+	static BlockType getBlock(const mpz_class& x, size_t i)
+	{
+		return x.get_mpz_t()->_mp_d[i];
+	}
+	static const BlockType* getBlock(const mpz_class& x)
+	{
+		return x.get_mpz_t()->_mp_d;
+	}
+	static size_t getBitLen(const mpz_class& x)
+	{
+		return mpz_sizeinbase(x.get_mpz_t(), 2);
+	}
+	static void shr(mpz_class& x, size_t n)
+	{
+		x >>= n;
+	}
+};
+
+template<class G, class BlockType>
+void powerArray(G& z, const G& x, const BlockType *y, size_t n)
+{
+	typedef TagMultiGr<G> TagG;
+	G out;
+	TagG::init(out);
+	G t(x);
+	for (size_t i = 0; i < n; i++) {
+		BlockType v = y[i];
+		int m = (int)sizeof(BlockType) * 8;
+		if (i == n - 1) {
+			// avoid unused multiplication
+			while (m > 0 && (v & (BlockType(1) << (m - 1))) == 0) {
+				m--;
+			}
+		}
+		for (int j = 0; j < m; j++) {
+			if (v & (BlockType(1) << j)) {
+				TagG::mul(out, out, t);
+			}
+			TagG::square(t, t);
+		}
+	}
+	z = out;
+}
+
+template<class G, class F>
+void power(G& z, const G& x, const F& _y)
+{
+	typedef TagMultiGr<G> TagG;
+	typedef power_impl::TagInt<F> TagI;
+	if (_y == 0) {
+		TagG::init(z);
+		return;
+	}
+	if (_y == 1) {
+		z = x;
+		return;
+	}
+	bool isNegative = _y < 0;
+	const F& y = isNegative ? -_y : _y;
+	powerArray(z, x, TagI::getBlock(y), TagI::getBlockSize(y));
+	if (isNegative) {
+		TagG::inv(z, z);
+	}
+}
+
+} } // mcl::power_impl
diff --git a/include/mcl/tagmultigr.hpp b/include/mcl/tagmultigr.hpp
new file mode 100644
index 0000000..51add36
--- /dev/null
+++ b/include/mcl/tagmultigr.hpp
@@ -0,0 +1,39 @@
+#pragma once
+/**
+	@file
+	@brief TagMultiGr
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#include <assert.h>
+
+namespace mcl {
+
+// default tag is for multiplicative group
+template<class G>
+struct TagMultiGr {
+	static void square(G& z, const G& x)
+	{
+		G::mul(z, x, x);
+	}
+	static void mul(G& z, const G& x, const G& y)
+	{
+		G::mul(z, x, y);
+	}
+	static void inv(G& z, const G& x)
+	{
+		G::inv(z, x);
+	}
+	static void div(G& z, const G& x, const G& y)
+	{
+		G::div(z, x, y);
+	}
+	static void init(G& x)
+	{
+		x = 1;
+	}
+};
+
+} // mcl
+
diff --git a/mcl.sln b/mcl.sln
new file mode 100644
index 0000000..e0edbbb
--- /dev/null
+++ b/mcl.sln
@@ -0,0 +1,25 @@
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Express 2012 for Windows Desktop
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "fp_test", "test\proj\fp_test\fp_test.vcxproj", "{51266DE6-B57B-4AE3-B85C-282F170E1728}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ec_test", "test\proj\ec_test\ec_test.vcxproj", "{46B6E88E-739A-406B-9F68-BC46C5950FA3}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{51266DE6-B57B-4AE3-B85C-282F170E1728}.Debug|x64.ActiveCfg = Debug|x64
+		{51266DE6-B57B-4AE3-B85C-282F170E1728}.Debug|x64.Build.0 = Debug|x64
+		{51266DE6-B57B-4AE3-B85C-282F170E1728}.Release|x64.ActiveCfg = Release|x64
+		{51266DE6-B57B-4AE3-B85C-282F170E1728}.Release|x64.Build.0 = Release|x64
+		{46B6E88E-739A-406B-9F68-BC46C5950FA3}.Debug|x64.ActiveCfg = Debug|x64
+		{46B6E88E-739A-406B-9F68-BC46C5950FA3}.Debug|x64.Build.0 = Debug|x64
+		{46B6E88E-739A-406B-9F68-BC46C5950FA3}.Release|x64.ActiveCfg = Release|x64
+		{46B6E88E-739A-406B-9F68-BC46C5950FA3}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/release.props b/release.props
new file mode 100644
index 0000000..88b0830
--- /dev/null
+++ b/release.props
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ImportGroup Label="PropertySheets" />
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup />
+</Project>
+\ No newline at end of file
diff --git a/sample/Makefile b/sample/Makefile
new file mode 100644
index 0000000..96678a6
--- /dev/null
+++ b/sample/Makefile
@@ -0,0 +1,23 @@
+include ../common.mk
+
+TARGET=$(TEST_FILE)
+LIBS=
+
+SRC=$(wildcard *.cpp)
+
+all: $(TARGET)
+
+test: $(TARGET)
+	@$(SAMPLE_TEST)
+
+$(OBJDIR):
+	@$(MKDIR) $(OBJDIR)
+
+clean:
+	$(CLEAN)
+
+$(LIBS):
+	$(MAKE) -C ../src
+
+-include $(DEPEND_FILE)
+
diff --git a/sample/ecdh_smpl.cpp b/sample/ecdh_smpl.cpp
new file mode 100644
index 0000000..91bb9f6
--- /dev/null
+++ b/sample/ecdh_smpl.cpp
@@ -0,0 +1,69 @@
+/*
+	sample of Elliptic Curve Diffie-Hellman key sharing
+*/
+#include <iostream>
+#include <fstream>
+#include <cybozu/random_generator.hpp>
+#include <mcl/fp.hpp>
+#include <mcl/gmp_util.hpp>
+#include <mcl/ecparam.hpp>
+#include <mcl/ec.hpp>
+#include <mcl/fp.hpp>
+typedef mcl::FpT<> Fp;
+
+struct ZnTag;
+
+typedef mcl::EcT<Fp> Ec;
+typedef mcl::FpT<ZnTag> Zn;
+
+int main()
+{
+	cybozu::RandomGenerator rg;
+	/*
+		system setup with a parameter secp192k1 recommended by SECG
+		Ec is an elliptic curve over Fp
+		the cyclic group of <P> is isomorphic to Zn
+	*/
+	const mcl::EcParam& para = mcl::ecparam::secp192k1;
+	Zn::setModulo(para.n);
+	Fp::setModulo(para.p);
+	Ec::setParam(para.a, para.b);
+	const Ec P(Fp(para.gx), Fp(para.gy));
+
+	/*
+		Alice setups a private key a and public key aP
+	*/
+	Zn a;
+	Ec aP;
+
+	a.setRand(rg);
+	Ec::power(aP, P, a); // aP = a * P;
+
+	std::cout << "aP=" << aP << std::endl;
+
+	/*
+		Bob setups a private key b and public key bP
+	*/
+	Zn b;
+	Ec bP;
+
+	b.setRand(rg);
+	Ec::power(bP, P, b); // bP = b * P;
+
+	std::cout << "bP=" << bP << std::endl;
+
+	Ec abP, baP;
+
+	// Alice uses bP(B's public key) and a(A's priavte key)
+	Ec::power(abP, bP, a); // abP = a * (bP)
+
+	// Bob uses aP(A's public key) and b(B's private key)
+	Ec::power(baP, aP, b); // baP = b * (aP)
+
+	if (abP == baP) {
+		std::cout << "key sharing succeed:" << abP << std::endl;
+	} else {
+		std::cout << "ERR(not here)" << std::endl;
+	}
+}
+
diff --git a/sample/random_smpl.cpp b/sample/random_smpl.cpp
new file mode 100644
index 0000000..19944de
--- /dev/null
+++ b/sample/random_smpl.cpp
@@ -0,0 +1,29 @@
+#include <mcl/fp.hpp>
+#include <mcl/gmp_util.hpp>
+#include <mcl/ecparam.hpp>
+#include <cybozu/random_generator.hpp>
+#include <map>
+#include <mcl/fp.hpp>
+typedef mcl::FpT<> Fp;
+
+typedef std::map<std::string, int> Map;
+
+int main(int argc, char *argv[])
+{
+	cybozu::RandomGenerator rg;
+	const char *p = mcl::ecparam::secp192k1.p;
+	if (argc == 2) {
+		p = argv[1];
+	}
+	Fp::setModulo(p);
+	Fp x;
+	printf("p=%s\n", p);
+	Map m;
+	for (int i = 0; i < 10000; i++) {
+		x.setRand(rg);
+		m[x.toStr(16)]++;
+	}
+	for (Map::const_iterator i = m.begin(), ie = m.end(); i != ie; ++i) {
+		printf("%s %d\n", i->first.c_str(), i->second);
+	}
+}
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..37c0873
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,42 @@
+VER=-3.5
+LLC=llc$(VER)
+OPT=opt$(VER)
+DIS=llvm-dis$(VER)
+ASM=llvm-as$(VER)
+OPT_LLC= $(OPT) -O3 -o - | $(LLC) -O3 -o -
+
+SRC = once.txt all.txt long.txt short.txt mul.txt
+TARGET=x64.s x86.s arm.s arm64.s
+AFLAGS=-mattr=bmi2
+all: $(TARGET)
+
+base64.ll: gen.py $(SRC)
+	python gen.py 64
+
+base32.ll: gen.py $(SRC)
+	python gen.py 32
+
+x64: base64.ll
+	$(LLC) base64.ll -o - -x86-asm-syntax=intel
+x86: base32.ll
+	$(LLC) base32.ll -o - -x86-asm-syntax=intel -march=x86
+arm64: base64.ll
+	$(LLC) base64.ll -o - -march=aarch64
+
+arm: base32.ll
+	$(LLC) base32.ll -o - -march=arm
+
+opt: base64.ll
+	cat base64.ll|$(OPT_LLC) -x86-asm-syntax=intel $(AFLAGS)
+
+x64.s: base64.ll
+	cat base64.ll|$(OPT_LLC) $(AFLAGS) > x64.s
+x86.s: base32.ll
+	cat base32.ll|$(OPT_LLC) $(AFLAGS) -march=x86 > x86.s
+arm.s: base32.ll
+	cat base32.ll|$(OPT_LLC) -march=arm > arm.s
+arm64.s: base32.ll
+	cat base64.ll|$(OPT_LLC) -march=aarch64 > arm64.s
+clean:
+	rm -rf base*.ll *.s
+
diff --git a/src/all.txt b/src/all.txt
new file mode 100644
index 0000000..0dbd4f9
--- /dev/null
+++ b/src/all.txt
@@ -0,0 +1,7 @@
+declare { i$(bit), i1 } @llvm.usub.with.overflow.i$(bit)(i$(bit) %x, i$(bit) %y)
+
+define i$(unit) @extract$(bit+unit)(i$(bit+unit) %x, i$(bit+unit) %shift) {
+	%t0 = lshr i$(bit+unit) %x, %shift
+	%t1 = trunc i$(bit+unit) %t0 to i$(unit)
+	ret i$(unit) %t1
+}
diff --git a/src/gen.py b/src/gen.py
new file mode 100644
index 0000000..acdd1ab
--- /dev/null
+++ b/src/gen.py
@@ -0,0 +1,187 @@
+import sys, re
+
+# @for <var>, <begin>, <end>
+RE_FOR = re.compile(r'@for\s+(\w+)\s*,\s*([^ ]+)\s*,\s*([^ ]+)')
+# $(<exp>)
+RE_VAL = re.compile(r'\$\(([^)]+)\)')
+# @define <var>=<exp>
+RE_DEFINE = re.compile(r'@define\s+(\w+)\s*=(.*)')
+# @if <exp>
+RE_IF = re.compile(r'@if\s+(.*)')
+# @elif <exp>
+RE_ELIF = re.compile(r'@elif\s+(.*)')
+
+def evalStr(s, envG, envL={}):
+	def eval2str(x):
+		s = x.group(1)
+		v = eval(s, envG, envL)
+		return str(v)
+	s = RE_VAL.sub(eval2str, s)
+	return s
+
+def parseDefine(s, envG, envL):
+	"""
+	if s is @define statement, then update envL and return True
+	otherwise return False
+	"""
+	p = RE_DEFINE.match(s)
+	if not p:
+		return False
+	lhs = p.group(1).strip()
+	rhs = p.group(2).strip()
+	envL[lhs] = eval(rhs, envG, envL)
+	return True
+
+def parseFor(s, envG):
+	"""
+	@for i, 0, 3
+	<exp>
+	@endif
+
+	|
+	v
+	@define i = 0
+	<exp>
+	exp
+	@define i = 1
+	<exp>
+	@define i = 2
+	<exp>
+
+	"""
+	out = ""
+	inFor = False
+	envL = {}
+	for line in s.split('\n'):
+		stripped = line.strip()
+		# save @define for parseIf
+		parseDefine(stripped, envG, envL)
+		if inFor:
+			if line.strip() == '@endfor':
+				inFor = False
+				for i in xrange(b, e):
+					out += "@define %s = %d\n" % (v, i)
+					out += sub
+			else:
+				sub += line + '\n'
+		else:
+			p = RE_FOR.search(stripped)
+			if p:
+				v = p.group(1).strip()
+				b = eval(p.group(2), envG)
+				e = eval(p.group(3), envG)
+				sub = ""
+				inFor = True
+			else:
+				out += line + '\n'
+	return out
+
+def parseIf(s, envG):
+	out = ""
+	IF_INIT = 0
+	IF_IF = 1
+	IF_ELSE = 2
+	ifState = IF_INIT
+	ifVar = False
+	# available variables in @(<expr>)
+	envL = {}
+	def evalIntLoc(s):
+		return eval(s, envG, envL)
+	for line in s.split('\n'):
+		stripped = line.strip()
+		# remove @define
+		if parseDefine(stripped, envG, envL):
+			continue
+		if ifState == IF_INIT:
+			p = RE_IF.match(stripped)
+			if p:
+				ifState = IF_IF
+				ifVar = evalIntLoc(p.group(1))
+				continue
+		elif ifState == IF_IF:
+			if stripped == '@endif':
+				ifState = IF_INIT
+				continue
+			elif stripped == '@else':
+				ifState = IF_ELSE
+				ifVar = not ifVar
+				continue
+			p = RE_ELIF.match(stripped)
+			if p:
+				ifVar = evalIntLoc(p.group(1))
+				continue
+			if not ifVar:
+				continue
+		elif ifState == IF_ELSE:
+			if stripped == '@endif':
+				ifState = IF_INIT
+				continue
+			if not ifVar:
+				continue
+		else:
+			raise Exception('bad state', ifState)
+		out += evalStr(line, envG, envL) + '\n'
+	return out
+
+def parse(s, unitL, bitL):
+	"""
+		eval "@(<expr>)" to integer
+
+		@for <var>, <begin>, <end>
+		...
+		@endfor
+
+		REMARK : @for is not nestable
+
+		@define <var> = <exp>
+		REMARK : var is global
+
+		@if <exp>
+		@elif <exp>
+		@endif
+
+		REMARK : @if is not nestable
+	"""
+	# available variables in @(<expr>)
+	envG = {
+		'unit' : unitL,
+		'bit' : bitL,
+		'N' : bitL / unitL,
+	}
+	s = parseFor(s, envG)
+	s = parseIf(s, envG)
+	return s
+
+def gen(fo, inLame, unitL, bitLL):
+	fi = open(inLame, 'r')
+	s = fi.read()
+	fi.close()
+	for bitL in bitLL:
+		t = parse(s, unitL, bitL)
+		fo.write(t)
+
+def main():
+	argv = sys.argv
+	args = len(argv)
+	unitL = 64
+	if args == 2:
+		unitL = int(argv[1])
+	if unitL not in [32, 64]:
+		print "bad unitL", unitL
+		exit(1)
+
+	outLame = 'base%d.ll' % unitL
+	fo = open(outLame, 'w')
+#	gen(fo, 't.txt', unitL, [unitL * 4])
+#	exit(1)
+	gen(fo, 'once.txt', unitL, [unitL * 2])
+
+	bitLL = range(unitL, 576 + 1, unitL)
+	gen(fo, 'all.txt', unitL, bitLL)
+	gen(fo, 'short.txt', unitL, bitLL)
+	gen(fo, 'long.txt', unitL, bitLL)
+	gen(fo, 'mul.txt', unitL, bitLL[1:])
+	fo.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/src/long.txt b/src/long.txt
new file mode 100644
index 0000000..31082a7
--- /dev/null
+++ b/src/long.txt
@@ -0,0 +1,54 @@
+define void @mcl_fp_add$(bit)L(i$(bit)* %pz, i$(bit)* %px, i$(bit)* %py, i$(bit)* %pp) {
+	%x = load i$(bit)* %px
+	%y = load i$(bit)* %py
+	%p = load i$(bit)* %pp
+	%x1 = zext i$(bit) %x to i$(bit+unit)
+	%y1 = zext i$(bit) %y to i$(bit+unit)
+	%p1 = zext i$(bit) %p to i$(bit+unit)
+	%t0 = add i$(bit+unit) %x1, %y1 ; x + y
+	%t1 = trunc i$(bit+unit) %t0 to i$(bit)
+	store i$(bit) %t1, i$(bit)* %pz
+	%vc = sub i$(bit+unit) %t0, %p1
+	%c = lshr i$(bit+unit) %vc, $(bit+unit-1)
+	%c1 = trunc i$(bit+unit) %c to i1
+	br i1 %c1, label %carry, label %nocarry
+nocarry:
+	%v = trunc i$(bit+unit) %vc to i$(bit)
+	store i$(bit) %v, i$(bit)* %pz
+	ret void
+carry:
+	ret void
+}
+
+define internal { i$(bit), i$(unit) } @local_sbb$(bit)(i$(bit) %x, i$(bit) %y) {
+	%x1 = zext i$(bit) %x to i$(bit+unit)
+	%y1 = zext i$(bit) %y to i$(bit+unit)
+	%v1 = sub i$(bit+unit) %x1, %y1
+	%v = trunc i$(bit+unit) %v1 to i$(bit)
+	%c = lshr i$(bit+unit) %v1, $(bit)
+	%c1 = trunc i$(bit+unit) %c to i$(unit)
+	%r1 = insertvalue { i$(bit), i$(unit) } undef, i$(bit) %v, 0
+	%r2 = insertvalue { i$(bit), i$(unit) } %r1, i$(unit) %c1, 1
+	ret { i$(bit), i$(unit) } %r2
+}
+
+define void @mcl_fp_sub$(bit)L(i$(bit)* %pz, i$(bit)* %px, i$(bit)* %py, i$(bit)* %pp) {
+	%x = load i$(bit)* %px
+	%y = load i$(bit)* %py
+	%x1 = zext i$(bit) %x to i$(bit+unit)
+	%y1 = zext i$(bit) %y to i$(bit+unit)
+	%vc = sub i$(bit+unit) %x1, %y1
+	%v = trunc i$(bit+unit) %vc to i$(bit)
+	%c = lshr i$(bit+unit) %vc, $(bit+unit-1)
+	%c1 = trunc i$(bit+unit) %c to i1
+	store i$(bit) %v, i$(bit)* %pz
+	br i1 %c1, label %carry, label %nocarry
+nocarry:
+	ret void
+carry:
+	%p = load i$(bit)* %pp
+	%t = add i$(bit) %v, %p ; x - y + p
+	store i$(bit) %t, i$(bit)* %pz
+	ret void
+}
+
diff --git a/src/mul.txt b/src/mul.txt
new file mode 100644
index 0000000..4621c7a
--- /dev/null
+++ b/src/mul.txt
@@ -0,0 +1,81 @@
+@define bu = bit + unit
+define private i$(bu) @mul$(bit)x$(unit)(i$(bit) %x, i$(unit) %y)
+@if N > 4
+noinline
+@endif
+{
+@for i, 0, N
+  %x$(i) = call i$(unit) @extract$(bit)(i$(bit) %x, i$(bit) $(unit*i))
+  %x$(i)y = call i$(unit*2) @mul$(unit)x$(unit)(i$(unit) %x$(i), i$(unit) %y)
+  %x$(i)y0 = zext i$(unit*2) %x$(i)y to i$(bu)
+@endfor
+@for i, 1, N
+  %x$(i)y1 = shl i$(bu) %x$(i)y0, $(unit*i)
+@endfor
+  %t0 = add i$(bu) %x0y0, %x1y1
+@for i, 1, N-1
+  %t$(i) = add i$(bu) %t$(i-1), %x$(i+1)y1
+@endfor
+  ret i$(bu) %t$(N-2)
+}
+define void @mcl_fp_mul$(bit)pre(i$(unit)* %pz, i$(bit)* %px, i$(bit)* %py) {
+  %x = load i$(bit)* %px
+  %y = load i$(bit)* %py
+@for i, 0, N
+  %y$(i) = call i$(unit) @extract$(bit)(i$(bit) %y, i$(bit) $(unit*i))
+@endfor
+  %sum0 = call i$(bu) @mul$(bit)x$(unit)(i$(bit) %x, i$(unit) %y0)
+  %t0 = trunc i$(bu) %sum0 to i$(unit)
+  store i$(unit) %t0, i$(unit)* %pz
+@for i, 1, N
+
+  %s$(i-1) = lshr i$(bu) %sum$(i-1), $(unit)
+  %xy$(i) = call i$(bu) @mul$(bit)x$(unit)(i$(bit) %x, i$(unit) %y$(i))
+  %sum$(i) = add i$(bu) %s$(i-1), %xy$(i)
+  %z$(i) = getelementptr i$(unit)* %pz, i32 $(i)
+  @if i < N - 1
+  %ts$(i) = trunc i$(bu) %sum$(i) to i$(unit)
+  store i$(unit) %ts$(i), i$(unit)* %z$(i)
+  @endif
+@endfor
+  %p = bitcast i$(unit)* %z$(N-1) to i$(bu)*
+  store i$(bu) %sum$(N-1), i$(bu)* %p
+  ret void
+}
+
+@define bu = bit + unit
+@define bu2 = bit + unit * 2
+define void @mcl_fp_mont$(bit)(i$(bit)* %pz, i$(bit)* %px, i$(unit)* %py, i$(bit)* %pp, i$(unit) %r) {
+	%p = load i$(bit)* %pp
+	%x = load i$(bit)* %px
+
+@for i, 0, N
+	%py$(i) = getelementptr i$(unit)* %py, i$(unit) $(i)
+	%y$(i) = load i$(unit)* %py$(i)
+	%xy$(i) = call i$(bu) @mul$(bit)x$(unit)(i$(bit) %x, i$(unit) %y$(i))
+@if i == 0
+	%a0 = zext i$(bu) %xy0 to i$(bu2)
+
+	%at$(i) = trunc i$(bu) %xy$(i) to i$(unit)
+@else
+	%xye$(i) = zext i$(bu) %xy$(i) to i$(bu2)
+	%a$(i) = add i$(bu2) %s$(i-1), %xye$(i)
+	%at$(i) = trunc i$(bu2) %a$(i) to i$(unit)
+@endif
+	%q$(i) = mul i$(unit) %at$(i), %r
+	%pq$(i) = call i$(bu) @mul$(bit)x$(unit)(i$(bit) %p, i$(unit) %q$(i))
+	%pqe$(i) = zext i$(bu) %pq$(i) to i$(bu2)
+	%t$(i) = add i$(bu2) %a$(i), %pqe$(i)
+	%s$(i) = lshr i$(bu2) %t$(i), $(unit)
+@endfor
+	%v = trunc i$(bu2) %s$(N-1) to i$(bu)
+	%pe = zext i$(bit) %p to i$(bu)
+	%vc = sub i$(bu) %v, %pe
+	%c = lshr i$(bu) %vc, $(bit)
+	%c1 = trunc i$(bu) %c to i1
+	%z = select i1 %c1, i$(bu) %v, i$(bu) %vc
+	%zt = trunc i$(bu) %z to i$(bit)
+	store i$(bit) %zt, i$(bit)* %pz
+	ret void
+}
+
diff --git a/src/once.txt b/src/once.txt
new file mode 100644
index 0000000..501fb2b
--- /dev/null
+++ b/src/once.txt
@@ -0,0 +1,74 @@
+
+define i$(unit*2) @mul$(unit)x$(unit)(i$(unit) %x, i$(unit) %y) {
+	%x0 = zext i$(unit) %x to i$(unit*2)
+	%y0 = zext i$(unit) %y to i$(unit*2)
+	%z = mul i$(unit*2) %x0, %y0
+	ret i$(unit*2) %z
+}
+
+; NIST_P192
+; 0xfffffffffffffffffffffffffffffffeffffffffffffffff
+;
+;       0                1                2
+; ffffffffffffffff fffffffffffffffe ffffffffffffffff
+;
+; p = (1 << 192) - (1 << 64) - 1
+; (1 << 192) % p = (1 << 64) + 1
+;
+; L : 192bit
+; Hi: 64bit
+; x = [H:L] = [H2:H1:H0:L]
+; mod p
+;    x = L + H + (H << 64)
+;      = L + H + [H1:H0:0] + H2 + (H2 << 64)
+;[e:t] = L + H + [H1:H0:H2] + [H2:0] ; 2bit(e) over
+;      = t + e + (e << 64)
+
+define internal i64 @extract192to64(i192 %x, i192 %shift) {
+	%t0 = lshr i192 %x, %shift
+	%t1 = trunc i192 %t0 to i64
+	ret i64 %t1
+}
+
+define internal void @modNIST_P192(i192* %out, i192* %px) {
+	%L192 = load i192* %px
+	%L = zext i192 %L192 to i256
+
+	%pH = getelementptr i192* %px, i32 1
+	%H192 = load i192* %pH
+	%H = zext i192 %H192 to i256
+
+	%H10_ = shl i192 %H192, 64
+	%H10 = zext i192 %H10_ to i256
+
+	%H2_ = call i64 @extract192to64(i192 %H192, i192 128)
+	%H2 = zext i64 %H2_ to i256
+	%H102 = or i256 %H10, %H2
+
+	%H2s = shl i256 %H2, 64
+
+	%t0 = add i256 %L, %H
+	%t1 = add i256 %t0, %H102
+	%t2 = add i256 %t1, %H2s
+
+	%e = lshr i256 %t2, 192
+	%t3 = trunc i256 %t2 to i192
+	%e1 = trunc i256 %e to i192
+
+
+	%t4 = add i192 %t3, %e1
+	%e2 = shl i192 %e1, 64
+	%t5 = add i192 %t4, %e2
+
+	store i192 %t5, i192* %out
+
+	ret void
+}
+
+define void @mcl_fp_mul_NIST_P192(i192* %pz, i192* %px, i192* %py) {
+	%buf = alloca i192, i32 2
+	%p = bitcast i192* %buf to i$(unit)*
+	call void @mcl_fp_mul192pre(i$(unit)* %p, i192* %px, i192* %py)
+	call void @modNIST_P192(i192* %pz, i192* %buf)
+	ret void
+}
diff --git a/src/short.txt b/src/short.txt
new file mode 100644
index 0000000..931a63f
--- /dev/null
+++ b/src/short.txt
@@ -0,0 +1,46 @@
+define void @mcl_fp_add$(bit)S(i$(bit)* %pz, i$(bit)* %px, i$(bit)* %py, i$(bit)* %pp) {
+entry:
+	%x = load i$(bit)* %px
+	%y = load i$(bit)* %py
+	%p = load i$(bit)* %pp
+	%x1 = zext i$(bit) %x to i$(bit+unit)
+	%y1 = zext i$(bit) %y to i$(bit+unit)
+	%p1 = zext i$(bit) %p to i$(bit+unit)
+	%t0 = add i$(bit+unit) %x1, %y1 ; x + y
+	%t1 = sub i$(bit+unit) %t0, %p1 ; x + y - p
+	%t2 = lshr i$(bit+unit) %t1, $(bit)
+	%t3 = trunc i$(bit+unit) %t2 to i1
+	%t4 = select i1 %t3, i$(bit+unit) %t0, i$(bit+unit) %t1
+	%t5 = trunc i$(bit+unit) %t4 to i$(bit)
+	store i$(bit) %t5, i$(bit)* %pz
+	ret void
+}
+
+define internal { i$(bit), i$(unit) } @mcl_local_sbb$(bit)(i$(bit) %x, i$(bit) %y) {
+	%x1 = zext i$(bit) %x to i$(bit+unit)
+	%y1 = zext i$(bit) %y to i$(bit+unit)
+	%v1 = sub i$(bit+unit) %x1, %y1
+	%v = trunc i$(bit+unit) %v1 to i$(bit)
+	%c = lshr i$(bit+unit) %v1, $(bit)
+	%c1 = trunc i$(bit+unit) %c to i$(unit)
+	%r1 = insertvalue { i$(bit), i$(unit) } undef, i$(bit) %v, 0
+	%r2 = insertvalue { i$(bit), i$(unit) } %r1, i$(unit) %c1, 1
+	ret { i$(bit), i$(unit) } %r2
+}
+
+define void @mcl_fp_sub$(bit)S(i$(bit)* %pz, i$(bit)* %px, i$(bit)* %py, i$(bit)* %pp) {
+	%x = load i$(bit)* %px
+	%y = load i$(bit)* %py
+	%x1 = zext i$(bit) %x to i$(bit+unit)
+	%y1 = zext i$(bit) %y to i$(bit+unit)
+	%vc = sub i$(bit+unit) %x1, %y1
+	%v = trunc i$(bit+unit) %vc to i$(bit)
+	%c = lshr i$(bit+unit) %vc, $(bit+unit-1)
+	%c1 = trunc i$(bit+unit) %c to i1
+	%p = load i$(bit)* %pp
+	%a = select i1 %c1, i$(bit) %p, i$(bit) 0
+	%v1 = add i$(bit) %v, %a
+	store i$(bit) %v1, i$(bit)* %pz
+	ret void
+}
+
diff --git a/test/Makefile b/test/Makefile
new file mode 100644
index 0000000..b1e01dd
--- /dev/null
+++ b/test/Makefile
@@ -0,0 +1,42 @@
+include ../common.mk
+
+ifeq ($(USE_MONT_FP),1)
+  CFLAGS += -DUSE_MONT_FP
+endif
+
+ifeq ($(USE_LLVM),1)
+  CFLAGS += -DMIE_USE_LLVM
+  ASM_SRC=../src/$(CPU).s
+  ASM_OBJ=$(ASM_SRC:.s=.o)
+  SRC+=$(ASM_SRC)
+  LDFLAGS+=$(ASM_OBJ)
+endif
+
+TARGET=$(TEST_FILE)
+LIBS=
+
+SRC=fp_test.cpp ec_test.cpp fp_util_test.cpp
+ifeq ($(CPU),x64)
+  SRC+=fp_generator_test.cpp mont_fp_test.cpp
+endif
+
+all: $(TARGET)
+
+test: $(TARGET) $(ASM_OBJ)
+	@$(UNIT_TEST)
+
+$(OBJDIR):
+	@$(MKDIR) $(OBJDIR)
+
+clean:
+	$(CLEAN)
+
+$(LIBS):
+	$(MAKE) -C ../src
+
+-include $(DEPEND_FILE)
+
+ifeq ($(USE_LLVM),1)
+$(ASM_OBJ): $(ASM_SRC)
+	$(CXX) $< -o $@ -c
+endif
diff --git a/test/base_test.cpp b/test/base_test.cpp
new file mode 100644
index 0000000..29a177f
--- /dev/null
+++ b/test/base_test.cpp
@@ -0,0 +1,392 @@
+#include <map>
+#define MCL_USE_LLVM
+#include <mcl/fp_base.hpp>
+#include <cybozu/test.hpp>
+#include <cybozu/benchmark.hpp>
+#include <cybozu/xorshift.hpp>
+#include <cybozu/bit_operation.hpp>
+#include <mcl/fp_util.hpp>
+#include <mcl/fp.hpp>
+
+#include <mcl/fp_generator.hpp>
+#if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && (CYBOZU_OS_BIT == 64)
+	#define USE_XBYAK
+	static mcl::FpGenerator fg;
+#endif
+#define PUT(x) std::cout << #x "=" << (x) << std::endl
+
+const size_t MAX_N = 32;
+typedef mcl::fp::Unit Unit;
+
+size_t getUnitN(size_t bitLen)
+{
+	return (bitLen + sizeof(Unit) * 8 - 1) / (sizeof(Unit) * 8);
+}
+
+void setMpz(mpz_class& mx, const Unit *x, size_t n)
+{
+	mcl::Gmp::setRaw(mx, x, n);
+}
+void getMpz(Unit *x, size_t n, const mpz_class& mx)
+{
+	mcl::fp::local::toArray(x,  n, mx.get_mpz_t());
+}
+
+struct Montgomery {
+	mpz_class p_;
+	mpz_class R_; // (1 << (n_ * 64)) % p
+	mpz_class RR_; // (R * R) % p
+	Unit r_; // p * r = -1 mod M = 1 << 64
+	size_t n_;
+	Montgomery() {}
+	explicit Montgomery(const mpz_class& p)
+	{
+		p_ = p;
+		r_ = mcl::montgomery::getCoff(mcl::Gmp::getBlock(p, 0));
+		n_ = mcl::Gmp::getBlockSize(p);
+		R_ = 1;
+		R_ = (R_ << (n_ * 64)) % p_;
+		RR_ = (R_ * R_) % p_;
+	}
+
+	void toMont(mpz_class& x) const { mul(x, x, RR_); }
+	void fromMont(mpz_class& x) const { mul(x, x, 1); }
+
+	void mont(Unit *z, const Unit *x, const Unit *y) const
+	{
+		mpz_class mx, my;
+		setMpz(mx, x, n_);
+		setMpz(my, y, n_);
+		mul(mx, mx, my);
+		getMpz(z, n_, mx);
+	}
+	void mul(mpz_class& z, const mpz_class& x, const mpz_class& y) const
+	{
+#if 1
+		const size_t ySize = mcl::Gmp::getBlockSize(y);
+		mpz_class c = y == 0 ? mpz_class(0) : x * mcl::Gmp::getBlock(y, 0);
+		Unit q = c == 0 ? 0 : mcl::Gmp::getBlock(c, 0) * r_;
+		c += p_ * q;
+		c >>= sizeof(Unit) * 8;
+		for (size_t i = 1; i < n_; i++) {
+			if (i < ySize) {
+				c += x * mcl::Gmp::getBlock(y, i);
+			}
+			Unit q = c == 0 ? 0 : mcl::Gmp::getBlock(c, 0) * r_;
+			c += p_ * q;
+			c >>= sizeof(Unit) * 8;
+		}
+		if (c >= p_) {
+			c -= p_;
+		}
+		z = c;
+#else
+		z = x * y;
+		const size_t zSize = mcl::Gmp::getBlockSize(z);
+		for (size_t i = 0; i < n_; i++) {
+			if (i < zSize) {
+				Unit q = mcl::Gmp::getBlock(z, 0) * r_;
+				z += p_ * (mp_limb_t)q;
+			}
+			z >>= sizeof(Unit) * 8;
+		}
+		if (z >= p_) {
+			z -= p_;
+		}
+#endif
+	}
+};
+
+void put(const char *msg, const Unit *x, size_t n)
+{
+	printf("%s ", msg);
+	for (size_t i = 0; i < n; i++) printf("%016llx ", (long long)x[n - 1 - i]);
+	printf("\n");
+}
+void verifyEqual(const Unit *x, const Unit *y, size_t n, const char *file, int line)
+{
+	bool ok = mcl::fp::local::isEqualArray(x, y, n);
+	CYBOZU_TEST_ASSERT(ok);
+	if (ok) return;
+	printf("%s:%d\n", file, line);
+	put("L", x, n);
+	put("R", y, n);
+	exit(1);
+}
+#define VERIFY_EQUAL(x, y, n) verifyEqual(x, y, n, __FILE__, __LINE__)
+
+void addC(Unit *z, const Unit *x, const Unit *y, const Unit *p, size_t n)
+{
+	mpz_class mx, my, mp;
+	setMpz(mx, x, n);
+	setMpz(my, y, n);
+	setMpz(mp, p, n);
+	mx += my;
+	if (mx >= mp) mx -= mp;
+	getMpz(z, n, mx);
+}
+void subC(Unit *z, const Unit *x, const Unit *y, const Unit *p, size_t n)
+{
+	mpz_class mx, my, mp;
+	setMpz(mx, x, n);
+	setMpz(my, y, n);
+	setMpz(mp, p, n);
+	mx -= my;
+	if (mx < 0) mx += mp;
+	getMpz(z, n, mx);
+}
+static inline void set_zero(mpz_t& z, Unit *p, size_t n)
+{
+	z->_mp_alloc = (int)n;
+	z->_mp_size = 0;
+	z->_mp_d = (mp_limb_t*)p;
+}
+static inline void set_mpz_t(mpz_t& z, const Unit* p, int n)
+{
+	z->_mp_alloc = n;
+	int i = n;
+	while (i > 0 && p[i - 1] == 0) {
+		i--;
+	}
+	z->_mp_size = i;
+	z->_mp_d = (mp_limb_t*)p;
+}
+
+// z[2n] <- x[n] * y[n]
+void mulPreC(Unit *z, const Unit *x, const Unit *y, size_t n)
+{
+#if 1
+	mpz_t mx, my, mz;
+	set_zero(mz, z, n * 2);
+	set_mpz_t(mx, x, n);
+	set_mpz_t(my, y, n);
+	mpz_mul(mz, mx, my);
+	mcl::fp::local::toArray(z, n * 2, mz);
+#else
+	mpz_class mx, my;
+	setMpz(mx, x, n);
+	setMpz(my, y, n);
+	mx *= my;
+	getMpz(z, n * 2, mx);
+#endif
+}
+
+void modC(Unit *y, const Unit *x, const Unit *p, size_t n)
+{
+	mpz_t mx, my, mp;
+	set_mpz_t(mx, x, n * 2);
+	set_mpz_t(my, y, n);
+	set_mpz_t(mp, p, n);
+	mpz_mod(my, mx, mp);
+	mcl::fp::local::clearArray(y, my->_mp_size, n);
+}
+
+void mul(Unit *z, const Unit *x, const Unit *y, const Unit *p, size_t n)
+{
+	Unit ret[MAX_N * 2];
+	mpz_t mx, my, mz, mp;
+	set_zero(mz, ret, MAX_N * 2);
+	set_mpz_t(mx, x, n);
+	set_mpz_t(my, y, n);
+	set_mpz_t(mp, p, n);
+	mpz_mul(mz, mx, my);
+	mpz_mod(mz, mz, mp);
+	mcl::fp::local::toArray(z, n, mz);
+}
+
+typedef mcl::fp::void3op void3op;
+typedef mcl::fp::void4op void4op;
+typedef mcl::fp::void4Iop void4Iop;
+
+const struct FuncOp {
+	size_t bitLen;
+	void4op addS;
+	void4op addL;
+	void4op subS;
+	void4op subL;
+	void3op mulPre;
+	void4Iop mont;
+} gFuncOpTbl[] = {
+	{ 128, mcl_fp_add128S, mcl_fp_add128L, mcl_fp_sub128S, mcl_fp_sub128L, mcl_fp_mul128pre, mcl_fp_mont128 },
+	{ 192, mcl_fp_add192S, mcl_fp_add192L, mcl_fp_sub192S, mcl_fp_sub192L, mcl_fp_mul192pre, mcl_fp_mont192 },
+	{ 256, mcl_fp_add256S, mcl_fp_add256L, mcl_fp_sub256S, mcl_fp_sub256L, mcl_fp_mul256pre, mcl_fp_mont256 },
+	{ 320, mcl_fp_add320S, mcl_fp_add320L, mcl_fp_sub320S, mcl_fp_sub320L, mcl_fp_mul320pre, mcl_fp_mont320 },
+	{ 384, mcl_fp_add384S, mcl_fp_add384L, mcl_fp_sub384S, mcl_fp_sub384L, mcl_fp_mul384pre, mcl_fp_mont384 },
+	{ 448, mcl_fp_add448S, mcl_fp_add448L, mcl_fp_sub448S, mcl_fp_sub448L, mcl_fp_mul448pre, mcl_fp_mont448 },
+	{ 512, mcl_fp_add512S, mcl_fp_add512L, mcl_fp_sub512S, mcl_fp_sub512L, mcl_fp_mul512pre, mcl_fp_mont512 },
+#if CYBOZU_OS_BIT == 32
+	{ 160, mcl_fp_add160S, mcl_fp_add160L, mcl_fp_sub160S, mcl_fp_sub160L, mcl_fp_mul160pre, mcl_fp_mont160 },
+	{ 224, mcl_fp_add224S, mcl_fp_add224L, mcl_fp_sub224S, mcl_fp_sub224L, mcl_fp_mul224pre, mcl_fp_mont224 },
+	{ 288, mcl_fp_add288S, mcl_fp_add288L, mcl_fp_sub288S, mcl_fp_sub288L, mcl_fp_mul288pre, mcl_fp_mont288 },
+	{ 352, mcl_fp_add352S, mcl_fp_add352L, mcl_fp_sub352S, mcl_fp_sub352L, mcl_fp_mul352pre, mcl_fp_mont352 },
+	{ 416, mcl_fp_add416S, mcl_fp_add416L, mcl_fp_sub416S, mcl_fp_sub416L, mcl_fp_mul416pre, mcl_fp_mont416 },
+	{ 480, mcl_fp_add480S, mcl_fp_add480L, mcl_fp_sub480S, mcl_fp_sub480L, mcl_fp_mul480pre, mcl_fp_mont480 },
+	{ 544, mcl_fp_add544S, mcl_fp_add544L, mcl_fp_sub544S, mcl_fp_sub544L, mcl_fp_mul544pre, mcl_fp_mont544 },
+#else
+	{ 576, mcl_fp_add576S, mcl_fp_add576L, mcl_fp_sub576S, mcl_fp_sub576L, mcl_fp_mul576pre, mcl_fp_mont576 },
+#endif
+};
+
+FuncOp getFuncOp(size_t bitLen)
+{
+	typedef std::map<size_t, FuncOp> Map;
+	static Map map;
+	static bool init = false;
+	if (!init) {
+		init = true;
+		for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(gFuncOpTbl); i++) {
+			map[gFuncOpTbl[i].bitLen] = gFuncOpTbl[i];
+		}
+	}
+	for (Map::const_iterator i = map.begin(), ie = map.end(); i != ie; ++i) {
+		if (bitLen <= i->second.bitLen) {
+			return i->second;
+		}
+	}
+	printf("ERR bitLen=%d\n", (int)bitLen);
+	exit(1);
+}
+
+void test(const Unit *p, size_t bitLen)
+{
+	printf("bitLen %d\n", (int)bitLen);
+	const size_t n = getUnitN(bitLen);
+#ifdef NDEBUG
+	bool doBench = true;
+#else
+	bool doBench = false;
+#endif
+	const FuncOp funcOp = getFuncOp(bitLen);
+	const void4op addS = funcOp.addS;
+	const void4op addL = funcOp.addL;
+	const void4op subS = funcOp.subS;
+	const void4op subL = funcOp.subL;
+	const void3op mulPre = funcOp.mulPre;
+	const void4Iop mont = funcOp.mont;
+
+	mcl::fp::Unit x[MAX_N], y[MAX_N];
+	mcl::fp::Unit z[MAX_N], w[MAX_N];
+	mcl::fp::Unit z2[MAX_N * 2];
+	mcl::fp::Unit w2[MAX_N * 2];
+	cybozu::XorShift rg;
+	mcl::fp::getRandVal(x, rg, p, bitLen);
+	mcl::fp::getRandVal(y, rg, p, bitLen);
+	const size_t C = 10;
+
+	addC(z, x, y, p, n);
+	addS(w, x, y, p);
+	VERIFY_EQUAL(z, w, n);
+	for (size_t i = 0; i < C; i++) {
+		addC(z, y, z, p, n);
+		addS(w, y, w, p);
+		VERIFY_EQUAL(z, w, n);
+		addC(z, y, z, p, n);
+		addL(w, y, w, p);
+		VERIFY_EQUAL(z, w, n);
+		subC(z, x, z, p, n);
+		subS(w, x, w, p);
+		VERIFY_EQUAL(z, w, n);
+		subC(z, x, z, p, n);
+		subL(w, x, w, p);
+		VERIFY_EQUAL(z, w, n);
+		mulPreC(z2, x, z, n);
+		mulPre(w2, x, z);
+		VERIFY_EQUAL(z2, w2, n * 2);
+	}
+	{
+		mpz_class mp;
+		setMpz(mp, p, n);
+		Montgomery m(mp);
+#ifdef USE_XBYAK
+		if (bitLen > 128) fg.init(p, n);
+#endif
+		/*
+			real mont
+			   0    0
+			   1    R^-1
+			   R    1
+			  -1    -R^-1
+			  -R    -1
+		*/
+		mpz_class t = 1;
+		const mpz_class R = (t << (n * 64)) % mp;
+		const mpz_class tbl[] = {
+			0, 1, R, mp - 1, mp - R
+		};
+		for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+			const mpz_class& mx = tbl[i];
+			for (size_t j = i; j < CYBOZU_NUM_OF_ARRAY(tbl); j++) {
+				const mpz_class& my = tbl[j];
+				getMpz(x, n, mx);
+				getMpz(y, n, my);
+				m.mont(z, x, y);
+				mont(w, x, y, p, m.r_);
+				VERIFY_EQUAL(z, w, n);
+#ifdef USE_XBYAK
+				if (bitLen > 128) {
+					fg.mul_(w, x, y);
+					VERIFY_EQUAL(z, w, n);
+				}
+#endif
+			}
+		}
+		if (doBench) {
+//			CYBOZU_BENCH("montC", m.mont, x, y, x);
+			CYBOZU_BENCH("montA  ", mont, x, y, x, p, m.r_);
+		}
+	}
+	if (doBench) {
+//		CYBOZU_BENCH("addS", addS, x, y, x, p); // slow
+//		CYBOZU_BENCH("subS", subS, x, y, x, p);
+//		CYBOZU_BENCH("addL", addL, x, y, x, p);
+//		CYBOZU_BENCH("subL", subL, x, y, x, p);
+		CYBOZU_BENCH("mulPreA", mulPre, w2, y, x);
+		CYBOZU_BENCH("mulPreC", mulPreC, w2, y, x, n);
+		CYBOZU_BENCH("modC   ", modC, x, w2, p, n);
+	}
+#ifdef USE_XBYAK
+	if (bitLen <= 128) return;
+	if (doBench) {
+		fg.init(p, n);
+		CYBOZU_BENCH("addA   ", fg.add_, x, y, x);
+		CYBOZU_BENCH("subA   ", fg.sub_, x, y, x);
+//		CYBOZU_BENCH("mulA", fg.mul_, x, y, x);
+	}
+#endif
+	printf("mont test %d\n", (int)bitLen);
+}
+
+CYBOZU_TEST_AUTO(all)
+{
+	const struct {
+		size_t n;
+		const uint64_t p[9];
+	} tbl[] = {
+//		{ 2, { 0xf000000000000001, 1, } },
+		{ 2, { 0x000000000000001d, 0x8000000000000000, } },
+		{ 3, { 0x000000000000012b, 0x0000000000000000, 0x0000000080000000, } },
+//		{ 3, { 0x0f69466a74defd8d, 0xfffffffe26f2fc17, 0x07ffffffffffffff, } },
+//		{ 3, { 0x7900342423332197, 0x1234567890123456, 0x1480948109481904, } },
+		{ 3, { 0x0f69466a74defd8d, 0xfffffffe26f2fc17, 0xffffffffffffffff, } },
+//		{ 4, { 0x7900342423332197, 0x4242342420123456, 0x1234567892342342, 0x1480948109481904, } },
+//		{ 4, { 0x0f69466a74defd8d, 0xfffffffe26f2fc17, 0x17ffffffffffffff, 0x1513423423423415, } },
+		{ 4, { 0xa700000000000013, 0x6121000000000013, 0xba344d8000000008, 0x2523648240000001, } },
+//		{ 5, { 0x0000000000000009, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x8000000000000000, } },
+		{ 5, { 0xfffffffffffffc97, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, } },
+//		{ 6, { 0x4720422423332197, 0x0034230847204720, 0x3456789012345679, 0x4820984290482212, 0x9482094820948209, 0x0194810841094810, } },
+//		{ 6, { 0x7204224233321972, 0x0342308472047204, 0x4567890123456790, 0x0948204204243123, 0x2098420984209482, 0x2093482094810948, } },
+		{ 6, { 0x00000000ffffffff, 0xffffffff00000000, 0xfffffffffffffffe, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, } },
+//		{ 7, { 0x0000000000000063, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x8000000000000000, } },
+		{ 7, { 0x000000000fffcff1, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, } },
+		{ 8, { 0xffffffffffffd0c9, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, } },
+		{ 9, { 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x00000000000001ff, } },
+//		{ 9, { 0x4720422423332197, 0x0034230847204720, 0x3456789012345679, 0x2498540975555312, 0x9482904924029424, 0x0948209842098402, 0x1098410948109482, 0x0820958209582094, 0x0000000000000029, } },
+//		{ 9, { 0x0f69466a74defd8d, 0xfffffffe26f2fc17, 0x7fffffffffffffff, 0x8572938572398583, 0x5732057823857293, 0x9820948205872380, 0x3409238420492034, 0x9483842098340298, 0x0000000000000003, } },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		const size_t n = tbl[i].n;
+		const size_t bitLen = (n - 1) * 64 + cybozu::bsr<uint64_t>(tbl[i].p[n - 1]) + 1;
+		test((const Unit*)tbl[i].p, bitLen);
+	}
+}
+
diff --git a/test/ec_test.cpp b/test/ec_test.cpp
new file mode 100644
index 0000000..1255a1d
--- /dev/null
+++ b/test/ec_test.cpp
@@ -0,0 +1,397 @@
+#define PUT(x) std::cout << #x "=" << (x) << std::endl
+#define CYBOZU_TEST_DISABLE_AUTO_RUN
+#include <cybozu/test.hpp>
+#include <cybozu/benchmark.hpp>
+#include <mcl/gmp_util.hpp>
+
+#include <mcl/fp.hpp>
+typedef mcl::FpT<> Fp_3;
+typedef mcl::FpT<> Fp_4;
+typedef mcl::FpT<> Fp_6;
+typedef mcl::FpT<> Fp_9;
+#include <mcl/ec.hpp>
+#include <mcl/ecparam.hpp>
+#include <time.h>
+
+struct tagZn;
+typedef mcl::FpT<tagZn> Zn;
+
+template<class Fp>
+struct Test {
+	typedef mcl::EcT<Fp> Ec;
+	const mcl::EcParam& para;
+	Test(const mcl::EcParam& para)
+		: para(para)
+	{
+		Fp::setModulo(para.p);
+		Zn::setModulo(para.n);
+		Ec::setParam(para.a, para.b);
+//		CYBOZU_TEST_EQUAL(para.bitLen, Fp(-1).getBitLen());
+	}
+	void cstr() const
+	{
+		Ec O;
+		CYBOZU_TEST_ASSERT(O.isZero());
+		Ec P;
+		Ec::neg(P, O);
+		CYBOZU_TEST_EQUAL(P, O);
+	}
+	void ope() const
+	{
+		Fp x(para.gx);
+		Fp y(para.gy);
+		Zn n = 0;
+		CYBOZU_TEST_ASSERT(Ec::isValid(x, y));
+		Ec P(x, y), Q, R, O;
+		{
+			Ec::neg(Q, P);
+			CYBOZU_TEST_EQUAL(Q.x, P.x);
+			CYBOZU_TEST_EQUAL(Q.y, -P.y);
+
+			R = P + Q;
+			CYBOZU_TEST_ASSERT(R.isZero());
+
+			R = P + O;
+			CYBOZU_TEST_EQUAL(R, P);
+			R = O + P;
+			CYBOZU_TEST_EQUAL(R, P);
+		}
+
+		{
+			Ec::dbl(R, P);
+			Ec R2 = P + P;
+			CYBOZU_TEST_EQUAL(R, R2);
+			{
+				Ec P2 = P;
+				Ec::dbl(P2, P2);
+				CYBOZU_TEST_EQUAL(P2, R2);
+			}
+			Ec R3L = R2 + P;
+			Ec R3R = P + R2;
+			CYBOZU_TEST_EQUAL(R3L, R3R);
+			{
+				Ec RR = R2;
+				RR = RR + P;
+				CYBOZU_TEST_EQUAL(RR, R3L);
+				RR = R2;
+				RR = P + RR;
+				CYBOZU_TEST_EQUAL(RR, R3L);
+				RR = P;
+				RR = RR + RR;
+				CYBOZU_TEST_EQUAL(RR, R2);
+			}
+			Ec::power(R, P, 2);
+			CYBOZU_TEST_EQUAL(R, R2);
+			Ec R4L = R3L + R2;
+			Ec R4R = R2 + R3L;
+			CYBOZU_TEST_EQUAL(R4L, R4R);
+			Ec::power(R, P, 5);
+			CYBOZU_TEST_EQUAL(R, R4L);
+		}
+		{
+			R = P;
+			for (int i = 0; i < 10; i++) {
+				R += P;
+			}
+			Ec R2;
+			Ec::power(R2, P, 11);
+			CYBOZU_TEST_EQUAL(R, R2);
+		}
+		Ec::power(R, P, n - 1);
+		CYBOZU_TEST_EQUAL(R, -P);
+		R += P; // Ec::power(R, P, n);
+		CYBOZU_TEST_ASSERT(R.isZero());
+	}
+
+	void power() const
+	{
+		Fp x(para.gx);
+		Fp y(para.gy);
+		Ec P(x, y);
+		Ec Q;
+		Ec R;
+		for (int i = 0; i < 100; i++) {
+			Ec::power(Q, P, i);
+			CYBOZU_TEST_EQUAL(Q, R);
+			R += P;
+		}
+	}
+
+	void neg_power() const
+	{
+		Fp x(para.gx);
+		Fp y(para.gy);
+		Ec P(x, y);
+		Ec Q;
+		Ec R;
+		for (int i = 0; i < 100; i++) {
+			Ec::power(Q, P, -i);
+			CYBOZU_TEST_EQUAL(Q, R);
+			R -= P;
+		}
+	}
+	void squareRoot() const
+	{
+		Fp x(para.gx);
+		Fp y(para.gy);
+		bool odd = Fp::isYodd(y);
+		Fp yy;
+		Ec::getYfromX(yy, x, odd);
+		CYBOZU_TEST_EQUAL(yy, y);
+		Fp::neg(y, y);
+		odd = Fp::isYodd(y);
+		yy.clear();
+		Ec::getYfromX(yy, x, odd);
+		CYBOZU_TEST_EQUAL(yy, y);
+	}
+	void power_fp() const
+	{
+		Fp x(para.gx);
+		Fp y(para.gy);
+		Ec P(x, y);
+		Ec Q;
+		Ec R;
+		for (int i = 0; i < 100; i++) {
+			Ec::power(Q, P, Zn(i));
+			CYBOZU_TEST_EQUAL(Q, R);
+			R += P;
+		}
+	}
+	void binaryExpression() const
+	{
+		puts("test binaryExpression");
+		const Fp x(para.gx);
+		const Fp y(para.gy);
+		Ec P(x, y);
+		Ec Q;
+		// not compressed
+		Ec::setCompressedExpression(false);
+		{
+			cybozu::BitVector bv;
+			P.appendToBitVec(bv);
+			Q.fromBitVec(bv);
+			CYBOZU_TEST_EQUAL(P, Q);
+		}
+		{
+			P = -P;
+			cybozu::BitVector bv;
+			P.appendToBitVec(bv);
+			Q.fromBitVec(bv);
+			CYBOZU_TEST_EQUAL(P, Q);
+		}
+		P.clear();
+		{
+			cybozu::BitVector bv;
+			P.appendToBitVec(bv);
+			Q.fromBitVec(bv);
+			CYBOZU_TEST_EQUAL(P, Q);
+		}
+		// compressed
+		Ec::setCompressedExpression(true);
+		P.set(x, y);
+		{
+			cybozu::BitVector bv;
+			P.appendToBitVec(bv);
+			Q.fromBitVec(bv);
+			CYBOZU_TEST_EQUAL(P, Q);
+		}
+		{
+			P = -P;
+			cybozu::BitVector bv;
+			P.appendToBitVec(bv);
+			Q.fromBitVec(bv);
+			CYBOZU_TEST_EQUAL(P, Q);
+		}
+		P.clear();
+		{
+			cybozu::BitVector bv;
+			P.appendToBitVec(bv);
+			Q.fromBitVec(bv);
+			CYBOZU_TEST_EQUAL(P, Q);
+		}
+	}
+	void str() const
+	{
+		puts("test str");
+		const Fp x(para.gx);
+		const Fp y(para.gy);
+		Ec P(x, y);
+		Ec Q;
+		// not compressed
+		Ec::setCompressedExpression(false);
+		{
+			std::stringstream ss;
+			ss << P;
+			ss >> Q;
+			CYBOZU_TEST_EQUAL(P, Q);
+		}
+		{
+			P = -P;
+			std::stringstream ss;
+			ss << P;
+			ss >> Q;
+			CYBOZU_TEST_EQUAL(P, Q);
+		}
+		P.clear();
+		{
+			std::stringstream ss;
+			ss << P;
+			ss >> Q;
+			CYBOZU_TEST_EQUAL(P, Q);
+		}
+		// compressed
+		Ec::setCompressedExpression(true);
+		P.set(x, y);
+		{
+			std::stringstream ss;
+			ss << P;
+			ss >> Q;
+			CYBOZU_TEST_EQUAL(P, Q);
+		}
+		{
+			P = -P;
+			std::stringstream ss;
+			ss << P;
+			ss >> Q;
+			CYBOZU_TEST_EQUAL(P, Q);
+		}
+		P.clear();
+		{
+			std::stringstream ss;
+			ss << P;
+			ss >> Q;
+			CYBOZU_TEST_EQUAL(P, Q);
+		}
+	}
+
+	template<class F>
+	void test(F f, const char *msg) const
+	{
+		const int N = 300000;
+		Fp x(para.gx);
+		Fp y(para.gy);
+		Ec P(x, y);
+		Ec Q = P + P + P;
+		clock_t begin = clock();
+		for (int i = 0; i < N; i++) {
+			f(Q, P, Q);
+		}
+		clock_t end = clock();
+		printf("%s %.2fusec\n", msg, (end - begin) / double(CLOCKS_PER_SEC) / N * 1e6);
+	}
+	/*
+		add 8.71usec -> 6.94
+		sub 6.80usec -> 4.84
+		dbl 9.59usec -> 7.75
+		pos 2730usec -> 2153
+	*/
+	void bench() const
+	{
+		Fp x(para.gx);
+		Fp y(para.gy);
+		Ec P(x, y);
+		Ec Q = P + P + P;
+		CYBOZU_BENCH("add", Ec::add, Q, P, Q);
+		CYBOZU_BENCH("sub", Ec::sub, Q, P, Q);
+		CYBOZU_BENCH("dbl", Ec::dbl, P, P);
+		Zn z("-3");
+		CYBOZU_BENCH("pow", Ec::power, P, P, z);
+	}
+/*
+Affine : sandy-bridge
+add 3.17usec
+sub 2.43usec
+dbl 3.32usec
+pow 905.00usec
+Jacobi
+add 2.34usec
+sub 2.65usec
+dbl 1.56usec
+pow 499.00usec
+*/
+	void run() const
+	{
+		cstr();
+		ope();
+		power();
+		neg_power();
+		power_fp();
+		binaryExpression();
+		squareRoot();
+		str();
+#ifdef NDEBUG
+		bench();
+#endif
+	}
+private:
+	Test(const Test&);
+	void operator=(const Test&);
+};
+
+template<class Fp>
+void test_sub(const mcl::EcParam *para, size_t paraNum)
+{
+	for (size_t i = 0; i < paraNum; i++) {
+		puts(para[i].name);
+		Test<Fp>(para[i]).run();
+	}
+}
+
+int g_partial = -1;
+
+CYBOZU_TEST_AUTO(all)
+{
+#ifdef USE_MONT_FP
+	puts("use MontFp");
+#else
+	puts("use GMP");
+#endif
+	if (g_partial & (1 << 3)) {
+		const struct mcl::EcParam para3[] = {
+	//		mcl::ecparam::p160_1,
+			mcl::ecparam::secp160k1,
+			mcl::ecparam::secp192k1,
+			mcl::ecparam::NIST_P192,
+		};
+		test_sub<Fp_3>(para3, CYBOZU_NUM_OF_ARRAY(para3));
+	}
+
+	if (g_partial & (1 << 4)) {
+		const struct mcl::EcParam para4[] = {
+			mcl::ecparam::secp224k1,
+			mcl::ecparam::secp256k1,
+			mcl::ecparam::NIST_P224,
+			mcl::ecparam::NIST_P256,
+		};
+		test_sub<Fp_4>(para4, CYBOZU_NUM_OF_ARRAY(para4));
+	}
+
+	if (g_partial & (1 << 6)) {
+		const struct mcl::EcParam para6[] = {
+	//		mcl::ecparam::secp384r1,
+			mcl::ecparam::NIST_P384,
+		};
+		test_sub<Fp_6>(para6, CYBOZU_NUM_OF_ARRAY(para6));
+	}
+
+	if (g_partial & (1 << 9)) {
+		const struct mcl::EcParam para9[] = {
+	//		mcl::ecparam::secp521r1,
+			mcl::ecparam::NIST_P521,
+		};
+		test_sub<Fp_9>(para9, CYBOZU_NUM_OF_ARRAY(para9));
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc == 1) {
+		g_partial = -1;
+	} else {
+		g_partial = 0;
+		for (int i = 1; i < argc; i++) {
+			g_partial |= 1 << atoi(argv[i]);
+		}
+	}
+	return cybozu::test::autoRun.run(argc, argv);
+}
diff --git a/test/fp_generator_test.cpp b/test/fp_generator_test.cpp
new file mode 100644
index 0000000..9a61ab2
--- /dev/null
+++ b/test/fp_generator_test.cpp
@@ -0,0 +1,222 @@
+#include <cybozu/test.hpp>
+#if CYBOZU_OS_BIT == 32
+// not support
+#else
+#include <mcl/gmp_util.hpp>
+#include <stdint.h>
+#include <string>
+#include <cybozu/itoa.hpp>
+#include <mcl/fp_generator.hpp>
+#include <mcl/fp.hpp>
+#include <iostream>
+#include <cybozu/xorshift.hpp>
+#include <cybozu/benchmark.hpp>
+
+typedef mcl::FpT<> Fp;
+
+const int MAX_N = 4;
+
+const char *primeTable[] = {
+	"7fffffffffffffffffffffffffffffff", // 127bit(not full)
+	"ffffffffffffffffffffffffffffff61", // 128bit(full)
+	"fffffffffffffffffffffffffffffffffffffffeffffee37", // 192bit(full)
+	"2523648240000001ba344d80000000086121000000000013a700000000000013", // 254bit(not full)
+};
+
+/*
+	p is output buffer
+	pStr is hex
+	return the size of p
+*/
+int convertToArray(uint64_t *p, const mpz_class& x)
+{
+	const int pn = int(sizeof(mp_limb_t) * x.get_mpz_t()->_mp_size / sizeof(*p));
+	if (pn > MAX_N) {
+		printf("pn(%d) is too large\n", pn);
+		exit(1);
+	}
+	const uint64_t *q = (const uint64_t*)x.get_mpz_t()->_mp_d;
+	std::copy(q, q + pn, p);
+	std::fill(p + pn, p + MAX_N, 0);
+	return pn;
+}
+int convertToArray(uint64_t *p, const char *pStr)
+{
+	mpz_class x;
+	x.set_str(pStr, 16);
+	return convertToArray(p, x);
+}
+
+struct Int {
+	int vn;
+	uint64_t v[MAX_N];
+	Int()
+		: vn(0)
+	{
+	}
+	explicit Int(int vn)
+	{
+		if (vn > MAX_N) {
+			printf("vn(%d) is too large\n", vn);
+			exit(1);
+		}
+		this->vn = vn;
+	}
+	void set(const char *str) { fromStr(str); }
+	void set(const Fp& rhs)
+	{
+		convertToArray(v, rhs.toGmp());
+	}
+	void set(const uint64_t* x)
+	{
+		for (int i = 0; i < vn; i++) v[i] = x[i];
+	}
+	void fromStr(const char *str)
+	{
+		convertToArray(v, str);
+	}
+	std::string toStr() const
+	{
+		std::string ret;
+		for (int i = 0; i < vn; i++) {
+			ret += cybozu::itohex(v[vn - 1 - i], false);
+		}
+		return ret;
+	}
+	void put(const char *msg = "") const
+	{
+		if (msg) printf("%s=", msg);
+		printf("%s\n", toStr().c_str());
+	}
+	bool operator==(const Int& rhs) const
+	{
+		if (vn != rhs.vn) return false;
+		for (int i = 0; i < vn; i++) {
+			if (v[i] != rhs.v[i]) return false;
+		}
+		return true;
+	}
+	bool operator!=(const Int& rhs) const { return !operator==(rhs); }
+	bool operator==(const Fp& rhs) const
+	{
+		Int t(vn);
+		t.set(rhs);
+		return operator==(t);
+	}
+	bool operator!=(const Fp& rhs) const { return !operator==(rhs); }
+};
+static inline std::ostream& operator<<(std::ostream& os, const Int& x)
+{
+	return os << x.toStr();
+}
+
+void testAddSub(const mcl::FpGenerator& fg, int pn)
+{
+	Fp x, y;
+	Int mx(pn), my(pn);
+	x.fromStr("0x8811aabb23427cc");
+	y.fromStr("0x8811aabb23427cc11");
+	mx.set(x);
+	my.set(y);
+	for (int i = 0; i < 30; i++) {
+		CYBOZU_TEST_EQUAL(mx, x);
+		x += x;
+		fg.add_(mx.v, mx.v, mx.v);
+	}
+	for (int i = 0; i < 30; i++) {
+		CYBOZU_TEST_EQUAL(mx, x);
+		x += y;
+		fg.add_(mx.v, mx.v, my.v);
+	}
+	for (int i = 0; i < 30; i++) {
+		CYBOZU_TEST_EQUAL(my, y);
+		y -= x;
+		fg.sub_(my.v, my.v, mx.v);
+	}
+}
+
+void testNeg(const mcl::FpGenerator& fg, int pn)
+{
+	Fp x;
+	Int mx(pn), my(pn);
+	const char *tbl[] = {
+		"0",
+		"0x12346",
+		"0x11223344556677881122334455667788",
+		"0x0abbccddeeffaabb0000000000000000",
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		x.fromStr(tbl[i]);
+		mx.set(x);
+		x = -x;
+		fg.neg_(mx.v, mx.v);
+		CYBOZU_TEST_EQUAL(mx, x);
+	}
+}
+
+void testMulI(const mcl::FpGenerator& fg, int pn)
+{
+	cybozu::XorShift rg;
+	for (int i = 0; i < 100; i++) {
+		uint64_t x[MAX_N];
+		uint64_t z[MAX_N + 1];
+		rg.read(x, pn);
+		uint64_t y = rg.get64();
+		mpz_class mx;
+		mcl::Gmp::setRaw(mx, x, pn);
+		mpz_class my;
+		mcl::Gmp::set(my, y);
+		mx *= my;
+		uint64_t d = fg.mulI_(z, x, y);
+		z[pn] = d;
+		mcl::Gmp::setRaw(my, z, pn + 1);
+		CYBOZU_TEST_EQUAL(mx, my);
+	}
+	{
+		uint64_t x[MAX_N];
+		uint64_t z[MAX_N + 1];
+		rg.read(x, pn);
+		uint64_t y = rg.get64();
+		CYBOZU_BENCH_C("mulI", 10000000, fg.mulI_, z, x, y);
+	}
+}
+
+void testShr1(const mcl::FpGenerator& fg, int pn)
+{
+	cybozu::XorShift rg;
+	for (int i = 0; i < 100; i++) {
+		uint64_t x[MAX_N];
+		uint64_t z[MAX_N];
+		rg.read(x, pn);
+		mpz_class mx;
+		mcl::Gmp::setRaw(mx, x, pn);
+		mx >>= 1;
+		fg.shr1_(z, x);
+		mpz_class my;
+		mcl::Gmp::setRaw(my, z, pn);
+		CYBOZU_TEST_EQUAL(mx, my);
+	}
+}
+
+void test(const char *pStr)
+{
+	Fp::setModulo(pStr, 16);
+	uint64_t p[MAX_N];
+	const int pn = convertToArray(p, pStr);
+	printf("pn=%d\n", pn);
+	mcl::FpGenerator fg;
+	fg.init(p, pn);
+	testAddSub(fg, pn);
+	testNeg(fg, pn);
+	testMulI(fg, pn);
+	testShr1(fg, pn);
+}
+
+CYBOZU_TEST_AUTO(all)
+{
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(primeTable); i++) {
+		printf("test prime i=%d\n", (int)i);
+		test(primeTable[i]);
+	}
+}
+#endif
diff --git a/test/fp_test.cpp b/test/fp_test.cpp
new file mode 100644
index 0000000..aac80a1
--- /dev/null
+++ b/test/fp_test.cpp
@@ -0,0 +1,465 @@
+#define PUT(x) std::cout << #x "=" << (x) << std::endl
+#include <cybozu/test.hpp>
+#include <mcl/fp.hpp>
+#include <cybozu/benchmark.hpp>
+#include <time.h>
+
+#ifdef _MSC_VER
+	#pragma warning(disable: 4127) // const condition
+#endif
+
+typedef mcl::FpT<> Fp;
+
+const int m = 65537;
+struct Init {
+	Init()
+	{
+		std::ostringstream ms;
+		ms << m;
+		Fp::setModulo(ms.str());
+	}
+};
+
+CYBOZU_TEST_SETUP_FIXTURE(Init);
+
+#ifndef MCL_ONLY_BENCH
+CYBOZU_TEST_AUTO(cstr)
+{
+	const struct {
+		const char *str;
+		int val;
+	} tbl[] = {
+		{ "0", 0 },
+		{ "1", 1 },
+		{ "123", 123 },
+		{ "0x123", 0x123 },
+		{ "0b10101", 21 },
+		{ "-123", m - 123 },
+		{ "-0x123", m - 0x123 },
+		{ "-0b10101", m - 21 },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		// string cstr
+		Fp x(tbl[i].str);
+		CYBOZU_TEST_EQUAL(x, tbl[i].val);
+
+		// int cstr
+		Fp y(tbl[i].val);
+		CYBOZU_TEST_EQUAL(y, x);
+
+		// copy cstr
+		Fp z(x);
+		CYBOZU_TEST_EQUAL(z, x);
+
+		// assign int
+		Fp w;
+		w = tbl[i].val;
+		CYBOZU_TEST_EQUAL(w, x);
+
+		// assign self
+		Fp u;
+		u = w;
+		CYBOZU_TEST_EQUAL(u, x);
+
+		// conv
+		std::ostringstream os;
+		os << tbl[i].val;
+
+		std::string str;
+		x.toStr(str);
+		CYBOZU_TEST_EQUAL(str, os.str());
+	}
+}
+
+CYBOZU_TEST_AUTO(fromStr)
+{
+	const struct {
+		const char *in;
+		int out;
+		int base;
+	} tbl[] = {
+		{ "100", 100, 0 }, // set base = 10 if base = 0
+		{ "100", 4, 2 },
+		{ "100", 256, 16 },
+		{ "0b100", 4, 0 },
+		{ "0b100", 4, 2 },
+		{ "0x100", 256, 0 },
+		{ "0x100", 256, 16 },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		Fp x;
+		x.fromStr(tbl[i].in, tbl[i].base);
+		CYBOZU_TEST_EQUAL(x, tbl[i].out);
+	}
+	// conflict prefix with base
+	Fp x;
+	CYBOZU_TEST_EXCEPTION(x.fromStr("0b100", 16), cybozu::Exception);
+	CYBOZU_TEST_EXCEPTION(x.fromStr("0x100", 2), cybozu::Exception);
+}
+
+CYBOZU_TEST_AUTO(stream)
+{
+	const struct {
+		const char *in;
+		int out10;
+		int out16;
+	} tbl[] = {
+		{ "100", 100, 256 }, // set base = 10 if base = 0
+		{ "0x100", 256, 256 },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		{
+			std::istringstream is(tbl[i].in);
+			Fp x;
+			is >> x;
+			CYBOZU_TEST_EQUAL(x, tbl[i].out10);
+		}
+		{
+			std::istringstream is(tbl[i].in);
+			Fp x;
+			is >> std::hex >> x;
+			CYBOZU_TEST_EQUAL(x, tbl[i].out16);
+		}
+	}
+	std::istringstream is("0b100");
+	Fp x;
+	CYBOZU_TEST_EXCEPTION(is >> std::hex >> x, cybozu::Exception);
+}
+
+CYBOZU_TEST_AUTO(conv)
+{
+	const char *bin = "0b1001000110100";
+	const char *hex = "0x1234";
+	const char *dec = "4660";
+	Fp b(bin);
+	Fp h(hex);
+	Fp d(dec);
+	CYBOZU_TEST_EQUAL(b, h);
+	CYBOZU_TEST_EQUAL(b, d);
+
+	std::string str;
+	b.toStr(str, 2, true);
+	CYBOZU_TEST_EQUAL(str, bin);
+	b.toStr(str);
+	CYBOZU_TEST_EQUAL(str, dec);
+	b.toStr(str, 16, true);
+	CYBOZU_TEST_EQUAL(str, hex);
+}
+
+CYBOZU_TEST_AUTO(compare)
+{
+	const struct {
+		int lhs;
+		int rhs;
+		int cmp;
+	} tbl[] = {
+		{ 0, 0, 0 },
+		{ 1, 0, 1 },
+		{ 0, 1, -1 },
+		{ -1, 0, 1 }, // m-1, 0
+		{ 0, -1, -1 }, // 0, m-1
+		{ 123, 456, -1 },
+		{ 456, 123, 1 },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		const Fp x(tbl[i].lhs);
+		const Fp y(tbl[i].rhs);
+		const int cmp = tbl[i].cmp;
+		if (cmp == 0) {
+			CYBOZU_TEST_EQUAL(x, y);
+		} else {
+			CYBOZU_TEST_ASSERT(x != y);
+		}
+	}
+	{
+		Fp x(5);
+		CYBOZU_TEST_ASSERT(x == 5);
+	}
+}
+
+CYBOZU_TEST_AUTO(modulo)
+{
+	std::ostringstream ms;
+	ms << m;
+
+	std::string str;
+	Fp::getModulo(str);
+	CYBOZU_TEST_EQUAL(str, ms.str());
+}
+
+CYBOZU_TEST_AUTO(ope)
+{
+	const struct {
+		int x;
+		int y;
+		int add; // x + y
+		int sub; // x - y
+		int mul; // x * y
+		int sqr; // x^2
+	} tbl[] = {
+		{ 0, 1, 1, m - 1, 0, 0 },
+		{ 9, 5, 14, 4, 45, 81 },
+		{ 10, 13, 23, m - 3, 130, 100 },
+		{ 2000, 1000, 3000, 1000, (2000 * 1000) % m, (2000 * 2000) % m },
+		{ 12345, 9999, 12345 + 9999, 12345 - 9999, (12345 * 9999) % m, (12345 * 12345) % m },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		const Fp x(tbl[i].x);
+		const Fp y(tbl[i].y);
+		Fp z;
+		Fp::add(z, x, y);
+		CYBOZU_TEST_EQUAL(z, tbl[i].add);
+		Fp::sub(z, x, y);
+		CYBOZU_TEST_EQUAL(z, tbl[i].sub);
+		Fp::mul(z, x, y);
+		CYBOZU_TEST_EQUAL(z, tbl[i].mul);
+
+		Fp r;
+		Fp::inv(r, y);
+		Fp::mul(z, z, r);
+		CYBOZU_TEST_EQUAL(z, tbl[i].x);
+		z = x + y;
+		CYBOZU_TEST_EQUAL(z, tbl[i].add);
+		z = x - y;
+		CYBOZU_TEST_EQUAL(z, tbl[i].sub);
+		z = x * y;
+		CYBOZU_TEST_EQUAL(z, tbl[i].mul);
+
+		Fp::square(z, x);
+		CYBOZU_TEST_EQUAL(z, tbl[i].sqr);
+
+		z = x / y;
+		z *= y;
+		CYBOZU_TEST_EQUAL(z, tbl[i].x);
+	}
+}
+
+struct tag2;
+
+CYBOZU_TEST_AUTO(power)
+{
+	Fp x, y, z;
+	x = 12345;
+	z = 1;
+	for (int i = 0; i < 100; i++) {
+		Fp::power(y, x, i);
+		CYBOZU_TEST_EQUAL(y, z);
+		z *= x;
+	}
+	typedef mcl::FpT<tag2, 128> Fp2;
+	Fp2::setModulo("1009");
+	x = 5;
+	Fp2 n = 3;
+	z = 3;
+	Fp::power(x, x, z);
+	CYBOZU_TEST_EQUAL(x, 125);
+	x = 5;
+	Fp::power(x, x, n);
+	CYBOZU_TEST_EQUAL(x, 125);
+}
+
+CYBOZU_TEST_AUTO(power_fp)
+{
+	Fp x, y, z;
+	x = 12345;
+	z = 1;
+	for (int i = 0; i < 100; i++) {
+		Fp::power(y, x, Fp(i));
+		CYBOZU_TEST_EQUAL(y, z);
+		z *= x;
+	}
+}
+
+struct TagAnother;
+
+CYBOZU_TEST_AUTO(another)
+{
+	typedef mcl::FpT<TagAnother, 128> G;
+	G::setModulo("13");
+	G a = 3;
+	G b = 9;
+	a *= b;
+	CYBOZU_TEST_EQUAL(a, 1);
+}
+
+
+CYBOZU_TEST_AUTO(setRaw)
+{
+	Fp::setModulo("1000000000000000000117");
+	char b1[] = { 0x56, 0x34, 0x12 };
+	Fp x;
+	x.setRaw(b1, 3);
+	CYBOZU_TEST_EQUAL(x, 0x123456);
+	int b2[] = { 0x12, 0x34 };
+	x.setRaw(b2, 2);
+	CYBOZU_TEST_EQUAL(x, Fp("0x3400000012"));
+	x.fromStr("0xffffffffffff");
+
+	Fp::setModulo("0x10000000000001234567a5");
+	const struct {
+		uint32_t buf[3];
+		size_t bufN;
+		const char *expected;
+	} tbl[] = {
+		{ { 0x234567a4, 0x00000001, 0x00100000}, 1, "0x234567a4" },
+		{ { 0x234567a4, 0x00000001, 0x00100000}, 2, "0x1234567a4" },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		x.setRaw(tbl[i].buf, tbl[i].bufN);
+		CYBOZU_TEST_EQUAL(x, Fp(tbl[i].expected));
+	}
+	uint32_t large[3] = { 0x234567a5, 0x00000001, 0x00100000};
+	CYBOZU_TEST_EXCEPTION(x.setRaw(large, 3), cybozu::Exception);
+}
+
+
+CYBOZU_TEST_AUTO(set64bit)
+{
+	Fp::setModulo("0x1000000000000000000f");
+	const struct {
+		const char *p;
+		int64_t i;
+	} tbl[] = {
+		{ "0x1234567812345678", int64_t(0x1234567812345678ull) },
+		{ "0xfffedcba987edcba997", -int64_t(0x1234567812345678ull) },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		Fp x(tbl[i].p);
+		Fp y(tbl[i].i);
+		CYBOZU_TEST_EQUAL(x, y);
+	}
+}
+
+CYBOZU_TEST_AUTO(getRaw)
+{
+	const struct {
+		const char *s;
+		uint32_t v[4];
+		size_t vn;
+	} tbl[] = {
+		{ "0", { 0, 0, 0, 0 }, 1 },
+		{ "1234", { 1234, 0, 0, 0 }, 1 },
+		{ "0xaabbccdd12345678", { 0x12345678, 0xaabbccdd, 0, 0 }, 2 },
+		{ "0x11112222333344445555666677778888", { 0x77778888, 0x55556666, 0x33334444, 0x11112222 }, 4 },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		mpz_class x(tbl[i].s);
+		const size_t bufN = 8;
+		uint32_t buf[bufN];
+		size_t n = mcl::Gmp::getRaw(buf, bufN, x);
+		CYBOZU_TEST_EQUAL(n, tbl[i].vn);
+		CYBOZU_TEST_EQUAL_ARRAY(buf, tbl[i].v, n);
+	}
+}
+
+CYBOZU_TEST_AUTO(toStr)
+{
+	const char *tbl[] = {
+		"0x0",
+		"0x5",
+		"0x123",
+		"0x123456789012345679adbc",
+		"0xffffffff26f2fc170f69466a74defd8d",
+		"0x100000000000000000000000000000033",
+		"0x11ee12312312940000000000000000000000000002342343"
+	};
+	Fp::setModulo("0xfffffffffffffffffffffffe26f2fc170f69466a74defd8d");
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		mpz_class x(tbl[i]);
+		Fp y(tbl[i]);
+		std::string xs, ys;
+		mcl::Gmp::toStr(xs, x, 16);
+		y.toStr(ys, 16);
+		CYBOZU_TEST_EQUAL(xs, ys);
+	}
+}
+
+CYBOZU_TEST_AUTO(binaryRepl)
+{
+	const struct {
+		const char *s;
+		size_t n;
+		uint32_t v[6];
+	} tbl[] = {
+		{ "0", 0, { 0, 0, 0, 0, 0 } },
+		{ "1234", 1, { 1234, 0, 0, 0, 0 } },
+		{ "0xaabbccdd12345678", 2, { 0x12345678, 0xaabbccdd, 0, 0, 0 } },
+		{ "0x11112222333344445555666677778888", 4, { 0x77778888, 0x55556666, 0x33334444, 0x11112222, 0 } },
+		{ "0x9911112222333344445555666677778888", 5, { 0x77778888, 0x55556666, 0x33334444, 0x11112222, 0x99, 0 } },
+	};
+	Fp::setModulo("0xfffffffffffffffffffffffe26f2fc170f69466a74defd8d");
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		Fp x(tbl[i].s);
+		cybozu::BitVector bv;
+		x.appendToBitVec(bv);
+		CYBOZU_TEST_EQUAL(bv.size(), Fp::getModBitLen());
+		CYBOZU_TEST_EQUAL(bv.size(), Fp::getBitVecSize());
+		const Fp::BlockType *block = bv.getBlock();
+		if (sizeof(Fp::BlockType) == 4) {
+			CYBOZU_TEST_EQUAL_ARRAY(block, tbl[i].v, tbl[i].n);
+		} else {
+			const size_t n = (tbl[i].n + 1) / 2;
+			for (size_t j = 0; j < n; j++) {
+				uint64_t v = (uint64_t(tbl[i].v[j * 2 + 1]) << 32) | tbl[i].v[j * 2];
+				CYBOZU_TEST_EQUAL(block[j], v);
+			}
+		}
+		Fp y;
+		y.fromBitVec(bv);
+		CYBOZU_TEST_EQUAL(x, y);
+	}
+}
+#endif
+
+#ifdef NDEBUG
+void benchSub(const char *pStr, const char *xStr, const char *yStr)
+	try
+{
+	Fp::setModulo(pStr);
+	Fp x(xStr);
+	Fp y(yStr);
+
+	CYBOZU_BENCH("add", Fp::add, x, x, x);
+	CYBOZU_BENCH("sub", Fp::sub, x, x, y);
+	CYBOZU_BENCH("mul", Fp::mul, x, x, x);
+	CYBOZU_BENCH("square", Fp::square, x, x);
+	CYBOZU_BENCH("inv", x += y;Fp::inv, x, x); // avoid same jmp
+	CYBOZU_BENCH("div", x += y;Fp::div, x, y, x);
+	puts("");
+} catch (std::exception& e) {
+	printf("ERR %s\n", e.what());
+}
+
+// square 76clk@sandy
+CYBOZU_TEST_AUTO(bench3)
+{
+	const char *pStr = "0xfffffffffffffffffffffffe26f2fc170f69466a74defd8d";
+	const char *xStr = "0x148094810948190412345678901234567900342423332197";
+	const char *yStr = "0x7fffffffffffffffffffffe26f2fc170f69466a74defd8d";
+	benchSub(pStr, xStr, yStr);
+}
+
+CYBOZU_TEST_AUTO(bench4)
+{
+	const char *pStr = "0x2523648240000001ba344d80000000086121000000000013a700000000000013";
+	const char *xStr = "0x1480948109481904123456789234234242423424201234567900342423332197";
+	const char *yStr = "0x151342342342341517fffffffffffffffffffffe26f2fc170f69466a74defd8d";
+	benchSub(pStr, xStr, yStr);
+}
+
+CYBOZU_TEST_AUTO(bench6)
+{
+	const char *pStr = "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff0000000000000000ffffffff";
+	const char *xStr = "0x19481084109481094820948209482094820984290482212345678901234567900342308472047204720422423332197";
+	const char *yStr = "0x209348209481094820984209842094820948204204243123456789012345679003423084720472047204224233321972";
+	benchSub(pStr, xStr, yStr);
+}
+
+CYBOZU_TEST_AUTO(bench9)
+{
+	const char *pStr = "0x1ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff";
+	const char *xStr = "0x2908209582095820941098410948109482094820984209840294829049240294242498540975555312345678901234567900342308472047204720422423332197";
+	const char *yStr = "0x3948384209834029834092384204920349820948205872380573205782385729385729385723985837ffffffffffffffffffffffe26f2fc170f69466a74defd8d";
+	benchSub(pStr, xStr, yStr);
+}
+#endif
diff --git a/test/fp_util_test.cpp b/test/fp_util_test.cpp
new file mode 100644
index 0000000..28d94ed
--- /dev/null
+++ b/test/fp_util_test.cpp
@@ -0,0 +1,191 @@
+#define PUT(x) std::cout << #x "=" << (x) << std::endl
+#include <mcl/fp_util.hpp>
+#include <cybozu/test.hpp>
+
+CYBOZU_TEST_AUTO(toStr16)
+{
+	const struct {
+		uint32_t x[4];
+		size_t n;
+		const char *str;
+	} tbl[] = {
+		{ { 0, 0, 0, 0 }, 0, "0" },
+		{ { 0x123, 0, 0, 0 }, 1, "123" },
+		{ { 0x12345678, 0xaabbcc, 0, 0 }, 2, "aabbcc12345678" },
+		{ { 0, 0x12, 0x234a, 0 }, 3, "234a0000001200000000" },
+		{ { 1, 2, 0xffffffff, 0x123abc }, 4, "123abcffffffff0000000200000001" },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		std::string str;
+		mcl::fp::toStr16(str, tbl[i].x, tbl[i].n, false);
+		CYBOZU_TEST_EQUAL(str, tbl[i].str);
+		mcl::fp::toStr16(str, tbl[i].x, tbl[i].n, true);
+		CYBOZU_TEST_EQUAL(str, std::string("0x") + tbl[i].str);
+	}
+}
+
+// CYBOZU_TEST_AUTO(toStr2) // QQQ
+// CYBOZU_TEST_AUTO(verifyStr) // QQQ
+
+CYBOZU_TEST_AUTO(fromStr16)
+{
+	const struct {
+		const char *str;
+		uint64_t x[4];
+	} tbl[] = {
+		{ "0", { 0, 0, 0, 0 } },
+		{ "5", { 5, 0, 0, 0 } },
+		{ "123", { 0x123, 0, 0, 0 } },
+		{ "123456789012345679adbc", { uint64_t(0x789012345679adbcull), 0x123456, 0, 0 } },
+		{ "ffffffff26f2fc170f69466a74defd8d", { uint64_t(0x0f69466a74defd8dull), uint64_t(0xffffffff26f2fc17ull), 0, 0 } },
+		{ "100000000000000000000000000000033", { uint64_t(0x0000000000000033ull), 0, 1, 0 } },
+		{ "11ee12312312940000000000000000000000000002342343", { uint64_t(0x0000000002342343ull), uint64_t(0x0000000000000000ull), uint64_t(0x11ee123123129400ull), 0 } },
+		{ "1234567890abcdefABCDEF123456789aba32134723424242424", { uint64_t(0x2134723424242424ull), uint64_t(0xDEF123456789aba3ull), uint64_t(0x4567890abcdefABCull), 0x123 } },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		const size_t xN = 4;
+		uint64_t x[xN];
+		mcl::fp::fromStr16(x, xN, tbl[i].str, strlen(tbl[i].str));
+		for (size_t j = 0; j < xN; j++) {
+			CYBOZU_TEST_EQUAL(x[j], tbl[i].x[j]);
+		}
+	}
+}
+
+CYBOZU_TEST_AUTO(compareArray)
+{
+	const struct {
+		uint32_t a[4];
+		uint32_t b[4];
+		size_t n;
+		int expect;
+	} tbl[] = {
+		{ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0 },
+		{ { 1, 0, 0, 0 }, { 0, 0, 0, 0 }, 1, 1 },
+		{ { 0, 0, 0, 0 }, { 1, 0, 0, 0 }, 1, -1 },
+		{ { 1, 0, 0, 0 }, { 1, 0, 0, 0 }, 1, 0 },
+		{ { 3, 1, 1, 0 }, { 2, 1, 1, 0 }, 4, 1 },
+		{ { 9, 2, 1, 1 }, { 1, 3, 1, 1 }, 4, -1 },
+		{ { 1, 7, 8, 4 }, { 1, 7, 8, 9 }, 3, 0 },
+		{ { 1, 7, 8, 4 }, { 1, 7, 8, 9 }, 4, -1 },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		int e = mcl::fp::compareArray(tbl[i].a, tbl[i].b, tbl[i].n);
+		CYBOZU_TEST_EQUAL(e, tbl[i].expect);
+	}
+}
+
+struct Rand {
+	std::vector<uint32_t> v;
+	size_t pos;
+	int count;
+	void read(uint32_t *x, size_t n)
+	{
+		if (v.size() < pos + n) throw cybozu::Exception("Rand:get:bad n") << v.size() << pos << n;
+		std::copy(v.begin() + pos, v.begin() + pos + n, x);
+		pos += n;
+		count++;
+	}
+	Rand(const uint32_t *x, size_t n)
+		: pos(0)
+		, count(0)
+	{
+		for (size_t i = 0; i < n; i++) {
+			v.push_back(x[i]);
+		}
+	}
+};
+
+CYBOZU_TEST_AUTO(getRandVal)
+{
+	const size_t rn = 8;
+	const struct {
+		uint32_t r[rn];
+		uint32_t mod[2];
+		size_t bitLen;
+		int count;
+		uint32_t expect[2];
+	} tbl[] = {
+		{ { 1, 2, 3, 4, 5, 6, 7, 8 }, { 5, 6 }, 64, 1, { 1, 2 } },
+		{ { 0xfffffffc, 0x7, 3, 4, 5, 6, 7, 8 }, { 0xfffffffe, 0x3 }, 34, 1, { 0xfffffffc, 0x3 } },
+		{ { 0xfffffffc, 0x7, 3, 4, 5, 6, 7, 8 }, { 0xfffffffb, 0x3 }, 34, 2, { 3, 0 } },
+		{ { 2, 3, 5, 7, 4, 3, 0, 3 }, { 1, 0x3 }, 34, 4, { 0, 3 } },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		Rand rg(tbl[i].r, rn);
+		uint32_t out[2];
+		mcl::fp::getRandVal(out, rg, tbl[i].mod, tbl[i].bitLen);
+		CYBOZU_TEST_EQUAL(out[0], tbl[i].expect[0]);
+		CYBOZU_TEST_EQUAL(out[1], tbl[i].expect[1]);
+		CYBOZU_TEST_EQUAL(rg.count, tbl[i].count);
+	}
+}
+
+CYBOZU_TEST_AUTO(shiftLeftOr)
+{
+	const struct {
+		uint32_t x[4];
+		size_t n;
+		size_t shift;
+		uint32_t y;
+		uint32_t z[4];
+		uint32_t ret;
+	} tbl[] = {
+		{ { 0x12345678, 0, 0, 0 }, 1, 0, 0, { 0x12345678, 0, 0, 0 }, 0 },
+		{ { 0x12345678, 0, 0, 0 }, 1, 1, 0, { 0x2468acf0, 0, 0, 0 }, 0 },
+		{ { 0xf2345678, 0, 0, 0 }, 1, 1, 5, { 0xe468acf5, 0, 0, 0 }, 1 },
+		{ { 0x12345678, 0x9abcdef0, 0x11112222, 0xffccaaee }, 4, 19, 0x1234, { 0xb3c01234, 0xf78091a2, 0x1114d5e6, 0x57708889 }, 0x7fe65 },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		uint32_t z[4];
+		uint32_t ret = mcl::fp::shiftLeftOr(z, tbl[i].x, tbl[i].n, tbl[i].shift, tbl[i].y);
+		CYBOZU_TEST_EQUAL_ARRAY(z, tbl[i].z, tbl[i].n);
+		CYBOZU_TEST_EQUAL(ret, tbl[i].ret);
+	}
+}
+
+CYBOZU_TEST_AUTO(shiftRight)
+{
+	const struct {
+		uint32_t x[4];
+		size_t n;
+		size_t shift;
+		uint32_t z[4];
+	} tbl[] = {
+		{ { 0x12345678, 0, 0, 0 }, 4, 0, { 0x12345678, 0, 0, 0 } },
+		{ { 0x12345678, 0xaaaabbbb, 0xffeebbcc, 0xfeba9874 }, 4, 1, { 0x891a2b3c, 0x55555ddd, 0x7ff75de6, 0x7f5d4c3a } },
+		{ { 0x12345678, 0xaaaabbbb, 0xffeebbcc, 0xfeba9874 }, 4, 18, { 0xaeeec48d, 0xaef32aaa, 0xa61d3ffb, 0x3fae } },
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		uint32_t z[4];
+		mcl::fp::shiftRight(z, tbl[i].x, tbl[i].n, tbl[i].shift);
+		CYBOZU_TEST_EQUAL_ARRAY(z, tbl[i].z, tbl[i].n);
+	}
+}
+
+CYBOZU_TEST_AUTO(splitBitVec)
+{
+	uint32_t tbl[] = { 0x12345678, 0xaaaabbbb, 0xffeebbcc };
+	typedef cybozu::BitVectorT<uint32_t> BitVec;
+	typedef std::vector<int> IntVec;
+	BitVec bv;
+	bv.append(tbl, sizeof(tbl) * 8);
+	for (size_t len = bv.size(); len > 0; len--) {
+		bv.resize(len);
+		for (size_t w = 1; w < 18; w++) {
+			IntVec iv;
+			size_t last = mcl::fp::splitBitVec(iv, bv, w);
+			size_t q = len / w;
+			size_t r = len % w;
+			if (r == 0) {
+				r = w;
+			} else {
+				q++;
+			}
+			CYBOZU_TEST_EQUAL(iv.size(), q);
+			BitVec bv2;
+			mcl::fp::concatBitVec(bv2, iv, w, last);
+			CYBOZU_TEST_ASSERT(bv == bv2);
+		}
+	}
+}
diff --git a/test/mk32.sh b/test/mk32.sh
new file mode 100644
index 0000000..4d5f607
--- /dev/null
+++ b/test/mk32.sh
@@ -0,0 +1 @@
+g++ -O3 -march=native base_test.cpp ../src/x86.s -m32 -I ~/32/include/ -I ../include/ -I ../../xbyak/ -I ../../cybozulib/include ~/32/lib/libgmp.a ~/32/lib/libgmpxx.a -I ~/32/lib -DNDEBUG
diff --git a/test/mont_fp_test.cpp b/test/mont_fp_test.cpp
new file mode 100644
index 0000000..6e13e4d
--- /dev/null
+++ b/test/mont_fp_test.cpp
@@ -0,0 +1,809 @@
+#define PUT(x) std::cout << #x "=" << (x) << std::endl
+#include <cybozu/test.hpp>
+#include <cybozu/benchmark.hpp>
+#include <time.h>
+
+#define USE_MONT_FP
+#include <mcl/fp.hpp>
+typedef mcl::FpT<> Zn;
+typedef mcl::FpT<> MontFp3;
+typedef mcl::FpT<> MontFp4;
+typedef mcl::FpT<> MontFp6;
+typedef mcl::FpT<> MontFp9;
+
+struct Montgomery {
+	typedef mcl::Gmp::BlockType BlockType;
+	mpz_class p_;
+	mpz_class R_; // (1 << (pn_ * 64)) % p
+	mpz_class RR_; // (R * R) % p
+	BlockType pp_; // p * pp = -1 mod M = 1 << 64
+	size_t pn_;
+	Montgomery() {}
+	explicit Montgomery(const mpz_class& p)
+	{
+		p_ = p;
+		pp_ = mcl::montgomery::getCoff(mcl::Gmp::getBlock(p, 0));
+		pn_ = mcl::Gmp::getBlockSize(p);
+		R_ = 1;
+		R_ = (R_ << (pn_ * 64)) % p_;
+		RR_ = (R_ * R_) % p_;
+	}
+
+	void toMont(mpz_class& x) const { mul(x, x, RR_); }
+	void fromMont(mpz_class& x) const { mul(x, x, 1); }
+
+	void mul(mpz_class& z, const mpz_class& x, const mpz_class& y) const
+	{
+#if 0
+		const size_t ySize = mcl::Gmp::getBlockSize(y);
+		mpz_class c = x * mcl::Gmp::getBlock(y, 0);
+		BlockType q = mcl::Gmp::getBlock(c, 0) * pp_;
+		c += p_ * q;
+		c >>= sizeof(BlockType) * 8;
+		for (size_t i = 1; i < pn_; i++) {
+			if (i < ySize) {
+				c += x * mcl::Gmp::getBlock(y, i);
+			}
+			BlockType q = mcl::Gmp::getBlock(c, 0) * pp_;
+			c += p_ * q;
+			c >>= sizeof(BlockType) * 8;
+		}
+		if (c >= p_) {
+			c -= p_;
+		}
+		z = c;
+#else
+		z = x * y;
+		for (size_t i = 0; i < pn_; i++) {
+			BlockType q = mcl::Gmp::getBlock(z, 0) * pp_;
+			z += p_ * (mp_limb_t)q;
+			z >>= sizeof(BlockType) * 8;
+		}
+		if (z >= p_) {
+			z -= p_;
+		}
+#endif
+	}
+};
+
+template<class T>
+mpz_class toGmp(const T& x)
+{
+	std::string str = x.toStr();
+	mpz_class t;
+	mcl::Gmp::fromStr(t, str);
+	return t;
+}
+
+template<class T>
+std::string toStr(const T& x)
+{
+	std::ostringstream os;
+	os << x;
+	return os.str();
+}
+
+template<class T, class U>
+T castTo(const U& x)
+{
+	T t;
+	t.fromStr(toStr(x));
+	return t;
+}
+
+template<class T>
+void putRaw(const T& x)
+{
+	const uint64_t *p = x.getInnerValue();
+	for (size_t i = 0, n = T::BlockSize; i < n; i++) {
+		printf("%016llx", p[n - 1 - i]);
+	}
+	printf("\n");
+}
+
+template<size_t N>
+void put(const uint64_t (&x)[N])
+{
+	for (size_t i = 0; i < N; i++) {
+		printf("%016llx", x[N - 1 - i]);
+	}
+	printf("\n");
+}
+
+template<size_t N>
+struct Test {
+	typedef mcl::FpT<> Fp;
+	mpz_class m;
+	void run(const char *p)
+	{
+		Fp::setModulo(p);
+		m = p;
+		Zn::setModulo(p);
+		edge();
+		cstr();
+		toStr();
+		fromStr();
+		stream();
+		conv();
+		compare();
+		modulo();
+		ope();
+		cvtInt();
+		power();
+		neg_power();
+		power_Zn();
+		setRaw();
+		set64bit();
+		getRaw();
+		binaryExp();
+		bench();
+	}
+	void cstr()
+	{
+		const struct {
+			const char *str;
+			int val;
+		} tbl[] = {
+			{ "0", 0 },
+			{ "1", 1 },
+			{ "123", 123 },
+			{ "0x123", 0x123 },
+			{ "0b10101", 21 },
+		};
+		for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+			// string cstr
+			Fp x(tbl[i].str);
+			CYBOZU_TEST_EQUAL(x, tbl[i].val);
+
+			// int cstr
+			Fp y(tbl[i].val);
+			CYBOZU_TEST_EQUAL(y, x);
+
+			// copy cstr
+			Fp z(x);
+			CYBOZU_TEST_EQUAL(z, x);
+
+			// assign int
+			Fp w;
+			w = tbl[i].val;
+			CYBOZU_TEST_EQUAL(w, x);
+
+			// assign self
+			Fp u;
+			u = w;
+			CYBOZU_TEST_EQUAL(u, x);
+
+			// conv
+			std::ostringstream os;
+			os << tbl[i].val;
+
+			std::string str;
+			x.toStr(str);
+			CYBOZU_TEST_EQUAL(str, os.str());
+		}
+		const struct {
+			const char *str;
+			int val;
+		} tbl2[] = {
+			{ "-123", 123 },
+			{ "-0x123", 0x123 },
+			{ "-0b10101", 21 },
+		};
+		for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl2); i++) {
+			Fp x(tbl2[i].str);
+			x = -x;
+			CYBOZU_TEST_EQUAL(x, tbl2[i].val);
+		}
+	}
+	void toStr()
+	{
+		Fp x(0);
+		std::string str;
+		str = x.toStr();
+		CYBOZU_TEST_EQUAL(str, "0");
+		str = x.toStr(2, true);
+		CYBOZU_TEST_EQUAL(str, "0");
+		str = x.toStr(2, false);
+		CYBOZU_TEST_EQUAL(str, "0");
+		str = x.toStr(16, true);
+		CYBOZU_TEST_EQUAL(str, "0");
+		str = x.toStr(16, false);
+		CYBOZU_TEST_EQUAL(str, "0");
+
+		x = 123;
+		str = x.toStr();
+		CYBOZU_TEST_EQUAL(str, "123");
+		str = x.toStr(2, true);
+		CYBOZU_TEST_EQUAL(str, "0b1111011");
+		str = x.toStr(2, false);
+		CYBOZU_TEST_EQUAL(str, "1111011");
+		str = x.toStr(16, true);
+		CYBOZU_TEST_EQUAL(str, "0x7b");
+		str = x.toStr(16, false);
+		CYBOZU_TEST_EQUAL(str, "7b");
+
+		{
+			std::ostringstream os;
+			os << x;
+			CYBOZU_TEST_EQUAL(os.str(), "123");
+		}
+		{
+			std::ostringstream os;
+			os << std::hex << std::showbase << x;
+			CYBOZU_TEST_EQUAL(os.str(), "0x7b");
+		}
+		{
+			std::ostringstream os;
+			os << std::hex << x;
+			CYBOZU_TEST_EQUAL(os.str(), "7b");
+		}
+	}
+
+	void fromStr()
+	{
+		const struct {
+			const char *in;
+			int out;
+			int base;
+		} tbl[] = {
+			{ "100", 100, 0 }, // set base = 10 if base = 0
+			{ "100", 4, 2 },
+			{ "100", 256, 16 },
+			{ "0b100", 4, 0 },
+			{ "0b100", 4, 2 },
+			{ "0x100", 256, 0 },
+			{ "0x100", 256, 16 },
+		};
+		for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+			Fp x;
+			x.fromStr(tbl[i].in, tbl[i].base);
+			CYBOZU_TEST_EQUAL(x, tbl[i].out);
+		}
+		// conflict prefix with base
+		Fp x;
+		CYBOZU_TEST_EXCEPTION(x.fromStr("0b100", 16), cybozu::Exception);
+		CYBOZU_TEST_EXCEPTION(x.fromStr("0x100", 2), cybozu::Exception);
+	}
+
+	void stream()
+	{
+		const struct {
+			const char *in;
+			int out10;
+			int out16;
+		} tbl[] = {
+			{ "100", 100, 256 }, // set base = 10 if base = 0
+			{ "0x100", 256, 256 },
+		};
+		for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+			{
+				std::istringstream is(tbl[i].in);
+				Fp x;
+				is >> x;
+				CYBOZU_TEST_EQUAL(x, tbl[i].out10);
+			}
+			{
+				std::istringstream is(tbl[i].in);
+				Fp x;
+				is >> std::hex >> x;
+				CYBOZU_TEST_EQUAL(x, tbl[i].out16);
+			}
+		}
+		std::istringstream is("0b100");
+		Fp x;
+		CYBOZU_TEST_EXCEPTION(is >> std::hex >> x, cybozu::Exception);
+	}
+	void edge()
+	{
+#if 0
+		std::cout << std::hex;
+		/*
+			real mont
+			   0    0
+			   1    R^-1
+			   R    1
+			  -1    -R^-1
+			  -R    -1
+		*/
+		mpz_class t = 1;
+		const mpz_class R = (t << (N * 64)) % m;
+		const mpz_class tbl[] = {
+			0, 1, R, m - 1, m - R
+		};
+		for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+			const mpz_class& x = tbl[i];
+			for (size_t j = i; j < CYBOZU_NUM_OF_ARRAY(tbl); j++) {
+				const mpz_class& y = tbl[j];
+				mpz_class z = (x * y) % m;
+				Fp xx, yy;
+				Fp::toMont(xx, x);
+				Fp::toMont(yy, y);
+				Fp zz = xx * yy;
+				mpz_class t;
+				Fp::fromMont(t, zz);
+				CYBOZU_TEST_EQUAL(z, t);
+			}
+		}
+		std::cout << std::dec;
+#endif
+	}
+
+	void conv()
+	{
+		const char *bin = "0b100100011010001010110011110001001000000010010001101000101011001111000100100000001001000110100010101100111100010010000";
+		const char *hex = "0x123456789012345678901234567890";
+		const char *dec = "94522879687365475552814062743484560";
+		Fp b(bin);
+		Fp h(hex);
+		Fp d(dec);
+		CYBOZU_TEST_EQUAL(b, h);
+		CYBOZU_TEST_EQUAL(b, d);
+
+		std::string str;
+		b.toStr(str, 2, true);
+		CYBOZU_TEST_EQUAL(str, bin);
+		b.toStr(str);
+		CYBOZU_TEST_EQUAL(str, dec);
+		b.toStr(str, 16, true);
+		CYBOZU_TEST_EQUAL(str, hex);
+	}
+
+	void compare()
+	{
+		const struct {
+			int lhs;
+			int rhs;
+			int cmp;
+		} tbl[] = {
+			{ 0, 0, 0 },
+			{ 1, 0, 1 },
+			{ 0, 1, -1 },
+			{ -1, 0, 1 }, // m-1, 0
+			{ 0, -1, -1 }, // 0, m-1
+			{ 123, 456, -1 },
+			{ 456, 123, 1 },
+			{ 5, 5, 0 },
+		};
+		for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+			const Fp x(tbl[i].lhs);
+			const Fp y(tbl[i].rhs);
+			const int cmp = tbl[i].cmp;
+			if (cmp == 0) {
+				CYBOZU_TEST_EQUAL(x, y);
+			} else {
+				CYBOZU_TEST_ASSERT(x != y);
+			}
+		}
+	}
+
+	void modulo()
+	{
+		std::ostringstream ms;
+		ms << m;
+
+		std::string str;
+		Fp::getModulo(str);
+		CYBOZU_TEST_EQUAL(str, ms.str());
+	}
+
+	void ope()
+	{
+		const struct {
+			Zn x;
+			Zn y;
+			Zn add; // x + y
+			Zn sub; // x - y
+			Zn mul; // x * y
+			Zn sqr; // x * x
+		} tbl[] = {
+			{ 0, 1, 1, -1, 0, 0 },
+			{ 9, 7, 16, 2, 63, 81 },
+			{ 10, 13, 23, -3, 130, 100 },
+			{ 2000, -1000, 1000, 3000, -2000000, 4000000 },
+			{ -12345, -9999, -(12345 + 9999), - 12345 + 9999, 12345 * 9999, 12345 * 12345 },
+		};
+		for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+			const Fp x(castTo<Fp>(tbl[i].x));
+			const Fp y(castTo<Fp>(tbl[i].y));
+			Fp z;
+			Fp::add(z, x, y);
+			CYBOZU_TEST_EQUAL(z, castTo<Fp>(tbl[i].add));
+			Fp::sub(z, x, y);
+			CYBOZU_TEST_EQUAL(z, castTo<Fp>(tbl[i].sub));
+			Fp::mul(z, x, y);
+			CYBOZU_TEST_EQUAL(z, castTo<Fp>(tbl[i].mul));
+
+			Fp r;
+			Fp::inv(r, y);
+			Zn rr = 1 / tbl[i].y;
+			CYBOZU_TEST_EQUAL(r, castTo<Fp>(rr));
+			Fp::mul(z, z, r);
+			CYBOZU_TEST_EQUAL(z, castTo<Fp>(tbl[i].x));
+
+			z = x + y;
+			CYBOZU_TEST_EQUAL(z, castTo<Fp>(tbl[i].add));
+			z = x - y;
+			CYBOZU_TEST_EQUAL(z, castTo<Fp>(tbl[i].sub));
+			z = x * y;
+			CYBOZU_TEST_EQUAL(z, castTo<Fp>(tbl[i].mul));
+			Fp::square(z, x);
+			CYBOZU_TEST_EQUAL(z, castTo<Fp>(tbl[i].sqr));
+
+			z = x / y;
+			z *= y;
+			CYBOZU_TEST_EQUAL(z, castTo<Fp>(tbl[i].x));
+		}
+	}
+	void cvtInt()
+	{
+#if 0
+		Fp x;
+		x = 12345;
+		uint64_t y = x.cvtInt();
+		CYBOZU_TEST_EQUAL(y, 12345u);
+		x.fromStr("123456789012342342342342342");
+		CYBOZU_TEST_EXCEPTION(x.cvtInt(), cybozu::Exception);
+		bool err = false;
+		CYBOZU_TEST_NO_EXCEPTION(x.cvtInt(&err));
+		CYBOZU_TEST_ASSERT(err);
+#endif
+	}
+
+	void power()
+	{
+		Fp x, y, z;
+		x = 12345;
+		z = 1;
+		for (int i = 0; i < 100; i++) {
+			Fp::power(y, x, i);
+			CYBOZU_TEST_EQUAL(y, z);
+			z *= x;
+		}
+	}
+
+	void neg_power()
+	{
+		Fp x, y, z;
+		x = 12345;
+		z = 1;
+		Fp rx = 1 / x;
+		for (int i = 0; i < 100; i++) {
+			Fp::power(y, x, -i);
+			CYBOZU_TEST_EQUAL(y, z);
+			z *= rx;
+		}
+	}
+
+	void power_Zn()
+	{
+		Fp x, y, z;
+		x = 12345;
+		z = 1;
+		for (int i = 0; i < 100; i++) {
+			Fp::power(y, x, Zn(i));
+			CYBOZU_TEST_EQUAL(y, z);
+			z *= x;
+		}
+	}
+
+	void setRaw()
+	{
+		// QQQ
+#if 0
+		char b1[] = { 0x56, 0x34, 0x12 };
+		Fp x;
+		x.setRaw(b1, 3);
+		CYBOZU_TEST_EQUAL(x, 0x123456);
+		int b2[] = { 0x12, 0x34 };
+		x.setRaw(b2, 2);
+		CYBOZU_TEST_EQUAL(x, Fp("0x3400000012"));
+#endif
+	}
+	void binaryExp()
+	{
+		puts("binaryExp");
+		for (int i = 2; i < 7; i++) {
+			mpz_class g = m / i;
+			Fp x, y;
+//			Fp::toMont(x, g);
+			x.fromGmp(g);
+			cybozu::BitVector bv;
+			x.appendToBitVec(bv);
+			uint64_t buf[N];
+			mcl::Gmp::getRaw(buf, N, g);
+			CYBOZU_TEST_EQUAL(bv.getBlockSize(), N);
+			CYBOZU_TEST_EQUAL(bv.size(), Fp::getModBitLen());
+			CYBOZU_TEST_EQUAL(bv.size(), Fp::getBitVecSize());
+			const uint64_t *p = bv.getBlock();
+			CYBOZU_TEST_EQUAL_ARRAY(p, buf, N);
+		}
+		const mpz_class yy("0x1255556666777788881111222233334444");
+		if (yy > m) {
+			return;
+		}
+		Fp y;
+//		Fp::toMont(y, yy);
+		y.fromGmp(yy);
+		uint64_t b1[N] = { uint64_t(0x1111222233334444ull), uint64_t(0x5555666677778888ull), 0x12 };
+		Fp x;
+		cybozu::BitVector bv;
+		bv.append(b1, Fp::getModBitLen());
+		x.fromBitVec(bv);
+		CYBOZU_TEST_EQUAL(x, y);
+		bv.clear();
+		x.appendToBitVec(bv);
+		const uint64_t *b2 = bv.getBlock();
+		CYBOZU_TEST_EQUAL_ARRAY(b1, b2, N);
+	}
+
+	void set64bit()
+	{
+		const struct {
+			const char *p;
+			uint64_t i;
+		} tbl[] = {
+			{ "0x1234567812345678", uint64_t(0x1234567812345678ull) },
+			{ "0xaaaaaaaaaaaaaaaa", uint64_t(0xaaaaaaaaaaaaaaaaull) },
+		};
+		for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+			Fp x(tbl[i].p);
+			Fp y(tbl[i].i);
+			CYBOZU_TEST_EQUAL(x, y);
+		}
+	}
+
+	void getRaw()
+	{
+		const struct {
+			const char *s;
+			uint32_t v[4];
+			size_t vn;
+		} tbl[] = {
+			{ "0", { 0, 0, 0, 0 }, 1 },
+			{ "1234", { 1234, 0, 0, 0 }, 1 },
+			{ "0xaabbccdd12345678", { 0x12345678, 0xaabbccdd, 0, 0 }, 2 },
+			{ "0x11112222333344445555666677778888", { 0x77778888, 0x55556666, 0x33334444, 0x11112222 }, 4 },
+		};
+		for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+			mpz_class x(tbl[i].s);
+			const size_t bufN = 8;
+			uint32_t buf[bufN];
+			size_t n = mcl::Gmp::getRaw(buf, bufN, x);
+			CYBOZU_TEST_EQUAL(n, tbl[i].vn);
+			for (size_t j = 0; j < n; j++) {
+				CYBOZU_TEST_EQUAL(buf[j], tbl[i].v[j]);
+			}
+		}
+	}
+	void bench()
+	{
+		Fp x("-123456789");
+		Fp y("-0x7ffffffff");
+		CYBOZU_BENCH("add", operator+, x, x);
+		CYBOZU_BENCH("sub", operator-, x, y);
+		CYBOZU_BENCH("mul", operator*, x, x);
+		CYBOZU_BENCH("sqr", Fp::square, x, x);
+		CYBOZU_BENCH("div", y += x; operator/, x, y);
+	}
+};
+
+void customTest(const char *pStr, const char *xStr, const char *yStr)
+{
+#if 0
+	{
+		pStr = "0xfffffffffffffffffffffffffffffffffffffffeffffee37",
+		MontFp3::setModulo(pStr);
+		static uint64_t x[3] = { 1, 0, 0 };
+		uint64_t z[3];
+std::cout<<std::hex;
+		MontFp3::inv(*(MontFp3*)z, *(const MontFp3*)x);
+put(z);
+		exit(1);
+	}
+#endif
+#if 0
+	std::cout << std::hex;
+	uint64_t x[9] = { 0xff7fffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1ff };
+	uint64_t y[9] = { 0xff7fffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1ff };
+	uint64_t z1[9], z2[9];
+	MontFp9::setModulo(pStr);
+	MontFp9::fg_.mul_(z2, x, y);
+	put(z2);
+	{
+		puts("C");
+		mpz_class p(pStr);
+		Montgomery mont(p);
+		mpz_class xx, yy;
+		mcl::Gmp::setRaw(xx, x, CYBOZU_NUM_OF_ARRAY(x));
+		mcl::Gmp::setRaw(yy, y, CYBOZU_NUM_OF_ARRAY(y));
+		mpz_class z;
+		mont.mul(z, xx, yy);
+		std::cout << std::hex << z << std::endl;
+	}
+	exit(1);
+#else
+	std::string rOrg, rC, rAsm;
+	Zn::setModulo(pStr);
+	Zn s(xStr), t(yStr);
+	s *= t;
+	rOrg = toStr(s);
+	{
+		puts("C");
+		mpz_class p(pStr);
+		Montgomery mont(p);
+		mpz_class x(xStr), y(yStr);
+		mont.toMont(x);
+		mont.toMont(y);
+		mpz_class z;
+		mont.mul(z, x, y);
+		mont.fromMont(z);
+		rC = toStr(z);
+	}
+
+	puts("asm");
+	MontFp9::setModulo(pStr);
+	MontFp9 x(xStr), y(yStr);
+	x *= y;
+	rAsm = toStr(x);
+	CYBOZU_TEST_EQUAL(rOrg, rC);
+	CYBOZU_TEST_EQUAL(rOrg, rAsm);
+#endif
+}
+
+CYBOZU_TEST_AUTO(customTest)
+{
+	const struct {
+		const char *p;
+		const char *x;
+		const char *y;
+	} tbl[] = {
+		{
+			"0x1ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+//			"0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff0000000000000000ffffffff",
+//			"0xfffffffffffffffffffffffffffffffffffffffeffffee37",
+			"0x1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe",
+			"0x1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe"
+		},
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		customTest(tbl[i].p, tbl[i].x, tbl[i].y);
+	}
+}
+
+CYBOZU_TEST_AUTO(test3)
+{
+	Test<3> test;
+	const char *tbl[] = {
+		"0x000000000000000100000000000000000000000000000033", // min prime
+		"0x00000000fffffffffffffffffffffffffffffffeffffac73",
+		"0x0000000100000000000000000001b8fa16dfab9aca16b6b3",
+		"0x000000010000000000000000000000000000000000000007",
+		"0x30000000000000000000000000000000000000000000002b",
+		"0x70000000000000000000000000000000000000000000001f",
+		"0x800000000000000000000000000000000000000000000005",
+		"0xfffffffffffffffffffffffffffffffffffffffeffffee37",
+		"0xfffffffffffffffffffffffe26f2fc170f69466a74defd8d",
+		"0xffffffffffffffffffffffffffffffffffffffffffffff13", // max prime
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		printf("prime=%s\n", tbl[i]);
+		test.run(tbl[i]);
+	}
+}
+
+CYBOZU_TEST_AUTO(test4)
+{
+	Test<4> test;
+	const char *tbl[] = {
+		"0x0000000000000001000000000000000000000000000000000000000000000085", // min prime
+		"0x2523648240000001ba344d80000000086121000000000013a700000000000013",
+		"0x7523648240000001ba344d80000000086121000000000013a700000000000017",
+		"0x800000000000000000000000000000000000000000000000000000000000005f",
+		"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff43", // max prime
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		printf("prime=%s\n", tbl[i]);
+		test.run(tbl[i]);
+	}
+}
+
+CYBOZU_TEST_AUTO(test6)
+{
+	Test<6> test;
+	const char *tbl[] = {
+		"0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff0000000000000000ffffffff",
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		printf("prime=%s\n", tbl[i]);
+		test.run(tbl[i]);
+	}
+}
+
+CYBOZU_TEST_AUTO(test9)
+{
+	Test<9> test;
+	const char *tbl[] = {
+		"0x1ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		printf("prime=%s\n", tbl[i]);
+		test.run(tbl[i]);
+	}
+}
+
+CYBOZU_TEST_AUTO(toStr16)
+{
+	const char *tbl[] = {
+		"0x0",
+		"0x5",
+		"0x123",
+		"0x123456789012345679adbc",
+		"0xffffffff26f2fc170f69466a74defd8d",
+		"0x100000000000000000000000000000033",
+		"0x11ee12312312940000000000000000000000000002342343"
+	};
+	MontFp3::setModulo("0xffffffffffffffffffffffffffffffffffffffffffffff13");
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		std::string str, str2;
+		MontFp3 x(tbl[i]);
+		x.toStr(str, 16);
+		mpz_class y(tbl[i]);
+		mcl::Gmp::toStr(str2, y, 16);
+		CYBOZU_TEST_EQUAL(str, str2);
+	}
+}
+
+#if 0
+CYBOZU_TEST_AUTO(toStr16bench)
+{
+	const char *tbl[] = {
+		"0x0",
+		"0x5",
+		"0x123",
+		"0x123456789012345679adbc",
+		"0xffffffff26f2fc170f69466a74defd8d",
+		"0x100000000000000000000000000000033",
+		"0x11ee12312312940000000000000000000000000002342343"
+	};
+	const int C = 500000;
+	MontFp3::setModulo("0xffffffffffffffffffffffffffffffffffffffffffffff13");
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		std::string str, str2;
+		MontFp3 x(tbl[i]);
+		CYBOZU_BENCH_C("Mont:toStr", C, x.toStr, str, 16);
+		mpz_class y(tbl[i]);
+		CYBOZU_BENCH_C("Gmp:toStr ", C, mcl::Gmp::toStr, str2, y, 16);
+		str2.insert(0, "0x");
+		CYBOZU_TEST_EQUAL(str, str2);
+	}
+}
+
+CYBOZU_TEST_AUTO(fromStr16bench)
+{
+	const char *tbl[] = {
+		"0x0",
+		"0x5",
+		"0x123",
+		"0x123456789012345679adbc",
+		"0xffffffff26f2fc170f69466a74defd8d",
+		"0x100000000000000000000000000000033",
+		"0x11ee12312312940000000000000000000000000002342343"
+	};
+	const int C = 500000;
+	MontFp3::setModulo("0xffffffffffffffffffffffffffffffffffffffffffffff13");
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		std::string str = tbl[i];
+		MontFp3 x;
+		CYBOZU_BENCH_C("Mont:fromStr", C, x.fromStr, str);
+
+		mpz_class y;
+		str.erase(0, 2);
+		CYBOZU_BENCH_C("Gmp:fromStr ", C, mcl::Gmp::fromStr, y, str, 16);
+		x.toStr(str, 16);
+		std::string str2;
+		mcl::Gmp::toStr(str2, y, 16);
+		str2.insert(0, "0x");
+		CYBOZU_TEST_EQUAL(str, str2);
+	}
+}
+#endif
diff --git a/test/proj/ec_test/ec_test.vcxproj b/test/proj/ec_test/ec_test.vcxproj
new file mode 100644
index 0000000..b141754
--- /dev/null
+++ b/test/proj/ec_test/ec_test.vcxproj
@@ -0,0 +1,88 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{46B6E88E-739A-406B-9F68-BC46C5950FA3}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>ec_test</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(SolutionDir)common.props" />
+    <Import Project="$(SolutionDir)debug.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(SolutionDir)common.props" />
+    <Import Project="$(SolutionDir)release.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="$(SolutionDir)test\ec_test.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/test/proj/fp_test/fp_test.vcxproj b/test/proj/fp_test/fp_test.vcxproj
new file mode 100644
index 0000000..a77dc21
--- /dev/null
+++ b/test/proj/fp_test/fp_test.vcxproj
@@ -0,0 +1,91 @@
+����<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{51266DE6-B57B-4AE3-B85C-282F170E1728}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>fp_test</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(SolutionDir)common.props" />
+    <Import Project="$(SolutionDir)debug.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(SolutionDir)common.props" />
+    <Import Project="$(SolutionDir)release.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(SolutionDir)../xbyak/;$(SolutionDir)../cybozulib/include;$(SolutionDir)../cybozulib_ext/mpir/include;$(SolutionDir)include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(SolutionDir)../xbyak/;$(SolutionDir)../cybozulib/include;$(SolutionDir)../cybozulib_ext/mpir/include;$(SolutionDir)include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="$(SolutionDir)test\\fp_test.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/test/sq_test.cpp b/test/sq_test.cpp
new file mode 100644
index 0000000..6174be6
--- /dev/null
+++ b/test/sq_test.cpp
@@ -0,0 +1,21 @@
+#include <mcl/gmp_util.hpp>
+#include <cybozu/test.hpp>
+#include <iostream>
+
+CYBOZU_TEST_AUTO(sqrt)
+{
+	const int tbl[] = { 3, 5, 7, 11, 13, 17, 19, 257, 997, 1031 };
+	mcl::SquareRoot sq;
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		const mpz_class p = tbl[i];
+		sq.set(p);
+		for (mpz_class a = 1; a < p; a++) {
+			mpz_class x;
+			if (sq.get(x, a)) {
+				mpz_class y;
+				y = (x * x) % p;
+				CYBOZU_TEST_EQUAL(a, y);
+			}
+		}
+	}
+}
author	MITSUNARI Shigeo <herumi@nifty.com>	2015-05-05 09:52:44 +0800
committer	MITSUNARI Shigeo <herumi@nifty.com>	2015-05-05 09:52:44 +0800
commit	14fd1d125d21f47c2539a0e820e2a417e0c715f8 (patch)
tree	4fccb395f51a3f9c1202a783665ce396b649eeb9
download	tangerine-mcl-14fd1d125d21f47c2539a0e820e2a417e0c715f8.tar.gz tangerine-mcl-14fd1d125d21f47c2539a0e820e2a417e0c715f8.tar.zst tangerine-mcl-14fd1d125d21f47c2539a0e820e2a417e0c715f8.zip