support w/wo bmi2

author: MITSUNARI Shigeo <herumi@nifty.com> 2016-12-28 15:46:50 +0800
committer: MITSUNARI Shigeo <herumi@nifty.com> 2016-12-28 15:46:50 +0800
commit: 374acb1577a1c85f2d440354e481671bab1e1bff (patch)
tree: 33d3af189c55e3240caf60d13b591c3f75705d55
parent: ce7dd1e830e4560deb99fa656524abb00447c346 (diff)
download: tangerine-mcl-374acb1577a1c85f2d440354e481671bab1e1bff.tar.gz
tangerine-mcl-374acb1577a1c85f2d440354e481671bab1e1bff.tar.zst
tangerine-mcl-374acb1577a1c85f2d440354e481671bab1e1bff.zip
6 files changed, 53 insertions, 28 deletions
diff --git a/Makefile b/Makefile
index aceecd1..05aacfe 100644
--- a/Makefile
+++ b/Makefile
@@ -50,13 +50,17 @@ LLVM_FLAGS+=-pre-RA-sched=list-ilp -max-sched-reorder=128
 
 HAS_BMI2=$(shell cat "/proc/cpuinfo" | grep bmi2 >/dev/null && echo "1")
 ifeq ($(HAS_BMI2),1)
-  LLVM_FLAGS+=-mattr=bmi2
+#  LLVM_FLAGS+=-mattr=bmi2
 endif
 
 ifeq ($(USE_LOW_ASM),1)
   LOW_ASM_OBJ=$(LOW_ASM_SRC:.asm=.o)
   LIB_OBJ+=$(LOW_ASM_OBJ)
 endif
+# special case for intel with bmi2
+ifeq ($(INTEL),1)
+  LIB_OBJ+=$(OBJ_DIR)/$(CPU).bmi2.o
+endif
 
 $(MCL_LIB): $(LIB_OBJ)
 	$(AR) $@ $(LIB_OBJ)
@@ -70,6 +74,15 @@ $(ASM_SRC): $(LLVM_SRC)
 $(LLVM_SRC): $(GEN_EXE) $(FUNC_LIST)
 	$(GEN_EXE) -f $(FUNC_LIST) > $@
 
+$(OBJ_DIR)/$(CPU).bmi2.o: src/$(CPU).bmi2.s
+	$(PRE)$(CXX) -c $< -o $@ $(CFLAGS)
+
+src/$(CPU).bmi2.s: src/base$(BIT).bmi2.ll
+	$(LLVM_OPT) -O3 -o - $< -march=$(CPU) | $(LLVM_LLC) -O3 -o $@ $(LLVM_FLAGS) -mattr=bmi2
+
+src/base$(BIT).bmi2.ll: $(GEN_EXE)
+	$(GEN_EXE) -f $(FUNC_LIST) -s bmi2 > $@
+
 $(FUNC_LIST): $(LOW_ASM_SRC)
 ifeq ($(USE_LOW_ASM),1)
 	$(shell awk '/global/ { print $$2}' $(LOW_ASM_SRC) > $(FUNC_LIST))
@@ -109,7 +122,7 @@ test: $(TEST_EXE)
 	@grep -v "ng=0, exception=0" result.txt || echo "all unit tests are ok"
 
 clean:
-	$(RM) $(MCL_LIB) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_SRC) $(ASM_OBJ) $(LIB_OBJ) $(LLVM_SRC) $(FUNC_LIST)
+	$(RM) $(MCL_LIB) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_SRC) $(ASM_OBJ) $(LIB_OBJ) $(LLVM_SRC) $(FUNC_LIST) src/*.ll src/*.s
 
 ALL_SRC=$(SRC_SRC) $(TEST_SRC) $(SAMPLE_SRC)
 DEPEND_FILE=$(addprefix $(OBJ_DIR)/, $(ALL_SRC:.cpp=.d))
diff --git a/src/fp.cpp b/src/fp.cpp
index aab1945..068abaa 100644
--- a/src/fp.cpp
+++ b/src/fp.cpp
@@ -231,7 +231,7 @@ void setOp(Op& op, Mode mode)
 	setOp2<N, Gtag, true>(op);
 #ifdef MCL_USE_LLVM
 	if (mode != fp::FP_GMP && mode != fp::FP_GMP_MONT) {
-		setOp2<N, Ltag, (N * UnitBitSize <= 256)>(op);
+		setOp2<N, LBMI2tag, (N * UnitBitSize <= 256)>(op);
 	}
 #else
 	(void)mode;
diff --git a/src/gen.cpp b/src/gen.cpp
index 61edc15..189860f 100644
--- a/src/gen.cpp
+++ b/src/gen.cpp
@@ -149,7 +149,7 @@ struct Code : public mcl::Generator {
 		Operand _0 = makeImm(64, 0);
 		Operand _1 = makeImm(64, 1);
 		Operand _2 = makeImm(64, 2);
-		makeNIST_P192 = Function("makeNIST_P192L", p);
+		makeNIST_P192 = Function("makeNIST_P192L" + suf, p);
 		verifyAndSetPrivate(makeNIST_P192);
 		beginFunc(makeNIST_P192);
 		p0 = sub(_0, _1);
@@ -188,7 +188,7 @@ struct Code : public mcl::Generator {
 		resetGlobalIdx();
 		Operand out(IntPtr, unit);
 		Operand px(IntPtr, unit);
-		mcl_fpDbl_mod_NIST_P192 = Function("mcl_fpDbl_mod_NIST_P192L", Void, out, px);
+		mcl_fpDbl_mod_NIST_P192 = Function("mcl_fpDbl_mod_NIST_P192L" + suf, Void, out, px);
 		verifyAndSetPrivate(mcl_fpDbl_mod_NIST_P192);
 		beginFunc(mcl_fpDbl_mod_NIST_P192);
 
@@ -247,7 +247,7 @@ struct Code : public mcl::Generator {
 		const size_t mask = -(1 << rem);
 		const Operand py(IntPtr, unit);
 		const Operand px(IntPtr, unit);
-		Function f("mcl_fpDbl_mod_NIST_P521L", Void, py, px);
+		Function f("mcl_fpDbl_mod_NIST_P521L" + suf, Void, py, px);
 		verifyAndSetPrivate(f);
 		beginFunc(f);
 		Operand x = loadN(px, n * 2 + 1);
@@ -286,7 +286,7 @@ struct Code : public mcl::Generator {
 		resetGlobalIdx();
 		Operand py(IntPtr, unit);
 		Operand px(IntPtr, unit);
-		mcl_fp_sqr_NIST_P192 = Function("mcl_fp_sqr_NIST_P192L", Void, py, px);
+		mcl_fp_sqr_NIST_P192 = Function("mcl_fp_sqr_NIST_P192L" + suf, Void, py, px);
 		verifyAndSetPrivate(mcl_fp_sqr_NIST_P192);
 		beginFunc(mcl_fp_sqr_NIST_P192);
 		Operand buf = _alloca(unit, 192 * 2 / unit);
@@ -303,7 +303,7 @@ struct Code : public mcl::Generator {
 		Operand pz(IntPtr, unit);
 		Operand px(IntPtr, unit);
 		Operand py(IntPtr, unit);
-		Function f("mcl_fp_mulNIST_P192L", Void, pz, px, py);
+		Function f("mcl_fp_mulNIST_P192L" + suf, Void, pz, px, py);
 		verifyAndSetPrivate(f);
 		beginFunc(f);
 		Operand buf = _alloca(unit, 192 * 2 / unit);
@@ -603,6 +603,7 @@ struct Code : public mcl::Generator {
 		Operand y(Int, unit);
 		std::string name = "mulPv" + cybozu::itoa(bit) + "x" + cybozu::itoa(unit);
 		mulPvM[bit] = Function(name, z, px, y);
+		mulPvM[bit].setPrivate();
 		verifyAndSetPrivate(mulPvM[bit]);
 		beginFunc(mulPvM[bit]);
 		OperandVec L(N), H(N);
diff --git a/src/low_func.hpp b/src/low_func.hpp
index 1cab20e..8684131 100644
--- a/src/low_func.hpp
+++ b/src/low_func.hpp
@@ -19,11 +19,13 @@ namespace mcl { namespace fp {
 
 struct Gtag; // GMP
 struct Ltag; // LLVM
+struct LBMI2tag; // LLVM with Intel BMI2 instruction
 struct Atag; // asm
 
 template<class Tag> struct TagToStr { };
 template<> struct TagToStr<Gtag> { static const char *f() { return "Gtag"; } };
 template<> struct TagToStr<Ltag> { static const char *f() { return "Ltag"; } };
+template<> struct TagToStr<LBMI2tag> { static const char *f() { return "LBMI2tag"; } };
 template<> struct TagToStr<Atag> { static const char *f() { return "Atag"; } };
 
 template<size_t N>
diff --git a/src/low_func_llvm.hpp b/src/low_func_llvm.hpp
index 98f4700..a02001a 100644
--- a/src/low_func_llvm.hpp
+++ b/src/low_func_llvm.hpp
@@ -18,28 +18,37 @@ struct EnableKaratsuba<Ltag> {
 #endif
 
 #ifdef MCL_GMP_IS_FASTER_THAN_LLVM
-#define MCL_DEF_MUL(n)
+#define MCL_DEF_MUL(n, tag, suf)
 #else
-#define MCL_DEF_MUL(n) \
-template<>const void3u MulPreCore<n, Ltag>::f = &mcl_fpDbl_mulPre ## n ## L; \
-template<>const void2u SqrPreCore<n, Ltag>::f = &mcl_fpDbl_sqrPre ## n ## L;
+#define MCL_DEF_MUL(n, tag, suf) \
+template<>const void3u MulPreCore<n, tag>::f = &mcl_fpDbl_mulPre ## n ## suf; \
+template<>const void2u SqrPreCore<n, tag>::f = &mcl_fpDbl_sqrPre ## n ## suf;
 #endif
 
+#define MCL_DEF_LLVM_FUNC2(n, tag, suf) \
+template<>const u3u AddPre<n, tag>::f = &mcl_fp_addPre ## n ## suf; \
+template<>const u3u SubPre<n, tag>::f = &mcl_fp_subPre ## n ## suf; \
+template<>const void2u Shr1<n, tag>::f = &mcl_fp_shr1_ ## n ## suf; \
+MCL_DEF_MUL(n, tag, suf) \
+template<>const void2uI MulUnitPre<n, tag>::f = &mcl_fp_mulUnitPre ## n ## suf; \
+template<>const void4u Add<n, true, tag>::f = &mcl_fp_add ## n ## suf; \
+template<>const void4u Add<n, false, tag>::f = &mcl_fp_addNF ## n ## suf; \
+template<>const void4u Sub<n, true, tag>::f = &mcl_fp_sub ## n ## suf; \
+template<>const void4u Sub<n, false, tag>::f = &mcl_fp_subNF ## n ## suf; \
+template<>const void4u Mont<n, true, tag>::f = &mcl_fp_mont ## n ## suf; \
+template<>const void4u Mont<n, false, tag>::f = &mcl_fp_montNF ## n ## suf; \
+template<>const void3u MontRed<n, tag>::f = &mcl_fp_montRed ## n ## suf; \
+template<>const void4u DblAdd<n, tag>::f = &mcl_fpDbl_add ## n ## suf; \
+template<>const void4u DblSub<n, tag>::f = &mcl_fpDbl_sub ## n ## suf; \
+
+#if CYBOZU_HOST == CYBOZU_HOST_INTEL
+#define MCL_DEF_LLVM_FUNC(n) \
+	MCL_DEF_LLVM_FUNC2(n, Ltag, L) \
+	MCL_DEF_LLVM_FUNC2(n, LBMI2tag, Lbmi2)
+#else
 #define MCL_DEF_LLVM_FUNC(n) \
-template<>const u3u AddPre<n, Ltag>::f = &mcl_fp_addPre ## n ## L; \
-template<>const u3u SubPre<n, Ltag>::f = &mcl_fp_subPre ## n ## L; \
-template<>const void2u Shr1<n, Ltag>::f = &mcl_fp_shr1_ ## n ## L; \
-MCL_DEF_MUL(n) \
-template<>const void2uI MulUnitPre<n, Ltag>::f = &mcl_fp_mulUnitPre ## n ## L; \
-template<>const void4u Add<n, true, Ltag>::f = &mcl_fp_add ## n ## L; \
-template<>const void4u Add<n, false, Ltag>::f = &mcl_fp_addNF ## n ## L; \
-template<>const void4u Sub<n, true, Ltag>::f = &mcl_fp_sub ## n ## L; \
-template<>const void4u Sub<n, false, Ltag>::f = &mcl_fp_subNF ## n ## L; \
-template<>const void4u Mont<n, true, Ltag>::f = &mcl_fp_mont ## n ## L; \
-template<>const void4u Mont<n, false, Ltag>::f = &mcl_fp_montNF ## n ## L; \
-template<>const void3u MontRed<n, Ltag>::f = &mcl_fp_montRed ## n ## L; \
-template<>const void4u DblAdd<n, Ltag>::f = &mcl_fpDbl_add ## n ## L; \
-template<>const void4u DblSub<n, Ltag>::f = &mcl_fpDbl_sub ## n ## L; \
+	MCL_DEF_LLVM_FUNC2(n, Ltag, L)
+#endif
 
 MCL_DEF_LLVM_FUNC(1)
 MCL_DEF_LLVM_FUNC(2)
diff --git a/src/proto.hpp b/src/proto.hpp
index 18e8567..0085f23 100644
--- a/src/proto.hpp
+++ b/src/proto.hpp
@@ -27,7 +27,7 @@ void mcl_fpDbl_sub ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const m
 
 #define MCL_FP_DEF_FUNC(n) \
 	MCL_FP_DEF_FUNC_SUB(n, L) \
-	MCL_FP_DEF_FUNC_SUB(n, A)
+	MCL_FP_DEF_FUNC_SUB(n, Lbmi2)
 
 #define MCL_FP_DEF_FUNC_SPECIAL(suf) \
 void mcl_fpDbl_mod_NIST_P192 ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* xy, const mcl::fp::Unit* /* dummy */); \
@@ -60,7 +60,7 @@ MCL_FP_DEF_FUNC(17)
 #endif
 
 MCL_FP_DEF_FUNC_SPECIAL(L)
-MCL_FP_DEF_FUNC_SPECIAL(A)
+MCL_FP_DEF_FUNC_SPECIAL(Lbmi2)
 
 }
author	MITSUNARI Shigeo <herumi@nifty.com>	2016-12-28 15:46:50 +0800
committer	MITSUNARI Shigeo <herumi@nifty.com>	2016-12-28 15:46:50 +0800
commit	374acb1577a1c85f2d440354e481671bab1e1bff (patch)
tree	33d3af189c55e3240caf60d13b591c3f75705d55
parent	ce7dd1e830e4560deb99fa656524abb00447c346 (diff)
download	tangerine-mcl-374acb1577a1c85f2d440354e481671bab1e1bff.tar.gz tangerine-mcl-374acb1577a1c85f2d440354e481671bab1e1bff.tar.zst tangerine-mcl-374acb1577a1c85f2d440354e481671bab1e1bff.zip