aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <herumi@nifty.com>2016-12-28 15:46:50 +0800
committerMITSUNARI Shigeo <herumi@nifty.com>2016-12-28 15:46:50 +0800
commit374acb1577a1c85f2d440354e481671bab1e1bff (patch)
tree33d3af189c55e3240caf60d13b591c3f75705d55
parentce7dd1e830e4560deb99fa656524abb00447c346 (diff)
downloadtangerine-mcl-374acb1577a1c85f2d440354e481671bab1e1bff.tar.gz
tangerine-mcl-374acb1577a1c85f2d440354e481671bab1e1bff.tar.zst
tangerine-mcl-374acb1577a1c85f2d440354e481671bab1e1bff.zip
support w/wo bmi2
-rw-r--r--Makefile17
-rw-r--r--src/fp.cpp2
-rw-r--r--src/gen.cpp11
-rw-r--r--src/low_func.hpp2
-rw-r--r--src/low_func_llvm.hpp45
-rw-r--r--src/proto.hpp4
6 files changed, 53 insertions, 28 deletions
diff --git a/Makefile b/Makefile
index aceecd1..05aacfe 100644
--- a/Makefile
+++ b/Makefile
@@ -50,13 +50,17 @@ LLVM_FLAGS+=-pre-RA-sched=list-ilp -max-sched-reorder=128
HAS_BMI2=$(shell cat "/proc/cpuinfo" | grep bmi2 >/dev/null && echo "1")
ifeq ($(HAS_BMI2),1)
- LLVM_FLAGS+=-mattr=bmi2
+# LLVM_FLAGS+=-mattr=bmi2
endif
ifeq ($(USE_LOW_ASM),1)
LOW_ASM_OBJ=$(LOW_ASM_SRC:.asm=.o)
LIB_OBJ+=$(LOW_ASM_OBJ)
endif
+# special case for intel with bmi2
+ifeq ($(INTEL),1)
+ LIB_OBJ+=$(OBJ_DIR)/$(CPU).bmi2.o
+endif
$(MCL_LIB): $(LIB_OBJ)
$(AR) $@ $(LIB_OBJ)
@@ -70,6 +74,15 @@ $(ASM_SRC): $(LLVM_SRC)
$(LLVM_SRC): $(GEN_EXE) $(FUNC_LIST)
$(GEN_EXE) -f $(FUNC_LIST) > $@
+$(OBJ_DIR)/$(CPU).bmi2.o: src/$(CPU).bmi2.s
+ $(PRE)$(CXX) -c $< -o $@ $(CFLAGS)
+
+src/$(CPU).bmi2.s: src/base$(BIT).bmi2.ll
+ $(LLVM_OPT) -O3 -o - $< -march=$(CPU) | $(LLVM_LLC) -O3 -o $@ $(LLVM_FLAGS) -mattr=bmi2
+
+src/base$(BIT).bmi2.ll: $(GEN_EXE)
+ $(GEN_EXE) -f $(FUNC_LIST) -s bmi2 > $@
+
$(FUNC_LIST): $(LOW_ASM_SRC)
ifeq ($(USE_LOW_ASM),1)
$(shell awk '/global/ { print $$2}' $(LOW_ASM_SRC) > $(FUNC_LIST))
@@ -109,7 +122,7 @@ test: $(TEST_EXE)
@grep -v "ng=0, exception=0" result.txt || echo "all unit tests are ok"
clean:
- $(RM) $(MCL_LIB) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_SRC) $(ASM_OBJ) $(LIB_OBJ) $(LLVM_SRC) $(FUNC_LIST)
+ $(RM) $(MCL_LIB) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_SRC) $(ASM_OBJ) $(LIB_OBJ) $(LLVM_SRC) $(FUNC_LIST) src/*.ll src/*.s
ALL_SRC=$(SRC_SRC) $(TEST_SRC) $(SAMPLE_SRC)
DEPEND_FILE=$(addprefix $(OBJ_DIR)/, $(ALL_SRC:.cpp=.d))
diff --git a/src/fp.cpp b/src/fp.cpp
index aab1945..068abaa 100644
--- a/src/fp.cpp
+++ b/src/fp.cpp
@@ -231,7 +231,7 @@ void setOp(Op& op, Mode mode)
setOp2<N, Gtag, true>(op);
#ifdef MCL_USE_LLVM
if (mode != fp::FP_GMP && mode != fp::FP_GMP_MONT) {
- setOp2<N, Ltag, (N * UnitBitSize <= 256)>(op);
+ setOp2<N, LBMI2tag, (N * UnitBitSize <= 256)>(op);
}
#else
(void)mode;
diff --git a/src/gen.cpp b/src/gen.cpp
index 61edc15..189860f 100644
--- a/src/gen.cpp
+++ b/src/gen.cpp
@@ -149,7 +149,7 @@ struct Code : public mcl::Generator {
Operand _0 = makeImm(64, 0);
Operand _1 = makeImm(64, 1);
Operand _2 = makeImm(64, 2);
- makeNIST_P192 = Function("makeNIST_P192L", p);
+ makeNIST_P192 = Function("makeNIST_P192L" + suf, p);
verifyAndSetPrivate(makeNIST_P192);
beginFunc(makeNIST_P192);
p0 = sub(_0, _1);
@@ -188,7 +188,7 @@ struct Code : public mcl::Generator {
resetGlobalIdx();
Operand out(IntPtr, unit);
Operand px(IntPtr, unit);
- mcl_fpDbl_mod_NIST_P192 = Function("mcl_fpDbl_mod_NIST_P192L", Void, out, px);
+ mcl_fpDbl_mod_NIST_P192 = Function("mcl_fpDbl_mod_NIST_P192L" + suf, Void, out, px);
verifyAndSetPrivate(mcl_fpDbl_mod_NIST_P192);
beginFunc(mcl_fpDbl_mod_NIST_P192);
@@ -247,7 +247,7 @@ struct Code : public mcl::Generator {
const size_t mask = -(1 << rem);
const Operand py(IntPtr, unit);
const Operand px(IntPtr, unit);
- Function f("mcl_fpDbl_mod_NIST_P521L", Void, py, px);
+ Function f("mcl_fpDbl_mod_NIST_P521L" + suf, Void, py, px);
verifyAndSetPrivate(f);
beginFunc(f);
Operand x = loadN(px, n * 2 + 1);
@@ -286,7 +286,7 @@ struct Code : public mcl::Generator {
resetGlobalIdx();
Operand py(IntPtr, unit);
Operand px(IntPtr, unit);
- mcl_fp_sqr_NIST_P192 = Function("mcl_fp_sqr_NIST_P192L", Void, py, px);
+ mcl_fp_sqr_NIST_P192 = Function("mcl_fp_sqr_NIST_P192L" + suf, Void, py, px);
verifyAndSetPrivate(mcl_fp_sqr_NIST_P192);
beginFunc(mcl_fp_sqr_NIST_P192);
Operand buf = _alloca(unit, 192 * 2 / unit);
@@ -303,7 +303,7 @@ struct Code : public mcl::Generator {
Operand pz(IntPtr, unit);
Operand px(IntPtr, unit);
Operand py(IntPtr, unit);
- Function f("mcl_fp_mulNIST_P192L", Void, pz, px, py);
+ Function f("mcl_fp_mulNIST_P192L" + suf, Void, pz, px, py);
verifyAndSetPrivate(f);
beginFunc(f);
Operand buf = _alloca(unit, 192 * 2 / unit);
@@ -603,6 +603,7 @@ struct Code : public mcl::Generator {
Operand y(Int, unit);
std::string name = "mulPv" + cybozu::itoa(bit) + "x" + cybozu::itoa(unit);
mulPvM[bit] = Function(name, z, px, y);
+ mulPvM[bit].setPrivate();
verifyAndSetPrivate(mulPvM[bit]);
beginFunc(mulPvM[bit]);
OperandVec L(N), H(N);
diff --git a/src/low_func.hpp b/src/low_func.hpp
index 1cab20e..8684131 100644
--- a/src/low_func.hpp
+++ b/src/low_func.hpp
@@ -19,11 +19,13 @@ namespace mcl { namespace fp {
struct Gtag; // GMP
struct Ltag; // LLVM
+struct LBMI2tag; // LLVM with Intel BMI2 instruction
struct Atag; // asm
template<class Tag> struct TagToStr { };
template<> struct TagToStr<Gtag> { static const char *f() { return "Gtag"; } };
template<> struct TagToStr<Ltag> { static const char *f() { return "Ltag"; } };
+template<> struct TagToStr<LBMI2tag> { static const char *f() { return "LBMI2tag"; } };
template<> struct TagToStr<Atag> { static const char *f() { return "Atag"; } };
template<size_t N>
diff --git a/src/low_func_llvm.hpp b/src/low_func_llvm.hpp
index 98f4700..a02001a 100644
--- a/src/low_func_llvm.hpp
+++ b/src/low_func_llvm.hpp
@@ -18,28 +18,37 @@ struct EnableKaratsuba<Ltag> {
#endif
#ifdef MCL_GMP_IS_FASTER_THAN_LLVM
-#define MCL_DEF_MUL(n)
+#define MCL_DEF_MUL(n, tag, suf)
#else
-#define MCL_DEF_MUL(n) \
-template<>const void3u MulPreCore<n, Ltag>::f = &mcl_fpDbl_mulPre ## n ## L; \
-template<>const void2u SqrPreCore<n, Ltag>::f = &mcl_fpDbl_sqrPre ## n ## L;
+#define MCL_DEF_MUL(n, tag, suf) \
+template<>const void3u MulPreCore<n, tag>::f = &mcl_fpDbl_mulPre ## n ## suf; \
+template<>const void2u SqrPreCore<n, tag>::f = &mcl_fpDbl_sqrPre ## n ## suf;
#endif
+#define MCL_DEF_LLVM_FUNC2(n, tag, suf) \
+template<>const u3u AddPre<n, tag>::f = &mcl_fp_addPre ## n ## suf; \
+template<>const u3u SubPre<n, tag>::f = &mcl_fp_subPre ## n ## suf; \
+template<>const void2u Shr1<n, tag>::f = &mcl_fp_shr1_ ## n ## suf; \
+MCL_DEF_MUL(n, tag, suf) \
+template<>const void2uI MulUnitPre<n, tag>::f = &mcl_fp_mulUnitPre ## n ## suf; \
+template<>const void4u Add<n, true, tag>::f = &mcl_fp_add ## n ## suf; \
+template<>const void4u Add<n, false, tag>::f = &mcl_fp_addNF ## n ## suf; \
+template<>const void4u Sub<n, true, tag>::f = &mcl_fp_sub ## n ## suf; \
+template<>const void4u Sub<n, false, tag>::f = &mcl_fp_subNF ## n ## suf; \
+template<>const void4u Mont<n, true, tag>::f = &mcl_fp_mont ## n ## suf; \
+template<>const void4u Mont<n, false, tag>::f = &mcl_fp_montNF ## n ## suf; \
+template<>const void3u MontRed<n, tag>::f = &mcl_fp_montRed ## n ## suf; \
+template<>const void4u DblAdd<n, tag>::f = &mcl_fpDbl_add ## n ## suf; \
+template<>const void4u DblSub<n, tag>::f = &mcl_fpDbl_sub ## n ## suf; \
+
+#if CYBOZU_HOST == CYBOZU_HOST_INTEL
+#define MCL_DEF_LLVM_FUNC(n) \
+ MCL_DEF_LLVM_FUNC2(n, Ltag, L) \
+ MCL_DEF_LLVM_FUNC2(n, LBMI2tag, Lbmi2)
+#else
#define MCL_DEF_LLVM_FUNC(n) \
-template<>const u3u AddPre<n, Ltag>::f = &mcl_fp_addPre ## n ## L; \
-template<>const u3u SubPre<n, Ltag>::f = &mcl_fp_subPre ## n ## L; \
-template<>const void2u Shr1<n, Ltag>::f = &mcl_fp_shr1_ ## n ## L; \
-MCL_DEF_MUL(n) \
-template<>const void2uI MulUnitPre<n, Ltag>::f = &mcl_fp_mulUnitPre ## n ## L; \
-template<>const void4u Add<n, true, Ltag>::f = &mcl_fp_add ## n ## L; \
-template<>const void4u Add<n, false, Ltag>::f = &mcl_fp_addNF ## n ## L; \
-template<>const void4u Sub<n, true, Ltag>::f = &mcl_fp_sub ## n ## L; \
-template<>const void4u Sub<n, false, Ltag>::f = &mcl_fp_subNF ## n ## L; \
-template<>const void4u Mont<n, true, Ltag>::f = &mcl_fp_mont ## n ## L; \
-template<>const void4u Mont<n, false, Ltag>::f = &mcl_fp_montNF ## n ## L; \
-template<>const void3u MontRed<n, Ltag>::f = &mcl_fp_montRed ## n ## L; \
-template<>const void4u DblAdd<n, Ltag>::f = &mcl_fpDbl_add ## n ## L; \
-template<>const void4u DblSub<n, Ltag>::f = &mcl_fpDbl_sub ## n ## L; \
+ MCL_DEF_LLVM_FUNC2(n, Ltag, L)
+#endif
MCL_DEF_LLVM_FUNC(1)
MCL_DEF_LLVM_FUNC(2)
diff --git a/src/proto.hpp b/src/proto.hpp
index 18e8567..0085f23 100644
--- a/src/proto.hpp
+++ b/src/proto.hpp
@@ -27,7 +27,7 @@ void mcl_fpDbl_sub ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const m
#define MCL_FP_DEF_FUNC(n) \
MCL_FP_DEF_FUNC_SUB(n, L) \
- MCL_FP_DEF_FUNC_SUB(n, A)
+ MCL_FP_DEF_FUNC_SUB(n, Lbmi2)
#define MCL_FP_DEF_FUNC_SPECIAL(suf) \
void mcl_fpDbl_mod_NIST_P192 ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* xy, const mcl::fp::Unit* /* dummy */); \
@@ -60,7 +60,7 @@ MCL_FP_DEF_FUNC(17)
#endif
MCL_FP_DEF_FUNC_SPECIAL(L)
-MCL_FP_DEF_FUNC_SPECIAL(A)
+MCL_FP_DEF_FUNC_SPECIAL(Lbmi2)
}