aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <herumi@nifty.com>2018-11-13 16:03:46 +0800
committerMITSUNARI Shigeo <herumi@nifty.com>2018-11-13 16:03:46 +0800
commit3a6b1c2124b9d4c2ad4cdd2c193a3a8b8e02afb9 (patch)
tree61dddf9bd26fc9acdfe0afa12eabefdd035a31fe
parent543e5fc19dc5115e1e7fedcb142980959ce8d1bd (diff)
downloadtangerine-mcl-3a6b1c2124b9d4c2ad4cdd2c193a3a8b8e02afb9.tar.gz
tangerine-mcl-3a6b1c2124b9d4c2ad4cdd2c193a3a8b8e02afb9.tar.zst
tangerine-mcl-3a6b1c2124b9d4c2ad4cdd2c193a3a8b8e02afb9.zip
support perf if MCL_PERF=1
-rw-r--r--src/fp_generator.hpp111
1 files changed, 110 insertions, 1 deletions
diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp
index df70ac7..7ecd977 100644
--- a/src/fp_generator.hpp
+++ b/src/fp_generator.hpp
@@ -127,6 +127,71 @@ if (rm.isReg()) { \
namespace fp {
+struct Profiler {
+ FILE *fp_;
+ const char *suf_;
+ const uint8_t *prev_ = 0;
+ Profiler()
+ : fp_(0)
+ , suf_(0)
+ , prev_(0)
+ {
+ }
+ void init(const char *suf, const uint8_t *prev)
+ {
+#ifdef __linux__
+ close();
+ const char *s = getenv("MCL_PERF");
+ if (s == 0 || strcmp(s, "1") != 0) return;
+ fprintf(stderr, "use perf suf=%s\n", suf);
+ suf_ = suf;
+ const int pid = getpid();
+ char name[128];
+ snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid);
+ fp_ = fopen(name, "wb");
+ if (fp_ == 0) throw cybozu::Exception("PerMap") << name;
+ prev_ = prev;
+#else
+ (void)suf;
+ (void)prev;
+#endif
+ }
+ ~Profiler()
+ {
+ close();
+ }
+ void close()
+ {
+#ifdef __linux__
+ if (fp_ == 0) return;
+ fclose(fp_);
+ fp_ = 0;
+ prev_ = 0;
+#endif
+ }
+ void set(const uint8_t *p, size_t n, const char *name) const
+ {
+#ifdef __linux__
+ if (fp_ == 0) return;
+ fprintf(fp_, "%llx %zx %s%s\n", (long long)p, n, name, suf_);
+#else
+ (void)p;
+ (void)n;
+ (void)name;
+#endif
+ }
+ void set(const char *name, const uint8_t *cur)
+ {
+#ifdef __linux__
+ set(prev_, cur - prev_, name);
+ prev_ = cur;
+#else
+ (void)name;
+ (void)cur;
+#endif
+ }
+};
+
struct FpGenerator : Xbyak::CodeGenerator {
typedef Xbyak::RegExp RegExp;
typedef Xbyak::Reg64 Reg64;
@@ -203,6 +268,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
int pn_;
int FpByte_;
bool isFullBit_;
+ Profiler prof_;
/*
@param op [in] ; use op.p, op.N, op.isFullBit
@@ -264,45 +330,88 @@ private:
FpByte_ = int(op.maxN * sizeof(uint64_t));
isFullBit_ = op.isFullBit;
// printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_);
+ static char suf[] = "_0";
+ prof_.init(suf, getCurr());
+ suf[1]++;
op.fp_addPre = gen_addSubPre(true, pn_);
+ prof_.set("Fp_addPre", getCurr());
+
op.fp_subPre = gen_addSubPre(false, pn_);
- op.fp_subA_ = gen_fp_sub();
+ prof_.set("Fp_subPre", getCurr());
+
op.fp_addA_ = gen_fp_add();
+ prof_.set("Fp_add", getCurr());
+
+ op.fp_subA_ = gen_fp_sub();
+ prof_.set("Fp_sub", getCurr());
op.fp_shr1 = gen_shr1();
+ prof_.set("Fp_shr1", getCurr());
op.fp_negA_ = gen_fp_neg();
+ prof_.set("Fp_neg", getCurr());
op.fpDbl_addA_ = gen_fpDbl_add();
+ prof_.set("FpDbl_add", getCurr());
+
op.fpDbl_subA_ = gen_fpDbl_sub();
+ prof_.set("FpDbl_sub", getCurr());
+
op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2);
+ prof_.set("FpDbl_addPre", getCurr());
+
op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2);
+ prof_.set("FpDbl_subPre", getCurr());
op.fpDbl_mulPreA_ = gen_fpDbl_mulPre();
+ prof_.set("FpDbl_mulPre", getCurr());
+
op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre();
+ prof_.set("FpDbl_sqrPre", getCurr());
+
op.fpDbl_modA_ = gen_fpDbl_mod(op);
+ prof_.set("FpDbl_mod", getCurr());
op.fp_mulA_ = gen_mul();
+ prof_.set("Fp_mul", getCurr());
if (op.fp_mulA_) {
op.fp_mul = reinterpret_cast<void4u>(op.fp_mulA_); // used in toMont/fromMont
}
op.fp_sqrA_ = gen_sqr();
+ prof_.set("Fp_sqr", getCurr());
+
if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4
align(16);
op.fp_preInv = getCurr<int2u>();
gen_preInv();
+ prof_.set("preInv", getCurr());
}
if (op.xi_a == 0) return; // Fp2 is not used
op.fp2_addA_ = gen_fp2_add();
+ prof_.set("Fp2_add", getCurr());
+
op.fp2_subA_ = gen_fp2_sub();
+ prof_.set("Fp2_sub", getCurr());
+
op.fp2_negA_ = gen_fp2_neg();
+ prof_.set("Fp2_neg", getCurr());
+
op.fp2_mulNF = 0;
op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre();
+ prof_.set("Fp2Dbl_mulPre", getCurr());
+
op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre();
+ prof_.set("Fp2Dbl_sqrPre", getCurr());
+
op.fp2_mulA_ = gen_fp2_mul();
+ prof_.set("Fp2_mul", getCurr());
+
op.fp2_sqrA_ = gen_fp2_sqr();
+ prof_.set("Fp2_sqr", getCurr());
+
op.fp2_mul_xiA_ = gen_fp2_mul_xi();
+ prof_.set("Fp2_mul_xi", getCurr());
}
u3u gen_addSubPre(bool isAdd, int n)
{