def PowModPython(bs, e, m): return [pow(b, e, m) for b in bs] def PowModGmp(bs, e, m): import gmpy2, numpy as np return [gmpy2.powmod(b, e, m) for b in bs] #return list(np.vectorize(lambda x: gmpy2.powmod(x, e, m))(np.asarray(bs, dtype = object))) def PowModParallel(func, bs, e, m, *, processes = None, extra = ()): import multiprocessing as mp, functools if processes is None: processes = mp.cpu_count() with mp.Pool(processes) as pool: try: block = (len(bs) + processes * 8 - 1) // (processes * 8) return functools.reduce( lambda a, b: a + b, pool.starmap(func, [(bs[i : i + block], e, m) + extra for i in range(0, len(bs), block)]), [] ) finally: pool.close() pool.join() def PowModRedc(bs, e, m, bits, redc_type = None, w = None): import gmpy2, numpy as np, math def Int(x): return gmpy2.mpz(x) def Mask(x, s): #return x & ((1 << s) - 1) return np.vectorize(lambda x: gmpy2.t_mod_2exp(x, s))(x) e = Int(e) m = Int(m) def CreateMod(): if redc_type == 'div': def Mod(x): return x % m def To(x): return x def From(x): return x def Adjust(x): return x else: if redc_type is None and m & 1 != 0 or redc_type == 'montgomery': # https://www.nayuki.io/page/montgomery-reduction-algorithm assert m & 1 != 0, 'Montgomery can not handle even modulus!' def MontgomeryPrec(n, s): n, s = Int(n), Int(s) r = 1 << s r2 = (r * r) % n ri = pow(r, -1, n) k = (r * ri - 1) // n return n, s, Int(k), Int(r2), Int(r - 1) Mg = MontgomeryPrec(m, bits + 2) def Mod(x): n, s, k, _, mask = Mg t = Mask(Mask(x, s) * k, s) u = (x + t * n) >> s return u def To(x): _, _, _, r2, _ = Mg return Mod(x * r2) def From(x): return Mod(x) elif redc_type == 'barrett': # https://www.nayuki.io/page/barrett-reduction-algorithm assert m > 0 and m & (m - 1) != 0, 'Modulus is a power of two! Not possible for Barrett.' def BarrettPrec(n, s): n, s = Int(n), Int(s) return n, s, (1 << s) // n Br = BarrettPrec(m, bits * 2 + 2) def Mod(x): n, s, r = Br q = (x * r) >> s u = x - q * n return u def To(x): return x def From(x): return x else: assert False, f'Unknown reduction type {redc_type}!' def Adjust(x): gm = x >= m x[gm] -= m return x return Mod, To, From, Adjust def Arr(x): return np.array([Int(e) for e in x], dtype = object) Mod, To, From, Adjust = CreateMod() bs = To(Mod(To(Arr(bs)))) if w is None: w = OptW(e)[1] def PreComputePowers(d, w, bs): d = Int(d) digits = [] mask = 2 ** w - 1 while d != 0: digits.append(d & mask) d >>= w digits = digits[::-1] powers = [None, np.copy(bs)] for i in range(2, 2 ** w): powers.append(Mod(powers[-1] * bs)) return digits, powers digits, powers = PreComputePowers(e, w, bs) res = powers[digits[0]] for digit in digits[1:]: for i in range(w): res = Mod(res * res) if digit != 0: res = Mod(res * powers[digit]) return list(Adjust(From(res))) def OptW(e): n = max(1, e.bit_length()) minv = None for w in range(1, 11): # Number of multiplication-reductions c = n + 1 + 2 ** w - 2 + n // w # Regular Windowed if minv is None or c < minv[0]: minv = (c, w) return minv def OptWS(e): n = max(1, e.bit_length()) minv = None for w in range(1, 11): # Number of multiplication-reductions c = w - 1 + n + 2 ** (w - 1) - 1 + n // w # Sliding Window if minv is None or c < minv[0]: minv = (c, w) return minv def Test(): import time, random, numpy as np random.seed(0) cym = cython_import() num_bits = 496 pow_bits = 252 cnt_nums = 1 << 17 cnt_pows = 1 << 0 def Rand(bits): return random.randrange(1 << bits) bs = [Rand(num_bits) for i in range(cnt_nums)] em = [(Rand(pow_bits), Rand(num_bits) | 1) for i in range(cnt_pows)] opt_w, opt_ws = OptW(em[0][0]), OptWS(em[0][0]) print(f'reg_win = {opt_w[1]} ({opt_w[0]} redcs), slide_win = {opt_ws[1]} ({opt_ws[0]} redcs)') ref_tim, res = None, None for fname, f in [ ('Python', PowModPython), ('Python_Parallel', lambda bs, e, m: PowModParallel(PowModPython, bs, e, m)), ('Gmp', PowModGmp), ('Gmp_Parallel', lambda bs, e, m: PowModParallel(PowModGmp, bs, e, m)), ('Redc_Div_Parallel', lambda bs, e, m: PowModParallel(PowModRedc, bs, e, m, extra = (num_bits, 'div'))), ('Redc_Barrett_Parallel', lambda bs, e, m: PowModParallel(PowModRedc, bs, e, m, extra = (num_bits, 'barrett'))), ('Redc_Montgomery_Parallel', lambda bs, e, m: PowModParallel(PowModRedc, bs, e, m, extra = (num_bits, 'montgomery'))), #('Cython_Redc_Div', lambda bs, e, m: cym.pow_mod_cy(bs, e, m, redc_type = 'div')), ('Cython_Redc_Barrett_Parallel', lambda bs, e, m: cym.pow_mod_cy(bs, e, m, redc_type = 'barrett')), ('Cython_Redc_Montgomery_Parallel', lambda bs, e, m: cym.pow_mod_cy(bs, e, m, redc_type = 'montgomery')), ('Cython_Gmp_Parallel', lambda bs, e, m: cym.pow_mod_gmp_cy(bs, e, m)), ]: tim = time.time() for e, m in em: resc = f(bs, e, m) tim = time.time() - tim if ref_tim is None: ref_tim = tim print(f'{fname:<33}: time {tim:>6.02f} sec, boost {ref_tim / tim:>5.02f}', end = '', flush = True) if res is not None: assert np.all(np.equal(res, resc)) print(', Correct') else: res = resc print() def cython_compile(srcs, *, clang_dir = '', boost_dir = None, debug = False): import json, hashlib, os, glob, importlib, sys, shutil, tempfile, platform srch = hashlib.sha256(json.dumps(srcs, sort_keys = True, ensure_ascii = True).encode('utf-8')).hexdigest().upper()[:12] pdir = 'cyimp' debug = bool(debug) if len(glob.glob(f'{pdir}/cy{srch}*')) == 0: class ChDir: def __init__(self, newd): self.newd = newd def __enter__(self): self.curd = os.getcwd() os.chdir(self.newd) return self def __exit__(self, ext, exv, tb): os.chdir(self.curd) os.makedirs(pdir, exist_ok = True) with tempfile.TemporaryDirectory(dir = pdir) as td, ChDir(str(td)) as chd: os.makedirs(pdir, exist_ok = True) for k, v in srcs.items(): with open(f'cys{srch}_{k}', 'wb') as f: f.write(v.replace('{srch}', srch).encode('utf-8')) import numpy as np, setuptools, distutils from setuptools.command.build_ext import build_ext from setuptools import setup, Extension from Cython.Build import cythonize from distutils import sysconfig if platform.system() == 'Windows' and not boost_dir: boost_dir = 'C:/local/boost_1_74_0/' compiler = f'{clang_dir}clang++' os.environ['AR'] = 'ar' os.environ['ARFLAGS'] = '' sysconfig.get_config_vars()['CFLAGS'] = '' sysconfig.get_config_vars()['CCSHARED'] = '' sysconfig.get_config_vars()['CXX'] = compiler sysconfig.get_config_vars()['CC'] = compiler sysconfig.get_config_vars()['LDSHARED'] = compiler + ' -shared' sys.argv += ['build_ext', '--inplace', '--compiler=unix'] setup( ext_modules = cythonize( Extension( f'{pdir}.cy{srch}', [f'cys{srch}_{k}' for k in filter(lambda e: any(e.endswith(e2) for e2 in ['.pyx', '.c', '.cpp']), srcs.keys())], depends = [f'cys{srch}_{k}' for k in filter(lambda e: not any(e.endswith(e2) for e2 in ['.pyx', '.c', '.cpp']), srcs.keys())], extra_compile_args = [ '-g', '-m64', f'-O{0 if debug else 3}', '-std=c++20', '-DNPY_NO_DEPRECATED_API=0x00000007', '-Wno-deprecated-volatile', '-fopenmp', # '-fsanitize=address', '-Ic:/bin/msys/clang64/include/', *{'Windows': ['-D_WIN32'], 'Linux': []}[platform.system()], f'-I{boost_dir}' if boost_dir else '', ], extra_link_args = [ '-g', '-fopenmp=libiomp5', *[ 'c:/bin/msys/' + e for e in [ 'clang64/lib/libgmp.a', 'clang64/lib/libgmpxx.a', 'mingw64/lib/gcc/x86_64-w64-mingw32/10.3.0/libgcc.a', #'clang64/x86_64-w64-mingw32/lib/libmingwex.a', ] ], 'c:/dev/lib/libmingwex_renamed.a', ], ), compiler_directives = {'language_level': 3, 'embedsignature': True}, annotate = True, ), include_dirs = [np.get_include()], ) del sys.argv[-2:] for f in glob.glob(f'{pdir}/cy{srch}*'): shutil.copy(f, f'./../') print('Cython module:', f'cy{srch}', flush = True) return importlib.import_module(f'{pdir}.cy{srch}') def cython_import(): return cython_compile({ 'lib.h': ( r''' #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ASSERT_MSG(cond, msg) \ { if (!(cond)) { \ std::string fmsg = "Assertion failed (" #cond ") at " + std::to_string(__LINE__) + "! Msg: \"" + std::string(msg) + "\""; \ std::cout << fmsg << std::endl << std::flush; \ throw std::runtime_error(fmsg); \ } } #define ASSERT(cond) ASSERT_MSG(cond, "") #define LN { std::cout << "LN " << __LINE__ << std::endl; } #define DUMP(x) { std::cout << #x << " = " << (x) << std::endl; } #define bit_sizeof(x) (sizeof(x) * 8) using u8 = uint8_t; using u32 = uint32_t; using u64 = uint64_t; using i64 = int64_t; using mpu128 = boost::multiprecision::uint128_t; using mpi128 = boost::multiprecision::int128_t; using mpu256 = boost::multiprecision::uint256_t; using mpi256 = boost::multiprecision::int256_t; using mpu512 = boost::multiprecision::uint512_t; using mpi512 = boost::multiprecision::int512_t; using mpu1024 = boost::multiprecision::uint1024_t; using mpi1024 = boost::multiprecision::int1024_t; using mpu2048 = boost::multiprecision::number >; using mpi2048 = boost::multiprecision::number >; using u128 = unsigned _ExtInt(128); using i128 = signed _ExtInt(128); using u256 = unsigned _ExtInt(256); using i256 = signed _ExtInt(256); using u512 = unsigned _ExtInt(512); using i512 = signed _ExtInt(512); using u1024 = unsigned _ExtInt(1024); using i1024 = signed _ExtInt(1024); using u2048 = unsigned _ExtInt(2048); using i2048 = signed _ExtInt(2048); size_t NThreads() { size_t nthr = std::thread::hardware_concurrency(); ASSERT(nthr > 0); return nthr; } size_t OmpThreads() { //return 1; return NThreads(); } void Init() { omp_set_num_threads(OmpThreads()); } void PowModGmpC( uint64_t * bs0, uint64_t bs_words, uint64_t cnt, const uint64_t * e0, uint64_t e_words, const uint64_t * m0, uint64_t m_words) { auto ToLSlow = [](void const * x0, size_t c) { // ASSERT(c % 4 == 0); std::span x((u32*)x0, c / 4); mpz_class r; for (ptrdiff_t i = ptrdiff_t(x.size()) - 1; i >= 0; --i) { r <<= 32; r |= mpz_class(x[i]); } return r; }; auto ToWSlow = [](mpz_class x, void * y0, size_t c) { // ASSERT(c % 4 == 0); std::span y((u32*)y0, c / 4); for (size_t i = 0; i < y.size(); ++i) { y[i] = u32(mpz_get_ui(x.get_mpz_t())); x >>= 32; } ASSERT(x == 0); }; auto ToL = [](void const * x, size_t c) { mpz_class r; mpz_import(r.get_mpz_t(), c / 8, -1, 8, 0, 0, x); return r; }; auto ToW = [](mpz_class const & x, void * y, size_t c) { size_t cnt = 0; mpz_export(y, &cnt, -1, 8, 0, 0, x.get_mpz_t()); }; mpz_class const e = ToL(e0, e_words * sizeof(u64)), m = ToL(m0, m_words * sizeof(u64)); std::span bs(bs0, bs_words * cnt); #pragma omp parallel for for (size_t i = 0; i < cnt * bs_words; i += bs_words) { mpz_class b = ToL(&bs[i], bs_words * sizeof(u64)), r; mpz_powm(r.get_mpz_t(), b.get_mpz_t(), e.get_mpz_t(), m.get_mpz_t()); ToW(r, &bs[i], bs_words * sizeof(u64)); } } template void PowModC(uint64_t * bs0, uint64_t cnt, const uint64_t * e0, uint64_t e_words, const uint64_t * m0) { bool constexpr Check = 0; Init(); using TE = u1024; TE e = 0; ASSERT(sizeof(e) >= e_words * sizeof(u64)); std::memcpy(&e, e0, e_words * sizeof(u64)); static_assert(Bits == 64 || Bits == 128 || Bits == 256 || Bits == 512); using T = std::conditional_t>>; using ST = std::conditional_t>>; using DT = std::conditional_t>>; using SDT = std::conditional_t>>; using QT = std::conditional_t>>; static_assert(bit_sizeof(T) == Bits); T const m = *(T*)m0; ASSERT_MSG(m != 0, "Modulus can't be zero!"); auto const Funcs = [&, m]{ auto ToL = [](auto a){ using LT = mpi2048; bool sign = a < 0; if (sign) a = -a; LT r = 0; for (size_t i = 0; i < bit_sizeof(a); i += 64) r |= LT(u64(a >> i)) << i; return sign ? -r : r; }; auto ToW = [](auto a){ bool sign = a < 0; if (sign) a = -a; DT r = 0; for (size_t i = 0; i < bit_sizeof(a); i += 64) r |= DT(u64(a >> i)) << i; return sign ? -SDT(r) : SDT(r); }; auto LDiv = [ToW, ToL](auto const & a, auto const & b){ return ToW(ToL(a) / ToL(b)); }; auto LMod = [ToW, ToL](auto const & a, auto const & b){ return ToW(ToL(a) % ToL(b)); }; if constexpr(RedcType == 0) { auto Mod = [m, LMod](auto const & x) { return T(LMod(x, m)); }; auto To = [](auto const & x) { return T(x); }; auto From = [](auto const & x) { return T(x); }; auto Adjust = [](auto const & x) { return T(x); }; return std::make_tuple(Mod, To, From, Adjust); } else if constexpr(RedcType == 1) { ASSERT_MSG(m > 0 && (m & (m - 1)) != 0, "Modulus is a power of two! Not possible for Barrett."); size_t constexpr S = Bits * 2; auto BarrettPrec = [&](auto const & n, auto const & s) { return std::make_tuple(n, s, DT(LDiv(QT(1) << s, n))); }; auto const Br = BarrettPrec(m, S); auto Mod = [Br](auto const & x) { auto const & [n, _, r] = Br; // DT const q = DT((QT(x) * r) >> S); T const q = T((QT(x) * r) >> S); T const u = T(x) - T(q) * T(n); return u; }; auto To = [](auto const & x) { return T(x); }; auto From = [](auto const & x) { return T(x); }; auto Adjust = [m](auto const & x) { return x >= m ? T(x - m) : T(x); }; return std::make_tuple(Mod, To, From, Adjust); } else if constexpr(RedcType == 2) { ASSERT_MSG((m & 1) != 0, "Montgomery can not handle even modulus!"); size_t constexpr S = Bits; auto EGCD = [&](auto const & a, auto const & b) { ST ro = a, r = b, so = 1, s = 0, qu = 0; while (r != 0) { std::tie(ro, qu, r) = std::make_tuple(r, ST(LDiv(ro, r)), ST(LMod(ro, r))); std::tie(so, s) = std::make_tuple(s, so - qu * s); } ST to = ST(LDiv(SDT(ro) - SDT(a) * SDT(so), ST(b))); ASSERT(SDT(a) * SDT(so) + SDT(b) * SDT(to) == SDT(ro)); return std::make_tuple(T(ro), so, to); }; auto ModInv = [&](auto const & a, auto const & b) { auto const [g, s, t] = EGCD(a, b); ASSERT_MSG(g == 1, "Number is not mod-invertible when GCD > 1."); T const ai = T(LMod(LMod(s, b) + b, b)); ASSERT(LMod(DT(a) * ai, b) == 1); return ai; }; auto MontgomeryPrec = [&](auto const & n, auto const & s) { DT const r = DT(1) << s; T const r1 = T(LMod(r, n)), r2 = T(LMod(DT(r1) * r1, n)), ri = ModInv(r1, n), k = T(LDiv(r * ri - 1, n)); return std::make_tuple(n, s, k, r2, T(r - 1)); }; auto const Mg = MontgomeryPrec(m, S); auto Mod = [Mg](auto const & x) { auto const & [n, _0, k, _1, mask] = Mg; // T const t = T(((x & mask) * k) & mask); T const t = T(T(x) * k); T const u = T((DT(x) + DT(t) * n) >> S); return u; }; auto To = [Mg, Mod](auto const & x) { auto const & [_0, _1, _2, r2, _3] = Mg; return Mod(DT(x) * r2); }; auto From = [Mod](auto const & x) { return Mod(x); }; auto Adjust = [m](auto const & x) { return x >= m ? T(x - m) : T(x); }; return std::make_tuple(Mod, To, From, Adjust); } else { static_assert([]{ return false; }()); } }(); auto const & Mod = std::get<0>(Funcs); auto const & To = std::get<1>(Funcs); auto const & From = std::get<2>(Funcs); auto const & Adjust = std::get<3>(Funcs); size_t ebits = 0; { auto ec = e; while (ec != 0) { ec >>= 1; ebits += 1; } } ASSERT(ebits >= 1); auto OptWS = [&]{ size_t n = std::max(size_t(1), ebits); std::tuple minv = {size_t(-1), 0}; for (size_t w = 1; w <= 12; ++w) { // Number of multiplication-reductions size_t const c = w - 1 + n + (1 << (w - 1)) - 1 + n / w; // Sliding Window if (std::get<1>(minv) == 0 || c < std::get<0>(minv)) minv = std::make_tuple(c, w); } return minv; }; auto PowSlow = [&](auto const & a, auto const & b) { auto bc = b; size_t bbits = 0; while (bc != 0) { bc >>= 1; ++bbits; } ASSERT(bbits >= 1); T r = a; for (ptrdiff_t i = ptrdiff_t(bbits) - 2; i >= 0; --i) { r = Mod(DT(r) * r); if ((b >> i) & 1) r = Mod(DT(r) * a); } return r; }; std::span bs((T*)bs0, cnt); size_t const w = std::get<1>(OptWS()), nthr = OmpThreads(), block = std::max(size_t(1), std::min(size_t(1 << 12), size_t((bs.size() + nthr - 1) / nthr))), n_blocks = (bs.size() + block - 1) / block, th_blocks = (n_blocks + nthr - 1) / nthr; #pragma omp parallel for for (size_t ith = 0; ith < nthr; ++ith) { std::vector> powers(1 << (w - 1), std::vector(block)); std::vector buf(block); for (size_t iblock = ith * th_blocks; iblock < std::min(n_blocks, size_t((ith + 1) * th_blocks)); ++iblock) { size_t const lo = iblock * block, hi = std::min((iblock + 1) * block, bs.size()); // Precompute Redc form for (size_t j = lo; j < hi; ++j) bs[j] = To(bs[j]); // Precompute powers for Sliding Window method for (size_t j = lo, j0 = 0; j < hi; ++j, ++j0) powers[0][j0] = bs[j]; for (size_t k = 1; k <= w - 1; ++k) for (size_t j = lo, j0 = 0; j < hi; ++j, ++j0) powers[0][j0] = Mod(DT(powers[0][j0]) * powers[0][j0]); for (size_t k = 1; k < (1 << (w - 1)); ++k) for (size_t j = lo, j0 = 0; j < hi; ++j, ++j0) powers[k][j0] = Mod(DT(powers[k - 1][j0]) * bs[j]); if constexpr(Check) { for (size_t k = 0; k < (1 << (w - 1)); ++k) for (size_t j = lo, j0 = 0; j < hi; ++j, ++j0) ASSERT_MSG(powers[k][j0] == PowSlow(bs[j], (1 << (w - 1)) + k), "w " + std::to_string(w) + ", k " + std::to_string(k)); } // Binary exponentiation with Sliding Window // https://en.wikipedia.org/wiki/Elliptic_curve_point_multiplication#Sliding-window_method for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0) buf[k0] = bs[k]; for (ptrdiff_t j = ptrdiff_t(ebits) - 2; j >= 0;) { size_t nt = 0; u8 const bit = u8((e >> j) & 1); if (bit == 0) { for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0) buf[k0] = Mod(DT(buf[k0]) * buf[k0]); nt = 1; j -= nt; } else { nt = 1; size_t t = 1; for (size_t k = 1; k < std::min(w, size_t(j + 1)); ++k) { t <<= 1; t |= size_t((e >> (j - k)) & 1); ++nt; } j -= nt; if (nt < w) { // Regular binary exponentiation of tail for (ptrdiff_t p = ptrdiff_t(nt) - 1; p >= 0; --p) { for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0) buf[k0] = Mod(DT(buf[k0]) * buf[k0]); if ((t >> p) & 1) for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0) buf[k0] = Mod(DT(buf[k0]) * bs[k]); } } else { for (size_t p = 0; p < w; ++p) for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0) buf[k0] = Mod(DT(buf[k0]) * buf[k0]); ASSERT(t >= (1 << (w - 1))); t -= 1 << (w - 1); ASSERT(t < powers.size()); auto const & pow_t = powers[t]; for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0) buf[k0] = Mod(DT(buf[k0]) * pow_t[k0]); } } if constexpr(Check) { for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0) ASSERT_MSG(buf[k0] == PowSlow(bs[k], e >> (j + 1)), "w " + std::to_string(w) + ", ebits " + std::to_string(ebits) + ", j " + std::to_string(j) + ", nt " + std::to_string(nt)); } } // Convert back from Redc form for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0) bs[k] = Adjust(From(buf[k0])); } } } '''), 'main.pyx': ( r''' # distutils: language = c++ import numpy as np cimport numpy as np cimport cython from libc.stdint cimport * from cpython cimport * cdef extern from *: ctypedef uint64_t val0 "0" ctypedef uint64_t val1 "1" ctypedef uint64_t val2 "2" ctypedef uint64_t val64 "64" ctypedef uint64_t val128 "128" ctypedef uint64_t val256 "256" ctypedef uint64_t val512 "512" ctypedef bool valFalse "false" ctypedef bool valTrue "true" cdef extern from "cys{srch}_lib.h" nogil: void PowModGmpC( uint64_t * bs0, uint64_t bs_words, uint64_t cnt, const uint64_t * e0, uint64_t e_words, const uint64_t * m0, uint64_t m_words) except + void PowModC[Bits, RedcType](uint64_t * bs0, uint64_t cnt, const uint64_t * e0, uint64_t e_words, const uint64_t * m0) except + #@cython.boundscheck(False) #@cython.wraparound(False) def pow_mod_cy(bs, e, m, *, redc_type): u64m = (1 << 64) - 1 def ToU64(x, nb): assert x >= 0, x return np.array([(x >> (i * 64)) & u64m for i in range(nb // 64)], dtype = np.uint64) def FromU64(x): r = 0 for i, e in enumerate(x): r += int(e) << (i * 64) return r def GetBits(x): xbl = x.bit_length() assert xbl <= 1024, xbl return 64 if xbl <= 64 else 128 if xbl <= 128 else 256 if xbl <= 256 else 512 if xbl <= 512 else 1024 redc_type = {'div': 0, 'barrett': 1, 'montgomery': 2}[redc_type] bits = GetBits(m) assert bits <= 512 bs = np.asarray([ToU64(b % m, bits) for b in bs], dtype = np.uint64) m = ToU64(m, bits) ebits = GetBits(e) e = ToU64(e, ebits) cdef uint64_t[:, :] cbs = bs cdef const uint64_t[:] cm = m cdef const uint64_t[:] ce = e cdef uint64_t cbits = bits cdef uint64_t cewords = ebits // 64 cdef uint64_t credc_type = redc_type cdef uint64_t cnt = len(bs) with nogil: ( PowModC[val64, val0] if cbits == 64 else PowModC[val128, val0] if cbits == 128 else PowModC[val256, val0] if cbits == 256 else PowModC[val512, val0] if credc_type == 0 else PowModC[val64, val1] if cbits == 64 else PowModC[val128, val1] if cbits == 128 else PowModC[val256, val1] if cbits == 256 else PowModC[val512, val1] if credc_type == 1 else PowModC[val64, val2] if cbits == 64 else PowModC[val128, val2] if cbits == 128 else PowModC[val256, val2] if cbits == 256 else PowModC[val512, val2] )( &cbs[0, 0], cnt, &ce[0], cewords, &cm[0] ) return [FromU64(e) for e in bs] #@cython.boundscheck(False) #@cython.wraparound(False) def pow_mod_gmp_cy(bs, e, m): u64m = (1 << 64) - 1 def ToU64(x, nb): assert x >= 0, x return np.array([(x >> (i * 64)) & u64m for i in range(nb // 64)], dtype = np.uint64) def FromU64(x): r = 0 for i, e in enumerate(x): r += int(e) << (i * 64) return r def GetBits(x): xbl = x.bit_length() assert xbl <= 1024, xbl return 64 if xbl <= 64 else 128 if xbl <= 128 else 256 if xbl <= 256 else 512 if xbl <= 512 else 1024 bits = GetBits(m) assert bits <= 512 bs = np.asarray([ToU64(b, bits) for b in bs], dtype = np.uint64) m = ToU64(m, bits) ebits = GetBits(e) e = ToU64(e, ebits) cdef uint64_t[:, :] cbs = bs cdef const uint64_t[:] cm = m cdef const uint64_t[:] ce = e cdef uint64_t cbits = bits cdef uint64_t cwords = bits // 64 cdef uint64_t cewords = ebits // 64 cdef uint64_t ccnt = len(bs) with nogil: PowModGmpC(&cbs[0, 0], cwords, ccnt, &ce[0], cewords, &cm[0], cwords) return [FromU64(e) for e in bs] '''), }, clang_dir = 'c:/bin/llvm/bin/', boost_dir = None, debug = 0) if __name__ == '__main__': Test()