def PowModPython(bs, e, m):
    return [pow(b, e, m) for b in bs]

def PowModGmp(bs, e, m):
    import gmpy2, numpy as np
    return [gmpy2.powmod(b, e, m) for b in bs]
    #return list(np.vectorize(lambda x: gmpy2.powmod(x, e, m))(np.asarray(bs, dtype = object)))

def PowModParallel(func, bs, e, m, *, processes = None, extra = ()):
    import multiprocessing as mp, functools
    if processes is None:
        processes = mp.cpu_count()
    with mp.Pool(processes) as pool:
        try:
            block = (len(bs) + processes * 8 - 1) // (processes * 8)
            return functools.reduce(
                lambda a, b: a + b,
                pool.starmap(func,
                    [(bs[i : i + block], e, m) + extra for i in range(0, len(bs), block)]),
                []
            )
        finally:
            pool.close()
            pool.join()

def PowModRedc(bs, e, m, bits, redc_type = None, w = None):
    import gmpy2, numpy as np, math

    def Int(x):
        return gmpy2.mpz(x)
    def Mask(x, s):
        #return x & ((1 << s) - 1)
        return np.vectorize(lambda x: gmpy2.t_mod_2exp(x, s))(x)

    e = Int(e)
    m = Int(m)
    
    def CreateMod():
        if redc_type == 'div':
            def Mod(x):
                return x % m
            def To(x):
                return x
            def From(x):
                return x
            def Adjust(x):
                return x
        else:
            if redc_type is None and m & 1 != 0 or redc_type == 'montgomery':
                # https://www.nayuki.io/page/montgomery-reduction-algorithm

                assert m & 1 != 0, 'Montgomery can not handle even modulus!'

                def MontgomeryPrec(n, s):
                    n, s = Int(n), Int(s)
                    r = 1 << s
                    r2 = (r * r) % n
                    ri = pow(r, -1, n)
                    k = (r * ri - 1) // n
                    return n, s, Int(k), Int(r2), Int(r - 1)
                
                Mg = MontgomeryPrec(m, bits + 2)
                
                def Mod(x):
                    n, s, k, _, mask = Mg
                    t = Mask(Mask(x, s) * k, s)
                    u = (x + t * n) >> s
                    return u
                    
                def To(x):
                    _, _, _, r2, _ = Mg
                    return Mod(x * r2)

                def From(x):
                    return Mod(x)
            elif redc_type == 'barrett':
                # https://www.nayuki.io/page/barrett-reduction-algorithm

                assert m > 0 and m & (m - 1) != 0, 'Modulus is a power of two! Not possible for Barrett.'

                def BarrettPrec(n, s):
                    n, s = Int(n), Int(s)
                    return n, s, (1 << s) // n

                Br = BarrettPrec(m, bits * 2 + 2)

                def Mod(x):
                    n, s, r = Br
                    q = (x * r) >> s
                    u = x - q * n
                    return u

                def To(x):
                    return x
                
                def From(x):
                    return x
            else:
                assert False, f'Unknown reduction type {redc_type}!'

            def Adjust(x):
                gm = x >= m
                x[gm] -= m
                return x

        return Mod, To, From, Adjust
    
    def Arr(x):
        return np.array([Int(e) for e in  x], dtype = object)
    
    Mod, To, From, Adjust = CreateMod()
    bs = To(Mod(To(Arr(bs))))

    if w is None:
        w = OptW(e)[1]
    
    def PreComputePowers(d, w, bs):
        d = Int(d)
        digits = []
        mask = 2 ** w - 1
        while d != 0:
            digits.append(d & mask)
            d >>= w
        digits = digits[::-1]
        powers = [None, np.copy(bs)]
        for i in range(2, 2 ** w):
            powers.append(Mod(powers[-1] * bs))
        return digits, powers
    
    digits, powers = PreComputePowers(e, w, bs)
    res = powers[digits[0]]
    
    for digit in digits[1:]:
        for i in range(w):
            res = Mod(res * res)
        if digit != 0:
            res = Mod(res * powers[digit])
    
    return list(Adjust(From(res)))

def OptW(e):
    n = max(1, e.bit_length())
    minv = None
    for w in range(1, 11):
        # Number of multiplication-reductions
        c = n + 1 + 2 ** w - 2 + n // w # Regular Windowed
        if minv is None or c < minv[0]:
            minv = (c, w)
    return minv

def OptWS(e):
    n = max(1, e.bit_length())
    minv = None
    for w in range(1, 11):
        # Number of multiplication-reductions
        c = w - 1 + n + 2 ** (w - 1) - 1 + n // w # Sliding Window
        if minv is None or c < minv[0]:
            minv = (c, w)
    return minv

def Test():
    import time, random, numpy as np
    random.seed(0)
    
    cym = cython_import()
    
    num_bits = 496
    pow_bits = 252
    cnt_nums = 1 << 17
    cnt_pows = 1 << 0
    
    def Rand(bits):
        return random.randrange(1 << bits)
    
    bs = [Rand(num_bits) for i in range(cnt_nums)]
    em = [(Rand(pow_bits), Rand(num_bits) | 1) for i in range(cnt_pows)]
    
    opt_w, opt_ws = OptW(em[0][0]), OptWS(em[0][0])
    print(f'reg_win = {opt_w[1]} ({opt_w[0]} redcs), slide_win = {opt_ws[1]} ({opt_ws[0]} redcs)')
    
    ref_tim, res = None, None
    for fname, f in [
        ('Python', PowModPython),
        ('Python_Parallel', lambda bs, e, m: PowModParallel(PowModPython, bs, e, m)),
        ('Gmp', PowModGmp),
        ('Gmp_Parallel', lambda bs, e, m: PowModParallel(PowModGmp, bs, e, m)),
        ('Redc_Div_Parallel', lambda bs, e, m: PowModParallel(PowModRedc, bs, e, m, extra = (num_bits, 'div'))),
        ('Redc_Barrett_Parallel', lambda bs, e, m: PowModParallel(PowModRedc, bs, e, m, extra = (num_bits, 'barrett'))),
        ('Redc_Montgomery_Parallel', lambda bs, e, m: PowModParallel(PowModRedc, bs, e, m, extra = (num_bits, 'montgomery'))),
        #('Cython_Redc_Div', lambda bs, e, m: cym.pow_mod_cy(bs, e, m, redc_type = 'div')),
        ('Cython_Redc_Barrett_Parallel', lambda bs, e, m: cym.pow_mod_cy(bs, e, m, redc_type = 'barrett')),
        ('Cython_Redc_Montgomery_Parallel', lambda bs, e, m: cym.pow_mod_cy(bs, e, m, redc_type = 'montgomery')),
        ('Cython_Gmp_Parallel', lambda bs, e, m: cym.pow_mod_gmp_cy(bs, e, m)),
    ]:
        tim = time.time()
        for e, m in em:
            resc = f(bs, e, m)
        tim = time.time() - tim
        if ref_tim is None:
            ref_tim = tim
        print(f'{fname:<33}: time {tim:>6.02f} sec, boost {ref_tim / tim:>5.02f}', end = '', flush = True)
        if res is not None:
            assert np.all(np.equal(res, resc))
            print(', Correct')
        else:
            res = resc
            print()
        
def cython_compile(srcs, *, clang_dir = '', boost_dir = None, debug = False):
    import json, hashlib, os, glob, importlib, sys, shutil, tempfile, platform
    srch = hashlib.sha256(json.dumps(srcs, sort_keys = True, ensure_ascii = True).encode('utf-8')).hexdigest().upper()[:12]
    pdir = 'cyimp'
    debug = bool(debug)
    
    if len(glob.glob(f'{pdir}/cy{srch}*')) == 0:
        class ChDir:
            def __init__(self, newd):
                self.newd = newd
            def __enter__(self):
                self.curd = os.getcwd()
                os.chdir(self.newd)
                return self
            def __exit__(self, ext, exv, tb):
                os.chdir(self.curd)
        
        os.makedirs(pdir, exist_ok = True)
        with tempfile.TemporaryDirectory(dir = pdir) as td, ChDir(str(td)) as chd:
            os.makedirs(pdir, exist_ok = True)
            
            for k, v in srcs.items():
                with open(f'cys{srch}_{k}', 'wb') as f:
                    f.write(v.replace('{srch}', srch).encode('utf-8'))
            
            import numpy as np, setuptools, distutils
            from setuptools.command.build_ext import build_ext
            from setuptools import setup, Extension
            from Cython.Build import cythonize
            from distutils import sysconfig
            
            if platform.system() == 'Windows' and not boost_dir:
                boost_dir = 'C:/local/boost_1_74_0/'
            
            compiler = f'{clang_dir}clang++'
            os.environ['AR'] = 'ar'
            os.environ['ARFLAGS'] = ''
            sysconfig.get_config_vars()['CFLAGS'] = ''
            sysconfig.get_config_vars()['CCSHARED'] = ''
            sysconfig.get_config_vars()['CXX'] = compiler
            sysconfig.get_config_vars()['CC'] = compiler
            sysconfig.get_config_vars()['LDSHARED'] = compiler + ' -shared'
            
            sys.argv += ['build_ext', '--inplace', '--compiler=unix']
            setup(
                ext_modules = cythonize(
                    Extension(
                        f'{pdir}.cy{srch}', [f'cys{srch}_{k}' for k in filter(lambda e: any(e.endswith(e2) for e2 in ['.pyx', '.c', '.cpp']), srcs.keys())],
                        depends = [f'cys{srch}_{k}' for k in filter(lambda e: not any(e.endswith(e2) for e2 in ['.pyx', '.c', '.cpp']), srcs.keys())],
                        extra_compile_args = [
                            '-g', '-m64', f'-O{0 if debug else 3}', '-std=c++20', '-DNPY_NO_DEPRECATED_API=0x00000007', '-Wno-deprecated-volatile',
                            '-fopenmp', # '-fsanitize=address',
                            '-Ic:/bin/msys/clang64/include/',
                            *{'Windows': ['-D_WIN32'], 'Linux': []}[platform.system()],
                            f'-I{boost_dir}' if boost_dir else '',
                        ],
                        extra_link_args = [
                            '-g', '-fopenmp=libiomp5',
                            *[
                                'c:/bin/msys/' + e
                                for e in [
                                    'clang64/lib/libgmp.a', 'clang64/lib/libgmpxx.a', 'mingw64/lib/gcc/x86_64-w64-mingw32/10.3.0/libgcc.a',
                                    #'clang64/x86_64-w64-mingw32/lib/libmingwex.a',
                                ]
                            ],
                            'c:/dev/lib/libmingwex_renamed.a',
                        ],
                    ),
                    compiler_directives = {'language_level': 3, 'embedsignature': True},
                    annotate = True,
                ),
                include_dirs = [np.get_include()],
            )
            
            del sys.argv[-2:]
            for f in glob.glob(f'{pdir}/cy{srch}*'):
                shutil.copy(f, f'./../')
            
    print('Cython module:', f'cy{srch}', flush = True)
    return importlib.import_module(f'{pdir}.cy{srch}')
    
def cython_import():
    return cython_compile({
        'lib.h': (
r'''
    #include <cstdint>
    #include <cstring>
    #include <stdexcept>
    #include <tuple>
    #include <type_traits>
    #include <span>
    #include <iostream>
    #include <thread>
    #include <algorithm>
    #include <vector>
    #include <string>
    
    #include <omp.h>
    #include <gmpxx.h>
    #include <boost/multiprecision/cpp_int.hpp>
    
    #define ASSERT_MSG(cond, msg) \
        { if (!(cond)) { \
            std::string fmsg = "Assertion failed (" #cond ") at " + std::to_string(__LINE__) + "! Msg: \"" + std::string(msg) + "\""; \
            std::cout << fmsg << std::endl << std::flush; \
            throw std::runtime_error(fmsg); \
        } }
    #define ASSERT(cond) ASSERT_MSG(cond, "")
    #define LN { std::cout << "LN " << __LINE__ << std::endl; }
    #define DUMP(x) { std::cout << #x << " = " << (x) << std::endl; }
    #define bit_sizeof(x) (sizeof(x) * 8)
    
    using u8 = uint8_t;
    using u32 = uint32_t;
    using u64 = uint64_t;
    using i64 = int64_t;
    
    using mpu128 = boost::multiprecision::uint128_t;
    using mpi128 = boost::multiprecision::int128_t;
    using mpu256 = boost::multiprecision::uint256_t;
    using mpi256 = boost::multiprecision::int256_t;
    using mpu512 = boost::multiprecision::uint512_t;
    using mpi512 = boost::multiprecision::int512_t;
    using mpu1024 = boost::multiprecision::uint1024_t;
    using mpi1024 = boost::multiprecision::int1024_t;
    using mpu2048 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<2048, 2048, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void> >;
    using mpi2048 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<2048, 2048, boost::multiprecision::signed_magnitude, boost::multiprecision::unchecked, void> >;
    
    using u128 = unsigned _ExtInt(128);
    using i128 = signed _ExtInt(128);
    using u256 = unsigned _ExtInt(256);
    using i256 = signed _ExtInt(256);
    using u512 = unsigned _ExtInt(512);
    using i512 = signed _ExtInt(512);
    using u1024 = unsigned _ExtInt(1024);
    using i1024 = signed _ExtInt(1024);
    using u2048 = unsigned _ExtInt(2048);
    using i2048 = signed _ExtInt(2048);
    
    size_t NThreads() {
        size_t nthr = std::thread::hardware_concurrency();
        ASSERT(nthr > 0);
        return nthr;
    }
    
    size_t OmpThreads() {
        //return 1;
        return NThreads();
    }
    
    void Init() {
        omp_set_num_threads(OmpThreads());
    }

    void PowModGmpC(
        uint64_t * bs0, uint64_t bs_words, uint64_t cnt, const uint64_t * e0,
        uint64_t e_words, const uint64_t * m0, uint64_t m_words) {
        
        auto ToLSlow = [](void const * x0, size_t c) {
            // ASSERT(c % 4 == 0);
            std::span<u32 const> x((u32*)x0, c / 4);
            mpz_class r;
            for (ptrdiff_t i = ptrdiff_t(x.size()) - 1; i >= 0; --i) {
                r <<= 32;
                r |= mpz_class(x[i]);
            }
            return r;
        };
        auto ToWSlow = [](mpz_class x, void * y0, size_t c) {
            // ASSERT(c % 4 == 0);
            std::span<u32> y((u32*)y0, c / 4);
            for (size_t i = 0; i < y.size(); ++i) {
                y[i] = u32(mpz_get_ui(x.get_mpz_t()));
                x >>= 32;
            }
            ASSERT(x == 0);
        };
        auto ToL = [](void const * x, size_t c) {
            mpz_class r;
            mpz_import(r.get_mpz_t(), c / 8, -1, 8, 0, 0, x);
            return r;
        };
        auto ToW = [](mpz_class const & x, void * y, size_t c) {
            size_t cnt = 0;
            mpz_export(y, &cnt, -1, 8, 0, 0, x.get_mpz_t());
        };
        
        mpz_class const
            e = ToL(e0, e_words * sizeof(u64)),
            m = ToL(m0, m_words * sizeof(u64));
        
        std::span<u64> bs(bs0, bs_words * cnt);
        
        #pragma omp parallel for
        for (size_t i = 0; i < cnt * bs_words; i += bs_words) {
            mpz_class b = ToL(&bs[i], bs_words * sizeof(u64)), r;
            mpz_powm(r.get_mpz_t(), b.get_mpz_t(), e.get_mpz_t(), m.get_mpz_t());
            ToW(r, &bs[i], bs_words * sizeof(u64));
        }
    }
    
    template <size_t Bits, size_t RedcType>
    void PowModC(uint64_t * bs0, uint64_t cnt, const uint64_t * e0, uint64_t e_words, const uint64_t * m0) {
        bool constexpr Check = 0;
        
        Init();
        
        using TE = u1024;
        TE e = 0;
        ASSERT(sizeof(e) >= e_words * sizeof(u64));
        std::memcpy(&e, e0, e_words * sizeof(u64));
        static_assert(Bits == 64 || Bits == 128 || Bits == 256 || Bits == 512);
        
        using T = std::conditional_t<Bits == 64, u64, std::conditional_t<Bits == 128, u128,
            std::conditional_t<Bits == 256, u256, u512>>>;
        using ST = std::conditional_t<Bits == 64, i64, std::conditional_t<Bits == 128, i128,
            std::conditional_t<Bits == 256, i256, i512>>>;
        using DT = std::conditional_t<Bits == 64, u128, std::conditional_t<Bits == 128, u256,
            std::conditional_t<Bits == 256, u512, u1024>>>;
        using SDT = std::conditional_t<Bits == 64, i128, std::conditional_t<Bits == 128, i256,
            std::conditional_t<Bits == 256, i512, i1024>>>;
        using QT = std::conditional_t<Bits == 64, u256, std::conditional_t<Bits == 128, u512,
            std::conditional_t<Bits == 256, u1024, u2048>>>;
        
        static_assert(bit_sizeof(T) == Bits);
        
        T const m = *(T*)m0;
        ASSERT_MSG(m != 0, "Modulus can't be zero!");
        
        auto const Funcs = [&, m]{
            auto ToL = [](auto a){
                using LT = mpi2048;
                bool sign = a < 0;
                if (sign)
                    a = -a;
                LT r = 0;
                for (size_t i = 0; i < bit_sizeof(a); i += 64)
                    r |= LT(u64(a >> i)) << i;
                return sign ? -r : r;
            };
            auto ToW = [](auto a){
                bool sign = a < 0;
                if (sign)
                    a = -a;
                DT r = 0;
                for (size_t i = 0; i < bit_sizeof(a); i += 64)
                    r |= DT(u64(a >> i)) << i;
                return sign ? -SDT(r) : SDT(r);
            };
            auto LDiv = [ToW, ToL](auto const & a, auto const & b){
                return ToW(ToL(a) / ToL(b));
            };
            auto LMod = [ToW, ToL](auto const & a, auto const & b){
                return ToW(ToL(a) % ToL(b));
            };
            
            if constexpr(RedcType == 0) {
                auto Mod = [m, LMod](auto const & x) {
                    return T(LMod(x, m));
                };
                auto To = [](auto const & x) { return T(x); };
                auto From = [](auto const & x) { return T(x); };
                auto Adjust = [](auto const & x) { return T(x); };
                
                return std::make_tuple(Mod, To, From, Adjust);
            } else if constexpr(RedcType == 1) {
                ASSERT_MSG(m > 0 && (m & (m - 1)) != 0, "Modulus is a power of two! Not possible for Barrett.");
                
                size_t constexpr S = Bits * 2;
                
                auto BarrettPrec = [&](auto const & n, auto const & s) {
                    return std::make_tuple(n, s, DT(LDiv(QT(1) << s, n)));
                };
                
                auto const Br = BarrettPrec(m, S);
                
                auto Mod = [Br](auto const & x) {
                    auto const & [n, _, r] = Br;
                    // DT const q = DT((QT(x) * r) >> S);
                    T const q = T((QT(x) * r) >> S);
                    T const u = T(x) - T(q) * T(n);
                    return u;
                };
                auto To = [](auto const & x) { return T(x); };
                auto From = [](auto const & x) { return T(x); };
                auto Adjust = [m](auto const & x) { return x >= m ? T(x - m) : T(x); };
                
                return std::make_tuple(Mod, To, From, Adjust);
            } else if constexpr(RedcType == 2) {
                ASSERT_MSG((m & 1) != 0, "Montgomery can not handle even modulus!");
                
                size_t constexpr S = Bits;
                
                auto EGCD = [&](auto const & a, auto const & b) {
                    ST ro = a, r = b, so = 1, s = 0, qu = 0;
                    while (r != 0) {
                        std::tie(ro, qu, r) = std::make_tuple(r, ST(LDiv(ro, r)), ST(LMod(ro, r)));
                        std::tie(so, s) = std::make_tuple(s, so - qu * s);
                    }
                    ST to = ST(LDiv(SDT(ro) - SDT(a) * SDT(so), ST(b)));
                    ASSERT(SDT(a) * SDT(so) + SDT(b) * SDT(to) == SDT(ro));
                    return std::make_tuple(T(ro), so, to);
                };
                auto ModInv = [&](auto const & a, auto const & b) {
                    auto const [g, s, t] = EGCD(a, b);
                    ASSERT_MSG(g == 1, "Number is not mod-invertible when GCD > 1.");
                    T const ai = T(LMod(LMod(s, b) + b, b));
                    ASSERT(LMod(DT(a) * ai, b) == 1);
                    return ai;
                };
                auto MontgomeryPrec = [&](auto const & n, auto const & s) {
                    DT const r = DT(1) << s;
                    T const
                      r1 = T(LMod(r, n)),
                      r2 = T(LMod(DT(r1) * r1, n)),
                      ri = ModInv(r1, n),
                      k = T(LDiv(r * ri - 1, n));
                    return std::make_tuple(n, s, k, r2, T(r - 1));
                };
                
                auto const Mg = MontgomeryPrec(m, S);
                
                auto Mod = [Mg](auto const & x) {
                    auto const & [n, _0, k, _1, mask] = Mg;
                    // T const t = T(((x & mask) * k) & mask);
                    T const t = T(T(x) * k);
                    T const u = T((DT(x) + DT(t) * n) >> S);
                    return u;
                };
                auto To = [Mg, Mod](auto const & x) {
                    auto const & [_0, _1, _2, r2, _3] = Mg;
                    return Mod(DT(x) * r2);
                };
                auto From = [Mod](auto const & x) { return Mod(x); };
                auto Adjust = [m](auto const & x) { return x >= m ? T(x - m) : T(x); };
                
                return std::make_tuple(Mod, To, From, Adjust);
            } else {
                static_assert([]{ return false; }());
            }
        }();
        auto const & Mod = std::get<0>(Funcs);
        auto const & To = std::get<1>(Funcs);
        auto const & From = std::get<2>(Funcs);
        auto const & Adjust = std::get<3>(Funcs);
        
        size_t ebits = 0;
        {
            auto ec = e;
            while (ec != 0) {
                ec >>= 1;
                ebits += 1;
            }
        }
        ASSERT(ebits >= 1);
        
        auto OptWS = [&]{
            size_t n = std::max(size_t(1), ebits);
            std::tuple<size_t, size_t> minv = {size_t(-1), 0};
            for (size_t w = 1; w <= 12; ++w) {
                // Number of multiplication-reductions
                size_t const c =
                    w - 1 + n + (1 << (w - 1)) - 1 + n / w; // Sliding Window
                if (std::get<1>(minv) == 0 || c < std::get<0>(minv))
                    minv = std::make_tuple(c, w);
            }
            return minv;
        };
        
        auto PowSlow = [&](auto const & a, auto const & b) {
            auto bc = b;
            size_t bbits = 0;
            while (bc != 0) {
                bc >>= 1;
                ++bbits;
            }
            ASSERT(bbits >= 1);
            T r = a;
            for (ptrdiff_t i = ptrdiff_t(bbits) - 2; i >= 0; --i) {
                r = Mod(DT(r) * r);
                if ((b >> i) & 1)
                    r = Mod(DT(r) * a);
            }
            return r;
        };
        
        std::span<T> bs((T*)bs0, cnt);
        
        size_t const w = std::get<1>(OptWS()), nthr = OmpThreads(),
            block = std::max(size_t(1), std::min(size_t(1 << 12), size_t((bs.size() + nthr - 1) / nthr))),
            n_blocks = (bs.size() + block - 1) / block, th_blocks = (n_blocks + nthr - 1) / nthr;
        
        #pragma omp parallel for
        for (size_t ith = 0; ith < nthr; ++ith) {
            std::vector<std::vector<T>> powers(1 << (w - 1), std::vector<T>(block));
            std::vector<T> buf(block);
            
            for (size_t iblock = ith * th_blocks;
                    iblock < std::min(n_blocks, size_t((ith + 1) * th_blocks)); ++iblock) {
                size_t const lo = iblock * block, hi = std::min((iblock + 1) * block, bs.size());
                
                // Precompute Redc form
                for (size_t j = lo; j < hi; ++j)
                    bs[j] = To(bs[j]);
                
                // Precompute powers for Sliding Window method
                
                for (size_t j = lo, j0 = 0; j < hi; ++j, ++j0)
                    powers[0][j0] = bs[j];
                for (size_t k = 1; k <= w - 1; ++k)
                    for (size_t j = lo, j0 = 0; j < hi; ++j, ++j0)
                        powers[0][j0] = Mod(DT(powers[0][j0]) * powers[0][j0]);
                for (size_t k = 1; k < (1 << (w - 1)); ++k)
                    for (size_t j = lo, j0 = 0; j < hi; ++j, ++j0)
                        powers[k][j0] = Mod(DT(powers[k - 1][j0]) * bs[j]);
                
                if constexpr(Check) {
                    for (size_t k = 0; k < (1 << (w - 1)); ++k)
                        for (size_t j = lo, j0 = 0; j < hi; ++j, ++j0)
                            ASSERT_MSG(powers[k][j0] == PowSlow(bs[j], (1 << (w - 1)) + k),
                                "w " + std::to_string(w) + ", k " + std::to_string(k));
                }
                
                // Binary exponentiation with Sliding Window
                // https://en.wikipedia.org/wiki/Elliptic_curve_point_multiplication#Sliding-window_method
                
                for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0)
                    buf[k0] = bs[k];
                
                for (ptrdiff_t j = ptrdiff_t(ebits) - 2; j >= 0;) {
                    size_t nt = 0;
                    u8 const bit = u8((e >> j) & 1);
                    if (bit == 0) {
                        for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0)
                            buf[k0] = Mod(DT(buf[k0]) * buf[k0]);
                        nt = 1;
                        j -= nt;
                    } else {
                        nt = 1;
                        size_t t = 1;
                        for (size_t k = 1; k < std::min(w, size_t(j + 1)); ++k) {
                            t <<= 1;
                            t |= size_t((e >> (j - k)) & 1);
                            ++nt;
                        }
                        j -= nt;
                        if (nt < w) {
                            // Regular binary exponentiation of tail
                            
                            for (ptrdiff_t p = ptrdiff_t(nt) - 1; p >= 0; --p) {
                                for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0)
                                    buf[k0] = Mod(DT(buf[k0]) * buf[k0]);
                                if ((t >> p) & 1)
                                    for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0)
                                        buf[k0] = Mod(DT(buf[k0]) * bs[k]);
                            }
                        } else {
                            for (size_t p = 0; p < w; ++p)
                                for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0)
                                    buf[k0] = Mod(DT(buf[k0]) * buf[k0]);
                            
                            ASSERT(t >= (1 << (w - 1)));
                            t -= 1 << (w - 1);
                            ASSERT(t < powers.size());
                            auto const & pow_t = powers[t];
                            
                            for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0)
                                buf[k0] = Mod(DT(buf[k0]) * pow_t[k0]);
                        }
                    }
                    if constexpr(Check) {
                        for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0)
                            ASSERT_MSG(buf[k0] == PowSlow(bs[k], e >> (j + 1)),
                                "w " + std::to_string(w) + ", ebits " + std::to_string(ebits) +
                                ", j " + std::to_string(j) + ", nt " + std::to_string(nt));
                    }
                }
                
                // Convert back from Redc form
                for (size_t k = lo, k0 = 0; k < hi; ++k, ++k0)
                    bs[k] = Adjust(From(buf[k0]));
            }
        }
    }
'''),
        'main.pyx': (
r'''
# distutils: language = c++

import numpy as np
cimport numpy as np
cimport cython
from libc.stdint cimport *
from cpython cimport *

cdef extern from *:
    ctypedef uint64_t val0 "0"
    ctypedef uint64_t val1 "1"
    ctypedef uint64_t val2 "2"
    ctypedef uint64_t val64 "64"
    ctypedef uint64_t val128 "128"
    ctypedef uint64_t val256 "256"
    ctypedef uint64_t val512 "512"
    ctypedef bool valFalse "false"
    ctypedef bool valTrue "true"

cdef extern from "cys{srch}_lib.h" nogil:
    void PowModGmpC(
        uint64_t * bs0, uint64_t bs_words, uint64_t cnt, const uint64_t * e0,
        uint64_t e_words, const uint64_t * m0, uint64_t m_words) except +
    void PowModC[Bits, RedcType](uint64_t * bs0, uint64_t cnt, const uint64_t * e0, uint64_t e_words, const uint64_t * m0) except +

#@cython.boundscheck(False)
#@cython.wraparound(False)
def pow_mod_cy(bs, e, m, *, redc_type):
    u64m = (1 << 64) - 1
    def ToU64(x, nb):
        assert x >= 0, x
        return np.array([(x >> (i * 64)) & u64m for i in range(nb // 64)], dtype = np.uint64)
    def FromU64(x):
        r = 0
        for i, e in enumerate(x):
            r += int(e) << (i * 64)
        return r
    def GetBits(x):
        xbl = x.bit_length()
        assert xbl <= 1024, xbl
        return 64 if xbl <= 64 else 128 if xbl <= 128 else 256 if xbl <= 256 else 512 if xbl <= 512 else 1024
        
    redc_type = {'div': 0, 'barrett': 1, 'montgomery': 2}[redc_type]
    
    bits = GetBits(m)
    assert bits <= 512
    bs = np.asarray([ToU64(b % m, bits) for b in bs], dtype = np.uint64)
    m = ToU64(m, bits)
    ebits = GetBits(e)
    e = ToU64(e, ebits)
    
    cdef uint64_t[:, :] cbs = bs
    cdef const uint64_t[:] cm = m
    cdef const uint64_t[:] ce = e
    cdef uint64_t cbits = bits
    cdef uint64_t cewords = ebits // 64
    cdef uint64_t credc_type = redc_type
    cdef uint64_t cnt = len(bs)
    
    with nogil:
        (
            PowModC[val64, val0] if cbits == 64 else PowModC[val128, val0] if cbits == 128 else PowModC[val256, val0] if cbits == 256 else PowModC[val512, val0]
            if credc_type == 0 else
            PowModC[val64, val1] if cbits == 64 else PowModC[val128, val1] if cbits == 128 else PowModC[val256, val1] if cbits == 256 else PowModC[val512, val1]
            if credc_type == 1 else
            PowModC[val64, val2] if cbits == 64 else PowModC[val128, val2] if cbits == 128 else PowModC[val256, val2] if cbits == 256 else PowModC[val512, val2]
        )(
            <uint64_t *>&cbs[0, 0], cnt, <const uint64_t *>&ce[0], cewords, <const uint64_t *>&cm[0]
        )
    
    return [FromU64(e) for e in bs]

#@cython.boundscheck(False)
#@cython.wraparound(False)
def pow_mod_gmp_cy(bs, e, m):
    u64m = (1 << 64) - 1
    def ToU64(x, nb):
        assert x >= 0, x
        return np.array([(x >> (i * 64)) & u64m for i in range(nb // 64)], dtype = np.uint64)
    def FromU64(x):
        r = 0
        for i, e in enumerate(x):
            r += int(e) << (i * 64)
        return r
    def GetBits(x):
        xbl = x.bit_length()
        assert xbl <= 1024, xbl
        return 64 if xbl <= 64 else 128 if xbl <= 128 else 256 if xbl <= 256 else 512 if xbl <= 512 else 1024
    
    bits = GetBits(m)
    assert bits <= 512
    bs = np.asarray([ToU64(b, bits) for b in bs], dtype = np.uint64)
    m = ToU64(m, bits)
    ebits = GetBits(e)
    e = ToU64(e, ebits)
    
    cdef uint64_t[:, :] cbs = bs
    cdef const uint64_t[:] cm = m
    cdef const uint64_t[:] ce = e
    cdef uint64_t cbits = bits
    cdef uint64_t cwords = bits // 64
    cdef uint64_t cewords = ebits // 64
    cdef uint64_t ccnt = len(bs)
    
    with nogil:
        PowModGmpC(<uint64_t *>&cbs[0, 0], cwords, ccnt, <const uint64_t *>&ce[0], cewords, <const uint64_t *>&cm[0], cwords)
    
    return [FromU64(e) for e in bs]
'''),
    }, clang_dir = 'c:/bin/llvm/bin/', boost_dir = None, debug = 0)

if __name__ == '__main__':
    Test()