提交记录 19841


用户 题目 状态 得分 用时 内存 语言 代码长度
whatever 1002. 测测你的多项式乘法 Accepted 100 44.062 ms 39864 KB C++17 53.43 KB
提交时间 评测时间
2023-08-06 18:26:29 2023-08-06 18:26:32
#include <algorithm>
#include <functional>
#include <iostream>

using i32 = int;
using u32 = unsigned;
using i64 = long long;
using u64 = unsigned long long;

#define IL __inline__ __attribute__((always_inline))
#define RC(T, x) reinterpret_cast<T>(x)

/*
一个高性能的多项式实现
作者为 QedDust413 & killer_joke
*/

namespace Setting {
constexpr u32 mod = 998244353;
constexpr size_t sta_l_MB = 64;
constexpr int Detail = 1;
}  // namespace Setting

namespace __qed {
constexpr int bit_up(int x) { return 1 << (32 - __builtin_clz(x)); }
constexpr int crz_32(int x) { return __builtin_ctz(x); }
constexpr int cro_32(int x) { return __builtin_ctz(~x); }
constexpr int lg2u(int x) { return x == 1 ? 0 : (32 - __builtin_clz(x - 1)); }
constexpr int lg2d(int x) { return std::__lg(x); }
constexpr bool ispow2(u64 x) { return (x != 0) && ((x & (x - 1)) == 0); }
constexpr u32 primitive_root(u32 M) {
    u32 qed = 0, n = M - 1, d[11] = {};
    for (int i = 2; i * i <= n; i++) {
        if (n % i == 0) {
            d[qed++] = i;
            do {
                n /= i;
            } while (n % i == 0);
        }
    }
    if (n > 1) {
        d[qed++] = n;
    }
    for (u32 g = 2, r = 0;; ++g) {
        for (u32 i = 0; i < qed; ++i) {
            u32 b = (M - 1) / d[i], a = g;
            for (r = 1; b; b >>= 1, a = u64(a) * a % M) {
                if (b & 1) {
                    r = u64(r) * a % M;
                }
            }
            if (r == 1) {
                break;
            }
        }
        if (r != 1) {
            return g;
        }
    }
}
}  // namespace __qed

namespace Stat_Info {
using Setting::Detail;
i64 ntt_size;
i64 fill0_size, copy_size, rev_size;
size_t max_cost_sta_l;
constexpr const char* Author = "QedDust413 & killer_joke";
constexpr const char* Thanks =
    "negiizhao, chaihf, rogeryoungh, EntropyIncreaser, hly1204, Min_25, Qdedawsd2233, bh1234666, yosupo, "
    "Pulsating_Dust, KKKKa, qdc, and more.";
template <typename Tf = std::ostream>
void report(Tf& outf = std::clog) {
    if constexpr (Detail <= 0) {
        outf << "Statistics are turned off.\n";
    }
    if constexpr (Detail > 0) {
        outf << "ntt_size:" << ntt_size << "\n";
    }
    if constexpr (Detail > 1) {
        outf << "fill0_size:" << fill0_size << " copy_size:" << copy_size << " rev_size:" << rev_size
             << "\nmax_cost_sta:" << (double(max_cost_sta_l) / double(1 << 20)) << "\n";
    }
    outf.flush();
}
}  // namespace Stat_Info

namespace mem_helper {
using namespace __qed;
template <size_t align>
void* to_align(void* mem) {
    static_assert(ispow2(align));
    return RC(void*, (RC(u64, mem) + (align - 1)) & (-align));
}
template <size_t align>
bool is_align(const void* mem) {
    static_assert(ispow2(align));
    return (RC(u64, mem) & (align - 1)) == 0;
}
char _mem[Setting::sta_l_MB << 20], *_now = _mem;
struct pre_aloc {
    char* t;
    pre_aloc() : t(_now) {}
    ~pre_aloc() { _now = t; }
};
template <typename T, size_t align>
T* AlocS(size_t n) {
    T* r = RC(T*, to_align<align>(_now));
    _now = RC(char*, r + n);
    if constexpr (Stat_Info::Detail > 1) {
        Stat_Info::max_cost_sta_l = std::max<size_t>(_now - _mem, Stat_Info::max_cost_sta_l);
    }
    return r;
}
template <size_t align>
void* _alocH(size_t n) {
    constexpr size_t ptr_size = sizeof(void*), Extra = (align + ptr_size);
    void *base = std::malloc(n + Extra), *p = to_align<align>(RC(char*, base) + ptr_size);
    return RC(void**, p)[-1] = base, p;
}
template <typename T, size_t align>
T* AlocH(size_t n) {
    return RC(T*, _alocH<align>(n * sizeof(T)));
}
void Freed(void* p) { std::free(RC(void**, p)[-1]); }
template <typename T, size_t align>
struct Trivial_Aligned_Allocator {
    static_assert(std::is_trivial<T>::value);
    typedef T value_type;
    T* allocate(std::size_t n) { return AlocH<T, align>(n); }
    template <class T1>
    struct rebind {
        using other = Trivial_Aligned_Allocator<T1, align>;
    };
    void deallocate(T* p, std::size_t n) { Freed(p); }
};
}  // namespace mem_helper

namespace Montgo {
// Author : QedDust413
struct Mont32 {
    u32 Mod, Mod2, Inv, NInv, R2;
    constexpr Mont32(u32 n) : Mod(n), Mod2(n << 1), Inv(1), NInv(), R2((-u64(n)) % n) {
        for (int i = 0; i < 5; ++i) {
            Inv *= 2 - n * Inv;
        }
        NInv = -Inv;
    }
    constexpr IL u32 reduce(u64 x) const { return (x + u64(u32(x) * NInv) * Mod) >> 32; }
    constexpr IL u32 reduce_s(u64 x) const {
        u32 r = (x >> 32) - ((u64(u32(x) * Inv) * Mod) >> 32);
        return r >> 31 ? r + Mod : r;
    }
    constexpr IL u32 mul(u32 x, u32 y) const { return reduce(u64(x) * y); }
    constexpr IL u32 mul_s(u32 x, u32 y) const { return reduce_s(u64(x) * y); }
    constexpr IL u32 In(u32 x) const { return mul(x, R2); }
    constexpr IL u32 In_s(u32 x) const { return mul_s(x, R2); }
    constexpr IL u32 Out(u32 x) const {
        u32 r = (x + (u64(x * NInv) * Mod)) >> 32;
        return r < Mod ? r : r - Mod;
    }
};
}  // namespace Montgo

namespace field_Z {
// Author : QedDust413
using Setting::mod;
constexpr Montgo::Mont32 Space(mod);
constexpr u32 mod2 = Space.Mod2;
constexpr IL u32 shrink(u32 x) { return x >= mod ? x - mod : x; }
constexpr IL u32 dilate2(u32 x) { return x >> 31 ? x + mod2 : x; }
using Z = u32;
constexpr bool isgood(Z x) { return x < mod2; }
constexpr IL Z InZ(u32 x) { return Space.In(x); }
constexpr IL Z InZs(u32 x) { return Space.In_s(x); }
constexpr Z zero_Z(0), one_Z(InZs(1)), not_exist_Z(-1);
constexpr IL u32 OutZ(Z x) { return Space.Out(x); }
constexpr bool equalZ(Z x, Z y) { return shrink(x) == shrink(y); }
namespace calc {
constexpr IL Z addZ(Z x, Z y) { return dilate2(x + y - mod2); }
constexpr IL Z subZ(Z x, Z y) { return dilate2(x - y); }
constexpr IL Z mulZ(Z x, Z y) { return Space.mul(x, y); }
constexpr Z powZ(Z a, u32 b, Z r = one_Z) {
    for (; b; a = mulZ(a, a), b >>= 1) {
        if (b & 1) {
            r = mulZ(r, a);
        }
    }
    return r;
}
constexpr IL Z invZ(Z x) { return powZ(x, mod - 2); }
constexpr IL Z divZ(Z x, Z y) { return powZ(y, mod - 2, x); }
template <bool strict = true>
constexpr IL Z mulZs(Z x, Z y) {
    if constexpr (strict) {
        return Space.mul_s(x, y);
    }
    return mulZ(x, y);
}
constexpr IL Z negZ(Z x) { return mod2 - x; }
namespace extend_ {
constexpr Z absZ(Z x) {
    u32 r = OutZ(x);
    return InZs(std::min(r, mod - r));
}
constexpr Z legendre(Z x) { return powZ(x, (mod - 1) >> 1); }
constexpr bool isQR(Z x) { return equalZ(legendre(x), one_Z); }
constexpr Z sqrtZ(Z x) {
    if (!isQR(x)) {
        return not_exist_Z;
    }
    Z a(1), I_mul(0);
    while (isQR(I_mul = subZ(mulZ(a, a), x))) {
        ++a;
    }
    struct dZ {
        Z r, i;
        constexpr void Mul(dZ d, Z I_) {
            *this = { addZ(mulZ(r, d.r), mulZ(mulZ(I_, i), d.i)), addZ(mulZ(r, d.i), mulZ(i, d.r)) };
        }
    };
    dZ s = { a, one_Z }, r = { one_Z, zero_Z };
    for (u32 b = (mod + 1) >> 1; b; b >>= 1, s.Mul(s, I_mul)) {
        if (b & 1) {
            r.Mul(s, I_mul);
        }
    }
    return absZ(r.r);
}
}  // namespace extend_
}  // namespace calc
template <int fixes, bool strict = true>
constexpr Z trans(Z x) {
    constexpr Z o = fixes > 0 ? calc::powZ(Space.R2, fixes) : calc::powZ(1, -fixes);
    return calc::mulZs<strict>(x, o);
}
template <bool strict = true>
constexpr Z trans(Z x, int fixes) {
    return calc::mulZs<strict>(x, fixes > 0 ? calc::powZ(Space.R2, fixes) : calc::powZ(1, -fixes));
}
namespace Const {
constexpr Z _half = InZs((mod + 1) >> 1), _neghalf = InZs((mod - 1) >> 1), neg_one = InZs(mod - 1);
constexpr Z img = calc::extend_::sqrtZ(neg_one), imgI = mod - img, _imghalf = calc::mulZs(_half, img);
}  // namespace Const
}  // namespace field_Z
#pragma GCC target("avx2")
#include <immintrin.h>
#define VEC(sz, T) __attribute((vector_size(sz))) T
namespace SIMD {
using i32x8 = VEC(32, i32);
using u32x8 = VEC(32, u32);
using i64x4 = VEC(32, i64);
using u64x4 = VEC(32, u64);
using I256 = __m256i;
constexpr IL u32x8 setu32x8(u32 x) { return (u32x8){ x, x, x, x, x, x, x, x }; }
template <int typ>
IL u32x8 shuffle(const u32x8& x) {
    return RC(u32x8, _mm256_shuffle_epi32(RC(I256, x), typ));
}
template <int typ>
IL u32x8 blend(const u32x8& x, const u32x8& y) {
    return RC(u32x8, _mm256_blend_epi32(RC(I256, x), RC(I256, y), typ));
}
IL I256 swaplohi128(const I256& x) { return _mm256_permute2x128_si256(x, x, 1); }
IL u32x8& x8(u32* data) { return *RC(u32x8*, data); }
IL const u32x8& x8(const u32* data) { return *RC(const u32x8*, data); }
IL I256 loadu(const void* data) { return _mm256_loadu_si256(RC(const __m256i_u*, data)); }
IL void storeu(const I256& x, void* data) { return _mm256_storeu_si256(RC(__m256i_u*, data), x); }
IL u64x4 fus_mul(const u32x8& x, const u32x8& y) {
    return RC(u64x4, _mm256_mul_epu32(RC(I256, x), RC(I256, y)));
}
}  // namespace SIMD
namespace field_Z {
using SIMD::setu32x8;
using SIMD::u32x8;
using SIMD::x8;
using Zx8 = u32x8;
constexpr u32x8 modx8 = setu32x8(mod), mod2x8 = setu32x8(mod2), NInvx8 = setu32x8(Space.NInv);
constexpr Zx8 one_Zx8 = setu32x8(one_Z), zerox8 = setu32x8(0u);
IL Zx8 dilate2x8(const Zx8& x) {
    return x + (RC(Zx8, RC(SIMD::i32x8, x) < RC(SIMD::i32x8, zerox8)) & mod2x8);
}
IL Zx8 shrinkx8(const Zx8& x) { return x - ((x >= modx8) & modx8); }
namespace calc {
// Author : killer_joke
using namespace SIMD;
IL Zx8 addZx8(const Zx8& x, const Zx8& y) { return dilate2x8(x + y - mod2x8); }
IL Zx8 subZx8(const Zx8& x, const Zx8& y) { return dilate2x8(x - y); }
IL Zx8 mulZx8(const Zx8& x, const Zx8& y) {
    u32x8 z = (NInvx8 * x * y);
    return blend<0xaa>(RC(u32x8, (fus_mul(x, y) + fus_mul(z, modx8)) >> 32),
                       RC(u32x8, (fus_mul(u32x8(u64x4(x) >> 32), u32x8(u64x4(y) >> 32)) +
                                  fus_mul(shuffle<0xf5>(z), modx8))));
}
IL Zx8 powZx8(const Zx8& y, u32 b, const Zx8& _r = one_Zx8) {
    Zx8 x = y, r = _r;
    for (; b; x = mulZx8(x, x), b >>= 1) {
        if (b & 1) {
            r = mulZx8(r, x);
        }
    }
    return r;
}
IL Zx8 invZx8(const Zx8& x) { return powZx8(x, mod - 2); }
IL Zx8 divZx8(const Zx8& x, const Zx8& y) { return powZx8(y, mod - 2, x); }
template <bool strict = true>
IL Zx8 mulZsx8(const Zx8& x, const Zx8& y) {
    if constexpr (strict) {
        u32x8 z = (NInvx8 * x * y);
        z = blend<0xaa>(RC(u32x8, (fus_mul(x, y) + fus_mul(z, modx8)) >> 32),
                        RC(u32x8, (fus_mul(u32x8(u64x4(x) >> 32), u32x8(u64x4(y) >> 32)) +
                                   fus_mul(shuffle<0xf5>(z), modx8)))) -
            modx8;
        return z + (RC(Zx8, RC(i32x8, z) < RC(i32x8, zerox8)) & modx8);
    }
    return mulZx8(x, y);
}
IL Zx8 negZx8(const Zx8& x) { return mod2x8 - x; }
IL Zx8 _AmulZx8(const Zx8& a, const Zx8& b, const Zx8& c) { return mulZx8(a + b, c); }
IL Zx8 _SmulZx8(const Zx8& a, const Zx8& b, const Zx8& c) { return mulZx8(a - b + mod2x8, c); }
template <int typ>
IL u32x8 Neg(const u32x8& x) {
    return blend<typ>(x, mod2x8 - x);
}
}  // namespace calc
}  // namespace field_Z
#undef VEC

#define STATI(ifo, dt, num)                 \
    if constexpr (Stat_Info::Detail > dt) { \
        Stat_Info::ifo += (num);            \
    }
namespace poly_base {
using namespace field_Z;
using __qed::bit_up;
constexpr int mp2 = __qed::crz_32(mod - 1);
void fl0(Z* f, int l) { STATI(fill0_size, 1, l) std::fill_n(f, l, zero_Z); }
void fl0(Z* bg, Z* ed) { STATI(fill0_size, 1, ed - bg) std::fill(bg, ed, zero_Z); }
void Cpy(const Z* f, int l, Z* g) { STATI(copy_size, 1, l) std::copy_n(f, l, g); }
void Cpy(const Z* bg, const Z* ed, Z* g) { STATI(copy_size, 1, ed - bg) std::copy(bg, ed, g); }
void rev(Z* bg, Z* ed) { STATI(rev_size, 1, ed - bg) std::reverse(bg, ed); }
void rev(Z* f, int l) { STATI(rev_size, 1, l) std::reverse(f, f + l); }
void Cpy_fl0(const Z* f, int n, Z* g, int lim) { Cpy(f, n, g), fl0(g + n, g + lim); }
void Cpy_rev(const Z* f, int l, Z* g) {
    STATI(rev_size, 1, l) STATI(copy_size, 1, l) std::reverse_copy(f, f + l, g);
}
#define flx(nam, opt)                                       \
    void nam(const Z* A, int l, Z* B, Z t) {                \
        int i = 0;                                          \
        if (l > 16) {                                       \
            Zx8 tx8 = setu32x8(t);                          \
            for (; i + 7 < l; i += 8) {                     \
                x8(B + i) = calc::opt##Zx8(x8(A + i), tx8); \
            }                                               \
        }                                                   \
        for (; i < l; ++i) {                                \
            B[i] = calc::opt##Z(A[i], t);                   \
        }                                                   \
    }
flx(mul_t_s, mul) flx(add_t_s, add) flx(sub_t_s, sub)
#undef flx
#define flx(nam, opt)                                         \
    void nam(Z* A, int l, const Z* B) {                       \
        int i = 0;                                            \
        for (; i + 7 < l; i += 8) {                           \
            x8(A + i) = calc::opt##Zx8(x8(A + i), x8(B + i)); \
        }                                                     \
        for (; i < l; ++i) {                                  \
            A[i] = calc::opt##Z(A[i], B[i]);                  \
        }                                                     \
    }                                                         \
    void nam(const Z* A, const Z* B, int l, Z* C) {           \
        int i = 0;                                            \
        for (; i + 7 < l; i += 8) {                           \
            x8(C + i) = calc::opt##Zx8(x8(A + i), x8(B + i)); \
        }                                                     \
        for (; i < l; ++i) {                                  \
            C[i] = calc::opt##Z(A[i], B[i]);                  \
        }                                                     \
    }
    flx(dot, mul) flx(addP, add) flx(subP, sub)
#undef flx
        void NegP(const Z* A, int l, int r, Z* B) {
    int i = l;
    if (r - l >= 16) {
        for (; i & 7; ++i) {
            B[i] = calc::negZ(A[i]);
        }
        for (; i + 7 < r; i += 8) {
            x8(B + i) = calc::negZx8(x8(A + i));
        }
    }
    for (; i < r; ++i) {
        B[i] = calc::negZ(A[i]);
    }
}
}  // namespace poly_base

namespace poly_base {
using mem_helper::Freed;
using mem_helper::pre_aloc;
const std::function<Z*(size_t)> alocS = mem_helper::AlocS<Z, 32>;
const std::function<Z*(size_t)> alocH = mem_helper::AlocH<Z, 32>;
struct pre_base {
    const std::function<void(Z*, int, int)> jok;
    Z* p;
    int sz;
    pre_base(std::function<void(Z*, int, int)> f) : jok(f), p(nullptr), sz(0) {}
    ~pre_base() {
        if (p != nullptr) {
            Freed(p);
        }
    }
    void expand(int len) {
        if (len > sz) {
            len = (len + 7) & -8;
            Z* nxtp = alocH(len);
            if (p != nullptr) {
                Cpy(p, sz, nxtp), Freed(p);
            }
            jok(p = nxtp, sz, len), sz = len;
        }
    }
    void clear() {
        if (p != nullptr) {
            Freed(p), sz = 0;
        }
    }
    const Z* operator()(int len) { return expand(len), p; }
    const Z* operator()() { return p; }
};

void multi_iv_Zx8(const Zx8* g, int n, Zx8* f) {
    using namespace calc;
    if (n == 0) {
        return;
    }
    f[0] = g[0];
    for (int i = 1; i < n; ++i) {
        f[i] = mulZx8(f[i - 1], g[i]);
    }
    f[n - 1] = invZx8(f[n - 1]);
    for (int i = n - 1; i; --i) {
        Zx8 ivi = f[i];
        f[i] = mulZx8(ivi, f[i - 1]);
        f[i - 1] = mulZx8(ivi, g[i]);
    }
}

void multi_iv(const Z* g, int n, Z* f) {
    int N = n >> 3, i = N << 3;
    multi_iv_Zx8(RC(const Zx8*, g), N, RC(Zx8*, f));
    for (; i < n; ++i) {
        f[i] = calc::invZ(g[i]);
    }
}

namespace cal_helper {
using namespace calc;
void _cal_i(Z* dus, int l, int r) {
    for (; l < 8; ++l) {
        dus[l] = InZ(l);
    }
    constexpr Zx8 eight_Zx8 = setu32x8(InZs(8));
    for (int i = l; i < r; i += 8) {
        x8(dus + i) = addZx8(x8(dus + i - 8), eight_Zx8);
    }
}
pre_base seqi(_cal_i);
void _cal_iv(Z* kil, int l, int r) {
    const Z* _i = seqi(r);
    if (l < 8) {
        x8(kil) = invZx8(x8(_i)), l = 8;
    }
    multi_iv_Zx8(RC(const Zx8*, _i + l), (r - l) >> 3, RC(Zx8*, kil + l));
}
pre_base seqiv(_cal_iv);
void deriv(const Z* g, int n, Z* f) {
    const Z* _i = seqi(n + 1);
    int i = 0;
    if (n > 7) {
        for (; i < 7; ++i) {
            f[i] = mulZ(g[i + 1], _i[i + 1]);
        }
        for (; i + 7 < n; i += 8) {
            storeu(RC(I256, mulZx8(x8(g + i + 1), x8(_i + i + 1))), f + i);
        }
    }
    for (; i < n; ++i) {
        f[i] = mulZ(g[i + 1], _i[i + 1]);
    }
    f[n] = zero_Z;
}
void integ(const Z* g, int n, Z* f) {
    const Z* _iv = seqiv(n + 1);
    int i = n - 1;
    if (i > 7) {
        for (; (i & 7) != 6; --i) {
            f[i + 1] = mulZ(g[i], _iv[i + 1]);
        }
        for (i -= 7; ~i; i -= 8) {
            x8(f + i + 1) = mulZx8(RC(Zx8, loadu(g + i)), x8(_iv + i + 1));
        }
        i += 7;
    }
    for (; ~i; --i) {
        f[i + 1] = mulZ(g[i], _iv[i + 1]);
    }
    f[0] = zero_Z;
}
}  // namespace cal_helper
using cal_helper::deriv;
using cal_helper::integ;
using cal_helper::seqi;
using cal_helper::seqiv;
}  // namespace poly_base

namespace poly_base {
namespace Tools {
int compP(const Z* f, int l, const Z* g) {
    for (int i = 0; i < l; ++i) {
        if (!equalZ(f[i], g[i])) {
            return i;
        }
    }
    return -1;
}
template <int fixes = 1, typename Tf = std::istream>
void scanP(Z* f, int l, Tf& inf = std::cin) {
    for (int i = 0; i < l; ++i) {
        inf >> f[i], f[i] = trans<fixes, false>(f[i]);
    }
}
template <int fixes = -1, typename Tf = std::ostream>
void printP(const Z* f, int l, Tf& outf = std::cout) {
    if (l) {
        outf << trans<fixes>(f[0]);
        for (int i = 1; i < l; ++i) {
            outf << ' ' << trans<fixes>(f[i]);
        }
        outf << '\n';
    }
}
template <typename T = u32, typename lT = u64>
T stoi_m(const std::string& s, const T m) {
    T r(0);
    for (auto ch : s) {
        r = ((lT)r * 10 + (ch - '0')) % m;
    }
    return r;
}
int clzP(const Z* f, int n) {
    int i = 0;
    while (i < n && equalZ(f[i], zero_Z)) {
        ++i;
    }
    return i;
}
int crzP(const Z* f, int n) {
    int i = n;
    while (i && equalZ(f[i - 1], zero_Z)) {
        --i;
    }
    return n - i;
}
}  // namespace Tools
}  // namespace poly_base

namespace poly_base {
// Author : QedDust413
namespace f_n_t_t_base {
using namespace calc;
using namespace __qed;
constexpr int base4thre = 4;
template <bool strict = false>
void mul_t(Z* A, int l, Z t) {
    for (int j = 0; j < l; ++j) {
        A[j] = mulZs<strict>(A[j], t);
    }
}
constexpr Z _g(InZ(primitive_root(mod)));
struct P_R_Tab {
    Z t[mp2 + 1];
    constexpr P_R_Tab(Z G) : t() {
        t[mp2] = shrink(powZ(G, (mod - 1) >> mp2));
        for (int i = mp2 - 1; ~i; --i) {
            t[i] = mulZs(t[i + 1], t[i + 1]);
        }
    }
    constexpr Z operator[](int i) const { return t[i]; }
} constexpr rt1(_g), rt1_I(invZ(_g));
template <int lim>
struct omega_info_base2 {
    Z o[lim >> 1];
    constexpr omega_info_base2(const P_R_Tab& Tb) : o() {
        o[0] = one_Z;
        for (int i = 1, l = lim >> 1; i < l; ++i) {
            o[i] = mulZ(o[i & (i - 1)], Tb[crz_32(i) + 2]);
        }
    }
    constexpr Z operator[](int i) const { return o[i]; }
};
const omega_info_base2<base4thre> w(rt1), wI(rt1_I);
constexpr Zx8 powXx8(Z X) {
    Z X2 = mulZs(X, X), X3 = mulZs(X2, X), X4 = mulZs(X3, X), X5 = mulZs(X4, X), X6 = mulZs(X5, X),
      X7 = mulZs(X6, X);
    return (Zx8){ one_Z, X, X2, X3, X4, X5, X6, X7 };
}
struct ntt_info_base4x8 {
    Z rt3[mp2 - 2], rt3_I[mp2 - 2];
    Zx8 rt4ix8[mp2 - 3], rt4ix8_I[mp2 - 3];
    constexpr ntt_info_base4x8() : rt3(), rt3_I(), rt4ix8(), rt4ix8_I() {
        Z pr = one_Z, pr_I = one_Z;
        for (int i = 0; i < mp2 - 2; ++i) {
            rt3[i] = mulZs(pr, rt1[i + 3]), rt3_I[i] = mulZs(pr_I, rt1_I[i + 3]);
            pr = mulZs(pr, rt1_I[i + 3]), pr_I = mulZs(pr_I, rt1[i + 3]);
        }
        pr = one_Z, pr_I = one_Z;
        for (int i = 0; i < mp2 - 3; ++i) {
            rt4ix8[i] = powXx8(mulZs(pr, rt1[i + 4])), rt4ix8_I[i] = powXx8(mulZs(pr_I, rt1_I[i + 4]));
            pr = mulZs(pr, rt1_I[i + 4]), pr_I = mulZs(pr_I, rt1[i + 4]);
        }
    }
};
}  // namespace f_n_t_t_base
// fast_number_theoretic_transform
// Author : QedDust413
namespace f_n_t_t {
using namespace f_n_t_t_base;
template <bool strict = false, int fixes = 0>
void dif_base2(Z* A, int lim) {
    STATI(ntt_size, 0, lim)
    for (int L = lim >> 1, R = lim; L; L >>= 1, R >>= 1) {
        for (int i = 0, k = 0; i < lim; i += R, ++k) {
            for (int j = 0; j < L; ++j) {
                Z x = dilate2(A[i + j] - mod2), y = mulZ(w[k], A[i + j + L]);
                A[i + j] = x + y, A[i + j + L] = x - y + mod2;
            }
        }
    }
    if constexpr (fixes) {
        mul_t<strict>(A, lim, trans<fixes>(one_Z));
    } else {
        for (int j = 0; j < lim; ++j) {
            A[j] = dilate2(A[j] - mod2);
            if constexpr (strict) {
                A[j] = shrink(A[j]);
            }
        }
    }
}
template <bool strict = false, int fixes = 0>
void dit_base2(Z* A, int lim) {
    STATI(ntt_size, 0, lim)
    for (int L = 1, R = 2; L < lim; L <<= 1, R <<= 1) {
        for (int i = 0, k = 0; i < lim; i += R, ++k) {
            for (int j = 0; j < L; ++j) {
                Z x = A[i + j], y = A[i + j + L];
                A[i + j] = addZ(x, y), A[i + j + L] = mulZ(x - y + mod2, wI[k]);
            }
        }
    }
    mul_t<strict>(A, lim, trans<fixes + 1>(mod - ((mod - 1) >> crz_32(lim))));
}
constexpr Zx8 w14x8 = setu32x8(rt1[2]), w14Ix8 = setu32x8(rt1_I[2]);
constexpr Z w38 = mod - rt1_I[3], w38I = mod - rt1[3];
constexpr ntt_info_base4x8 iab4;
template <bool strict = false, int fixes = 0>
void dif_base4x8(Z* A, int lim) {
    STATI(ntt_size, 0, lim)
    int n = lim >> 3, L = n >> 1;
    Zx8* f = RC(Zx8*, A);
    if (crz_32(n) & 1) {
        for (int j = 0; j < L; ++j) {
            Zx8 x = f[j], y = f[j + L];
            f[j] = x + y, f[j + L] = x - y + mod2x8;
        }
        L >>= 1;
    }
    L >>= 1;
    for (int R = L << 2; L; L >>= 2, R >>= 2) {
        Z _r = one_Z, _r2 = one_Z, _r3 = one_Z;
        for (int i = 0, k = 0; i < n; i += R, ++k) {
            Zx8 r = setu32x8(_r), r2 = setu32x8(_r2), r3 = setu32x8(_r3);
            for (int j = 0; j < L; ++j) {
#define F(c) f[i + j + c * L]
                Zx8 f0 = dilate2x8(F(0) - mod2x8), f1 = mulZx8(F(1), r), f2 = mulZx8(F(2), r2),
                    f3 = mulZx8(F(3), r3);
                Zx8 f1f3 = _SmulZx8(f1, f3, w14x8), f02 = addZx8(f0, f2), f13 = addZx8(f1, f3),
                    f_02 = subZx8(f0, f2);
                F(0) = f02 + f13, F(1) = f02 - f13 + mod2x8, F(2) = f_02 + f1f3, F(3) = f_02 - f1f3 + mod2x8;
#undef F
            }
            _r = mulZs(_r, iab4.rt3[cro_32(k)]), _r2 = mulZs(_r, _r), _r3 = mulZs(_r2, _r);
        }
    }
    {
        constexpr Zx8 _r = setu32x8(trans<fixes>(one_Z)),
                      pr2 = { one_Z, one_Z, one_Z, rt1[2], one_Z, one_Z, one_Z, rt1[2] },
                      pr4 = { one_Z, one_Z, one_Z, one_Z, one_Z, rt1[3], rt1[2], w38 };
        Zx8 r = _r;
        for (int i = 0; i < n; ++i) {
            Zx8& fi = f[i];
            fi = mulZx8(fi, r);
            fi = _AmulZx8(Neg<0xf0>(fi), RC(Zx8, swaplohi128(RC(I256, fi))), pr4);
            fi = _AmulZx8(Neg<0xcc>(fi), shuffle<0x4e>(fi), pr2);
            fi = addZx8(Neg<0xaa>(fi), shuffle<0xb1>(fi));
            if constexpr (strict) {
                fi = shrinkx8(fi);
            }
            r = mulZsx8(r, iab4.rt4ix8[cro_32(i)]);
        }
    }
}
template <bool strict = false, int fixes = 0>
void dit_base4x8(Z* A, int lim) {
    STATI(ntt_size, 0, lim)
    int n = lim >> 3, L = 1;
    Zx8* f = RC(Zx8*, A);
    {
        constexpr Zx8 pr2 = { one_Z, one_Z, one_Z, rt1_I[2], one_Z, one_Z, one_Z, rt1_I[2] },
                      pr4 = { one_Z, one_Z, one_Z, one_Z, one_Z, rt1_I[3], rt1_I[2], w38I };
        Zx8 r = setu32x8(trans<fixes + 1>(mod - ((mod - 1) >> crz_32(lim))));
        for (int i = 0; i < n; ++i) {
            Zx8& fi = f[i];
            fi = _AmulZx8(Neg<0xaa>(fi), shuffle<0xb1>(fi), pr2);
            fi = _AmulZx8(Neg<0xcc>(fi), shuffle<0x4e>(fi), pr4);
            fi = _AmulZx8(Neg<0xf0>(fi), RC(Zx8, swaplohi128(RC(I256, fi))), r);
            r = mulZsx8(r, iab4.rt4ix8_I[cro_32(i)]);
        }
    }
    for (int R = L << 2; L < (n >> 1); L <<= 2, R <<= 2) {
        Z _r = one_Z, _r2 = one_Z, _r3 = one_Z;
        for (int i = 0, k = 0; i < n; i += R, ++k) {
            Zx8 r = setu32x8(_r), r2 = setu32x8(_r2), r3 = setu32x8(_r3);
            for (int j = 0; j < L; ++j) {
#define F(c) f[i + j + c * L]
                Zx8 f0 = F(0), f1 = F(1), f2 = F(2), f3 = F(3);
                Zx8 f2f3 = _SmulZx8(f2, f3, w14Ix8), f01 = addZx8(f0, f1), f23 = addZx8(f2, f3),
                    f_01 = subZx8(f0, f1);
                F(0) = addZx8(f01, f23), F(1) = _AmulZx8(f_01, f2f3, r), F(2) = _SmulZx8(f01, f23, r2),
                F(3) = _SmulZx8(f_01, f2f3, r3);
#undef F
            }
            _r = mulZs(_r, iab4.rt3_I[cro_32(k)]), _r2 = mulZs(_r, _r), _r3 = mulZs(_r2, _r);
        }
    }
    if (L != n) {
        for (int j = 0; j < L; ++j) {
            Zx8 x = f[j], y = f[j + L];
            f[j] = addZx8(x, y), f[j + L] = subZx8(x, y);
        }
    }
    if constexpr (strict) {
        for (int i = 0; i < n; ++i) {
            f[i] = shrinkx8(f[i]);
        }
    }
}
template <bool strict = false, int fixes = 0>
void dif(Z* A, int lim) {
    lim > base4thre ? dif_base4x8<strict, fixes>(A, lim) : dif_base2<strict, fixes>(A, lim);
}
template <bool strict = false, int fixes = 0>
void dit(Z* A, int lim) {
    lim > base4thre ? dit_base4x8<strict, fixes>(A, lim) : dit_base2<strict, fixes>(A, lim);
}
}  // namespace f_n_t_t
using f_n_t_t::dif;
using f_n_t_t::dit;
void Conv(Z* f, int lim, Z* g) { dif(f, lim), dif(g, lim), dot(f, lim, g), dit(f, lim); }
}  // namespace poly_base

#undef STATI
namespace poly {
using namespace poly_base;
// Author : QedDust413
void Mul(const Z* f, int n, const Z* g, int m, Z* h) {
    int lim = bit_up(n + m);
    pre_aloc mem;
    Z *F = alocS(lim), *G = alocS(lim);
    Cpy_fl0(f, n + 1, F, lim), Cpy_fl0(g, m + 1, G, lim), Conv(F, lim, G), Cpy(F, n + m + 1, h);
}

void Inv(const Z* g, int n, Z* f) {
    int lim = bit_up(n);
    pre_aloc mem;
    Z *o = alocS(lim), *h = alocS(lim);
    f[0] = calc::invZ(g[0]);
    for (int t = 2, mid = 1; t <= lim; t <<= 1, mid <<= 1) {
        int xl = std::min<int>(t, n + 1);
        Cpy_fl0(g, xl, o, t), Cpy_fl0(f, mid, h, t), Conv(o, t, h), fl0(o, mid), dif(o, t), dot(o, t, h),
            dit(o, t), NegP(o, mid, xl, f);
    }
}

void _Div(const Z* g, int n, const Z* f, Z* q) {
    if (n == 0) {
        return q[0] = calc::divZ(g[0], f[0]), void();
    }
    int lim = bit_up(n) << 1;
    pre_aloc mem;
    Z *o = alocS(lim), *h = alocS(lim);
    Inv(f, n, o), fl0(o + n + 1, o + lim), Cpy_fl0(g, n + 1, h, lim), Conv(h, lim, o), Cpy(h, n + 1, q);
}

void Div(const Z* g, int n, const Z* f, Z* q) {
    using namespace calc;
    if (n <= 64) {
        return _Div(g, n, f, q);
    }
    int bn = bit_up(n >> 4), bt = (n / bn) + 1, bn2 = bn << 1;
    pre_aloc mem;
    Z *o = alocS(bn2), *jok = alocS(bn2);
    Inv(f, bn - 1, o), fl0(o + bn, bn), Cpy_fl0(g, bn, jok, bn2), Conv(jok, bn2, o);
    Z *nf = alocS(bn2 * bt), *ng = alocS(bn2 * (bt - 1));
    Cpy(jok, bn, q), Cpy_fl0(f, bn, nf, bn2), dif(nf, bn2);
    for (int bi = 1; bi < bt; ++bi) {
        int xl = std::min<int>(bn, n + 1 - bn * bi);
        Cpy_fl0(f + bi * bn, xl, nf + bi * bn2, bn2), dif(nf + bi * bn2, bn2);
        Cpy_fl0(q + (bi - 1) * bn, bn, ng + (bi - 1) * bn2, bn2), dif<true>(ng + (bi - 1) * bn2, bn2),
            fl0(jok, bn2);
        for (int j = 0; j < bi; ++j) {
            Z *nF = nf + (bi - j) * bn2, *nF1 = nF - bn2, *ngj = ng + j * bn2;
            for (int i = 0; i < bn; i += 8) {
                x8(jok + i) = subZx8(x8(jok + i), _AmulZx8(x8(nF + i), x8(nF1 + i), x8(ngj + i)));
            }
            for (int i = bn; i < bn2; i += 8) {
                x8(jok + i) = subZx8(x8(jok + i), _SmulZx8(x8(nF + i), x8(nF1 + i), x8(ngj + i)));
            }
        }
        dit(jok, bn2), fl0(jok + bn, bn), addP(jok, xl, g + bn * bi), dif(jok, bn2), dot(jok, bn2, o),
            dit(jok, bn2), Cpy(jok, xl, q + bn * bi);
    }
}

void Div(const Z* g, int n, const Z* f, int m, Z* q) {
    int lim = n - m + 1, R = std::min<int>(m + 1, lim);
    pre_aloc mem;
    Z *o = alocS(lim), *h = alocS(lim);
    Cpy_rev(f + std::max<int>(2 * m - n, 0), R, o), fl0(o + R, o + lim);
    Cpy_rev(g + m, lim, h), Div(h, n - m, o, q), rev(q, lim);
}

void Div(const Z* g, int n, const Z* f, int m, Z* q, Z* r) {
    Div(g, n, f, m, q);
    int lim = bit_up(std::min<int>(n, m + m - 1));
    pre_aloc mem;
    Z *o = alocS(lim), *h = alocS(lim);
    Cpy_fl0(f, m, o, lim), Cpy_fl0(q, std::min<int>(n - m + 1, m), h, lim), Conv(o, lim, h), subP(g, o, m, r);
}

void Ln(const Z* g, int n, Z* f) {
    dot(g, seqi(n + 1), n + 1, f), Div(f, n, g, f), dot(f, n + 1, seqiv(n + 1));
}

template <bool calc_inv = true>
void Expi(const Z* g, int n, Z* f, Z* h) {
    f[0] = h[0] = one_Z;
    if (n < 1) {
        return;
    }
    int lim = bit_up(n);
    seqiv.expand(lim);
    pre_aloc mem;
    Z *o = alocS(lim), *A = alocS(lim), *B = alocS(lim);
    A[0] = A[1] = one_Z;
    for (int t = 2, mid = 1; t <= lim; t <<= 1, mid <<= 1) {
        int xl = std::min(n + 1, t), dl = std::min(t << 1, lim);
        deriv(g, mid - 1, o), dif(o, mid), dot(o, mid, A), dit(o, mid), deriv(f, mid - 1, o + mid);
        subP(o + mid, mid - 1, o), o[t - 1] = zero_Z, fl0(o, mid - 1), o[mid - 1] = calc::negZ(o[mid - 1]);
        Cpy_fl0(h, mid, B, t), dif(B, t), dif(o, t), dot(o, t, B), dit(o, t);
        integ(o, t - 1, o), fl0(o, mid), subP(o + mid, xl - mid, g + mid);
        dif(o, t), dot(o, t, A), dit(o, t), NegP(o, mid, xl, f);
        if (t != lim || calc_inv) {
            Cpy_fl0(f, xl, A, dl), dif(A, dl), dot(A, B, t, o), dit(o, t), fl0(o, mid);
            dif(o, t), dot(o, t, B), dit(o, t), NegP(o, mid, xl, h);
        }
    }
}

void Exp(const Z* g, int n, Z* f) {
    using namespace calc;
    pre_aloc mem;
    if (n <= 64) {
        Z* h = alocS(n + 1);
        return Expi<false>(g, n, f, h);
    }
    int bn = bit_up(n >> 4), bt = (n / bn) + 1, bn2 = bn << 1;
    seqiv.expand(n + 1);
    Z* h = alocS(bn2);
    Expi(g, bn - 1, f, h);
    fl0(h + bn, bn), dif(h, bn2);
    Z *jok = alocS(bn2), *nf = alocS(bn2 * bt), *ng = alocS(bn2 * (bt - 1));
    dot(g, seqi(), bn, nf), fl0(nf + bn, nf + bn2), dif(nf, bn2);
    for (int bi = 1; bi < bt; ++bi) {
        int xl = std::min<int>(bn, n + 1 - bn * bi);
        dot(g + bi * bn, seqi() + bi * bn, xl, nf + bi * bn2), fl0(nf + bi * bn2 + xl, bn2 - xl),
            dif(nf + bi * bn2, bn2);
        Cpy_fl0(f + (bi - 1) * bn, bn, ng + (bi - 1) * bn2, bn2), dif<true>(ng + (bi - 1) * bn2, bn2),
            fl0(jok, bn2);
        for (int j = 0; j < bi; ++j) {
            Z *nF = nf + (bi - j) * bn2, *nF1 = nF - bn2, *ngj = ng + j * bn2;
            for (int i = 0; i < bn; i += 8) {
                x8(jok + i) = addZx8(x8(jok + i), _AmulZx8(x8(nF + i), x8(nF1 + i), x8(ngj + i)));
            }
            for (int i = bn; i < bn2; i += 8) {
                x8(jok + i) = addZx8(x8(jok + i), _SmulZx8(x8(nF + i), x8(nF1 + i), x8(ngj + i)));
            }
        }
        dit(jok, bn2), fl0(jok + bn, bn), dif(jok, bn2), dot(jok, bn2, h), dit(jok, bn2), fl0(jok + bn, bn);
        dot(jok, xl, seqiv() + bn * bi), dif(jok, bn2), dot(jok, bn2, ng), dit(jok, bn2),
            Cpy(jok, xl, f + bn * bi);
    }
}

void InvSqrt(const Z* g, int n, Z* f) {
    int lim = bit_up(n);
    pre_aloc mem;
    Z *o = alocS(lim << 1), *h = alocS(lim << 1);
    f[0] = calc::invZ(calc::extend_::sqrtZ(g[0]));
    for (int t = 2, mid = 1; t <= lim; t <<= 1, mid <<= 1) {
        int xl = std::min<int>(t, n + 1);
        Cpy_fl0(f, mid, o, t << 1), Cpy_fl0(g, xl, h, t << 1), dif(o, t << 1), dif(h, t << 1),
            dot(h, t << 1, o), dot(o, t << 1, o), dot(h, t << 1, o), dit(h, t << 1),
            mul_t_s(h + mid, xl - mid, f + mid, Const::_neghalf);
    }
}

template <bool calc_inv = true>
void Sqrti(const Z* g, int n, Z* f, Z* h) {
    int lim = bit_up(n);
    f[0] = calc::extend_::sqrtZ(g[0]), h[0] = calc::invZ(f[0]);
    if (n == 0)
        return;
    pre_aloc mem;
    Z *o = alocS(lim), *H = alocS(lim), *ff = alocS(lim);
    ff[0] = f[0];
    for (int t = 2, mid = 1; t <= lim; t <<= 1, mid <<= 1) {
        int xl = std::min<int>(t, n + 1);
        dot(ff, mid, ff), dit(ff, mid), subP(ff, g, mid, ff + mid), subP(ff + mid, g + mid, mid, ff + mid),
            fl0(ff, mid), Cpy_fl0(h, mid, H, t), Conv(ff, t, H),
            mul_t_s(ff + mid, xl - mid, f + mid, Const::_neghalf);
        if (t != lim || calc_inv) {
            Cpy(f, t, o), dif(o, t), Cpy(o, t, ff), dot(o, t, H), dit(o, t), fl0(o, mid), dif(o, t),
                dot(o, t, H), dit(o, t), NegP(o, mid, xl, h);
        }
    }
}

void Sqrt(const Z* g, int n, Z* f) {
    using namespace calc;
    pre_aloc mem;
    if (n <= 64) {
        Z* h = alocS(n + 1);
        return Sqrti<false>(g, n, f, h);
    }
    int bn = bit_up(n >> 4), bt = (n / bn) + 1, bn2 = bn << 1;
    Z* o = alocS(bn2);
    Sqrti(g, bn - 1, f, o), mul_t_s(o, bn, o, Const::_neghalf), fl0(o + bn, bn), dif(o, bn2);
    Z *jok = alocS(bn2), *ng = alocS(bn2 * (bt - 1));
    for (int bi = 1; bi < bt; ++bi) {
        int xl = std::min<int>(bn, n + 1 - bn * bi);
        Cpy_fl0(f + (bi - 1) * bn, bn, ng + (bi - 1) * bn2, bn2), dif<true>(ng + (bi - 1) * bn2, bn2),
            fl0(jok, bn2);
        {
            Z* nG1 = ng + (bi - 1) * bn2;
            for (int i = 0; i < bn; i += 8) {
                x8(jok + i) = addZx8(x8(jok + i), mulZx8(x8(nG1 + i), x8(ng + i)));
            }
            for (int i = bn; i < bn2; i += 8) {
                x8(jok + i) = subZx8(x8(jok + i), mulZx8(x8(nG1 + i), x8(ng + i)));
            }
        }
        for (int j = 1; j < bi; ++j) {
            Z *nG = ng + (bi - j) * bn2, *nG1 = nG - bn2, *ngj = ng + j * bn2;
            for (int i = 0; i < bn; i += 8) {
                x8(jok + i) = addZx8(x8(jok + i), _AmulZx8(x8(nG1 + i), x8(nG + i), x8(ngj + i)));
            }
            for (int i = bn; i < bn2; i += 8) {
                x8(jok + i) = subZx8(x8(jok + i), _SmulZx8(x8(nG1 + i), x8(nG + i), x8(ngj + i)));
            }
        }
        dit(jok, bn2), fl0(jok + bn, bn), subP(jok, xl, g + bi * bn), dif(jok, bn2), dot(jok, bn2, o),
            dit(jok, bn2), Cpy(jok, xl, f + bi * bn);
    }
}

void Pow(const Z* g, int n, int k, Z* f) {
    if (k == 0) {
        f[0] = one_Z, fl0(f + 1, n);
    } else if (k == 1) {
        Cpy(g, n + 1, f);
    } else {
        pre_aloc mem;
        Z* h = alocS(n + 1);
        Ln(g, n, f), mul_t_s(f, n + 1, h, InZ(k)), Exp(h, n, f);
    }
}

void Pow(const Z* g, int n, int k, int k2, Z* f) {
    if (g[0] != one_Z) {
        pre_aloc mem;
        Z* h = alocS(n + 1);
        mul_t_s(g, n + 1, f, calc::invZ(g[0])), Pow(f, n, k, h), mul_t_s(h, n + 1, f, calc::powZ(g[0], k2));
    } else {
        Pow(g, n, k, f);
    }
}
}  // namespace poly

namespace poly {
constexpr int good_poly = 1, empty_poly = 0, not_exist_poly = -1;
int Safe_Sqrt(const Z* g, int n, Z* f) {
    int shift = Tools::clzP(g, n + 1);
    if (shift > n) {
        return empty_poly;
    }
    if ((shift & 1) || (!calc::extend_::isQR(g[shift]))) {
        return not_exist_poly;
    }
    if (shift) {
        pre_aloc mem;
        Z* h = alocS(n + 1);
        Cpy_fl0(g + shift, n - shift + 1, f, n - (shift >> 1) + 1), Sqrt(f, n - (shift >> 1), h);
        Cpy(h, n - (shift >> 1) + 1, f + (shift >> 1)), fl0(f, shift >> 1);
    } else {
        Sqrt(g, n, f);
    }
    return good_poly;
}

int Safe_Pow(const Z* g, int n, std::string s, Z* f) {
    using namespace Tools;
    int shift = clzP(g, n + 1);
    if (shift) {
        if ((s.size() > size_t(8)) || ((stoll(s) * shift) > i64(n))) {
            return empty_poly;
        }
        int rf = (stoi(s) * shift), l = n + 1 - rf;
        pre_aloc mem;
        Z* h = alocS(l);
        Cpy(g + shift, l, f), Pow(f, l - 1, stoi_m(s, mod), stoi_m(s, mod - 1), h), Cpy(h, l, f + rf),
            fl0(f, rf);
    } else {
        Pow(g, n, stoi_m(s, mod), stoi_m(s, mod - 1), f);
    }
    return good_poly;
}
}  // namespace poly

namespace poly {
// Author : killer_joke
namespace Trigonometric_Function {
void Trig(const Z* g, int n, Z* sint, Z* cost) {
    pre_aloc mem;
    Z *A = alocS(n + 1), *o = alocS(n + 1), *h = alocS(n + 1);
    mul_t_s(g, n + 1, A, Const::img), Expi<true>(A, n, h, o);
    if (sint != nullptr) {
        subP(o, h, n + 1, sint), mul_t_s(sint, n + 1, sint, Const::_imghalf);
    }
    if (cost != nullptr) {
        addP(h, o, n + 1, cost), mul_t_s(cost, n + 1, cost, Const::_half);
    }
}
void Sin(const Z* g, int n, Z* f) { Trig(g, n, f, nullptr); }
void Cos(const Z* g, int n, Z* f) { Trig(g, n, nullptr, f); }
void Tan(const Z* g, int n, Z* f) {
    pre_aloc mem;
    Z* h = alocS(n + 1);
    Trig(g, n, f, h), Div(f, n, h, f);
}
void aSin(const Z* g, int n, Z* f) {
    int lim = bit_up(n) << 1;
    pre_aloc mem;
    Z *o = alocS(lim), *h = alocS(lim);
    Cpy_fl0(g, n + 1, o, lim), dif(o, lim), dot(o, lim, o), dit(o, lim), NegP(o, 0, n + 1, o),
        o[0] = calc::addZ(one_Z, o[0]);
    InvSqrt(o, n, h), deriv(g, n, o), fl0(o + n, o + lim), fl0(h + n + 1, h + lim), Conv(o, lim, h),
        integ(o, n, f);
}
void aCos(const Z* g, int n, Z* f) { aSin(g, n, f), NegP(f, 0, n + 1, f); }
void aTan(const Z* g, int n, Z* f) {
    int lim = bit_up(n);
    pre_aloc mem;
    Z* o = alocS(lim << 1);
    Cpy_fl0(g, n + 1, o, lim << 1), dif(o, lim << 1), dot(o, lim << 1, o), dit(o, lim << 1);
    o[0] = calc::addZ(o[0], one_Z), deriv(g, n, o + lim), Div(o + lim, n, o, f), integ(f, n, f);
}
}  // namespace Trigonometric_Function
}  // namespace poly

namespace poly_base {
namespace fac_helper {
void _fac_i(Z* _fac, int l, int r) {
    const Z* _i = seqi(r);
    if (l == 0) {
        _fac[0] = one_Z, l = 1;
    }
    for (int i = l; i < r; ++i) {
        _fac[i] = calc::mulZ(_fac[i - 1], _i[i]);
    }
}
void _fac_iv(Z* _ifac, int l, int r) {
    const Z* _iv = seqiv(r);
    if (l == 0) {
        _ifac[0] = one_Z, l = 1;
    }
    for (int i = l; i < r; ++i) {
        _ifac[i] = calc::mulZ(_ifac[i - 1], _iv[i]);
    }
}
pre_base seqfac(_fac_i), seqifac(_fac_iv);
}  // namespace fac_helper
using fac_helper::seqfac;
using fac_helper::seqifac;
}  // namespace poly_base

namespace poly_base {
// Author : killer_joke
namespace makeseq {
using namespace calc;
void Ci(Z* f, int n, Z c) {
    Z fx = one_Z;
    int i = 0;
    for (; i < std::min(n, 8); ++i, fx = mulZ(fx, c)) {
        f[i] = fx;
    }
    if (n > 16) {
        Zx8 C8 = setu32x8(fx);
        for (; i + 7 < n; i += 8) {
            x8(f + i) = mulZx8(x8(f + i - 8), C8);
        }
    }
    for (; i < n; ++i) {
        f[i] = mulZ(f[i - 1], c);
    }
}
void E_x(Z* f, int n) {
    const Z* ifac = seqifac(n);
    int i = 0;
    for (; i + 7 < n; i += 8) {
        x8(f + i) = Neg<0xaa>(x8(ifac + i));
    }
    for (; i < n; ++i) {
        f[i] = (i & 1) ? negZ(ifac[i]) : ifac[i];
    }
}
void mRisingfac(Z* f, int n, Z m) {
    f[0] = one_Z;
    for (int i = 1; i < n; ++i) {
        f[i] = mulZ(f[i - 1], m), m = addZ(m, one_Z);
    }
}
void mFallingfac(Z* f, int n, Z m) {
    f[0] = one_Z;
    for (int i = 1; i < n; ++i) {
        f[i] = mulZ(f[i - 1], m), m = subZ(m, one_Z);
    }
}
}  // namespace makeseq
}  // namespace poly_base

namespace poly_base {
// Author : killer_joke
namespace f_n_t_t {
void _dfx2_f(Z* ka, int l, int r) {
    for (int i = std::max(1, l); i < r; i <<= 1) {
        makeseq::Ci(ka + i, i, rt1[__qed::lg2d(i) + 1]);
    }
}
pre_base predifx2(_dfx2_f);
template <bool strict = false>
void difx2(Z* f, int lim) {
    Cpy(f, lim, f + lim), dit(f + lim, lim);
    dot(f + lim, lim, predifx2() + lim), dif<strict>(f + lim, lim);
}
template <bool strict = false>
void difx2fxC(Z* f, int lim) {
    constexpr Z two_Z = InZs(2);
    Cpy(f, lim, f + lim), dit(f + lim, lim);
    dot(f + lim, lim, predifx2() + lim), f[lim] = subZ(f[lim], two_Z), dif<strict>(f + lim, lim);
}
void dif_neg(const Z* g, int lim, Z* f) {
    if (lim < 8) {
        if (lim == 1) {
            *f = *g;
        } else {
            for (int i = 0; i < lim; ++i) {
                f[i] = g[i ^ 1];
            }
        }
    } else {
        for (int i = 0; i < lim; i += 8) {
            x8(f + i) = shuffle<0xb1>(x8(g + i));
        }
    }
}
}  // namespace f_n_t_t
using f_n_t_t::difx2;
using f_n_t_t::difx2fxC;
using f_n_t_t::predifx2;
}  // namespace poly_base

namespace field_Z {
namespace calc {
constexpr IL Z _AdMulZ(Z a, Z b, Z c, Z d) { return Space.reduce_s(u64(a) * b + u64(c) * d); }
IL Zx8 _AdMulZx8(const Zx8& a, const Zx8& b, const Zx8& c, const Zx8& d) {
    u32x8 z = (NInvx8 * ((a * b) + (c * d)));
    z = blend<0xaa>(RC(u32x8, (fus_mul(a, b) + fus_mul(c, d) + fus_mul(z, modx8)) >> 32),
                    RC(u32x8, (fus_mul(u32x8(u64x4(a) >> 32), u32x8(u64x4(b) >> 32)) +
                               fus_mul(u32x8(u64x4(c) >> 32), u32x8(u64x4(d) >> 32)) +
                               fus_mul(shuffle<0xf5>(z), modx8)))) -
        modx8;
    return z + (RC(Zx8, RC(i32x8, z) < RC(i32x8, zerox8)) & modx8);
}
}  // namespace calc
}  // namespace field_Z

namespace poly_base {
void adddot(const Z* A, const Z* B, const Z* C, const Z* D, int n, Z* E) {
    int i = 0;
    for (; i + 7 < n; i += 8) {
        x8(E + i) = calc::_AdMulZx8(x8(A + i), x8(B + i), x8(C + i), x8(D + i));
    }
    for (; i < n; ++i) {
        E[i] = calc::_AdMulZ(A[i], B[i], C[i], D[i]);
    }
}
}  // namespace poly_base

namespace poly {
namespace tech {
// Author : QedDust413
void Chirp_Z(const Z* g, int n, Z c, int m, Z* f) {
    using namespace calc;
    pre_aloc mem;
    int lim = bit_up((++n) + m), t = std::max<int>(n, m);
    Z *F = alocS(lim), *G = alocS(lim), *ipwc = alocS(t + 1), cI = invZ(c);
    makeseq::Ci(F, n + m, c);
    for (int i = 0; i < n + m; ++i) {
        F[i + 1] = mulZ(F[i], F[i + 1]);
    }
    fl0(F + n + m, F + lim - 1), F[lim - 1] = one_Z;
    makeseq::Ci(ipwc, t + 1, cI);
    for (int i = 1; i <= t; ++i) {
        ipwc[i] = mulZ(ipwc[i - 1], ipwc[i]);
    }
    for (int i = 1; i < n; ++i) {
        G[i] = mulZ(g[n - i], ipwc[n - i - 1]);
    }
    G[n] = g[0], G[0] = zero_Z, fl0(G + n + 1, G + lim), Conv(F, lim, G), f[0] = F[n - 1];
    for (int i = 1; i < m; ++i) {
        f[i] = mulZ(F[i - 1 + n], ipwc[i - 1]);
    }
}
void translate(const Z* g, int n, Z c, Z* f) {
    int lim = bit_up(n << 1);
    pre_aloc mem;
    Z *o = alocS(lim), *h = alocS(lim);
    dot(g, seqfac(n + 1), n + 1, o), rev(o, n + 1), fl0(o + n + 1, o + lim);
    makeseq::Ci(h, n + 1, c), dot(h, n + 1, seqifac(n + 1)), fl0(h + n + 1, h + lim);
    Conv(o, lim, h);
    Cpy_rev(o, n + 1, f), dot(f, n + 1, seqifac());
}
void multi_eva(const Z* f, int n, const Z* a, int m, Z* o) {
    using namespace calc;
    pre_aloc mem;
    int lgn = __qed::lg2u(m), lim = 1 << lgn, lim2 = lim << 1;
    predifx2.expand(lim);
    Z *G = alocS(lgn * lim2), *g = G, *_d = G;
    for (const Z* p = a; p != a + m; ++p, g += 2) {
        g[0] = subZ(one_Z, *p), g[1] = subZ(Const::neg_one, *p);
    }
    std::fill(g, _d += lim2, one_Z), g = _d;
    for (int t = 2, t2 = 4, M = m & -t; t < lim; t = t2, t2 <<= 1, M &= -t) {
        Z *lc = g - lim2, *rc = lc + t, *ed = g + (M << 1);
        for (; g != ed; g += t2, lc += t2, rc += t2) {
            dot(lc, rc, t, g), difx2fxC(g, t);
        }
        if (M < m) {
            dot(lc, rc, t, g), difx2(g, t), g += t2;
        }
        std::fill(g, _d += lim2, one_Z), g = _d;
    }
    int Lim = bit_up(n + m);
    Z *w = alocS(Lim << 1), *h = w + Lim;
    dot(g - lim2, g - lim, lim, w), dit(w, lim);
    if (m == lim) {
        w[0] = subZ(w[0], one_Z), w[lim] = one_Z;
    }
    rev(w, m + 1), fl0(w + m + 1, w + Lim), Inv(w, n, h), rev(h, n + 1), fl0(h + n + 1, h + Lim);
    Cpy_fl0(f, n + 1, w, Lim), Conv(h, Lim, w), Cpy(h + n, m, w), fl0(w + m, w + lim);
    for (int t = lim, t2 = t << 1, M = m & -t, mid = t >> 1; t > 1;
         t2 = t, t = mid, mid >>= 1, M |= (m & -t)) {
        Z *lc = (g -= lim2), *rc = lc + t, *p = w, *ed = w + (M << 1);
        for (; p != ed; p += t2, lc += t2, rc += t2) {
            dif(p, t), dot(rc, p, t, p + t), dit(p + t, t), dot(p, t, lc), dit(p, t);
            Cpy(p + mid, mid, p + t), Cpy(p + t + mid, mid, p);
        }
        int rl = m - M - mid;
        if (rl > 0) {
            dif(p, t), dot(rc, p, t, p + t), dit(p + t, t), dot(p, t, lc), dit(p, t);
            Cpy(p + mid, rl, p + t), Cpy(p + t + rl, mid, p);
        }
    }
    for (int i = 0; i < m; ++i) {
        o[i] = w[i << 1];
    }
}
void fst_intpol(const Z* x, const Z* y, int n, Z* f) {
    using namespace calc;
    pre_aloc mem;
    int lgn = __qed::lg2u(n), lim = 1 << lgn, lim2 = lim << 1;
    predifx2.expand(lim);
    Z *G = alocS(lgn * lim2), *g = G, *_d = G;
    for (const Z* p = x; p != x + n; ++p, g += 2) {
        g[0] = subZ(one_Z, *p), g[1] = subZ(Const::neg_one, *p);
    }
    std::fill(g, _d += lim2, one_Z), g = _d;
    for (int t = 2, t2 = 4, M = n & -t; t < lim; t = t2, t2 <<= 1, M &= -t) {
        Z *lc = g - lim2, *rc = lc + t, *ed = g + (M << 1);
        for (; g != ed; g += t2, lc += t2, rc += t2) {
            dot(lc, rc, t, g), difx2fxC(g, t);
        }
        if (M < n) {
            dot(lc, rc, t, g), difx2(g, t), g += t2;
        }
        std::fill(g, _d += lim2, one_Z), g = _d;
    }
    Z *h = alocS(lim2), *w = alocS(lim2), *o = alocS(n + 1);
    dot(g - lim2, g - lim, lim, w), dit(w, lim);
    if (n == lim) {
        w[0] = subZ(w[0], one_Z), w[lim] = one_Z;
    }
    Cpy_rev(w, n + 1, o), Inv(o, n, h), rev(h, n + 1), fl0(h + n + 1, h + lim2);
    deriv(w, n, w), fl0(w + n, w + lim2), Conv(h, lim2, w), Cpy(h + n, n, w);
    for (int t = lim, t2 = t << 1, M = n & -t, mid = t >> 1; t > 1;
         t2 = t, t = mid, mid >>= 1, M |= (n & -t)) {
        Z *lc = (g -= lim2), *rc = lc + t, *p = w, *ed = w + (M << 1);
        for (; p != ed; p += t2, lc += t2, rc += t2) {
            dif(p, t), dot(rc, p, t, p + t), dit(p + t, t), dot(p, t, lc), dit(p, t);
            Cpy(p + mid, mid, p + t), Cpy(p + t + mid, mid, p);
        }
        int rl = n - M - mid;
        if (rl > 0) {
            dif(p, t), dot(rc, p, t, p + t), dit(p + t, t), dot(p, t, lc), dit(p, t);
            Cpy(p + mid, rl, p + t), Cpy(p + t + rl, mid, p);
        }
    }
    for (int i = 0; i < n; ++i) {
        w[i] = w[i << 1];
    }
    multi_iv(w, n, o);
    for (int i = 0; i < n; ++i) {
        w[i << 1] = w[i << 1 | 1] = mulZs(y[i], o[i]);
    }
    fl0(w + (n << 1), w + lim2);
    for (int t = 2, t2 = 4; t < lim; t = t2, t2 <<= 1, g += lim2) {
        int M = (n + (t - 1)) & -t;
        Z *lc = g, *rc = lc + t, *p = w, *ed = w + (M << 1);
        for (; p != ed; p += t2, lc += t2, rc += t2) {
            adddot(lc, p + t, rc, p, t, p), difx2<true>(p, t);
        }
    }
    adddot(g, w + lim, g + lim, w, lim, w), dit(w, lim), Cpy(w, n, f);
}
Z linear(const Z* f, const Z* a, int k, i64 n) {
    auto _cal_linear = [](Z* p, int l, int r) {
        if (l == 0) {
            p[0] = Const::_half, l = 1;
        }
        for (int i = l; i < r; ++i) {
            p[i] = calc::mulZs(p[i & (i - 1)], f_n_t_t::rt1_I[__qed::crz_32(i) + 2]);
        }
    };
    static pre_base pre_linear(_cal_linear);
    int hlim = bit_up(k), lim = hlim << 1;
    pre_aloc mem;
    Z *o = alocS(lim), *h = alocS(lim), *q = alocS(lim);
    const Z* w = pre_linear(hlim);
    predifx2.expand(lim), Cpy_fl0(a, k, h, lim), q[0] = one_Z, Cpy_fl0(f, k, q + 1, lim - 1);
    NegP(q, 1, k + 1, q), Conv(h, lim, q), fl0(h + k, h + lim), dif(h, lim);
    for (; n; n >>= 1) {
        f_n_t_t::dif_neg(q, lim, o), dot(q, lim, o), dot(h, lim, o);
        if (n & 1) {
            for (int i = 0; i < hlim; ++i) {
                h[i] = calc::mulZ(h[i << 1] - h[i << 1 | 1] + mod2, w[i]);
                q[i] = q[i << 1];
            }
        } else {
            for (int i = 0; i < hlim; ++i) {
                h[i] = calc::mulZ(h[i << 1] + h[i << 1 | 1], Const::_half);
                q[i] = q[i << 1];
            }
        }
        n == 1 ? dit(h, hlim) : difx2(h, hlim);
        n == 1 ? dit(q, hlim) : difx2(q, hlim);
    }
    return calc::divZ(h[0], q[0]);
}
}  // namespace tech
}  // namespace poly

namespace poly_base {
void poi_translate(const Z* p, int n, Z c, int m, Z* o) {
    int l = n + m, lim = bit_up(l);
    pre_aloc mem;
    Z *A = alocS(lim), *B = alocS(lim), *fc = alocS(l + 1), *fic = alocS(l + 1);
    const Z* ifac = seqifac(n + 1);
    for (int i = 0; i <= n; ++i) {
        Z fx = calc::mulZ(ifac[i], ifac[n - i]);
        A[i] = calc::mulZ(p[i], ((n - i) & 1) ? mod2 - fx : fx);
    }
    fl0(A + n + 1, A + lim);
    makeseq::mRisingfac(fc, l + 1, calc::subZ(c, InZs(n)));
    multi_iv(fc, l + 1, fic);
    for (int i = 0; i < l; ++i) {
        B[i] = calc::mulZ(fc[i], fic[i + 1]);
    }
    fl0(B + l, B + lim), Conv(A, lim, B);
    for (int i = 0; i < m; ++i) {
        o[i] = calc::mulZ(A[i + n], fc[i + n + 1]);
    }
    dot(o, m, fic);
}
// Author : killer_joke

namespace EGF {
void dot(Z* f, int lim, const Z* g) {
    poly_base::dot(f, lim, seqfac(lim));
    poly_base::dot(f, lim, g);
}
void dot(const Z* f, const Z* g, int lim, Z* h) {
    poly_base::dot(f, seqfac(lim), lim, h);
    poly_base::dot(h, lim, g);
}
void fromOGF(Z* f, int n) { poly_base::dot(f, n + 1, seqifac(n + 1)); }
void toOGF(Z* f, int n) { poly_base::dot(f, n + 1, seqifac(n + 1)); }
}  // namespace EGF
}  // namespace poly_base

namespace Falling_Factorial {
using namespace poly_base;
namespace f_d_t {
// Author : killer_joke
Z* EX[mp2 + 1][2];
void prefdt(int lim) {
    int lg = __qed::lg2d(lim);
    if (EX[lg][0]) {
        return;
    }
    EX[lg][0] = alocH(lim << 1), EX[lg][1] = alocH(lim << 1);
    Cpy_fl0(seqifac(lim), lim, EX[lg][0], lim << 1), dif(EX[lg][0], lim << 1);
    f_n_t_t::dif_neg(EX[lg][0], lim << 1, EX[lg][1]);
}
void fdt(Z* f, int lim) { dif(f, lim << 1), dot(f, lim << 1, EX[__qed::lg2d(lim)][0]), dit(f, lim << 1); }
void ifdt(Z* f, int lim) { dif(f, lim << 1), dot(f, lim << 1, EX[__qed::lg2d(lim)][1]), dit(f, lim << 1); }
void _fdtA(Z* p, int l, int r) {
    for (int i = std::max(1, l); i < r; i <<= 1) {
        makeseq::E_x(p + i, i), rev(p + i, i);
    }
}
void _fdtB(Z* p, int l, int r) {
    seqiv.expand(r);
    for (int i = std::max(2, l); i < r; i <<= 1) {
        Cpy(seqiv(), i, p + i), dif(p + i, i);
    }
}
pre_base dA(_fdtA), dB(_fdtB);
void prefdtx2(int lim) { dA.expand(lim), dB.expand(lim << 1); }
void fdtx2(const Z* g, int lim, Z* f) {
    dot(g, dA() + lim, lim, f), fl0(f + lim, lim), dif(f, lim << 1), dot(f, lim << 1, dB() + (lim << 1));
    dit(f, lim << 1), dot(f + lim, lim, seqifac()), Cpy(g, lim, f);
}
}  // namespace f_d_t
using f_d_t::fdt;
using f_d_t::ifdt;
using f_d_t::prefdt;
void Mul(const Z* f, int n, const Z* g, int m, Z* h) {
    int lim = bit_up(n + m);
    prefdt(lim);
    pre_aloc mem;
    Z *F = alocS(lim << 1), *G = alocS(lim << 1);
    Cpy_fl0(f, n + 1, F, lim << 1), Cpy_fl0(g, m + 1, G, lim << 1);
    fdt(F, lim), fdt(G, lim), EGF::dot(F, lim, G);
    fl0(F + lim, lim), ifdt(F, lim), Cpy(F, n + m + 1, h);
}
void translate(const Z* g, int n, Z c, Z* f) {
    int lim = bit_up(n << 1);
    pre_aloc mem;
    Z *o = alocS(lim), *h = alocS(lim);
    makeseq::mFallingfac(o, n + 1, c);
    EGF::fromOGF(o, n), fl0(o + n + 1, o + lim);
    dot(g, seqfac(n + 1), n + 1, h), rev(h, n + 1), fl0(h + n + 1, h + lim);
    Conv(o, lim, h), Cpy_rev(o, n + 1, f), dot(f, n + 1, seqifac(n + 1));
}
void fall_down(const Z* g, int n, Z* f) {
    int lim = bit_up(n);
    pre_aloc mem;
    Z *o = alocS(lim << 1), *h = alocS(lim), *u = alocS(lim), *r = alocS(lim);
    prefdt(lim), f_d_t::prefdtx2(lim);
    Cpy_fl0(g, n + 1, o, lim), Cpy(seqi(n + 1), n + 1, u);
    for (int i = 0; i <= n; i += 2) {
        o[i + 1] = calc::addZ(o[i], o[i + 1]);
    }
    for (int t = 4, mid = 2, t2 = 8; t <= lim; t <<= 1, mid <<= 1, t2 <<= 1) {
        dot(u, n + 1, u);
        for (int j = 0; j <= n; j += t) {
            f_d_t::fdtx2(o + j, mid, r), f_d_t::fdtx2(o + j + mid, mid, h);
            dot(h, u, t, o + j), addP(o + j, t, r);
        }
    }
    fl0(o + lim, lim), ifdt(o, lim), Cpy(o, n + 1, f);
}
void stand_up(const Z* g, int n, Z* f) {
    int lim = bit_up(n);
    pre_aloc mem;
    Z *F = alocS(lim), *G = alocS(lim), *u = alocS(lim), *r = alocS(lim);
    predifx2.expand(lim);
    std::fill(G, G + lim, one_Z), subP(G, lim, seqi(lim));
    Cpy_fl0(g, n + 1, F, lim);
    for (int t = 2, mid = 1; t <= lim; t <<= 1, mid <<= 1) {
        for (int j = 0; j <= n; j += t) {
            Cpy(F + j + mid, mid, u), difx2(u, mid), difx2(F + j, mid);
            Cpy(G + j + mid, mid, r), difx2fxC(r, mid), difx2fxC(G + j, mid);
            dot(u, t, G + j), addP(F + j, t, u), dot(G + j, t, r);
        }
    }
    dit(F, lim), Cpy(F, n + 1, f);
}
}  // namespace Falling_Factorial

namespace Command {
using Stat_Info::report;
void cut_string() { _Exit(0); }
}  // namespace Command

struct auto_reporter {
    ~auto_reporter() { Command::report(); }
} jack;

using poly_base::alocS;
using poly_base::pre_aloc;
using namespace poly_base::Tools;
using namespace field_Z;

void poly_multiply(unsigned* a, int n, unsigned* b, int m, unsigned* c) {
    pre_aloc mem;
    auto F = alocS(n + 1), G = alocS(m + 1), H = alocS(n + m + 1);
    for (int i = 0; i <= n; i++) F[i] = trans<1,0>(a[i]);
    for (int i = 0; i <= m; i++) G[i] = trans<1,0>(b[i]);
    // scanP(F,n+1);
    // scanP(G,m+1);
    poly::Mul(F, n + 1, G, m + 1, H);
    for (int i = 0; i <= n + m; i++) c[i] =trans<-1>(H[i]);
    // printP(H,n+m+1);
    // for (int i = 0; i <= n ; i++) std::cerr<<F[i]<<" ";std::cerr<<"\n";
    // for (int i = 0; i <=  m; i++) std::cerr<<G[i]<<" ";std::cerr<<"\n";
    return;
}

CompilationN/AN/ACompile OKScore: N/A

Testcase #144.062 ms38 MB + 952 KBAcceptedScore: 100


Judge Duck Online | 评测鸭在线
Server Time: 2024-04-29 20:19:50 | Loaded in 1 ms | Server Status
个人娱乐项目,仅供学习交流使用