#include <algorithm>
#include <functional>
#include <iostream>
using i32 = int;
using u32 = unsigned;
using i64 = long long;
using u64 = unsigned long long;
#define IL __inline__ __attribute__((always_inline))
#define RC(T, x) reinterpret_cast<T>(x)
/*
一个高性能的多项式实现
作者为 QedDust413 & killer_joke
*/
namespace Setting {
constexpr u32 mod = 998244353;
constexpr size_t sta_l_MB = 64;
constexpr int Detail = 1;
} // namespace Setting
namespace __qed {
constexpr int bit_up(int x) { return 1 << (32 - __builtin_clz(x)); }
constexpr int crz_32(int x) { return __builtin_ctz(x); }
constexpr int cro_32(int x) { return __builtin_ctz(~x); }
constexpr int lg2u(int x) { return x == 1 ? 0 : (32 - __builtin_clz(x - 1)); }
constexpr int lg2d(int x) { return std::__lg(x); }
constexpr bool ispow2(u64 x) { return (x != 0) && ((x & (x - 1)) == 0); }
constexpr u32 primitive_root(u32 M) {
u32 qed = 0, n = M - 1, d[11] = {};
for (int i = 2; i * i <= n; i++) {
if (n % i == 0) {
d[qed++] = i;
do {
n /= i;
} while (n % i == 0);
}
}
if (n > 1) {
d[qed++] = n;
}
for (u32 g = 2, r = 0;; ++g) {
for (u32 i = 0; i < qed; ++i) {
u32 b = (M - 1) / d[i], a = g;
for (r = 1; b; b >>= 1, a = u64(a) * a % M) {
if (b & 1) {
r = u64(r) * a % M;
}
}
if (r == 1) {
break;
}
}
if (r != 1) {
return g;
}
}
}
} // namespace __qed
namespace Stat_Info {
using Setting::Detail;
i64 ntt_size;
i64 fill0_size, copy_size, rev_size;
size_t max_cost_sta_l;
constexpr const char* Author = "QedDust413 & killer_joke";
constexpr const char* Thanks =
"negiizhao, chaihf, rogeryoungh, EntropyIncreaser, hly1204, Min_25, Qdedawsd2233, bh1234666, yosupo, "
"Pulsating_Dust, KKKKa, qdc, and more.";
template <typename Tf = std::ostream>
void report(Tf& outf = std::clog) {
if constexpr (Detail <= 0) {
outf << "Statistics are turned off.\n";
}
if constexpr (Detail > 0) {
outf << "ntt_size:" << ntt_size << "\n";
}
if constexpr (Detail > 1) {
outf << "fill0_size:" << fill0_size << " copy_size:" << copy_size << " rev_size:" << rev_size
<< "\nmax_cost_sta:" << (double(max_cost_sta_l) / double(1 << 20)) << "\n";
}
outf.flush();
}
} // namespace Stat_Info
namespace mem_helper {
using namespace __qed;
template <size_t align>
void* to_align(void* mem) {
static_assert(ispow2(align));
return RC(void*, (RC(u64, mem) + (align - 1)) & (-align));
}
template <size_t align>
bool is_align(const void* mem) {
static_assert(ispow2(align));
return (RC(u64, mem) & (align - 1)) == 0;
}
char _mem[Setting::sta_l_MB << 20], *_now = _mem;
struct pre_aloc {
char* t;
pre_aloc() : t(_now) {}
~pre_aloc() { _now = t; }
};
template <typename T, size_t align>
T* AlocS(size_t n) {
T* r = RC(T*, to_align<align>(_now));
_now = RC(char*, r + n);
if constexpr (Stat_Info::Detail > 1) {
Stat_Info::max_cost_sta_l = std::max<size_t>(_now - _mem, Stat_Info::max_cost_sta_l);
}
return r;
}
template <size_t align>
void* _alocH(size_t n) {
constexpr size_t ptr_size = sizeof(void*), Extra = (align + ptr_size);
void *base = std::malloc(n + Extra), *p = to_align<align>(RC(char*, base) + ptr_size);
return RC(void**, p)[-1] = base, p;
}
template <typename T, size_t align>
T* AlocH(size_t n) {
return RC(T*, _alocH<align>(n * sizeof(T)));
}
void Freed(void* p) { std::free(RC(void**, p)[-1]); }
template <typename T, size_t align>
struct Trivial_Aligned_Allocator {
static_assert(std::is_trivial<T>::value);
typedef T value_type;
T* allocate(std::size_t n) { return AlocH<T, align>(n); }
template <class T1>
struct rebind {
using other = Trivial_Aligned_Allocator<T1, align>;
};
void deallocate(T* p, std::size_t n) { Freed(p); }
};
} // namespace mem_helper
namespace Montgo {
// Author : QedDust413
struct Mont32 {
u32 Mod, Mod2, Inv, NInv, R2;
constexpr Mont32(u32 n) : Mod(n), Mod2(n << 1), Inv(1), NInv(), R2((-u64(n)) % n) {
for (int i = 0; i < 5; ++i) {
Inv *= 2 - n * Inv;
}
NInv = -Inv;
}
constexpr IL u32 reduce(u64 x) const { return (x + u64(u32(x) * NInv) * Mod) >> 32; }
constexpr IL u32 reduce_s(u64 x) const {
u32 r = (x >> 32) - ((u64(u32(x) * Inv) * Mod) >> 32);
return r >> 31 ? r + Mod : r;
}
constexpr IL u32 mul(u32 x, u32 y) const { return reduce(u64(x) * y); }
constexpr IL u32 mul_s(u32 x, u32 y) const { return reduce_s(u64(x) * y); }
constexpr IL u32 In(u32 x) const { return mul(x, R2); }
constexpr IL u32 In_s(u32 x) const { return mul_s(x, R2); }
constexpr IL u32 Out(u32 x) const {
u32 r = (x + (u64(x * NInv) * Mod)) >> 32;
return r < Mod ? r : r - Mod;
}
};
} // namespace Montgo
namespace field_Z {
// Author : QedDust413
using Setting::mod;
constexpr Montgo::Mont32 Space(mod);
constexpr u32 mod2 = Space.Mod2;
constexpr IL u32 shrink(u32 x) { return x >= mod ? x - mod : x; }
constexpr IL u32 dilate2(u32 x) { return x >> 31 ? x + mod2 : x; }
using Z = u32;
constexpr bool isgood(Z x) { return x < mod2; }
constexpr IL Z InZ(u32 x) { return Space.In(x); }
constexpr IL Z InZs(u32 x) { return Space.In_s(x); }
constexpr Z zero_Z(0), one_Z(InZs(1)), not_exist_Z(-1);
constexpr IL u32 OutZ(Z x) { return Space.Out(x); }
constexpr bool equalZ(Z x, Z y) { return shrink(x) == shrink(y); }
namespace calc {
constexpr IL Z addZ(Z x, Z y) { return dilate2(x + y - mod2); }
constexpr IL Z subZ(Z x, Z y) { return dilate2(x - y); }
constexpr IL Z mulZ(Z x, Z y) { return Space.mul(x, y); }
constexpr Z powZ(Z a, u32 b, Z r = one_Z) {
for (; b; a = mulZ(a, a), b >>= 1) {
if (b & 1) {
r = mulZ(r, a);
}
}
return r;
}
constexpr IL Z invZ(Z x) { return powZ(x, mod - 2); }
constexpr IL Z divZ(Z x, Z y) { return powZ(y, mod - 2, x); }
template <bool strict = true>
constexpr IL Z mulZs(Z x, Z y) {
if constexpr (strict) {
return Space.mul_s(x, y);
}
return mulZ(x, y);
}
constexpr IL Z negZ(Z x) { return mod2 - x; }
namespace extend_ {
constexpr Z absZ(Z x) {
u32 r = OutZ(x);
return InZs(std::min(r, mod - r));
}
constexpr Z legendre(Z x) { return powZ(x, (mod - 1) >> 1); }
constexpr bool isQR(Z x) { return equalZ(legendre(x), one_Z); }
constexpr Z sqrtZ(Z x) {
if (!isQR(x)) {
return not_exist_Z;
}
Z a(1), I_mul(0);
while (isQR(I_mul = subZ(mulZ(a, a), x))) {
++a;
}
struct dZ {
Z r, i;
constexpr void Mul(dZ d, Z I_) {
*this = { addZ(mulZ(r, d.r), mulZ(mulZ(I_, i), d.i)), addZ(mulZ(r, d.i), mulZ(i, d.r)) };
}
};
dZ s = { a, one_Z }, r = { one_Z, zero_Z };
for (u32 b = (mod + 1) >> 1; b; b >>= 1, s.Mul(s, I_mul)) {
if (b & 1) {
r.Mul(s, I_mul);
}
}
return absZ(r.r);
}
} // namespace extend_
} // namespace calc
template <int fixes, bool strict = true>
constexpr Z trans(Z x) {
constexpr Z o = fixes > 0 ? calc::powZ(Space.R2, fixes) : calc::powZ(1, -fixes);
return calc::mulZs<strict>(x, o);
}
template <bool strict = true>
constexpr Z trans(Z x, int fixes) {
return calc::mulZs<strict>(x, fixes > 0 ? calc::powZ(Space.R2, fixes) : calc::powZ(1, -fixes));
}
namespace Const {
constexpr Z _half = InZs((mod + 1) >> 1), _neghalf = InZs((mod - 1) >> 1), neg_one = InZs(mod - 1);
constexpr Z img = calc::extend_::sqrtZ(neg_one), imgI = mod - img, _imghalf = calc::mulZs(_half, img);
} // namespace Const
} // namespace field_Z
#pragma GCC target("avx2")
#include <immintrin.h>
#define VEC(sz, T) __attribute((vector_size(sz))) T
namespace SIMD {
using i32x8 = VEC(32, i32);
using u32x8 = VEC(32, u32);
using i64x4 = VEC(32, i64);
using u64x4 = VEC(32, u64);
using I256 = __m256i;
constexpr IL u32x8 setu32x8(u32 x) { return (u32x8){ x, x, x, x, x, x, x, x }; }
template <int typ>
IL u32x8 shuffle(const u32x8& x) {
return RC(u32x8, _mm256_shuffle_epi32(RC(I256, x), typ));
}
template <int typ>
IL u32x8 blend(const u32x8& x, const u32x8& y) {
return RC(u32x8, _mm256_blend_epi32(RC(I256, x), RC(I256, y), typ));
}
IL I256 swaplohi128(const I256& x) { return _mm256_permute2x128_si256(x, x, 1); }
IL u32x8& x8(u32* data) { return *RC(u32x8*, data); }
IL const u32x8& x8(const u32* data) { return *RC(const u32x8*, data); }
IL I256 loadu(const void* data) { return _mm256_loadu_si256(RC(const __m256i_u*, data)); }
IL void storeu(const I256& x, void* data) { return _mm256_storeu_si256(RC(__m256i_u*, data), x); }
IL u64x4 fus_mul(const u32x8& x, const u32x8& y) {
return RC(u64x4, _mm256_mul_epu32(RC(I256, x), RC(I256, y)));
}
} // namespace SIMD
namespace field_Z {
using SIMD::setu32x8;
using SIMD::u32x8;
using SIMD::x8;
using Zx8 = u32x8;
constexpr u32x8 modx8 = setu32x8(mod), mod2x8 = setu32x8(mod2), NInvx8 = setu32x8(Space.NInv);
constexpr Zx8 one_Zx8 = setu32x8(one_Z), zerox8 = setu32x8(0u);
IL Zx8 dilate2x8(const Zx8& x) {
return x + (RC(Zx8, RC(SIMD::i32x8, x) < RC(SIMD::i32x8, zerox8)) & mod2x8);
}
IL Zx8 shrinkx8(const Zx8& x) { return x - ((x >= modx8) & modx8); }
namespace calc {
// Author : killer_joke
using namespace SIMD;
IL Zx8 addZx8(const Zx8& x, const Zx8& y) { return dilate2x8(x + y - mod2x8); }
IL Zx8 subZx8(const Zx8& x, const Zx8& y) { return dilate2x8(x - y); }
IL Zx8 mulZx8(const Zx8& x, const Zx8& y) {
u32x8 z = (NInvx8 * x * y);
return blend<0xaa>(RC(u32x8, (fus_mul(x, y) + fus_mul(z, modx8)) >> 32),
RC(u32x8, (fus_mul(u32x8(u64x4(x) >> 32), u32x8(u64x4(y) >> 32)) +
fus_mul(shuffle<0xf5>(z), modx8))));
}
IL Zx8 powZx8(const Zx8& y, u32 b, const Zx8& _r = one_Zx8) {
Zx8 x = y, r = _r;
for (; b; x = mulZx8(x, x), b >>= 1) {
if (b & 1) {
r = mulZx8(r, x);
}
}
return r;
}
IL Zx8 invZx8(const Zx8& x) { return powZx8(x, mod - 2); }
IL Zx8 divZx8(const Zx8& x, const Zx8& y) { return powZx8(y, mod - 2, x); }
template <bool strict = true>
IL Zx8 mulZsx8(const Zx8& x, const Zx8& y) {
if constexpr (strict) {
u32x8 z = (NInvx8 * x * y);
z = blend<0xaa>(RC(u32x8, (fus_mul(x, y) + fus_mul(z, modx8)) >> 32),
RC(u32x8, (fus_mul(u32x8(u64x4(x) >> 32), u32x8(u64x4(y) >> 32)) +
fus_mul(shuffle<0xf5>(z), modx8)))) -
modx8;
return z + (RC(Zx8, RC(i32x8, z) < RC(i32x8, zerox8)) & modx8);
}
return mulZx8(x, y);
}
IL Zx8 negZx8(const Zx8& x) { return mod2x8 - x; }
IL Zx8 _AmulZx8(const Zx8& a, const Zx8& b, const Zx8& c) { return mulZx8(a + b, c); }
IL Zx8 _SmulZx8(const Zx8& a, const Zx8& b, const Zx8& c) { return mulZx8(a - b + mod2x8, c); }
template <int typ>
IL u32x8 Neg(const u32x8& x) {
return blend<typ>(x, mod2x8 - x);
}
} // namespace calc
} // namespace field_Z
#undef VEC
#define STATI(ifo, dt, num) \
if constexpr (Stat_Info::Detail > dt) { \
Stat_Info::ifo += (num); \
}
namespace poly_base {
using namespace field_Z;
using __qed::bit_up;
constexpr int mp2 = __qed::crz_32(mod - 1);
void fl0(Z* f, int l) { STATI(fill0_size, 1, l) std::fill_n(f, l, zero_Z); }
void fl0(Z* bg, Z* ed) { STATI(fill0_size, 1, ed - bg) std::fill(bg, ed, zero_Z); }
void Cpy(const Z* f, int l, Z* g) { STATI(copy_size, 1, l) std::copy_n(f, l, g); }
void Cpy(const Z* bg, const Z* ed, Z* g) { STATI(copy_size, 1, ed - bg) std::copy(bg, ed, g); }
void rev(Z* bg, Z* ed) { STATI(rev_size, 1, ed - bg) std::reverse(bg, ed); }
void rev(Z* f, int l) { STATI(rev_size, 1, l) std::reverse(f, f + l); }
void Cpy_fl0(const Z* f, int n, Z* g, int lim) { Cpy(f, n, g), fl0(g + n, g + lim); }
void Cpy_rev(const Z* f, int l, Z* g) {
STATI(rev_size, 1, l) STATI(copy_size, 1, l) std::reverse_copy(f, f + l, g);
}
#define flx(nam, opt) \
void nam(const Z* A, int l, Z* B, Z t) { \
int i = 0; \
if (l > 16) { \
Zx8 tx8 = setu32x8(t); \
for (; i + 7 < l; i += 8) { \
x8(B + i) = calc::opt##Zx8(x8(A + i), tx8); \
} \
} \
for (; i < l; ++i) { \
B[i] = calc::opt##Z(A[i], t); \
} \
}
flx(mul_t_s, mul) flx(add_t_s, add) flx(sub_t_s, sub)
#undef flx
#define flx(nam, opt) \
void nam(Z* A, int l, const Z* B) { \
int i = 0; \
for (; i + 7 < l; i += 8) { \
x8(A + i) = calc::opt##Zx8(x8(A + i), x8(B + i)); \
} \
for (; i < l; ++i) { \
A[i] = calc::opt##Z(A[i], B[i]); \
} \
} \
void nam(const Z* A, const Z* B, int l, Z* C) { \
int i = 0; \
for (; i + 7 < l; i += 8) { \
x8(C + i) = calc::opt##Zx8(x8(A + i), x8(B + i)); \
} \
for (; i < l; ++i) { \
C[i] = calc::opt##Z(A[i], B[i]); \
} \
}
flx(dot, mul) flx(addP, add) flx(subP, sub)
#undef flx
void NegP(const Z* A, int l, int r, Z* B) {
int i = l;
if (r - l >= 16) {
for (; i & 7; ++i) {
B[i] = calc::negZ(A[i]);
}
for (; i + 7 < r; i += 8) {
x8(B + i) = calc::negZx8(x8(A + i));
}
}
for (; i < r; ++i) {
B[i] = calc::negZ(A[i]);
}
}
} // namespace poly_base
namespace poly_base {
using mem_helper::Freed;
using mem_helper::pre_aloc;
const std::function<Z*(size_t)> alocS = mem_helper::AlocS<Z, 32>;
const std::function<Z*(size_t)> alocH = mem_helper::AlocH<Z, 32>;
struct pre_base {
const std::function<void(Z*, int, int)> jok;
Z* p;
int sz;
pre_base(std::function<void(Z*, int, int)> f) : jok(f), p(nullptr), sz(0) {}
~pre_base() {
if (p != nullptr) {
Freed(p);
}
}
void expand(int len) {
if (len > sz) {
len = (len + 7) & -8;
Z* nxtp = alocH(len);
if (p != nullptr) {
Cpy(p, sz, nxtp), Freed(p);
}
jok(p = nxtp, sz, len), sz = len;
}
}
void clear() {
if (p != nullptr) {
Freed(p), sz = 0;
}
}
const Z* operator()(int len) { return expand(len), p; }
const Z* operator()() { return p; }
};
void multi_iv_Zx8(const Zx8* g, int n, Zx8* f) {
using namespace calc;
if (n == 0) {
return;
}
f[0] = g[0];
for (int i = 1; i < n; ++i) {
f[i] = mulZx8(f[i - 1], g[i]);
}
f[n - 1] = invZx8(f[n - 1]);
for (int i = n - 1; i; --i) {
Zx8 ivi = f[i];
f[i] = mulZx8(ivi, f[i - 1]);
f[i - 1] = mulZx8(ivi, g[i]);
}
}
void multi_iv(const Z* g, int n, Z* f) {
int N = n >> 3, i = N << 3;
multi_iv_Zx8(RC(const Zx8*, g), N, RC(Zx8*, f));
for (; i < n; ++i) {
f[i] = calc::invZ(g[i]);
}
}
namespace cal_helper {
using namespace calc;
void _cal_i(Z* dus, int l, int r) {
for (; l < 8; ++l) {
dus[l] = InZ(l);
}
constexpr Zx8 eight_Zx8 = setu32x8(InZs(8));
for (int i = l; i < r; i += 8) {
x8(dus + i) = addZx8(x8(dus + i - 8), eight_Zx8);
}
}
pre_base seqi(_cal_i);
void _cal_iv(Z* kil, int l, int r) {
const Z* _i = seqi(r);
if (l < 8) {
x8(kil) = invZx8(x8(_i)), l = 8;
}
multi_iv_Zx8(RC(const Zx8*, _i + l), (r - l) >> 3, RC(Zx8*, kil + l));
}
pre_base seqiv(_cal_iv);
void deriv(const Z* g, int n, Z* f) {
const Z* _i = seqi(n + 1);
int i = 0;
if (n > 7) {
for (; i < 7; ++i) {
f[i] = mulZ(g[i + 1], _i[i + 1]);
}
for (; i + 7 < n; i += 8) {
storeu(RC(I256, mulZx8(x8(g + i + 1), x8(_i + i + 1))), f + i);
}
}
for (; i < n; ++i) {
f[i] = mulZ(g[i + 1], _i[i + 1]);
}
f[n] = zero_Z;
}
void integ(const Z* g, int n, Z* f) {
const Z* _iv = seqiv(n + 1);
int i = n - 1;
if (i > 7) {
for (; (i & 7) != 6; --i) {
f[i + 1] = mulZ(g[i], _iv[i + 1]);
}
for (i -= 7; ~i; i -= 8) {
x8(f + i + 1) = mulZx8(RC(Zx8, loadu(g + i)), x8(_iv + i + 1));
}
i += 7;
}
for (; ~i; --i) {
f[i + 1] = mulZ(g[i], _iv[i + 1]);
}
f[0] = zero_Z;
}
} // namespace cal_helper
using cal_helper::deriv;
using cal_helper::integ;
using cal_helper::seqi;
using cal_helper::seqiv;
} // namespace poly_base
namespace poly_base {
namespace Tools {
int compP(const Z* f, int l, const Z* g) {
for (int i = 0; i < l; ++i) {
if (!equalZ(f[i], g[i])) {
return i;
}
}
return -1;
}
template <int fixes = 1, typename Tf = std::istream>
void scanP(Z* f, int l, Tf& inf = std::cin) {
for (int i = 0; i < l; ++i) {
inf >> f[i], f[i] = trans<fixes, false>(f[i]);
}
}
template <int fixes = -1, typename Tf = std::ostream>
void printP(const Z* f, int l, Tf& outf = std::cout) {
if (l) {
outf << trans<fixes>(f[0]);
for (int i = 1; i < l; ++i) {
outf << ' ' << trans<fixes>(f[i]);
}
outf << '\n';
}
}
template <typename T = u32, typename lT = u64>
T stoi_m(const std::string& s, const T m) {
T r(0);
for (auto ch : s) {
r = ((lT)r * 10 + (ch - '0')) % m;
}
return r;
}
int clzP(const Z* f, int n) {
int i = 0;
while (i < n && equalZ(f[i], zero_Z)) {
++i;
}
return i;
}
int crzP(const Z* f, int n) {
int i = n;
while (i && equalZ(f[i - 1], zero_Z)) {
--i;
}
return n - i;
}
} // namespace Tools
} // namespace poly_base
namespace poly_base {
// Author : QedDust413
namespace f_n_t_t_base {
using namespace calc;
using namespace __qed;
constexpr int base4thre = 4;
template <bool strict = false>
void mul_t(Z* A, int l, Z t) {
for (int j = 0; j < l; ++j) {
A[j] = mulZs<strict>(A[j], t);
}
}
constexpr Z _g(InZ(primitive_root(mod)));
struct P_R_Tab {
Z t[mp2 + 1];
constexpr P_R_Tab(Z G) : t() {
t[mp2] = shrink(powZ(G, (mod - 1) >> mp2));
for (int i = mp2 - 1; ~i; --i) {
t[i] = mulZs(t[i + 1], t[i + 1]);
}
}
constexpr Z operator[](int i) const { return t[i]; }
} constexpr rt1(_g), rt1_I(invZ(_g));
template <int lim>
struct omega_info_base2 {
Z o[lim >> 1];
constexpr omega_info_base2(const P_R_Tab& Tb) : o() {
o[0] = one_Z;
for (int i = 1, l = lim >> 1; i < l; ++i) {
o[i] = mulZ(o[i & (i - 1)], Tb[crz_32(i) + 2]);
}
}
constexpr Z operator[](int i) const { return o[i]; }
};
const omega_info_base2<base4thre> w(rt1), wI(rt1_I);
constexpr Zx8 powXx8(Z X) {
Z X2 = mulZs(X, X), X3 = mulZs(X2, X), X4 = mulZs(X3, X), X5 = mulZs(X4, X), X6 = mulZs(X5, X),
X7 = mulZs(X6, X);
return (Zx8){ one_Z, X, X2, X3, X4, X5, X6, X7 };
}
struct ntt_info_base4x8 {
Z rt3[mp2 - 2], rt3_I[mp2 - 2];
Zx8 rt4ix8[mp2 - 3], rt4ix8_I[mp2 - 3];
constexpr ntt_info_base4x8() : rt3(), rt3_I(), rt4ix8(), rt4ix8_I() {
Z pr = one_Z, pr_I = one_Z;
for (int i = 0; i < mp2 - 2; ++i) {
rt3[i] = mulZs(pr, rt1[i + 3]), rt3_I[i] = mulZs(pr_I, rt1_I[i + 3]);
pr = mulZs(pr, rt1_I[i + 3]), pr_I = mulZs(pr_I, rt1[i + 3]);
}
pr = one_Z, pr_I = one_Z;
for (int i = 0; i < mp2 - 3; ++i) {
rt4ix8[i] = powXx8(mulZs(pr, rt1[i + 4])), rt4ix8_I[i] = powXx8(mulZs(pr_I, rt1_I[i + 4]));
pr = mulZs(pr, rt1_I[i + 4]), pr_I = mulZs(pr_I, rt1[i + 4]);
}
}
};
} // namespace f_n_t_t_base
// fast_number_theoretic_transform
// Author : QedDust413
namespace f_n_t_t {
using namespace f_n_t_t_base;
template <bool strict = false, int fixes = 0>
void dif_base2(Z* A, int lim) {
STATI(ntt_size, 0, lim)
for (int L = lim >> 1, R = lim; L; L >>= 1, R >>= 1) {
for (int i = 0, k = 0; i < lim; i += R, ++k) {
for (int j = 0; j < L; ++j) {
Z x = dilate2(A[i + j] - mod2), y = mulZ(w[k], A[i + j + L]);
A[i + j] = x + y, A[i + j + L] = x - y + mod2;
}
}
}
if constexpr (fixes) {
mul_t<strict>(A, lim, trans<fixes>(one_Z));
} else {
for (int j = 0; j < lim; ++j) {
A[j] = dilate2(A[j] - mod2);
if constexpr (strict) {
A[j] = shrink(A[j]);
}
}
}
}
template <bool strict = false, int fixes = 0>
void dit_base2(Z* A, int lim) {
STATI(ntt_size, 0, lim)
for (int L = 1, R = 2; L < lim; L <<= 1, R <<= 1) {
for (int i = 0, k = 0; i < lim; i += R, ++k) {
for (int j = 0; j < L; ++j) {
Z x = A[i + j], y = A[i + j + L];
A[i + j] = addZ(x, y), A[i + j + L] = mulZ(x - y + mod2, wI[k]);
}
}
}
mul_t<strict>(A, lim, trans<fixes + 1>(mod - ((mod - 1) >> crz_32(lim))));
}
constexpr Zx8 w14x8 = setu32x8(rt1[2]), w14Ix8 = setu32x8(rt1_I[2]);
constexpr Z w38 = mod - rt1_I[3], w38I = mod - rt1[3];
constexpr ntt_info_base4x8 iab4;
template <bool strict = false, int fixes = 0>
void dif_base4x8(Z* A, int lim) {
STATI(ntt_size, 0, lim)
int n = lim >> 3, L = n >> 1;
Zx8* f = RC(Zx8*, A);
if (crz_32(n) & 1) {
for (int j = 0; j < L; ++j) {
Zx8 x = f[j], y = f[j + L];
f[j] = x + y, f[j + L] = x - y + mod2x8;
}
L >>= 1;
}
L >>= 1;
for (int R = L << 2; L; L >>= 2, R >>= 2) {
Z _r = one_Z, _r2 = one_Z, _r3 = one_Z;
for (int i = 0, k = 0; i < n; i += R, ++k) {
Zx8 r = setu32x8(_r), r2 = setu32x8(_r2), r3 = setu32x8(_r3);
for (int j = 0; j < L; ++j) {
#define F(c) f[i + j + c * L]
Zx8 f0 = dilate2x8(F(0) - mod2x8), f1 = mulZx8(F(1), r), f2 = mulZx8(F(2), r2),
f3 = mulZx8(F(3), r3);
Zx8 f1f3 = _SmulZx8(f1, f3, w14x8), f02 = addZx8(f0, f2), f13 = addZx8(f1, f3),
f_02 = subZx8(f0, f2);
F(0) = f02 + f13, F(1) = f02 - f13 + mod2x8, F(2) = f_02 + f1f3, F(3) = f_02 - f1f3 + mod2x8;
#undef F
}
_r = mulZs(_r, iab4.rt3[cro_32(k)]), _r2 = mulZs(_r, _r), _r3 = mulZs(_r2, _r);
}
}
{
constexpr Zx8 _r = setu32x8(trans<fixes>(one_Z)),
pr2 = { one_Z, one_Z, one_Z, rt1[2], one_Z, one_Z, one_Z, rt1[2] },
pr4 = { one_Z, one_Z, one_Z, one_Z, one_Z, rt1[3], rt1[2], w38 };
Zx8 r = _r;
for (int i = 0; i < n; ++i) {
Zx8& fi = f[i];
fi = mulZx8(fi, r);
fi = _AmulZx8(Neg<0xf0>(fi), RC(Zx8, swaplohi128(RC(I256, fi))), pr4);
fi = _AmulZx8(Neg<0xcc>(fi), shuffle<0x4e>(fi), pr2);
fi = addZx8(Neg<0xaa>(fi), shuffle<0xb1>(fi));
if constexpr (strict) {
fi = shrinkx8(fi);
}
r = mulZsx8(r, iab4.rt4ix8[cro_32(i)]);
}
}
}
template <bool strict = false, int fixes = 0>
void dit_base4x8(Z* A, int lim) {
STATI(ntt_size, 0, lim)
int n = lim >> 3, L = 1;
Zx8* f = RC(Zx8*, A);
{
constexpr Zx8 pr2 = { one_Z, one_Z, one_Z, rt1_I[2], one_Z, one_Z, one_Z, rt1_I[2] },
pr4 = { one_Z, one_Z, one_Z, one_Z, one_Z, rt1_I[3], rt1_I[2], w38I };
Zx8 r = setu32x8(trans<fixes + 1>(mod - ((mod - 1) >> crz_32(lim))));
for (int i = 0; i < n; ++i) {
Zx8& fi = f[i];
fi = _AmulZx8(Neg<0xaa>(fi), shuffle<0xb1>(fi), pr2);
fi = _AmulZx8(Neg<0xcc>(fi), shuffle<0x4e>(fi), pr4);
fi = _AmulZx8(Neg<0xf0>(fi), RC(Zx8, swaplohi128(RC(I256, fi))), r);
r = mulZsx8(r, iab4.rt4ix8_I[cro_32(i)]);
}
}
for (int R = L << 2; L < (n >> 1); L <<= 2, R <<= 2) {
Z _r = one_Z, _r2 = one_Z, _r3 = one_Z;
for (int i = 0, k = 0; i < n; i += R, ++k) {
Zx8 r = setu32x8(_r), r2 = setu32x8(_r2), r3 = setu32x8(_r3);
for (int j = 0; j < L; ++j) {
#define F(c) f[i + j + c * L]
Zx8 f0 = F(0), f1 = F(1), f2 = F(2), f3 = F(3);
Zx8 f2f3 = _SmulZx8(f2, f3, w14Ix8), f01 = addZx8(f0, f1), f23 = addZx8(f2, f3),
f_01 = subZx8(f0, f1);
F(0) = addZx8(f01, f23), F(1) = _AmulZx8(f_01, f2f3, r), F(2) = _SmulZx8(f01, f23, r2),
F(3) = _SmulZx8(f_01, f2f3, r3);
#undef F
}
_r = mulZs(_r, iab4.rt3_I[cro_32(k)]), _r2 = mulZs(_r, _r), _r3 = mulZs(_r2, _r);
}
}
if (L != n) {
for (int j = 0; j < L; ++j) {
Zx8 x = f[j], y = f[j + L];
f[j] = addZx8(x, y), f[j + L] = subZx8(x, y);
}
}
if constexpr (strict) {
for (int i = 0; i < n; ++i) {
f[i] = shrinkx8(f[i]);
}
}
}
template <bool strict = false, int fixes = 0>
void dif(Z* A, int lim) {
lim > base4thre ? dif_base4x8<strict, fixes>(A, lim) : dif_base2<strict, fixes>(A, lim);
}
template <bool strict = false, int fixes = 0>
void dit(Z* A, int lim) {
lim > base4thre ? dit_base4x8<strict, fixes>(A, lim) : dit_base2<strict, fixes>(A, lim);
}
} // namespace f_n_t_t
using f_n_t_t::dif;
using f_n_t_t::dit;
void Conv(Z* f, int lim, Z* g) { dif(f, lim), dif(g, lim), dot(f, lim, g), dit(f, lim); }
} // namespace poly_base
#undef STATI
namespace poly {
using namespace poly_base;
// Author : QedDust413
void Mul(const Z* f, int n, const Z* g, int m, Z* h) {
int lim = bit_up(n + m);
pre_aloc mem;
Z *F = alocS(lim), *G = alocS(lim);
Cpy_fl0(f, n + 1, F, lim), Cpy_fl0(g, m + 1, G, lim), Conv(F, lim, G), Cpy(F, n + m + 1, h);
}
void Inv(const Z* g, int n, Z* f) {
int lim = bit_up(n);
pre_aloc mem;
Z *o = alocS(lim), *h = alocS(lim);
f[0] = calc::invZ(g[0]);
for (int t = 2, mid = 1; t <= lim; t <<= 1, mid <<= 1) {
int xl = std::min<int>(t, n + 1);
Cpy_fl0(g, xl, o, t), Cpy_fl0(f, mid, h, t), Conv(o, t, h), fl0(o, mid), dif(o, t), dot(o, t, h),
dit(o, t), NegP(o, mid, xl, f);
}
}
void _Div(const Z* g, int n, const Z* f, Z* q) {
if (n == 0) {
return q[0] = calc::divZ(g[0], f[0]), void();
}
int lim = bit_up(n) << 1;
pre_aloc mem;
Z *o = alocS(lim), *h = alocS(lim);
Inv(f, n, o), fl0(o + n + 1, o + lim), Cpy_fl0(g, n + 1, h, lim), Conv(h, lim, o), Cpy(h, n + 1, q);
}
void Div(const Z* g, int n, const Z* f, Z* q) {
using namespace calc;
if (n <= 64) {
return _Div(g, n, f, q);
}
int bn = bit_up(n >> 4), bt = (n / bn) + 1, bn2 = bn << 1;
pre_aloc mem;
Z *o = alocS(bn2), *jok = alocS(bn2);
Inv(f, bn - 1, o), fl0(o + bn, bn), Cpy_fl0(g, bn, jok, bn2), Conv(jok, bn2, o);
Z *nf = alocS(bn2 * bt), *ng = alocS(bn2 * (bt - 1));
Cpy(jok, bn, q), Cpy_fl0(f, bn, nf, bn2), dif(nf, bn2);
for (int bi = 1; bi < bt; ++bi) {
int xl = std::min<int>(bn, n + 1 - bn * bi);
Cpy_fl0(f + bi * bn, xl, nf + bi * bn2, bn2), dif(nf + bi * bn2, bn2);
Cpy_fl0(q + (bi - 1) * bn, bn, ng + (bi - 1) * bn2, bn2), dif<true>(ng + (bi - 1) * bn2, bn2),
fl0(jok, bn2);
for (int j = 0; j < bi; ++j) {
Z *nF = nf + (bi - j) * bn2, *nF1 = nF - bn2, *ngj = ng + j * bn2;
for (int i = 0; i < bn; i += 8) {
x8(jok + i) = subZx8(x8(jok + i), _AmulZx8(x8(nF + i), x8(nF1 + i), x8(ngj + i)));
}
for (int i = bn; i < bn2; i += 8) {
x8(jok + i) = subZx8(x8(jok + i), _SmulZx8(x8(nF + i), x8(nF1 + i), x8(ngj + i)));
}
}
dit(jok, bn2), fl0(jok + bn, bn), addP(jok, xl, g + bn * bi), dif(jok, bn2), dot(jok, bn2, o),
dit(jok, bn2), Cpy(jok, xl, q + bn * bi);
}
}
void Div(const Z* g, int n, const Z* f, int m, Z* q) {
int lim = n - m + 1, R = std::min<int>(m + 1, lim);
pre_aloc mem;
Z *o = alocS(lim), *h = alocS(lim);
Cpy_rev(f + std::max<int>(2 * m - n, 0), R, o), fl0(o + R, o + lim);
Cpy_rev(g + m, lim, h), Div(h, n - m, o, q), rev(q, lim);
}
void Div(const Z* g, int n, const Z* f, int m, Z* q, Z* r) {
Div(g, n, f, m, q);
int lim = bit_up(std::min<int>(n, m + m - 1));
pre_aloc mem;
Z *o = alocS(lim), *h = alocS(lim);
Cpy_fl0(f, m, o, lim), Cpy_fl0(q, std::min<int>(n - m + 1, m), h, lim), Conv(o, lim, h), subP(g, o, m, r);
}
void Ln(const Z* g, int n, Z* f) {
dot(g, seqi(n + 1), n + 1, f), Div(f, n, g, f), dot(f, n + 1, seqiv(n + 1));
}
template <bool calc_inv = true>
void Expi(const Z* g, int n, Z* f, Z* h) {
f[0] = h[0] = one_Z;
if (n < 1) {
return;
}
int lim = bit_up(n);
seqiv.expand(lim);
pre_aloc mem;
Z *o = alocS(lim), *A = alocS(lim), *B = alocS(lim);
A[0] = A[1] = one_Z;
for (int t = 2, mid = 1; t <= lim; t <<= 1, mid <<= 1) {
int xl = std::min(n + 1, t), dl = std::min(t << 1, lim);
deriv(g, mid - 1, o), dif(o, mid), dot(o, mid, A), dit(o, mid), deriv(f, mid - 1, o + mid);
subP(o + mid, mid - 1, o), o[t - 1] = zero_Z, fl0(o, mid - 1), o[mid - 1] = calc::negZ(o[mid - 1]);
Cpy_fl0(h, mid, B, t), dif(B, t), dif(o, t), dot(o, t, B), dit(o, t);
integ(o, t - 1, o), fl0(o, mid), subP(o + mid, xl - mid, g + mid);
dif(o, t), dot(o, t, A), dit(o, t), NegP(o, mid, xl, f);
if (t != lim || calc_inv) {
Cpy_fl0(f, xl, A, dl), dif(A, dl), dot(A, B, t, o), dit(o, t), fl0(o, mid);
dif(o, t), dot(o, t, B), dit(o, t), NegP(o, mid, xl, h);
}
}
}
void Exp(const Z* g, int n, Z* f) {
using namespace calc;
pre_aloc mem;
if (n <= 64) {
Z* h = alocS(n + 1);
return Expi<false>(g, n, f, h);
}
int bn = bit_up(n >> 4), bt = (n / bn) + 1, bn2 = bn << 1;
seqiv.expand(n + 1);
Z* h = alocS(bn2);
Expi(g, bn - 1, f, h);
fl0(h + bn, bn), dif(h, bn2);
Z *jok = alocS(bn2), *nf = alocS(bn2 * bt), *ng = alocS(bn2 * (bt - 1));
dot(g, seqi(), bn, nf), fl0(nf + bn, nf + bn2), dif(nf, bn2);
for (int bi = 1; bi < bt; ++bi) {
int xl = std::min<int>(bn, n + 1 - bn * bi);
dot(g + bi * bn, seqi() + bi * bn, xl, nf + bi * bn2), fl0(nf + bi * bn2 + xl, bn2 - xl),
dif(nf + bi * bn2, bn2);
Cpy_fl0(f + (bi - 1) * bn, bn, ng + (bi - 1) * bn2, bn2), dif<true>(ng + (bi - 1) * bn2, bn2),
fl0(jok, bn2);
for (int j = 0; j < bi; ++j) {
Z *nF = nf + (bi - j) * bn2, *nF1 = nF - bn2, *ngj = ng + j * bn2;
for (int i = 0; i < bn; i += 8) {
x8(jok + i) = addZx8(x8(jok + i), _AmulZx8(x8(nF + i), x8(nF1 + i), x8(ngj + i)));
}
for (int i = bn; i < bn2; i += 8) {
x8(jok + i) = addZx8(x8(jok + i), _SmulZx8(x8(nF + i), x8(nF1 + i), x8(ngj + i)));
}
}
dit(jok, bn2), fl0(jok + bn, bn), dif(jok, bn2), dot(jok, bn2, h), dit(jok, bn2), fl0(jok + bn, bn);
dot(jok, xl, seqiv() + bn * bi), dif(jok, bn2), dot(jok, bn2, ng), dit(jok, bn2),
Cpy(jok, xl, f + bn * bi);
}
}
void InvSqrt(const Z* g, int n, Z* f) {
int lim = bit_up(n);
pre_aloc mem;
Z *o = alocS(lim << 1), *h = alocS(lim << 1);
f[0] = calc::invZ(calc::extend_::sqrtZ(g[0]));
for (int t = 2, mid = 1; t <= lim; t <<= 1, mid <<= 1) {
int xl = std::min<int>(t, n + 1);
Cpy_fl0(f, mid, o, t << 1), Cpy_fl0(g, xl, h, t << 1), dif(o, t << 1), dif(h, t << 1),
dot(h, t << 1, o), dot(o, t << 1, o), dot(h, t << 1, o), dit(h, t << 1),
mul_t_s(h + mid, xl - mid, f + mid, Const::_neghalf);
}
}
template <bool calc_inv = true>
void Sqrti(const Z* g, int n, Z* f, Z* h) {
int lim = bit_up(n);
f[0] = calc::extend_::sqrtZ(g[0]), h[0] = calc::invZ(f[0]);
if (n == 0)
return;
pre_aloc mem;
Z *o = alocS(lim), *H = alocS(lim), *ff = alocS(lim);
ff[0] = f[0];
for (int t = 2, mid = 1; t <= lim; t <<= 1, mid <<= 1) {
int xl = std::min<int>(t, n + 1);
dot(ff, mid, ff), dit(ff, mid), subP(ff, g, mid, ff + mid), subP(ff + mid, g + mid, mid, ff + mid),
fl0(ff, mid), Cpy_fl0(h, mid, H, t), Conv(ff, t, H),
mul_t_s(ff + mid, xl - mid, f + mid, Const::_neghalf);
if (t != lim || calc_inv) {
Cpy(f, t, o), dif(o, t), Cpy(o, t, ff), dot(o, t, H), dit(o, t), fl0(o, mid), dif(o, t),
dot(o, t, H), dit(o, t), NegP(o, mid, xl, h);
}
}
}
void Sqrt(const Z* g, int n, Z* f) {
using namespace calc;
pre_aloc mem;
if (n <= 64) {
Z* h = alocS(n + 1);
return Sqrti<false>(g, n, f, h);
}
int bn = bit_up(n >> 4), bt = (n / bn) + 1, bn2 = bn << 1;
Z* o = alocS(bn2);
Sqrti(g, bn - 1, f, o), mul_t_s(o, bn, o, Const::_neghalf), fl0(o + bn, bn), dif(o, bn2);
Z *jok = alocS(bn2), *ng = alocS(bn2 * (bt - 1));
for (int bi = 1; bi < bt; ++bi) {
int xl = std::min<int>(bn, n + 1 - bn * bi);
Cpy_fl0(f + (bi - 1) * bn, bn, ng + (bi - 1) * bn2, bn2), dif<true>(ng + (bi - 1) * bn2, bn2),
fl0(jok, bn2);
{
Z* nG1 = ng + (bi - 1) * bn2;
for (int i = 0; i < bn; i += 8) {
x8(jok + i) = addZx8(x8(jok + i), mulZx8(x8(nG1 + i), x8(ng + i)));
}
for (int i = bn; i < bn2; i += 8) {
x8(jok + i) = subZx8(x8(jok + i), mulZx8(x8(nG1 + i), x8(ng + i)));
}
}
for (int j = 1; j < bi; ++j) {
Z *nG = ng + (bi - j) * bn2, *nG1 = nG - bn2, *ngj = ng + j * bn2;
for (int i = 0; i < bn; i += 8) {
x8(jok + i) = addZx8(x8(jok + i), _AmulZx8(x8(nG1 + i), x8(nG + i), x8(ngj + i)));
}
for (int i = bn; i < bn2; i += 8) {
x8(jok + i) = subZx8(x8(jok + i), _SmulZx8(x8(nG1 + i), x8(nG + i), x8(ngj + i)));
}
}
dit(jok, bn2), fl0(jok + bn, bn), subP(jok, xl, g + bi * bn), dif(jok, bn2), dot(jok, bn2, o),
dit(jok, bn2), Cpy(jok, xl, f + bi * bn);
}
}
void Pow(const Z* g, int n, int k, Z* f) {
if (k == 0) {
f[0] = one_Z, fl0(f + 1, n);
} else if (k == 1) {
Cpy(g, n + 1, f);
} else {
pre_aloc mem;
Z* h = alocS(n + 1);
Ln(g, n, f), mul_t_s(f, n + 1, h, InZ(k)), Exp(h, n, f);
}
}
void Pow(const Z* g, int n, int k, int k2, Z* f) {
if (g[0] != one_Z) {
pre_aloc mem;
Z* h = alocS(n + 1);
mul_t_s(g, n + 1, f, calc::invZ(g[0])), Pow(f, n, k, h), mul_t_s(h, n + 1, f, calc::powZ(g[0], k2));
} else {
Pow(g, n, k, f);
}
}
} // namespace poly
namespace poly {
constexpr int good_poly = 1, empty_poly = 0, not_exist_poly = -1;
int Safe_Sqrt(const Z* g, int n, Z* f) {
int shift = Tools::clzP(g, n + 1);
if (shift > n) {
return empty_poly;
}
if ((shift & 1) || (!calc::extend_::isQR(g[shift]))) {
return not_exist_poly;
}
if (shift) {
pre_aloc mem;
Z* h = alocS(n + 1);
Cpy_fl0(g + shift, n - shift + 1, f, n - (shift >> 1) + 1), Sqrt(f, n - (shift >> 1), h);
Cpy(h, n - (shift >> 1) + 1, f + (shift >> 1)), fl0(f, shift >> 1);
} else {
Sqrt(g, n, f);
}
return good_poly;
}
int Safe_Pow(const Z* g, int n, std::string s, Z* f) {
using namespace Tools;
int shift = clzP(g, n + 1);
if (shift) {
if ((s.size() > size_t(8)) || ((stoll(s) * shift) > i64(n))) {
return empty_poly;
}
int rf = (stoi(s) * shift), l = n + 1 - rf;
pre_aloc mem;
Z* h = alocS(l);
Cpy(g + shift, l, f), Pow(f, l - 1, stoi_m(s, mod), stoi_m(s, mod - 1), h), Cpy(h, l, f + rf),
fl0(f, rf);
} else {
Pow(g, n, stoi_m(s, mod), stoi_m(s, mod - 1), f);
}
return good_poly;
}
} // namespace poly
namespace poly {
// Author : killer_joke
namespace Trigonometric_Function {
void Trig(const Z* g, int n, Z* sint, Z* cost) {
pre_aloc mem;
Z *A = alocS(n + 1), *o = alocS(n + 1), *h = alocS(n + 1);
mul_t_s(g, n + 1, A, Const::img), Expi<true>(A, n, h, o);
if (sint != nullptr) {
subP(o, h, n + 1, sint), mul_t_s(sint, n + 1, sint, Const::_imghalf);
}
if (cost != nullptr) {
addP(h, o, n + 1, cost), mul_t_s(cost, n + 1, cost, Const::_half);
}
}
void Sin(const Z* g, int n, Z* f) { Trig(g, n, f, nullptr); }
void Cos(const Z* g, int n, Z* f) { Trig(g, n, nullptr, f); }
void Tan(const Z* g, int n, Z* f) {
pre_aloc mem;
Z* h = alocS(n + 1);
Trig(g, n, f, h), Div(f, n, h, f);
}
void aSin(const Z* g, int n, Z* f) {
int lim = bit_up(n) << 1;
pre_aloc mem;
Z *o = alocS(lim), *h = alocS(lim);
Cpy_fl0(g, n + 1, o, lim), dif(o, lim), dot(o, lim, o), dit(o, lim), NegP(o, 0, n + 1, o),
o[0] = calc::addZ(one_Z, o[0]);
InvSqrt(o, n, h), deriv(g, n, o), fl0(o + n, o + lim), fl0(h + n + 1, h + lim), Conv(o, lim, h),
integ(o, n, f);
}
void aCos(const Z* g, int n, Z* f) { aSin(g, n, f), NegP(f, 0, n + 1, f); }
void aTan(const Z* g, int n, Z* f) {
int lim = bit_up(n);
pre_aloc mem;
Z* o = alocS(lim << 1);
Cpy_fl0(g, n + 1, o, lim << 1), dif(o, lim << 1), dot(o, lim << 1, o), dit(o, lim << 1);
o[0] = calc::addZ(o[0], one_Z), deriv(g, n, o + lim), Div(o + lim, n, o, f), integ(f, n, f);
}
} // namespace Trigonometric_Function
} // namespace poly
namespace poly_base {
namespace fac_helper {
void _fac_i(Z* _fac, int l, int r) {
const Z* _i = seqi(r);
if (l == 0) {
_fac[0] = one_Z, l = 1;
}
for (int i = l; i < r; ++i) {
_fac[i] = calc::mulZ(_fac[i - 1], _i[i]);
}
}
void _fac_iv(Z* _ifac, int l, int r) {
const Z* _iv = seqiv(r);
if (l == 0) {
_ifac[0] = one_Z, l = 1;
}
for (int i = l; i < r; ++i) {
_ifac[i] = calc::mulZ(_ifac[i - 1], _iv[i]);
}
}
pre_base seqfac(_fac_i), seqifac(_fac_iv);
} // namespace fac_helper
using fac_helper::seqfac;
using fac_helper::seqifac;
} // namespace poly_base
namespace poly_base {
// Author : killer_joke
namespace makeseq {
using namespace calc;
void Ci(Z* f, int n, Z c) {
Z fx = one_Z;
int i = 0;
for (; i < std::min(n, 8); ++i, fx = mulZ(fx, c)) {
f[i] = fx;
}
if (n > 16) {
Zx8 C8 = setu32x8(fx);
for (; i + 7 < n; i += 8) {
x8(f + i) = mulZx8(x8(f + i - 8), C8);
}
}
for (; i < n; ++i) {
f[i] = mulZ(f[i - 1], c);
}
}
void E_x(Z* f, int n) {
const Z* ifac = seqifac(n);
int i = 0;
for (; i + 7 < n; i += 8) {
x8(f + i) = Neg<0xaa>(x8(ifac + i));
}
for (; i < n; ++i) {
f[i] = (i & 1) ? negZ(ifac[i]) : ifac[i];
}
}
void mRisingfac(Z* f, int n, Z m) {
f[0] = one_Z;
for (int i = 1; i < n; ++i) {
f[i] = mulZ(f[i - 1], m), m = addZ(m, one_Z);
}
}
void mFallingfac(Z* f, int n, Z m) {
f[0] = one_Z;
for (int i = 1; i < n; ++i) {
f[i] = mulZ(f[i - 1], m), m = subZ(m, one_Z);
}
}
} // namespace makeseq
} // namespace poly_base
namespace poly_base {
// Author : killer_joke
namespace f_n_t_t {
void _dfx2_f(Z* ka, int l, int r) {
for (int i = std::max(1, l); i < r; i <<= 1) {
makeseq::Ci(ka + i, i, rt1[__qed::lg2d(i) + 1]);
}
}
pre_base predifx2(_dfx2_f);
template <bool strict = false>
void difx2(Z* f, int lim) {
Cpy(f, lim, f + lim), dit(f + lim, lim);
dot(f + lim, lim, predifx2() + lim), dif<strict>(f + lim, lim);
}
template <bool strict = false>
void difx2fxC(Z* f, int lim) {
constexpr Z two_Z = InZs(2);
Cpy(f, lim, f + lim), dit(f + lim, lim);
dot(f + lim, lim, predifx2() + lim), f[lim] = subZ(f[lim], two_Z), dif<strict>(f + lim, lim);
}
void dif_neg(const Z* g, int lim, Z* f) {
if (lim < 8) {
if (lim == 1) {
*f = *g;
} else {
for (int i = 0; i < lim; ++i) {
f[i] = g[i ^ 1];
}
}
} else {
for (int i = 0; i < lim; i += 8) {
x8(f + i) = shuffle<0xb1>(x8(g + i));
}
}
}
} // namespace f_n_t_t
using f_n_t_t::difx2;
using f_n_t_t::difx2fxC;
using f_n_t_t::predifx2;
} // namespace poly_base
namespace field_Z {
namespace calc {
constexpr IL Z _AdMulZ(Z a, Z b, Z c, Z d) { return Space.reduce_s(u64(a) * b + u64(c) * d); }
IL Zx8 _AdMulZx8(const Zx8& a, const Zx8& b, const Zx8& c, const Zx8& d) {
u32x8 z = (NInvx8 * ((a * b) + (c * d)));
z = blend<0xaa>(RC(u32x8, (fus_mul(a, b) + fus_mul(c, d) + fus_mul(z, modx8)) >> 32),
RC(u32x8, (fus_mul(u32x8(u64x4(a) >> 32), u32x8(u64x4(b) >> 32)) +
fus_mul(u32x8(u64x4(c) >> 32), u32x8(u64x4(d) >> 32)) +
fus_mul(shuffle<0xf5>(z), modx8)))) -
modx8;
return z + (RC(Zx8, RC(i32x8, z) < RC(i32x8, zerox8)) & modx8);
}
} // namespace calc
} // namespace field_Z
namespace poly_base {
void adddot(const Z* A, const Z* B, const Z* C, const Z* D, int n, Z* E) {
int i = 0;
for (; i + 7 < n; i += 8) {
x8(E + i) = calc::_AdMulZx8(x8(A + i), x8(B + i), x8(C + i), x8(D + i));
}
for (; i < n; ++i) {
E[i] = calc::_AdMulZ(A[i], B[i], C[i], D[i]);
}
}
} // namespace poly_base
namespace poly {
namespace tech {
// Author : QedDust413
void Chirp_Z(const Z* g, int n, Z c, int m, Z* f) {
using namespace calc;
pre_aloc mem;
int lim = bit_up((++n) + m), t = std::max<int>(n, m);
Z *F = alocS(lim), *G = alocS(lim), *ipwc = alocS(t + 1), cI = invZ(c);
makeseq::Ci(F, n + m, c);
for (int i = 0; i < n + m; ++i) {
F[i + 1] = mulZ(F[i], F[i + 1]);
}
fl0(F + n + m, F + lim - 1), F[lim - 1] = one_Z;
makeseq::Ci(ipwc, t + 1, cI);
for (int i = 1; i <= t; ++i) {
ipwc[i] = mulZ(ipwc[i - 1], ipwc[i]);
}
for (int i = 1; i < n; ++i) {
G[i] = mulZ(g[n - i], ipwc[n - i - 1]);
}
G[n] = g[0], G[0] = zero_Z, fl0(G + n + 1, G + lim), Conv(F, lim, G), f[0] = F[n - 1];
for (int i = 1; i < m; ++i) {
f[i] = mulZ(F[i - 1 + n], ipwc[i - 1]);
}
}
void translate(const Z* g, int n, Z c, Z* f) {
int lim = bit_up(n << 1);
pre_aloc mem;
Z *o = alocS(lim), *h = alocS(lim);
dot(g, seqfac(n + 1), n + 1, o), rev(o, n + 1), fl0(o + n + 1, o + lim);
makeseq::Ci(h, n + 1, c), dot(h, n + 1, seqifac(n + 1)), fl0(h + n + 1, h + lim);
Conv(o, lim, h);
Cpy_rev(o, n + 1, f), dot(f, n + 1, seqifac());
}
void multi_eva(const Z* f, int n, const Z* a, int m, Z* o) {
using namespace calc;
pre_aloc mem;
int lgn = __qed::lg2u(m), lim = 1 << lgn, lim2 = lim << 1;
predifx2.expand(lim);
Z *G = alocS(lgn * lim2), *g = G, *_d = G;
for (const Z* p = a; p != a + m; ++p, g += 2) {
g[0] = subZ(one_Z, *p), g[1] = subZ(Const::neg_one, *p);
}
std::fill(g, _d += lim2, one_Z), g = _d;
for (int t = 2, t2 = 4, M = m & -t; t < lim; t = t2, t2 <<= 1, M &= -t) {
Z *lc = g - lim2, *rc = lc + t, *ed = g + (M << 1);
for (; g != ed; g += t2, lc += t2, rc += t2) {
dot(lc, rc, t, g), difx2fxC(g, t);
}
if (M < m) {
dot(lc, rc, t, g), difx2(g, t), g += t2;
}
std::fill(g, _d += lim2, one_Z), g = _d;
}
int Lim = bit_up(n + m);
Z *w = alocS(Lim << 1), *h = w + Lim;
dot(g - lim2, g - lim, lim, w), dit(w, lim);
if (m == lim) {
w[0] = subZ(w[0], one_Z), w[lim] = one_Z;
}
rev(w, m + 1), fl0(w + m + 1, w + Lim), Inv(w, n, h), rev(h, n + 1), fl0(h + n + 1, h + Lim);
Cpy_fl0(f, n + 1, w, Lim), Conv(h, Lim, w), Cpy(h + n, m, w), fl0(w + m, w + lim);
for (int t = lim, t2 = t << 1, M = m & -t, mid = t >> 1; t > 1;
t2 = t, t = mid, mid >>= 1, M |= (m & -t)) {
Z *lc = (g -= lim2), *rc = lc + t, *p = w, *ed = w + (M << 1);
for (; p != ed; p += t2, lc += t2, rc += t2) {
dif(p, t), dot(rc, p, t, p + t), dit(p + t, t), dot(p, t, lc), dit(p, t);
Cpy(p + mid, mid, p + t), Cpy(p + t + mid, mid, p);
}
int rl = m - M - mid;
if (rl > 0) {
dif(p, t), dot(rc, p, t, p + t), dit(p + t, t), dot(p, t, lc), dit(p, t);
Cpy(p + mid, rl, p + t), Cpy(p + t + rl, mid, p);
}
}
for (int i = 0; i < m; ++i) {
o[i] = w[i << 1];
}
}
void fst_intpol(const Z* x, const Z* y, int n, Z* f) {
using namespace calc;
pre_aloc mem;
int lgn = __qed::lg2u(n), lim = 1 << lgn, lim2 = lim << 1;
predifx2.expand(lim);
Z *G = alocS(lgn * lim2), *g = G, *_d = G;
for (const Z* p = x; p != x + n; ++p, g += 2) {
g[0] = subZ(one_Z, *p), g[1] = subZ(Const::neg_one, *p);
}
std::fill(g, _d += lim2, one_Z), g = _d;
for (int t = 2, t2 = 4, M = n & -t; t < lim; t = t2, t2 <<= 1, M &= -t) {
Z *lc = g - lim2, *rc = lc + t, *ed = g + (M << 1);
for (; g != ed; g += t2, lc += t2, rc += t2) {
dot(lc, rc, t, g), difx2fxC(g, t);
}
if (M < n) {
dot(lc, rc, t, g), difx2(g, t), g += t2;
}
std::fill(g, _d += lim2, one_Z), g = _d;
}
Z *h = alocS(lim2), *w = alocS(lim2), *o = alocS(n + 1);
dot(g - lim2, g - lim, lim, w), dit(w, lim);
if (n == lim) {
w[0] = subZ(w[0], one_Z), w[lim] = one_Z;
}
Cpy_rev(w, n + 1, o), Inv(o, n, h), rev(h, n + 1), fl0(h + n + 1, h + lim2);
deriv(w, n, w), fl0(w + n, w + lim2), Conv(h, lim2, w), Cpy(h + n, n, w);
for (int t = lim, t2 = t << 1, M = n & -t, mid = t >> 1; t > 1;
t2 = t, t = mid, mid >>= 1, M |= (n & -t)) {
Z *lc = (g -= lim2), *rc = lc + t, *p = w, *ed = w + (M << 1);
for (; p != ed; p += t2, lc += t2, rc += t2) {
dif(p, t), dot(rc, p, t, p + t), dit(p + t, t), dot(p, t, lc), dit(p, t);
Cpy(p + mid, mid, p + t), Cpy(p + t + mid, mid, p);
}
int rl = n - M - mid;
if (rl > 0) {
dif(p, t), dot(rc, p, t, p + t), dit(p + t, t), dot(p, t, lc), dit(p, t);
Cpy(p + mid, rl, p + t), Cpy(p + t + rl, mid, p);
}
}
for (int i = 0; i < n; ++i) {
w[i] = w[i << 1];
}
multi_iv(w, n, o);
for (int i = 0; i < n; ++i) {
w[i << 1] = w[i << 1 | 1] = mulZs(y[i], o[i]);
}
fl0(w + (n << 1), w + lim2);
for (int t = 2, t2 = 4; t < lim; t = t2, t2 <<= 1, g += lim2) {
int M = (n + (t - 1)) & -t;
Z *lc = g, *rc = lc + t, *p = w, *ed = w + (M << 1);
for (; p != ed; p += t2, lc += t2, rc += t2) {
adddot(lc, p + t, rc, p, t, p), difx2<true>(p, t);
}
}
adddot(g, w + lim, g + lim, w, lim, w), dit(w, lim), Cpy(w, n, f);
}
Z linear(const Z* f, const Z* a, int k, i64 n) {
auto _cal_linear = [](Z* p, int l, int r) {
if (l == 0) {
p[0] = Const::_half, l = 1;
}
for (int i = l; i < r; ++i) {
p[i] = calc::mulZs(p[i & (i - 1)], f_n_t_t::rt1_I[__qed::crz_32(i) + 2]);
}
};
static pre_base pre_linear(_cal_linear);
int hlim = bit_up(k), lim = hlim << 1;
pre_aloc mem;
Z *o = alocS(lim), *h = alocS(lim), *q = alocS(lim);
const Z* w = pre_linear(hlim);
predifx2.expand(lim), Cpy_fl0(a, k, h, lim), q[0] = one_Z, Cpy_fl0(f, k, q + 1, lim - 1);
NegP(q, 1, k + 1, q), Conv(h, lim, q), fl0(h + k, h + lim), dif(h, lim);
for (; n; n >>= 1) {
f_n_t_t::dif_neg(q, lim, o), dot(q, lim, o), dot(h, lim, o);
if (n & 1) {
for (int i = 0; i < hlim; ++i) {
h[i] = calc::mulZ(h[i << 1] - h[i << 1 | 1] + mod2, w[i]);
q[i] = q[i << 1];
}
} else {
for (int i = 0; i < hlim; ++i) {
h[i] = calc::mulZ(h[i << 1] + h[i << 1 | 1], Const::_half);
q[i] = q[i << 1];
}
}
n == 1 ? dit(h, hlim) : difx2(h, hlim);
n == 1 ? dit(q, hlim) : difx2(q, hlim);
}
return calc::divZ(h[0], q[0]);
}
} // namespace tech
} // namespace poly
namespace poly_base {
void poi_translate(const Z* p, int n, Z c, int m, Z* o) {
int l = n + m, lim = bit_up(l);
pre_aloc mem;
Z *A = alocS(lim), *B = alocS(lim), *fc = alocS(l + 1), *fic = alocS(l + 1);
const Z* ifac = seqifac(n + 1);
for (int i = 0; i <= n; ++i) {
Z fx = calc::mulZ(ifac[i], ifac[n - i]);
A[i] = calc::mulZ(p[i], ((n - i) & 1) ? mod2 - fx : fx);
}
fl0(A + n + 1, A + lim);
makeseq::mRisingfac(fc, l + 1, calc::subZ(c, InZs(n)));
multi_iv(fc, l + 1, fic);
for (int i = 0; i < l; ++i) {
B[i] = calc::mulZ(fc[i], fic[i + 1]);
}
fl0(B + l, B + lim), Conv(A, lim, B);
for (int i = 0; i < m; ++i) {
o[i] = calc::mulZ(A[i + n], fc[i + n + 1]);
}
dot(o, m, fic);
}
// Author : killer_joke
namespace EGF {
void dot(Z* f, int lim, const Z* g) {
poly_base::dot(f, lim, seqfac(lim));
poly_base::dot(f, lim, g);
}
void dot(const Z* f, const Z* g, int lim, Z* h) {
poly_base::dot(f, seqfac(lim), lim, h);
poly_base::dot(h, lim, g);
}
void fromOGF(Z* f, int n) { poly_base::dot(f, n + 1, seqifac(n + 1)); }
void toOGF(Z* f, int n) { poly_base::dot(f, n + 1, seqifac(n + 1)); }
} // namespace EGF
} // namespace poly_base
namespace Falling_Factorial {
using namespace poly_base;
namespace f_d_t {
// Author : killer_joke
Z* EX[mp2 + 1][2];
void prefdt(int lim) {
int lg = __qed::lg2d(lim);
if (EX[lg][0]) {
return;
}
EX[lg][0] = alocH(lim << 1), EX[lg][1] = alocH(lim << 1);
Cpy_fl0(seqifac(lim), lim, EX[lg][0], lim << 1), dif(EX[lg][0], lim << 1);
f_n_t_t::dif_neg(EX[lg][0], lim << 1, EX[lg][1]);
}
void fdt(Z* f, int lim) { dif(f, lim << 1), dot(f, lim << 1, EX[__qed::lg2d(lim)][0]), dit(f, lim << 1); }
void ifdt(Z* f, int lim) { dif(f, lim << 1), dot(f, lim << 1, EX[__qed::lg2d(lim)][1]), dit(f, lim << 1); }
void _fdtA(Z* p, int l, int r) {
for (int i = std::max(1, l); i < r; i <<= 1) {
makeseq::E_x(p + i, i), rev(p + i, i);
}
}
void _fdtB(Z* p, int l, int r) {
seqiv.expand(r);
for (int i = std::max(2, l); i < r; i <<= 1) {
Cpy(seqiv(), i, p + i), dif(p + i, i);
}
}
pre_base dA(_fdtA), dB(_fdtB);
void prefdtx2(int lim) { dA.expand(lim), dB.expand(lim << 1); }
void fdtx2(const Z* g, int lim, Z* f) {
dot(g, dA() + lim, lim, f), fl0(f + lim, lim), dif(f, lim << 1), dot(f, lim << 1, dB() + (lim << 1));
dit(f, lim << 1), dot(f + lim, lim, seqifac()), Cpy(g, lim, f);
}
} // namespace f_d_t
using f_d_t::fdt;
using f_d_t::ifdt;
using f_d_t::prefdt;
void Mul(const Z* f, int n, const Z* g, int m, Z* h) {
int lim = bit_up(n + m);
prefdt(lim);
pre_aloc mem;
Z *F = alocS(lim << 1), *G = alocS(lim << 1);
Cpy_fl0(f, n + 1, F, lim << 1), Cpy_fl0(g, m + 1, G, lim << 1);
fdt(F, lim), fdt(G, lim), EGF::dot(F, lim, G);
fl0(F + lim, lim), ifdt(F, lim), Cpy(F, n + m + 1, h);
}
void translate(const Z* g, int n, Z c, Z* f) {
int lim = bit_up(n << 1);
pre_aloc mem;
Z *o = alocS(lim), *h = alocS(lim);
makeseq::mFallingfac(o, n + 1, c);
EGF::fromOGF(o, n), fl0(o + n + 1, o + lim);
dot(g, seqfac(n + 1), n + 1, h), rev(h, n + 1), fl0(h + n + 1, h + lim);
Conv(o, lim, h), Cpy_rev(o, n + 1, f), dot(f, n + 1, seqifac(n + 1));
}
void fall_down(const Z* g, int n, Z* f) {
int lim = bit_up(n);
pre_aloc mem;
Z *o = alocS(lim << 1), *h = alocS(lim), *u = alocS(lim), *r = alocS(lim);
prefdt(lim), f_d_t::prefdtx2(lim);
Cpy_fl0(g, n + 1, o, lim), Cpy(seqi(n + 1), n + 1, u);
for (int i = 0; i <= n; i += 2) {
o[i + 1] = calc::addZ(o[i], o[i + 1]);
}
for (int t = 4, mid = 2, t2 = 8; t <= lim; t <<= 1, mid <<= 1, t2 <<= 1) {
dot(u, n + 1, u);
for (int j = 0; j <= n; j += t) {
f_d_t::fdtx2(o + j, mid, r), f_d_t::fdtx2(o + j + mid, mid, h);
dot(h, u, t, o + j), addP(o + j, t, r);
}
}
fl0(o + lim, lim), ifdt(o, lim), Cpy(o, n + 1, f);
}
void stand_up(const Z* g, int n, Z* f) {
int lim = bit_up(n);
pre_aloc mem;
Z *F = alocS(lim), *G = alocS(lim), *u = alocS(lim), *r = alocS(lim);
predifx2.expand(lim);
std::fill(G, G + lim, one_Z), subP(G, lim, seqi(lim));
Cpy_fl0(g, n + 1, F, lim);
for (int t = 2, mid = 1; t <= lim; t <<= 1, mid <<= 1) {
for (int j = 0; j <= n; j += t) {
Cpy(F + j + mid, mid, u), difx2(u, mid), difx2(F + j, mid);
Cpy(G + j + mid, mid, r), difx2fxC(r, mid), difx2fxC(G + j, mid);
dot(u, t, G + j), addP(F + j, t, u), dot(G + j, t, r);
}
}
dit(F, lim), Cpy(F, n + 1, f);
}
} // namespace Falling_Factorial
namespace Command {
using Stat_Info::report;
void cut_string() { _Exit(0); }
} // namespace Command
struct auto_reporter {
~auto_reporter() { Command::report(); }
} jack;
using poly_base::alocS;
using poly_base::pre_aloc;
using namespace poly_base::Tools;
using namespace field_Z;
void poly_multiply(unsigned* a, int n, unsigned* b, int m, unsigned* c) {
pre_aloc mem;
auto F = alocS(n + 1), G = alocS(m + 1), H = alocS(n + m + 1);
for (int i = 0; i <= n; i++) F[i] = trans<1,0>(a[i]);
for (int i = 0; i <= m; i++) G[i] = trans<1,0>(b[i]);
// scanP(F,n+1);
// scanP(G,m+1);
poly::Mul(F, n + 1, G, m + 1, H);
for (int i = 0; i <= n + m; i++) c[i] =trans<-1>(H[i]);
// printP(H,n+m+1);
// for (int i = 0; i <= n ; i++) std::cerr<<F[i]<<" ";std::cerr<<"\n";
// for (int i = 0; i <= m; i++) std::cerr<<G[i]<<" ";std::cerr<<"\n";
return;
}
Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
Testcase #1 | 44.062 ms | 38 MB + 952 KB | Accepted | Score: 100 | 显示更多 |