#define __AVX__ 1
#define __AVX2__ 1
#define __SSE__ 1
#define __SSE2__ 1
#define __SSE2_MATH__ 1
#define __SSE3__ 1
#define __SSE4_1__ 1
#define __SSE4_2__ 1
#define __SSE_MATH__ 1
#define __SSSE3__ 1
#pragma GCC optimize("Ofast,no-stack-protector,unroll-loops,fast-math")
#pragma GCC target("sse,sse2,sse3,ssse3,sse4.1,sse4.2,avx,avx2,popcnt,tune=native")
#include<immintrin.h>
#include<bits/stdc++.h>
#define ADD _mm256_add_epi64
#define SUB _mm256_sub_epi64
#define MUL _mm256_mul_epi32
#define SHIFT _mm256_srli_epi64
const int maxn = 1 << 18;
const int mod = 81788929;
const int inv2 = mod + 1 >> 1;
typedef long long ll;
int n, m;
typedef unsigned long long u64;
typedef unsigned u32;
using AI = u32[maxn];
struct istream {
static const int size = 1 << 19;
char buf[size], *vin;
inline istream() {
scanf("%d%d\n",&n,&m);
fread(buf,1,size,stdin);
vin = buf - 1;
}
inline istream& operator >> (u32 & x) {
x = *++vin & 15, ++ vin;
return * this;
}
} cin;
struct ostream {
static const int size = 1 << 23;
char buf[size], *vout;
unsigned map[10000];
inline ostream() {
for(int i = 0;i < 10000;++i)
map[i] = i % 10 + 48 << 24 | i / 10 % 10 + 48 << 16 | i / 100 % 10 + 48 << 8 | i / 1000 + 48;
vout = buf + size;
}
inline ~ ostream()
{ fwrite(vout,1,buf + size - vout,stdout); }
inline ostream& operator << (u32 x) {
if(x) {
for(;x >= 1000;x /= 10000) *--(unsigned*&)vout = map[x % 10000];
while(x) *--vout = x % 10 + 48, x /= 10;
} else *--vout = 48;
return * this;
}
inline ostream& operator << (char x)
{
*--vout = x;
return * this;
}
} cout;
inline int pow(int a,int b,int ans = 1) {
for(;b;b >>= 1,a = (ll) a * a % mod) if(b & 1)
ans = (ll) ans * a % mod;
return ans;
}
AI wn, rev, wn_;
u64 _1[maxn], _2[maxn];
inline u32 getinv(u32 x) { return ((u64) x << 31) / mod; }
typedef __m256i LL;
inline void print(LL x) {
ll * o = (ll*) & x;
printf("%llu %llu %llu %llu\n", o[0], o[1], o[2], o[3]);
}
namespace poly
{
int lim,invlim;
inline void init(int len) {
for(lim = invlim = 1;lim < len;lim <<= 1) invlim = (ll) invlim * inv2 % mod;
for(int i = 1;i < lim;++i) rev[i] = rev[i >> 1] >> 1 | (i % 2 * lim / 2);
const int W = pow(7,mod / lim); wn[lim >> 1] = 1; wn_[lim >> 1] = getinv(1);
for(int i = 1;i + i < lim;++i)
wn[(lim >> 1) + i] = (u64) wn[(lim >> 1) + i - 1] * W % mod, wn_[(lim >> 1) + i] = getinv(wn[(lim >> 1) + i]);
for(int i = lim >> 1;--i >= 1;) wn[i] = wn[i << 1], wn_[i] = wn_[i << 1];
for(int i = 0;i < lim;++i) _1[i] = wn[i], _2[i] = wn_[i];
}
inline void fft(u32 * a,int type) {
for(int i = 0;i < lim;++i) if(rev[i] > i) std::swap(a[i], a[rev[i]]);
for(int mid = 1;mid < lim;mid <<= 1) {
if(mid <= 2) {
for(int i = 0;i < lim;i += mid + mid) for(int j = 0;j < mid;++j) {
const u32 x = a[i + j + mid] * wn[mid + j] - u32((u64) wn_[mid + j] * a[i + j + mid] >> 31) * mod;
a[i + j + mid] = mod + mod - x + a[i + j]; a[i + j] += x;
}
} else {
const LL mod = _mm256_set_epi64x(::mod, ::mod, ::mod, ::mod);
const LL Tmod = ADD(mod, mod);
static u64 A[maxn];
if(mid == 4) for(int i = 0;i < lim;++i) A[i] = a[i];
LL * aL = (LL*) A, * wnL = (LL*) (mid + _1), *wn_L = (LL*) (mid + _2);
const int M = mid / 4;
for(int i = 0;i < lim / 4;i += M + M) for(int j = 0;j < M;++j) {
LL x = SUB(MUL(aL[M + i + j], wnL[j]), MUL(mod, SHIFT(MUL(aL[M + i + j], wn_L[j]), 31)));
aL[M + i + j] = SUB(ADD(aL[i + j], Tmod), x);
aL[i + j] = ADD(aL[i + j], x);
}
if(mid + mid == lim) for(int i = 0;i < lim;++i) a[i] = A[i];
}
}
if(!type) {
for(int i = 0;i < lim;++i) a[i] = (u64) a[i] * invlim % mod;
std::reverse(a + 1,a + lim);
}
}
}
AI a, b;
int main() {
for(int i = 0;i <= n;++i) cin >> a[i];
for(int i = 0;i <= m;++i) cin >> b[i];
poly::init(n + m + 1);
poly::fft(a, 1);
poly::fft(b, 1);
for(int i = 0;i < poly::lim;++i) a[i] = (ll) a[i] * b[i] % mod;
poly::fft(a, 0);
for(int i = n + m;i >= 0;--i) cout << ' ' << a[i];
}
| Compilation | N/A | N/A | Compile Error | Score: N/A | 显示更多 |