#pragma GCC optimize("-Ofast","-funroll-all-loops","-ffast-math")
#pragma GCC optimize("-fno-math-errno")
#pragma GCC optimize("-funsafe-math-optimizations")
#pragma GCC optimize("-freciprocal-math")
#pragma GCC optimize("-fno-trapping-math")
#pragma GCC optimize("-ffinite-math-only")
#pragma GCC optimize("-fno-stack-protector")
#pragma GCC target ("avx2")
#include <immintrin.h>
#include <iostream>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#include <string>
#include <bitset>
#include <vector>
#include <set>
#include <map>
#include <queue>
#include <algorithm>
#include <sstream>
#include <stack>
#include <iomanip>
#include <assert.h>
using namespace std;
#define pb push_back
#define mp make_pair
typedef pair<int,int> pii;
typedef long long ll;
typedef double ld;
typedef vector<int> vi;
#define fi first
#define se second
#define fe first
#define FO(x) {freopen(#x".in","r",stdin);freopen(#x".out","w",stdout);}
#define Edg int M=0,fst[SZ],vb[SZ],nxt[SZ];void ad_de(int a,int b){++M;nxt[M]=fst[a];fst[a]=M;vb[M]=b;}void adde(int a,int b){ad_de(a,b);ad_de(b,a);}
#define Edgc int M=0,fst[SZ],vb[SZ],nxt[SZ],vc[SZ];void ad_de(int a,int b,int c){++M;nxt[M]=fst[a];fst[a]=M;vb[M]=b;vc[M]=c;}void adde(int a,int b,int c){ad_de(a,b,c);ad_de(b,a,c);}
#define es(x,e) (int e=fst[x];e;e=nxt[e])
#define esb(x,e,b) (int e=fst[x],b=vb[e];e;e=nxt[e],b=vb[e])
#define SZ 2345678
const int MOD=998244353;
ll qp(ll a,ll b)
{
ll ans=1;
while(b)
{
if(b&1) ans=ans*a%MOD;
a=a*a%MOD; b>>=1;
}
return ans;
}
int getK(int n)
{int s=1; while(s<n) s<<=1; return s;}
typedef unsigned us;
typedef unsigned long long ull;
us pool[SZ*4] __attribute__ ((aligned(64))),*ptr=pool;
us *p0[SZ],*p1[SZ],*q0[SZ],*q1[SZ];
__attribute__((always_inline)) void bit_flip(us*p,int t)
{
for(int i=0,j=0;i<t;++i)
{
if(i>j) swap(p[i],p[j]);
for(int l=t>>1;(j^=l)<l;l>>=1);
}
}
void prep(int n)
{
static int t=1;
for(;t<n;t<<=1)
{
int g=qp(3,(MOD-1)/(t*2));
us*p,*q;
p=p0[t]=ptr; ptr+=max(t,16); p[0]=1;
for(int m=1;m<t;++m)
p[m]=p[m-1]*(ull)g%us(MOD);
bit_flip(p,t);
q=q0[t]=ptr; ptr+=max(t,16);
for(int i=0;i<t;++i)
q[i]=(ull(p[i])<<32)/MOD;
g=qp(g,MOD-2);
p=p1[t]=ptr; ptr+=max(t,16); p[0]=1;
for(int m=1;m<t;++m)
p[m]=p[m-1]*(ull)g%us(MOD);
bit_flip(p,t);
q=q1[t]=ptr; ptr+=max(t,16);
for(int i=0;i<t;++i)
q[i]=(ull(p[i])<<32)/MOD;
}
}
typedef unsigned long long ull;
__attribute__((always_inline)) us my_mul(us a,us b,us c)
{
return b*(ull)a-((ull(a)*c)>>32)*ull(998244353);
}
__attribute__((always_inline)) __m128i my_mullo_epu32(const __m128i&a, const __m128i& b)
{
#if defined ( __SSE4_2__ )
return _mm_mullo_epi32(a,b);
#else
__m128i a13 = _mm_shuffle_epi32(a, 0xF5); // (-,a3,-,a1)
__m128i b13 = _mm_shuffle_epi32(b, 0xF5); // (-,b3,-,b1)
__m128i prod02 = _mm_mul_epu32(a, b); // (-,a2*b2,-,a0*b0)
__m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1)
// __m128i prod01 = ; // (-,-,a1*b1,a0*b0)
// __m128i prod23 = ; // (-,-,a3*b3,a2*b2)
__m128i prod = _mm_unpacklo_epi64(_mm_unpacklo_epi32(prod02,prod13),_mm_unpackhi_epi32(prod02,prod13)); // (ab3,ab2,ab1,ab0)
return prod;
#endif
}
__attribute__((always_inline)) __m128i my_mulhi_epu32(const __m128i&a, const __m128i& b)
{
__m128i a13 = _mm_shuffle_epi32(a, 0xF5); // (-,a3,-,a1)
__m128i b13 = _mm_shuffle_epi32(b, 0xF5); // (-,b3,-,b1)
__m128i prod02 = _mm_mul_epu32(a, b); // (a2*b2,-,a0*b0,-)
__m128i prod13 = _mm_mul_epu32(a13, b13); // (a3*b3,-,a1*b1,-)
// __m128i prod01 = ; // (a1*b1,a0*b0,-,-)
// __m128i prod23 = ; // (a3*b3,a2*b2,-,-)
__m128i prod = _mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02,prod13),_mm_unpackhi_epi32(prod02,prod13)); // (ab3,ab2,ab1,ab0)
return prod;
}
void ntt(us* __restrict__ x,int n)
{
int t=n;
for(int m=1;m<n;m<<=1)
{
t>>=1;
us* __restrict__ p=p0[m];
us* __restrict__ q=q0[m];
if(t==1||t==2)
{
us *xa=x,*xb=x+t;
for(int i=0;i<m;++i,xa+=t+t,xb+=t+t)
for(int j=0;j<t;++j)
{
us u=xa[j]-(xa[j]>=us(MOD+MOD))*us(MOD+MOD);
us v=my_mul(xb[j],p[i],q[i]);
xa[j]=u+v;
xb[j]=u-v+us(MOD+MOD);
}
}
else if(t==4)
{
us *xa=x,*xb=x+t;
for(int i=0;i<m;++i,xa+=t+t,xb+=t+t)
{
const __m128i p4=_mm_set1_epi32(p[i]),
q4=_mm_set1_epi32(q[i]),
mm=_mm_set1_epi32(MOD+MOD),
m0=_mm_set1_epi32(0),
m1=_mm_set1_epi32(MOD);
for(int j=0;j<t;j+=4)
{
__m128i u=_mm_load_si128((__m128i*)(xa+j));
u=_mm_sub_epi32(u,
_mm_and_si128(_mm_or_si128(_mm_cmpgt_epi32(u,mm),
_mm_cmpgt_epi32(m0,u)),mm));
__m128i v=_mm_load_si128((__m128i*)(xb+j));
v=_mm_sub_epi32(my_mullo_epu32(v,p4),
my_mullo_epu32(my_mulhi_epu32(v,q4),m1));
_mm_store_si128((__m128i*)(xa+j),_mm_add_epi32(u,v));
_mm_store_si128((__m128i*)(xb+j),_mm_add_epi32(_mm_sub_epi32(u,v),mm));
}
}
}
else
{
us *xa=x,*xb=x+t;
for(int i=0;i<m;++i,xa+=t+t,xb+=t+t)
{
const __m128i p4=_mm_set1_epi32(p[i]),
q4=_mm_set1_epi32(q[i]),
mm=_mm_set1_epi32(MOD+MOD),
m0=_mm_set1_epi32(0),
m1=_mm_set1_epi32(MOD);
//unfold 2x
for(int j=0;j<t;j+=8)
{
__m128i u0=_mm_load_si128((__m128i*)(xa+j));
__m128i u1=_mm_load_si128((__m128i*)(xa+j+4));
__m128i v0=_mm_load_si128((__m128i*)(xb+j));
__m128i v1=_mm_load_si128((__m128i*)(xb+j+4));
u0=_mm_sub_epi32(u0,
_mm_and_si128(_mm_or_si128(_mm_cmpgt_epi32(u0,mm),
_mm_cmpgt_epi32(m0,u0)),mm));
u1=_mm_sub_epi32(u1,
_mm_and_si128(_mm_or_si128(_mm_cmpgt_epi32(u1,mm),
_mm_cmpgt_epi32(m0,u1)),mm));
v0=_mm_sub_epi32(my_mullo_epu32(v0,p4),
my_mullo_epu32(my_mulhi_epu32(v0,q4),m1));
v1=_mm_sub_epi32(my_mullo_epu32(v1,p4),
my_mullo_epu32(my_mulhi_epu32(v1,q4),m1));
_mm_store_si128((__m128i*)(xa+j),_mm_add_epi32(u0,v0));
_mm_store_si128((__m128i*)(xa+j+4),_mm_add_epi32(u1,v1));
_mm_store_si128((__m128i*)(xb+j),
_mm_add_epi32(_mm_sub_epi32(u0,v0),mm));
_mm_store_si128((__m128i*)(xb+j+4),
_mm_add_epi32(_mm_sub_epi32(u1,v1),mm));
}
}
}
}
for(int i=0;i<n;++i)
x[i]-=(x[i]>=us(MOD+MOD))*us(MOD+MOD),
x[i]-=(x[i]>=us(MOD))*us(MOD);
}
void intt(us* __restrict__ x,int n)
{
int t=1;
for(int m=(n>>1);m;m>>=1)
{
us* __restrict__ p=p1[m];
us* __restrict__ q=q1[m];
if(t==1||t==2)
{
us *xa=x,*xb=x+t;
for(int i=0;i<m;++i,xa+=t+t,xb+=t+t)
for(int j=0;j<t;++j)
{
us u=xa[j],v=xb[j];
xa[j]=u+v-(u+v>=us(MOD+MOD))*us(MOD+MOD);
xb[j]=my_mul(u-v+us(MOD+MOD),p[i],q[i]);
}
}
else if(t==4)
{
us *xa=x,*xb=x+t;
for(int i=0;i<m;++i,xa+=t+t,xb+=t+t)
{
const __m128i p4=_mm_set1_epi32(p[i]),
q4=_mm_set1_epi32(q[i]),
mm=_mm_set1_epi32(MOD+MOD),
m0=_mm_set1_epi32(0),
m1=_mm_set1_epi32(MOD);
for(int j=0;j<t;j+=4)
{
__m128i u=_mm_load_si128((__m128i*)(xa+j));
__m128i v=_mm_load_si128((__m128i*)(xb+j));
__m128i uv=_mm_add_epi32(u,v);
_mm_store_si128((__m128i*)(xa+j),_mm_sub_epi32(uv,
_mm_and_si128(_mm_or_si128(_mm_cmpgt_epi32(uv,mm),
_mm_cmpgt_epi32(m0,uv)),mm)));
uv=_mm_add_epi32(_mm_sub_epi32(u,v),mm);
_mm_store_si128((__m128i*)(xb+j),
_mm_sub_epi32(my_mullo_epu32(uv,p4),
my_mullo_epu32(my_mulhi_epu32(uv,q4),m1)));
}
}
}
else
{
us *xa=x,*xb=x+t;
for(int i=0;i<m;++i,xa+=t+t,xb+=t+t)
{
const __m128i p4=_mm_set1_epi32(p[i]),
q4=_mm_set1_epi32(q[i]),
mm=_mm_set1_epi32(MOD+MOD),
m0=_mm_set1_epi32(0),
m1=_mm_set1_epi32(MOD);
//unfold 2x
for(int j=0;j<t;j+=8)
{
__m128i u0=_mm_load_si128((__m128i*)(xa+j));
__m128i u1=_mm_load_si128((__m128i*)(xa+j+4));
__m128i v0=_mm_load_si128((__m128i*)(xb+j));
__m128i v1=_mm_load_si128((__m128i*)(xb+j+4));
__m128i uv0=_mm_add_epi32(u0,v0);
__m128i uv1=_mm_add_epi32(u1,v1);
_mm_store_si128((__m128i*)(xa+j),_mm_sub_epi32(uv0,
_mm_and_si128(_mm_or_si128(_mm_cmpgt_epi32(uv0,mm),
_mm_cmpgt_epi32(m0,uv0)),mm)));
_mm_store_si128((__m128i*)(xa+j+4),_mm_sub_epi32(uv1,
_mm_and_si128(_mm_or_si128(_mm_cmpgt_epi32(uv1,mm),
_mm_cmpgt_epi32(m0,uv1)),mm)));
uv0=_mm_add_epi32(_mm_sub_epi32(u0,v0),mm);
uv1=_mm_add_epi32(_mm_sub_epi32(u1,v1),mm);
_mm_store_si128((__m128i*)(xb+j),
_mm_sub_epi32(my_mullo_epu32(uv0,p4),
my_mullo_epu32(my_mulhi_epu32(uv0,q4),m1)));
_mm_store_si128((__m128i*)(xb+j+4),
_mm_sub_epi32(my_mullo_epu32(uv1,p4),
my_mullo_epu32(my_mulhi_epu32(uv1,q4),m1)));
}
}
}
t<<=1;
}
us rn=qp(n,MOD-2);
for(int i=0;i<n;++i)
x[i]=x[i]*(ull)rn%MOD;
}
//copied from http://uoj.ac/submission/386710
typedef unsigned uint;
struct IO_Tp
{
const static int _I_Buffer_Size = 2 << 20;
char _I_Buffer[_I_Buffer_Size], *_I_pos = _I_Buffer;
const static int _O_Buffer_Size = 2 << 20;
char _O_Buffer[_O_Buffer_Size], *_O_pos = _O_Buffer;
uint m[10000];
IO_Tp()
{
constexpr uint e0 = '\0\0\0\1', e1 = '\0\0\1\0', e2 = '\0\1\0\0', e3 = '\1\0\0\0';
int x = 0;
for (uint i = 0, c0 = '0000'; i != 10; ++i, c0 += e0)
for (uint j = 0, c1 = c0; j != 10; ++j, c1 += e1)
for (uint k = 0, c2 = c1; k != 10; ++k, c2 += e2)
for (uint l = 0, c3 = c2; l != 10; ++l, c3 += e3)
m[x++] = c3;
fread(_I_Buffer, 1, _I_Buffer_Size, stdin);
}
~IO_Tp() { fwrite(_O_Buffer, 1, _O_pos - _O_Buffer, stdout); }
IO_Tp &operator>>(int &res)
{
while (!isdigit(*_I_pos))
++_I_pos;
res = *_I_pos++ - '0';
while (isdigit(*_I_pos))
res = res * 10 + (*_I_pos++ - '0');
return *this;
}
IO_Tp &operator>>(uint &res)
{
while (!isdigit(*_I_pos))
++_I_pos;
res = *_I_pos++ - '0';
while (isdigit(*_I_pos))
res = res * 10 + (*_I_pos++ - '0');
return *this;
}
IO_Tp &operator<<(uint x)
{
if (x == 0)
{
*_O_pos++ = '0';
return *this;
}
static char _buf[12];
char *_pos = _buf + 12;
if (x >= 10000)
*--reinterpret_cast<uint *&>(_pos) = m[x % 10000], x /= 10000;
if (x >= 10000)
*--reinterpret_cast<uint *&>(_pos) = m[x % 10000], x /= 10000;
*--reinterpret_cast<uint *&>(_pos) = m[x];
_pos += (x < 1000) + (x < 100) + (x < 10);
_O_pos = std::copy(_pos, _buf + 12, _O_pos);
return *this;
}
IO_Tp &operator<<(char ch)
{
*_O_pos++ = ch;
return *this;
}
} IO;
int N,M;
us a[2345678] __attribute__ ((aligned(64)));
us b[2345678] __attribute__ ((aligned(64)));
#ifdef LOCAL
int main()
{
N=100000; M=100000; int t=N+M-1;
for(int i=0;i<N;i++) a[i]=i%9;
for(int i=0;i<M;i++) b[i]=i%8;
printf("%.3lf\n",clock()*1.0/CLOCKS_PER_SEC);
int K=getK(t); prep(K);
printf("%.3lf\n",clock()*1.0/CLOCKS_PER_SEC);
for(int i=0;i<10;++i)
{
ntt(a,K); ntt(b,K);
for(int i=0;i<K;i++) a[i]=(ll)a[i]*b[i]%MOD;
intt(a,K);
ll su=0;
for(int i=0;i<t;i++) su+=a[i]%MOD;
printf("%lld\n",su);
}
cout<<"ANS=99663681227949"<<"\n";
cout<<"ans=998363825160779\n";
printf("%.3lf\n",clock()*1.0/CLOCKS_PER_SEC);
}
#else
int main()
{
IO>>N; IO>>M; ++N; ++M; int t=N+M-1;
for(int i=0;i<N;i++) IO>>a[i];
for(int i=0;i<M;i++) IO>>b[i];
int K=getK(t); prep(K);
ntt(a,K); ntt(b,K);
for(int i=0;i<K;i++) a[i]=(ll)a[i]*b[i]%MOD;
intt(a,K); for(int i=0;i<t;i++) IO<<a[i]<<' ';
}
#endif
Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
Subtask #1 Testcase #1 | 42.1 us | 116 KB | Accepted | Score: 0 | 显示更多 |
Subtask #1 Testcase #2 | 18.997 ms | 9 MB + 488 KB | Accepted | Score: 100 | 显示更多 |
Subtask #1 Testcase #3 | 8.537 ms | 3 MB + 1004 KB | Accepted | Score: 0 | 显示更多 |
Subtask #1 Testcase #4 | 8.49 ms | 3 MB + 980 KB | Accepted | Score: 0 | 显示更多 |
Subtask #1 Testcase #5 | 45.18 us | 116 KB | Accepted | Score: 0 | 显示更多 |
Subtask #1 Testcase #6 | 42.25 us | 116 KB | Accepted | Score: 0 | 显示更多 |
Subtask #1 Testcase #7 | 42.42 us | 116 KB | Accepted | Score: 0 | 显示更多 |
Subtask #1 Testcase #8 | 17.783 ms | 8 MB + 908 KB | Accepted | Score: 0 | 显示更多 |
Subtask #1 Testcase #9 | 17.787 ms | 8 MB + 908 KB | Accepted | Score: 0 | 显示更多 |
Subtask #1 Testcase #10 | 16.598 ms | 8 MB + 300 KB | Accepted | Score: 0 | 显示更多 |
Subtask #1 Testcase #11 | 18.993 ms | 9 MB + 648 KB | Accepted | Score: 0 | 显示更多 |
Subtask #1 Testcase #12 | 13.432 ms | 7 MB + 408 KB | Accepted | Score: 0 | 显示更多 |
Subtask #1 Testcase #13 | 40.53 us | 100 KB | Accepted | Score: 0 | 显示更多 |