#include <bits/stdc++.h>
#pragma GCC target("avx2")
#include <immintrin.h>
/*
No mozheng.
*/
namespace raw_ntt{using Z=uint32_t;using L=uint64_t;using I=std::size_t;using Y=__m256i;using P=std::array<Z,8>;
#define Il inline
#define Co const
#define Ce constexpr
#define bld _mm256_blend_epi32
#define per _mm256_permutevar8x32_epi32
#define Per _mm256_permute2x128_si256
#define shf _mm256_shuffle_epi32
#define Re return
Il void sty(void*p,Y x){_mm256_store_si256((Y*)p,x);}Il Y ldy(Co void*p){Re _mm256_load_si256((Co Y*)p);}Il Y pkd(Z x){Re _mm256_set1_epi32(x);}Il Y sr(Y x){Re _mm256_srli_epi64(x,32);}Il Y ml(Y x,Y y){Re _mm256_mul_epu32(x,y);}Il Y ad(Y x,Y y){Re _mm256_add_epi32(x,y);}Il Y az(Y x,Y y){Re _mm256_add_epi64(x,y);}Il Y sb(Y x,Y y){Re _mm256_sub_epi32(x,y);}Il Y MN(Y x,Y y){Re _mm256_min_epu32(x,y);}Ce Z CTZ(L x){Re __builtin_ctzll(x);}struct fnt32_ifo{Z m,m2,n,O,r2,r3,im,ni;alignas(32)P B,Bi,S3[24],T3[24],S4,U4,T4,V4;L s4[23],u4[23],t4[23],v4[23];Ce Z SK(Z x)Co{Re std::min(x,x-m);}Ce Z MY(Z x,Z y)Co{L z=L(x)*y;Re(z+L(Z(z)*n)*m)>>32;}Ce Z mS(Z x,Z y)Co{Re SK(MY(x,y));}Ce Z qp(Z a,Z b,Z r)Co{for(;b;b>>=1,a=MY(a,a)){if(b&1){r=MY(r,a);}}Re r;}Ce fnt32_ifo(Z P):m(P),m2(m*2),n([&]{Z n=1;for(Z i=0;i<5;++i){n*=2+m*n;}Re n;}()),O((-m)%m),r2((-L(m))%m),r3(mS(r2,r2)),im{},ni{},B{},Bi{},S3{},T3{},S4{},U4{},T4{},V4{},s4{},u4{},t4{},v4{}{Z _g=2,k=CTZ(m-1),R1[25]={},RI[25]={},w[8]={},W[8]={},pr=O,pi=O;for(;SK(qp(_g,m>>1,O))==O;++_g);_g=qp(_g,m>>k,O),R1[k-2]=_g,RI[k-2]=qp(_g,m-2,O),w[0]=O,W[0]=O;for(Z i=k-2;i>0;--i){R1[i-1]=MY(R1[i],R1[i]),RI[i-1]=MY(RI[i],RI[i]);}for(Z i=0;i<3;++i){for(Z j=1<<i,k=0;k<j;++k){w[j+k]=mS(w[k],R1[i]),W[j+k]=mS(W[k],RI[i]);}}im=w[1],ni=n*im,B={w[2],0,im,0,m-w[3]},Bi={W[2],0,W[1],0,W[3]};for(Z i=0;i<k-2;pr=MY(pr,RI[i+1]),pi=MY(pi,R1[i+1]),++i){Z r=mS(pr,R1[i+1]),R=mS(pi,RI[i+1]),r2=mS(r,r),R2=mS(R,R),r3=mS(r,r2),R3=mS(R,R2);S3[i]={r*n,r,r2*n,r2,r3*n,r3},T3[i]={R*n,R,R2*n,R2,R3*n,R3};}S4={w[7],w[0],w[6],w[1],w[5],w[2],w[4],w[3]},U4={w[1],w[3],w[1],w[0],w[0],w[2],w[0],w[1]},pr=O;T4={W[7],W[0],W[6],W[1],W[5],W[2],W[4],W[3]},V4={W[1],W[3],W[1],W[0],W[0],W[2],W[0],W[1]},pi=O;for(Z i=1;i<k-3;pr=MY(pr,RI[i+2]),pi=MY(pi,R1[i+2]),++i){Z r=mS(pr,R1[i+2]),R=mS(pi,RI[i+2]),r2=mS(r,r),R2=mS(R,R),r4=mS(r2,r2),R4=mS(R2,R2);s4[i]=L(r*n)<<32|r,t4[i]=L(R*n)<<32|R,u4[i]=L(r2)<<32|r4,v4[i]=L(R2)<<32|R4;}}void vdt(Y*f,Y*g,I q)Co{Y M=pkd(m),N=pkd(n);auto RD=[&](Y x,Y y){Y c=ml(x,N),d=ml(y,N),cc=ml(c,M),dd=ml(d,M);Re bld(az(y,dd),sr(az(x,cc)),85);};auto mL=[&](Y x,Y y){Re RD(ml(x,y),ml(sr(x),sr(y)));};for(I i=0;i<q;++i){sty(f+i,mL(ldy(f+i),ldy(g+i)));}}template<Z F>void vdf(Y*f,I q)Co{alignas(32)P st[13];Y M=pkd(m),M2=pkd(m2),N=pkd(n),Im=pkd(im),Ni=pkd(ni),Id=_mm256_set_epi32(4,0,2,0,4,0,2,0),Ir=_mm256_set_epi32(0,1,2,3,4,5,6,7);Z lg=CTZ(q);I v=q>>(lg&1),u=std::min<I>(q,64);auto sK=[&](Y x){Re MN(x,sb(x,M2));};auto sk=[&](Y x){Re MN(x,sb(x,M));};auto dL=[&](Y x){Re MN(x,ad(x,M2));};auto Lb=[&](Y x,Y y){Re ad(x,sb(M2,y));};auto AD=[&](Y x,Y y){Re sK(ad(x,y));};auto SB=[&](Y x,Y y){Re dL(sb(x,y));};auto mF=[&](Y x,Y y,Y z){Y e=ml(x,z),f=ml(sr(x),z),c=ml(x,y),d=ml(sr(x),y);e=ml(e,M),f=ml(f,M);Re bld(az(d,f),sr(az(c,e)),85);};auto RD=[&](Y x,Y y){Y c=ml(x,N),d=ml(y,N),cc=ml(c,M),dd=ml(d,M);Re bld(az(y,dd),sr(az(x,cc)),85);};auto mb=[&](Y x,Y y){Re RD(ml(x,y),ml(sr(x),y));};auto mL=[&](Y x,Y y){Re RD(ml(x,y),ml(sr(x),sr(y)));};auto mU=[&](Y x,Y y){Y c=ml(x,y),d=ml(x,sr(y)),e=ml(c,M);Re sk(sr(az(e,d)));};auto mu=[&](Y x,Y y){Re mF(x,y,sr(y));};std::fill(st,st+(lg>>1),F?Bi:B);if constexpr(F){Z fx=mS(m-((m-1)>>(CTZ(q)+3)),r3);Y Fx=pkd(fx),FN=pkd(fx*n),X0=sk(mb(ldy(&T4),Fx)),X1=ldy(&V4);for(I j=0;j<q;j+=u){for(I i=j;i<j+u;i+=2){Y fi=ldy(f+i),gi=ldy(f+i+1),Z0=X0,Z1=X1;Z k=CTZ(i+2);gi=per(gi,Ir);Y hi=bld(fi,gi,85),oi=bld(gi,fi,85);oi=shf(oi,177),fi=mF(ad(oi,hi),Fx,FN),gi=mL(Lb(oi,hi),Z0),fi=shf(fi,177);hi=bld(gi,fi,102),oi=bld(fi,gi,102),oi=shf(oi,78);gi=mb(Lb(oi,hi),sr(Z1)),X1=mL(X1,_mm256_set1_epi64x(v4[k]));fi=AD(oi,hi),X1=sk(X1),hi=_mm256_unpackhi_epi64(fi,gi),oi=_mm256_unpacklo_epi64(gi,fi);fi=Per(hi,oi,3),gi=Per(hi,oi,18),oi=mb(Lb(fi,gi),Z1),X0=mu(X0,_mm256_set1_epi64x(t4[k]));hi=AD(fi,gi),X0=sk(X0),gi=Per(hi,oi,2),fi=Per(hi,oi,49),gi=per(gi,Ir),sty(f+i,fi),sty(f+i+1,gi);}Z T=CTZ(j+u),t=2,p=0;for(I l=4,L=1;t<=T;L=l,l<<=2,t+=2,++p){I D=j+u-std::max(l,u),i=0;Y rt=ldy(st+p),*g=f+D;if(!D){for(I i=0;i<L;++i){auto p0=f+i,p1=p0+L,p2=p1+L,p3=p2+L;Y f2=ldy(p2),f3=ldy(p3),f0=ldy(p0),f1=ldy(p1),g3=mF(Lb(f3,f2),Im,Ni),g2=AD(f2,f3),g0=AD(f0,f1),g1=SB(f0,f1),h0=AD(g0,g2),h1=AD(g1,g3),h2=SB(g0,g2),h3=SB(g1,g3);if(l==q){h0=sk(h0),h1=sk(h1),h2=sk(h2),h3=sk(h3);}sty(p0,h0),sty(p1,h1),sty(p2,h2),sty(p3,h3);}i=l;}for(I k=(j+i)>>t;i<u;i+=l,++k){Y r1=per(rt,Id),N1=per(ml(rt,N),Id),r2=shf(r1,85),r3=shf(r1,255),N2=shf(N1,85),N3=shf(N1,255);rt=mU(rt,ldy(T3+CTZ(~k)));for(I j=0;j<L;++j){auto p0=g+i+j,p1=p0+L,p2=p1+L,p3=p2+L;Y f2=ldy(p2),f3=ldy(p3),f0=ldy(p0),f1=ldy(p1),g3=mF(Lb(f3,f2),Im,Ni),g2=AD(f2,f3),g0=AD(f0,f1),g1=SB(f0,f1),h2=Lb(g0,g2),h3=Lb(g1,g3),h0=ad(g0,g2),h1=ad(g1,g3),o2=mF(h2,r2,N2),o0=sK(h0),o1=mF(h1,r1,N1),o3=mF(h3,r3,N3);sty(p0,o0),sty(p1,o1),sty(p2,o2),sty(p3,o3);}}sty(st+p,rt);}}if(v!=q){for(I i=0;i<v;++i){Y P=ldy(f+i),Q=ldy(f+v+i),R=AD(P,Q),S=SB(P,Q);sty(f+i,sk(R)),sty(f+v+i,sk(S));}}}else{Y X0=ldy(&S4),X1=ldy(&U4);if(v!=q){for(I i=0;i<v;++i){Y P=ldy(f+i),Q=ldy(f+v+i),R=AD(P,Q),S=Lb(P,Q);sty(f+i,R),sty(f+v+i,S);}}for(I L=v>>2;L;L>>=2){for(I i=0;i<L;++i){auto p0=f+i,p1=p0+L,p2=p1+L,p3=p2+L;Y f1=ldy(p1),f3=ldy(p3),f2=ldy(p2),f0=ldy(p0),g3=mF(Lb(f1,f3),Im,Ni),g1=AD(f1,f3),g0=AD(f0,f2),g2=SB(f0,f2),h0=AD(g0,g1),h1=Lb(g0,g1),h2=ad(g2,g3),h3=Lb(g2,g3);sty(p0,h0),sty(p1,h1),sty(p2,h2),sty(p3,h3);}}Z t=std::min<Z>(6,lg)&-2,p=(t-2)>>1;for(I j=0;j<q;t=CTZ(j+=u)&-2,p=(t-2)>>1){for(I l=I(1)<<t,L=l>>2;L;l=L,L>>=2,t-=2,--p){Y rt=ldy(st+p),*g=f+j;for(I i=(j==0?l:0),k=(j+i)>>t;i<u;i+=l,++k){Y r1=per(rt,Id),N1=per(ml(rt,N),Id),r2=shf(r1,85),r3=shf(r1,255),N2=shf(N1,85),N3=shf(N1,255);rt=mU(rt,ldy(S3+CTZ(~k)));for(I j=0;j<L;++j){auto p0=g+i+j,p1=p0+L,p2=p1+L,p3=p2+L;Y f1=ldy(p1),f3=ldy(p3),f2=ldy(p2),f0=ldy(p0),g1=mF(f1,r1,N1),g3=mF(f3,r3,N3),g2=mF(f2,r2,N2),g0=sK(f0),h3=mF(ad(g1,g3),Im,Ni),h1=SB(g1,g3),h0=AD(g0,g2),h2=SB(g0,g2),o0=ad(h0,h1),o1=Lb(h0,h1),o2=ad(h2,h3),o3=Lb(h2,h3);sty(p0,o0),sty(p1,o1),sty(p2,o2),sty(p3,o3);}}sty(st+p,rt);}for(I i=j;i<j+u;i+=2){Y fi=ldy(f+i),gi=ldy(f+i+1),Z0=X0,Z1=X1;Z k=CTZ(i+2);gi=per(gi,Ir);Y hi=bld(fi,gi,15),oi=bld(gi,fi,15);fi=mb(hi,Z1),X0=mu(X0,_mm256_set1_epi64x(s4[k]));gi=sK(Per(oi,oi,1)),X0=sk(X0),hi=ad(fi,gi),oi=Lb(gi,fi),fi=Per(hi,oi,49),gi=Per(hi,oi,2);hi=bld(fi,gi,51),oi=bld(gi,fi,51),X1=mL(X1,_mm256_set1_epi64x(u4[k])),fi=mb(hi,sr(Z1));gi=sK(shf(oi,78)),X1=sk(X1),hi=ad(fi,gi),oi=Lb(gi,fi);fi=_mm256_unpackhi_epi64(hi,oi),gi=_mm256_unpacklo_epi64(oi,hi);hi=bld(fi,gi,85),oi=bld(gi,fi,85),fi=mL(hi,Z0),gi=sK(shf(oi,177)),hi=AD(fi,gi),oi=SB(gi,fi);hi=shf(hi,177),gi=bld(hi,oi,85),fi=bld(oi,hi,85),gi=per(gi,Ir),sty(f+i,fi),sty(f+i+1,gi);}}}}};
#undef bld
#undef per
#undef Per
#undef shf
#undef Il
#undef Co
#undef Ce
#undef Re
}
alignas(32)uint32_t F[1<<21],G[1<<21];
constexpr int bcl(int x){return x<2?1:2<<std::__lg(x-1);}
void poly_multiply(unsigned*a,int n,unsigned*b,int m,unsigned*c){
constexpr uint32_t mod=998244353;
constexpr raw_ntt::fnt32_ifo fnt(mod);
int lm=bcl(std::max(16,n+m+1));
memcpy(F,a,(n+1)*4),memset(F+n+1,0,(lm-n-1)*4);
memcpy(G,b,(m+1)*4),memset(G+m+1,0,(lm-m-1)*4);
fnt.vdf<0>((__m256i*)F,lm>>3);
fnt.vdf<0>((__m256i*)G,lm>>3);
fnt.vdt((__m256i*)F,(__m256i*)G,lm>>3);
fnt.vdf<1>((__m256i*)F,lm>>3);
memcpy(c,F,(n+m+1)*4);
}
Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
Testcase #1 | 2.613 ms | 16 MB + 32 KB | Runtime Error | Score: 0 | 显示更多 |