#include<iostream>
#include<cstring>
#include<algorithm>
#include<vector>
#include<set>
#include<cstdlib>
#include<map>
#include<bitset>
#include<random>
#include<ctime>
#include<stdint.h>
#include<queue>
#include<cassert>
using namespace std;
#pragma GCC target("avx2")
#include<immintrin.h>
typedef unsigned int u32;
typedef unsigned long long u64;
typedef __uint128_t u128;
typedef __m256i i256;
typedef u32 u32x8 __attribute((vector_size(32)));
typedef u32 u32x4 __attribute((vector_size(16)));
typedef u64 u64x8 __attribute((vector_size(64)));
typedef u64 u64x4 __attribute((vector_size(32)));
namespace PolyTemplate{
constexpr u32 mod=998244353;
constexpr u32 N_siz_lg2=21;
constexpr u32 N_basic=(1<<N_siz_lg2)+1;
constexpr u32 N_ntt_lg2=N_siz_lg2;
constexpr u32 N_mul_lg2=N_siz_lg2;
constexpr u32 N_inv_lg2=N_siz_lg2;
constexpr u32 N_exp_lg2=N_siz_lg2;
constexpr u32 N_sqrtinv_lg2=N_siz_lg2;
namespace Montgomery{
constexpr u32 get_nr(const u32& m){u32 res=1;for(u32 i=0;i<5;++i)res*=2u-m*res;return res;}
constexpr u32 mod_inv=get_nr(mod),nmod_inv=-mod_inv;
constexpr u32 mod_2=mod<<1;
constexpr u32 R32m=(-mod)%mod,R64m=(-(u64)mod)%mod;
constexpr u32 one=R32m,two=(R32m<<1);
constexpr u32 reduce(const u64& x){return x+u64(u32(x)*nmod_inv)*mod>>32;}
constexpr u32 reduce_s(const u64& x){u32 res=(x-u64(u32(x)*mod_inv)*mod)>>32;return (res>>31)?res+mod:res;}
constexpr u32 in_Mont(const u32& x){return reduce((u64)x*R64m);}
constexpr u32 in_Mont_s(const u32& x){return reduce_s((u64)x*R64m);}
constexpr u32 out_Mont(const u32& x){return reduce_s(x);}
constexpr u32 mul(const u32& x,const u32& y){return reduce((u64)x*y);}
constexpr u32 mul_s(const u32& x,const u32& y){return reduce_s((u64)x*y);}
constexpr u32 adm1(const u32& x){return (x>>31)?x+mod:x;}
constexpr u32 adm2(const u32& x){return (x>>31)?x+mod_2:x;}
constexpr u32 shrk(const u32& x){return (x<mod)?x:x-mod;}
constexpr u32 sum(const u32& x,const u32& y){return adm2(x+y-mod_2);}
constexpr u32 sub(const u32& x,const u32& y){return adm2(x-y);}
constexpr u32 neg(const u32& x){return mod_2-x;}
constexpr u32 mhalf(const u32& x){return x+(x&1)*mod>>1;}
constexpr u32 mhalf_s(const u32& x){return mhalf(shrk(x));}
i256 i256_zero=i256((u32x8){0,0,0,0,0,0,0,0});
#define i_blend(mask,x,y) ((u32x8)_mm256_blend_epi32(i256(x),i256(y),mask))
#define i_shuffle(mask,x) ((u32x8)_mm256_shuffle_epi32(i256(x),mask))
#define min_u32x8(x,y) ((u32x8)_mm256_min_epu32(i256(x),i256(y)))
#define mul_t64(x,y) ((u64x4)_mm256_mul_epu32(i256(x),i256(y)))
#define i_swap128(x) ((u32x8)_mm256_permute2x128_si256(i256(x),i256_zero,0x01))
constexpr u32x8 fux8(const u32& x){return (u32x8){x,x,x,x,x,x,x,x};}
constexpr u32x8 mod_invx8=fux8(mod_inv),nmod_invx8=fux8(nmod_inv);
constexpr u32x8 modx8=fux8(mod),mod_2x8=fux8(mod_2);
constexpr u32x8 R32mx8=fux8(R32m);
u32x8 adm1(const u32x8& x){return min_u32x8(x,x+modx8);}
u32x8 adm2(const u32x8& x){return min_u32x8(x,x+mod_2x8);}
u32x8 shrk(const u32x8& x){return min_u32x8(x,x-modx8);}
u32x8 shrk2(const u32x8& x){return min_u32x8(x,x-mod_2x8);}
u32x8 sum(const u32x8& x,const u32x8& y){return shrk2(x+y);}
u32x8 sub(const u32x8& x,const u32x8& y){return adm2(x-y);}
constexpr u32x8 neg(const u32x8& x){return mod_2x8-x;}
template<int mask> u32x8 neg(const u32x8& x){return i_blend(mask,x,mod_2x8-x);}
u32x8 mul(const u32x8& x,const u32x8& y){
u32x8 z=x*y*nmod_invx8;
return i_blend(0xaa,mul_t64(x,y)+mul_t64(z,modx8)>>32,
mul_t64(i_shuffle(0xf5,x),i_shuffle(0xf5,y))+mul_t64(i_shuffle(0xf5,z),modx8));
}
u32x8 mul_s(const u32x8& x,const u32x8& y){return shrk(mul(x,y));}
}
using namespace Montgomery;
constexpr u32 getsiz(const u32& x){return 31-__builtin_clz(x<<1|1);}
template<bool strict=false> constexpr u32 power(u32 x,u32 y,u32 z=one){
for(;y;y>>=1,x=mul(x,x))if(y&1)z=mul(z,x);
if constexpr(strict)z=shrk(z);
return z;
}
constexpr u32 omega[24]={301989884,696254469,691295370,888603487,54518517,568407860,232329203,34051923,5152499,745245250,824925969,816771355,773589419,272471756,671492362,269053879,50676799,280400066,654830683,441361737,219694500,173964571,260083352,781371651};
constexpr u32 omega_inv[24]={301989884,696254469,306948983,307583142,948373842,915129526,188046825,307336038,661378605,859292650,446928399,99752712,592954623,154089990,748293021,861173054,266982935,848074129,685667824,255387832,323328994,662576621,551015594,956329333};
constexpr u32x8 omegax8[24]={fux8(omega[0]),fux8(omega[1]),fux8(omega[2]),fux8(omega[3]),fux8(omega[4]),fux8(omega[5]),fux8(omega[6]),fux8(omega[7]),fux8(omega[8]),fux8(omega[9]),fux8(omega[10]),fux8(omega[11]),fux8(omega[12]),fux8(omega[13]),fux8(omega[14]),fux8(omega[15]),fux8(omega[16]),fux8(omega[17]),fux8(omega[18]),fux8(omega[19]),fux8(omega[20]),fux8(omega[21]),fux8(omega[22]),fux8(omega[23])};
constexpr u32x8 omega_invx8[24]={fux8(omega_inv[0]),fux8(omega_inv[1]),fux8(omega_inv[2]),fux8(omega_inv[3]),fux8(omega_inv[4]),fux8(omega_inv[5]),fux8(omega_inv[6]),fux8(omega_inv[7]),fux8(omega_inv[8]),fux8(omega_inv[9]),fux8(omega_inv[10]),fux8(omega_inv[11]),fux8(omega_inv[12]),fux8(omega_inv[13]),fux8(omega_inv[14]),fux8(omega_inv[15]),fux8(omega_inv[16]),fux8(omega_inv[17]),fux8(omega_inv[18]),fux8(omega_inv[19]),fux8(omega_inv[20]),fux8(omega_inv[21]),fux8(omega_inv[22]),fux8(omega_inv[23])};
constexpr u32 inv2[24]={301989884,150994942,75497471,536870912,268435456,134217728,67108864,33554432,16777216,8388608,4194304,2097152,1048576,524288,262144,131072,65536,32768,16384,8192,4096,2048,1024,512};
namespace DFT_prework{
constexpr u32 omega_prew1[21]={888603487,948373842,468421415,86131707,330734384,533556548,839104681,684645011,923358846,497787421,462449747,700146798,344023611,396787232,708825978,878978812,946564978,854529028,605475755,779834649,697214290};
constexpr u32 omega_prew2[21]={691295370,307583142,566821959,878217029,375146819,138254384,500602490,79119218,790898700,978335284,651424567,308706579,723000027,474797508,683394121,44141573,536892010,945865189,175417726,536169764,831722880};
constexpr u32 omega_prew3[21]={690661211,859521105,429836493,525055482,870009876,545164023,646656617,584256781,257102378,144111458,960130140,464540497,378864839,215784499,107991247,579280589,679126981,900012892,861330565,674987752,528970505};
constexpr u32x8 omega_prew1x8[21]={fux8(omega_prew1[0]),fux8(omega_prew1[1]),fux8(omega_prew1[2]),fux8(omega_prew1[3]),fux8(omega_prew1[4]),fux8(omega_prew1[5]),fux8(omega_prew1[6]),fux8(omega_prew1[7]),fux8(omega_prew1[8]),fux8(omega_prew1[9]),fux8(omega_prew1[10]),fux8(omega_prew1[11]),fux8(omega_prew1[12]),fux8(omega_prew1[13]),fux8(omega_prew1[14]),fux8(omega_prew1[15]),fux8(omega_prew1[16]),fux8(omega_prew1[17]),fux8(omega_prew1[18]),fux8(omega_prew1[19]),fux8(omega_prew1[20])};
constexpr u32x8 omega_prew2x8[21]={fux8(omega_prew2[0]),fux8(omega_prew2[1]),fux8(omega_prew2[2]),fux8(omega_prew2[3]),fux8(omega_prew2[4]),fux8(omega_prew2[5]),fux8(omega_prew2[6]),fux8(omega_prew2[7]),fux8(omega_prew2[8]),fux8(omega_prew2[9]),fux8(omega_prew2[10]),fux8(omega_prew2[11]),fux8(omega_prew2[12]),fux8(omega_prew2[13]),fux8(omega_prew2[14]),fux8(omega_prew2[15]),fux8(omega_prew2[16]),fux8(omega_prew2[17]),fux8(omega_prew2[18]),fux8(omega_prew2[19]),fux8(omega_prew2[20])};
constexpr u32x8 omega_prew3x8[21]={fux8(omega_prew3[0]),fux8(omega_prew3[1]),fux8(omega_prew3[2]),fux8(omega_prew3[3]),fux8(omega_prew3[4]),fux8(omega_prew3[5]),fux8(omega_prew3[6]),fux8(omega_prew3[7]),fux8(omega_prew3[8]),fux8(omega_prew3[9]),fux8(omega_prew3[10]),fux8(omega_prew3[11]),fux8(omega_prew3[12]),fux8(omega_prew3[13]),fux8(omega_prew3[14]),fux8(omega_prew3[15]),fux8(omega_prew3[16]),fux8(omega_prew3[17]),fux8(omega_prew3[18]),fux8(omega_prew3[19]),fux8(omega_prew3[20])};
constexpr u32x8 omega_prew_vec[20]={
(u32x8){301989884,54518517,888603487,431422394,691295370,138723248,690661211,49870511},(u32x8){301989884,915129526,948373842,932575484,307583142,468421415,859521105,751097867},
(u32x8){301989884,235452393,468421415,772130442,566821959,24315851,429836493,842568658},(u32x8){301989884,474372723,86131707,971006002,878217029,690908315,525055482,906708918},
(u32x8){301989884,609034448,330734384,276169454,375146819,921620063,870009876,756223400},(u32x8){301989884,106912412,533556548,801621478,138254384,101510987,545164023,46220011},
(u32x8){301989884,96199266,839104681,767890654,500602490,681492041,646656617,121762987},(u32x8){301989884,701666354,684645011,12999340,79119218,362696872,584256781,565599625},
(u32x8){301989884,597930298,923358846,443838322,790898700,57382558,257102378,223821},(u32x8){301989884,944006537,497787421,23575188,978335284,362728624,144111458,720221312},
(u32x8){301989884,28460047,462449747,746105787,651424567,352278518,960130140,173353844},(u32x8){301989884,408121340,700146798,594219100,308706579,480576817,464540497,422548613},
(u32x8){301989884,724956777,344023611,923048297,723000027,478702233,378864839,643356627},(u32x8){301989884,166637724,396787232,7917957,474797508,852875821,215784499,639155918},
(u32x8){301989884,943159745,708825978,119447747,683394121,643275731,107991247,164225709},(u32x8){301989884,943871701,878978812,257916190,44141573,886131954,579280589,922300255},
(u32x8){301989884,668844333,946564978,481511923,536892010,572214817,679126981,95583585},(u32x8){301989884,197643422,854529028,247490748,945865189,69890551,900012892,625469081},
(u32x8){301989884,284617728,605475755,907926482,175417726,716132619,861330565,762493523},(u32x8){301989884,951464653,779834649,556237284,536169764,672128838,674987752,323571920}
};
}
namespace IDFT_prework{
constexpr u32 omega_inv_prew1[21]={307583142,54518517,743862480,572742501,527327664,628602650,401045132,117454629,874934224,182484872,708741408,287883204,658137619,20510463,167695763,520771192,167771867,997149972,553919641,789316838,592977953};
constexpr u32 omega_inv_prew2[21]={306948983,888603487,138723248,65668869,842568658,953245971,195169681,118717521,792052763,828450244,908724728,218560432,628507989,248210924,566568154,6285593,82571768,49985074,225413092,349167278,61514562};
constexpr u32 omega_inv_prew3[21]={109640866,431422394,83114827,24315851,504534955,141503808,853802025,646549226,677150821,343855971,279990892,78472544,244186231,242288005,563504186,736807976,744291817,268701657,105775518,257603447,776318308};
constexpr u32x8 omega_inv_prew1x8[21]={fux8(omega_inv_prew1[0]),fux8(omega_inv_prew1[1]),fux8(omega_inv_prew1[2]),fux8(omega_inv_prew1[3]),fux8(omega_inv_prew1[4]),fux8(omega_inv_prew1[5]),fux8(omega_inv_prew1[6]),fux8(omega_inv_prew1[7]),fux8(omega_inv_prew1[8]),fux8(omega_inv_prew1[9]),fux8(omega_inv_prew1[10]),fux8(omega_inv_prew1[11]),fux8(omega_inv_prew1[12]),fux8(omega_inv_prew1[13]),fux8(omega_inv_prew1[14]),fux8(omega_inv_prew1[15]),fux8(omega_inv_prew1[16]),fux8(omega_inv_prew1[17]),fux8(omega_inv_prew1[18]),fux8(omega_inv_prew1[19]),fux8(omega_inv_prew1[20])};
constexpr u32x8 omega_inv_prew2x8[21]={fux8(omega_inv_prew2[0]),fux8(omega_inv_prew2[1]),fux8(omega_inv_prew2[2]),fux8(omega_inv_prew2[3]),fux8(omega_inv_prew2[4]),fux8(omega_inv_prew2[5]),fux8(omega_inv_prew2[6]),fux8(omega_inv_prew2[7]),fux8(omega_inv_prew2[8]),fux8(omega_inv_prew2[9]),fux8(omega_inv_prew2[10]),fux8(omega_inv_prew2[11]),fux8(omega_inv_prew2[12]),fux8(omega_inv_prew2[13]),fux8(omega_inv_prew2[14]),fux8(omega_inv_prew2[15]),fux8(omega_inv_prew2[16]),fux8(omega_inv_prew2[17]),fux8(omega_inv_prew2[18]),fux8(omega_inv_prew2[19]),fux8(omega_inv_prew2[20])};
constexpr u32x8 omega_inv_prew3x8[21]={fux8(omega_inv_prew3[0]),fux8(omega_inv_prew3[1]),fux8(omega_inv_prew3[2]),fux8(omega_inv_prew3[3]),fux8(omega_inv_prew3[4]),fux8(omega_inv_prew3[5]),fux8(omega_inv_prew3[6]),fux8(omega_inv_prew3[7]),fux8(omega_inv_prew3[8]),fux8(omega_inv_prew3[9]),fux8(omega_inv_prew3[10]),fux8(omega_inv_prew3[11]),fux8(omega_inv_prew3[12]),fux8(omega_inv_prew3[13]),fux8(omega_inv_prew3[14]),fux8(omega_inv_prew3[15]),fux8(omega_inv_prew3[16]),fux8(omega_inv_prew3[17]),fux8(omega_inv_prew3[18]),fux8(omega_inv_prew3[19]),fux8(omega_inv_prew3[20])};
constexpr u32x8 omega_inv_prew_vec[20]={
(u32x8){301989884,948373842,307583142,859521105,306948983,566821959,109640866,943725836},(u32x8){301989884,568407860,54518517,120027324,888603487,743862480,431422394,284325195},
(u32x8){301989884,88281993,743862480,377662087,138723248,525055482,83114827,375146819},(u32x8){301989884,339604140,572742501,648841455,65668869,964192430,24315851,812902759},
(u32x8){301989884,106620515,527327664,436846382,842568658,428344388,504534955,519167273},(u32x8){301989884,241923136,628602650,219599474,953245971,984583908,141503808,109167150},
(u32x8){301989884,18624901,401045132,460871413,195169681,144548046,853802025,82880647},(u32x8){301989884,345668731,117454629,728610311,118717521,677112829,646549226,219389591},
(u32x8){301989884,38148924,874934224,157210340,792052763,460662290,677150821,868408647},(u32x8){301989884,6645753,182484872,14604058,828450244,601733540,343855971,70851523},
(u32x8){301989884,510299248,708741408,873446491,908724728,979361051,279990892,895289360},(u32x8){301989884,698801115,287883204,744819444,218560432,101949700,78472544,502890042},
(u32x8){301989884,104759820,658137619,323552704,628507989,485840299,244186231,186573054},(u32x8){301989884,240245467,20510463,947733133,248210924,986917012,242288005,645376416},
(u32x8){301989884,51836607,167695763,795571479,566568154,228189739,563504186,905603063},(u32x8){301989884,973922767,520771192,51870192,6285593,593760417,736807976,111843065},
(u32x8){301989884,912478225,167771867,925354465,82571768,461333508,744291817,933608272},(u32x8){301989884,347456907,997149972,732304564,49985074,63762258,268701657,581976751},
(u32x8){301989884,486065515,553919641,364448230,225413092,664762964,105775518,324268423},(u32x8){301989884,49698485,789316838,924870183,349167278,11916879,257603447,78536227}
};
}
u32 jc[N_basic+1],jc_inv[N_basic+1];
u32 fxn[N_basic+1],fxn_inv[N_basic+1];
void init(){
jc[0]=one;
for(u32 i=1;i<=N_basic;++i){
fxn[i]=in_Mont_s(i);
jc[i]=mul_s(jc[i-1],fxn[i]);
}
jc_inv[N_basic]=power<true>(jc[N_basic],mod-2);
for(u32 i=N_basic-1;~i;--i)jc_inv[i]=mul_s(jc_inv[i+1],fxn[i+1]);
for(u32 i=1;i<=N_basic;++i)fxn_inv[i]=mul_s(jc[i-1],jc_inv[i]);
}
struct poly{
vector<u32> dat;
poly(){dat.push_back(0);}
poly(const u32& n){for(u32 i=0;i<=n;++i)dat.push_back(0);}
poly(const poly& p){dat=p.dat;}
poly(const vector<u32>& vc){dat=vc;}
poly(const u32* l,const u32& n,const bool& inMont=true){
for(u32 i=0;i<=n;++i){
if(inMont)dat.push_back(in_Mont(l[i]));
else dat.push_back(l[i]);
}
}
poly(const int* l,const u32& n,const bool& inMont=true){
for(u32 i=0;i<=n;++i){
if(inMont)dat.push_back(in_Mont(l[i]));
else dat.push_back(l[i]);
}
}
poly(const u32* l,const u32* r,const bool& inMont=true){
for(u32 i=0;i<=r-l;++i){
if(inMont)dat.push_back(in_Mont(l[i]));
else dat.push_back(l[i]);
}
}
poly(const int* l,const int* r,const bool& inMont=true){
for(u32 i=0;i<=r-l;++i){
if(inMont)dat.push_back(in_Mont(l[i]));
else dat.push_back(l[i]);
}
}
u32 size()const{return dat.size()-1;}
u32 operator[](const u32& x)const{return dat[x];}
u32& operator[](const u32& x){return dat[x];}
u32 getv(const u32& x)const{return out_Mont(dat[x]);}
void push(const u32& x=0){dat.push_back(in_Mont(x));}
void modify(const u32& x,const u32& y=0){dat[x]=in_Mont(y);}
void operator+=(const poly& a){for(u32 i=0;i<=size();++i)dat[i]=sum(dat[i],a.dat[i]);}
void operator-=(const poly& a){for(u32 i=0;i<=size();++i)dat[i]=sub(dat[i],a.dat[i]);}
poly operator+(const poly& a){
poly res(size());
for(u32 i=0;i<=size();++i)res[i]=sum(dat[i],a.dat[i]);
return res;
}
poly operator-(const poly& a){
poly res(size());
for(u32 i=0;i<=size();++i)res[i]=sub(dat[i],a.dat[i]);
return res;
}
void operator*=(u32 x){x=in_Mont(x);for(u32 i=0;i<=size();++i)dat[i]=mul(dat[i],x);}
poly operator*(u32 x){
poly res(size());x=in_Mont(x);
for(u32 i=0;i<=size();++i)res[i]=mul(dat[i],x);
return res;
}
u32 ord()const{
for(u32 i=0;i<=size();++i)if(dat[i])return i;
return size()+1;
}
void print(const char& endf='\n')const{
for(u32 i=0;i<=size();++i){cout<<out_Mont(dat[i]);if(i<size())cout<<' ';}
if(endf!='\0')cout<<endf;
}
};
namespace BasicPoly{
void chk_mxn(poly& F,const u32& n){
while(F.size()>n)F.dat.pop_back();
}
poly cal_mxn(const poly& F,const u32& n){
poly res=F;
chk_mxn(res,n);
return res;
}
void chk_txn(poly& F,const u32& n){
while(F.size()>n)F.dat.pop_back();
while(F.size()<n)F.dat.push_back(0);
}
poly cal_txn(const poly& F,const u32& n){
poly res=F;
chk_txn(res,n);
return res;
}
void chk_mulxn(poly& F,const u32& n){
for(u32 i=0;i<n;++i)F.dat.push_back(0);
for(u32 i=F.size();i>=n;--i)F[i]=F[i-n];
for(u32 i=0;i<n;++i)F[i]=0;
}
poly cal_mulxn(const poly& F,const u32& n){
poly res=F;
chk_mulxn(res,n);
return res;
}
void chk_divxn(poly& F,const u32& n){
if(F.size()<n){F=poly();return;}
for(u32 i=0;i<=F.size()-n;++i)F[i]=F[i+n];
for(u32 i=0;i<n;++i)F.dat.pop_back();
}
poly cal_divxn(const poly& F,const u32& n){
poly res=F;
chk_divxn(res,n);
return res;
}
void chk_rev(poly& F){
reverse(F.dat.begin(),F.dat.end());
}
poly cal_rev(const poly& F){
poly res=F;
reverse(res.dat.begin(),res.dat.end());
return res;
}
poly dF(const poly& F){
if(F.size()==0)return poly();
poly res(F.size()-1);
for(u32 i=0;i<=res.size();++i)res[i]=mul(fxn[i+1],F[i+1]);
return res;
}
poly fF(const poly& F){
poly res(F.size()+1);
for(u32 i=1;i<=res.size();++i)res[i]=mul(fxn_inv[i],F[i-1]);
return res;
}
}
#define i_ctz(x) __builtin_ctz(x)
namespace rawNTT{
using namespace DFT_prework;
using namespace IDFT_prework;
template<bool strict=false> constexpr void DFT_2(u32& x,u32& y){
u32 a=sum(x,y);
y=sub(x,y);
x=a;
if constexpr(strict){x=shrk(x);y=shrk(y);}
}
template<bool strict=false> constexpr void DFT_4(u32& x,u32& y,u32& z,u32& w){
u32 a=sub(x,z),b=mul(y-w+mod_2,omega[2]);
x=sum(x,z);z=sum(y,w);
y=sub(x,z);w=sub(a,b);
x=sum(x,z);z=sum(a,b);
if constexpr(strict){x=shrk(x);y=shrk(y);z=shrk(z);w=shrk(w);}
}
template<bool strict=false> void DFT_vec8(u32x8* buf,const u32& sz){
u32 L=1<<sz;L>>=1;
if(sz&1){
for(u32 i=0;i<L;++i){
u32x8 a=buf[i]+buf[i|L];
buf[i|L]=buf[i]-buf[i|L]+mod_2x8;
buf[i]=a;
}
L>>=2;
}
else L>>=1;
for(u32 r=L<<2;L;r=L,L>>=2){
u32x8 mul1=R32mx8,mul2=R32mx8,mul3=R32mx8;
u32 j=0;
for(u32x8 *i=buf;i!=buf+(1<<sz);i+=r,j++){
if(j){
mul1=mul_s(mul1,omega_prew1x8[i_ctz(j)]);
mul2=mul_s(mul2,omega_prew2x8[i_ctz(j)]);
mul3=mul_s(mul3,omega_prew3x8[i_ctz(j)]);
}
for(u32x8 *F0=i,*F1=i+L,*F2=i+L*2,*F3=i+L*3;F0!=i+L;++F0,++F1,++F2,++F3){
*F0=shrk2(*F0);*F1=mul(*F1,mul1);*F2=mul(*F2,mul2);*F3=mul(*F3,mul3);
u32x8 a=sub(*F0,*F2),b=mul(*F1-*F3+mod_2x8,omegax8[2]);
*F0=sum(*F0,*F2);*F2=sum(*F1,*F3);
*F1=*F0-*F2+mod_2x8;
*F3=a-b+mod_2x8;
*F0=*F0+*F2;
*F2=a+b;
}
}
}
u32x8 ml=R32mx8;
for(u32 i=0;i<(1<<sz);++i){
if(i)ml=mul_s(ml,omega_prew_vec[i_ctz(i)]);
constexpr u32x8 omega_op_vec8_0=(u32x8){R32m,R32m,R32m,R32m,R32m,R32m,omega[2],omega[2]};
constexpr u32x8 omega_op_vec8_1=(u32x8){R32m,R32m,R32m,omega[2],R32m,omega[3],R32m,mul_s(omega[2],omega[3])};
buf[i]=mul(buf[i],ml);
buf[i]=mul(neg<0xf0>(buf[i])+i_swap128(buf[i]),omega_op_vec8_0);
buf[i]=mul(neg<0xcc>(buf[i])+i_shuffle(0x4e,buf[i]),omega_op_vec8_1);
buf[i]=sum(neg<0xaa>(buf[i]),i_shuffle(0xb1,buf[i]));
if constexpr(strict)buf[i]=shrk(buf[i]);
}
}
template<bool strict=false> void DFT(u32* buf,const u32& sz){
if(sz==0){if constexpr(strict)buf[0]=shrk(buf[0]);}
else if(sz==1)DFT_2<strict>(buf[0],buf[1]);
else if(sz==2)DFT_4<strict>(buf[0],buf[1],buf[2],buf[3]);
else DFT_vec8<strict>((u32x8*)buf,sz-3);
}
template<u32 mlx=R32m> constexpr void IDFT_2(u32& x,u32& y){
constexpr u32 ml=mhalf_s(mlx);
u32 a=x+y;
y=mul(x-y+mod_2,ml);
x=mul(a,ml);
}
template<u32 mlx=R32m> constexpr void IDFT_4(u32& x,u32& y,u32& z,u32& w){
constexpr u32 mlx_i=mul(mlx,omega_inv[2]);
IDFT_2<mlx>(x,y);IDFT_2<mlx_i>(z,w);
IDFT_2<mlx>(x,z);IDFT_2<mlx>(y,w);
}
template<u32 mlx=R32m> void IDFT_vec8(u32x8* buf,const u32& sz){
u32x8 ml=fux8(mul_s(mlx,inv2[sz+3]));
for(u32 i=0;i<(1<<sz);++i){
if(i)ml=mul_s(ml,omega_inv_prew_vec[i_ctz(i)]);
constexpr u32x8 omega_inv_op_vec8_0=(u32x8){R32m,R32m,R32m,omega_inv[2],R32m,omega_inv[3],R32m,mul_s(omega_inv[2],omega_inv[3])};
constexpr u32x8 omega_inv_op_vec8_1=(u32x8){R32m,R32m,R32m,R32m,R32m,R32m,omega_inv[2],omega_inv[2]};
buf[i]=mul(neg<0xaa>(buf[i])+i_shuffle(0xb1,buf[i]),omega_inv_op_vec8_0);
buf[i]=mul(neg<0xcc>(buf[i])+i_shuffle(0x4e,buf[i]),omega_inv_op_vec8_1);
buf[i]=mul(neg<0xf0>(buf[i])+i_swap128(buf[i]),ml);
}
u32 L=1,r=4;
for(;r<=(1<<sz);L=r,r<<=2){
u32x8 mul1=R32mx8,mul2=R32mx8,mul3=R32mx8;
u32 j=0;
for(u32x8 *i=buf;i!=buf+(1<<sz);i+=r,j++){
if(j){
mul1=mul_s(mul1,omega_inv_prew1x8[i_ctz(j)]);
mul2=mul_s(mul2,omega_inv_prew2x8[i_ctz(j)]);
mul3=mul_s(mul3,omega_inv_prew3x8[i_ctz(j)]);
}
for(u32x8 *F0=i,*F1=i+L,*F2=i+L*2,*F3=i+L*3;F0!=i+L;++F0,++F1,++F2,++F3){
u32x8 a=sub(*F0,*F1),b=mul(*F2-*F3+mod_2x8,omega_invx8[2]);
*F0=sum(*F0,*F1);*F1=sum(*F2,*F3);
*F2=*F0-*F1+mod_2x8;*F0=sum(*F0,*F1);
*F1=a+b;*F2=a-b+mod_2x8;
*F1=mul(*F1,mul1);*F2=mul(*F2,mul2);*F3=mul(*F3,mul3);
}
}
}
if(L<(1<<sz)){
for(u32 i=0;i<L;++i){
u32x8 a=sum(buf[i],buf[i|L]);
buf[i|L]=sub(buf[i],buf[i|L]);
buf[i]=a;
}
}
}
template<u32 mlx=R32m> void IDFT(u32* buf,const u32& sz){
if(sz==0);
else if(sz==1)IDFT_2<mlx>(buf[0],buf[1]);
else if(sz==2)IDFT_4<mlx>(buf[0],buf[1],buf[2],buf[3]);
else IDFT_vec8<mlx>((u32x8*)buf,sz-3);
}
}
using rawNTT::DFT;
using rawNTT::IDFT;
using namespace BasicPoly;
namespace op_sv{
void calc_mul_sv(const u32* a,const u32* b,u32* c,const u32& sz){
if(sz<=2){
for(u32 i=0;i<(1<<sz);++i)c[i]=mul(a[i],b[i]);
}
else{
const u32x8 *end=(const u32x8*)(a+(1<<sz));
const u32x8 *i=(const u32x8*)(a);
const u32x8 *j=(const u32x8*)(b);
for(u32x8 *k=(u32x8*)c;i!=end;++i,++j,++k)*k=mul(*i,*j);
}
}
void calc_add_av(const u32* a,const u32* b,u32* c,const u32& sz){
if(sz<=2){
for(u32 i=0;i<(1<<sz);++i)c[i]=sum(a[i],b[i]);
}
else{
const u32x8 *end=(const u32x8*)(a+(1<<sz));
const u32x8 *i=(const u32x8*)(a);
const u32x8 *j=(const u32x8*)(b);
for(u32x8 *k=(u32x8*)c;i!=end;++i,++j,++k)*k=sum(*i,*j);
}
}
u32 op_cyc_buf[9];
void mov_cyc_sv(u32* a,const u32* b,const u32& sz,const u32& n){
if(n<(1<<sz)-1){
memcpy(a,b,n+1<<2);
memset(a+n+1,0,(4<<sz)-(n+1<<2));
}
else{
memcpy(a,b,4<<sz);
u32* targ=a;u32 mask=(1<<sz)-1;
if(sz<=2)targ=op_cyc_buf,mask=7,memset(targ,0,32);
const u32x8 *i=(const u32x8*)(b+(1<<sz));
const u32x8 *end=(const u32x8*)(b+(n+1&~7));
u32x8 *j=(u32x8*)targ;
for(u32 t=0;i!=end;){
*j=sum(*j,*i);
++i,++j,++t;
if(!(t&mask))j=(u32x8*)targ;
}
const u32* ti=(const u32*)i;
for(u32 *tj=(u32*)j;ti!=b+n+1;++ti,++tj)*tj=sum(*tj,*ti);
if(sz<=2){
for(u32 i=0;i<=7;++i)a[i&(1<<sz)-1]=sum(a[i&(1<<sz)-1],targ[i]);
}
}
}
}
using namespace op_sv;
namespace PolyMul{
u32 buf1[(1<<N_mul_lg2)+1],buf2[(1<<N_mul_lg2)+1];
void raw_mul(const u32* a,const u32* b,u32* c,const u32& n1,const u32& n2,const u32& n3,const u32& sz){
mov_cyc_sv(buf1,a,sz,n1);
mov_cyc_sv(buf2,b,sz,n2);
DFT(buf1,sz);
DFT(buf2,sz);
calc_mul_sv(buf1,buf2,buf1,sz);
IDFT(buf1,sz);
memcpy(c,buf1,n3+1<<2);
}
poly mul(const poly& a,const poly& b){
u32 n=a.size()+b.size();
u32 sz=getsiz(n);
memset(buf1,0,4<<sz);
memset(buf2,0,4<<sz);
for(u32 i=0;i<=a.size();++i)buf1[i]=a[i];
for(u32 i=0;i<=b.size();++i)buf2[i]=b[i];
DFT(buf1,sz);
DFT(buf2,sz);
calc_mul_sv(buf1,buf2,buf1,sz);
IDFT(buf1,sz);
return poly(buf1,buf1+n,false);
}
poly power_c(const poly& a,const u32& x){
if(x==0){poly res;res.dat[0]=R32m;return res;}
if(x==1)return a;
u32 n=a.size()*x;
u32 sz=getsiz(n);
memset(buf1,0,4<<sz);
for(u32 i=0;i<=a.size();++i)buf1[i]=a[i];
DFT(buf1,sz);
if(x==2)calc_mul_sv(buf1,buf1,buf1,sz);
else{
for(u32 i=0;i<(1<<sz);++i)buf1[i]=power(buf1[i],x);
}
IDFT(buf1,sz);
return poly(buf1,buf1+n,false);
}
}
}
using PolyTemplate::Montgomery::in_Mont;
using PolyTemplate::Montgomery::out_Mont;
using PolyTemplate::rawNTT::DFT;
using PolyTemplate::rawNTT::IDFT;
using PolyTemplate::poly;
using namespace PolyTemplate::BasicPoly;
using PolyTemplate::PolyMul::mul;
using PolyTemplate::PolyMul::power_c;
void poly_multiply(unsigned *a, int n, unsigned *b, int m, unsigned *c){
poly F(a,a+n),G(b,b+m);
F=mul(F,G);
for(int i=0;i<=n+m;++i)c[i]=F.getv(i);
}
| Compilation | N/A | N/A | Compile Error | Score: N/A | 显示更多 |