提交记录 19660


用户 题目 状态 得分 用时 内存 语言 代码长度
TSKY 1004. 【模板题】高精度乘法 Accepted 100 31.893 ms 14060 KB C++14 27.57 KB
提交时间 评测时间
2023-07-10 22:18:41 2023-07-10 22:18:44
#include<string.h>
#include<stdio.h>
#include<immintrin.h>
namespace int_
{
	#define inline __attribute__((always_inline))
	template<typename ty>ty pow(ty n,unsigned m){if(!m)return 1;ty fl=pow(n,m>>1);fl*=fl;return m&1?fl*n:fl;}
	template<const unsigned mod>
	struct modint{
		unsigned num;
		inline modint(){}
		inline modint(unsigned x){num=static_cast<unsigned>(x);}
		inline modint(int x){num=x;}
		inline modint(unsigned long long x){num=x%mod;}
		inline operator unsigned (){return num;}
		inline modint &operator=(unsigned x){num=x;return *this;}
		inline modint &operator=(modint<mod>x){num=x.num;return *this;}
		inline modint &operator+=(modint<mod>x){num+=x.num;num=num<mod?num:num-mod;return *this;}
		inline modint &operator-=(modint<mod>x){num=num<x.num?num+mod-x.num:num-x.num;return *this;}
		inline modint &operator*=(modint<mod>x)
		{num=static_cast<unsigned>(static_cast<unsigned long long>(num)*x.num%mod);return *this;}
		inline modint &operator/=(modint<mod>x)
		{num=static_cast<unsigned>(static_cast<unsigned long long>(num)*pow(x,mod-2)%mod);return *this;}
		inline modint operator+(modint<mod>x){x.num+=num;return x.num<mod?x.num:x.num-mod;}
		inline modint operator-(modint<mod>x){return num<x.num?num+mod-x.num:num-x.num;}
		inline modint operator*(modint<mod>x)
		{return static_cast<unsigned>(static_cast<unsigned long long>(num)*x.num%mod);}
		inline modint operator/(modint<mod>x)
		{return static_cast<unsigned>(static_cast<unsigned long long>(num)*pow(x,mod-2)%mod);}
		inline modint &operator+=(unsigned x){num+=x;num=num<mod?num:num-mod;return *this;}
		inline modint &operator-=(unsigned x){num=num<x?num+mod-x:num-x;return *this;}
		inline modint &operator*=(unsigned x)
		{num=static_cast<unsigned>(static_cast<unsigned long long>(num)*x%mod);return *this;}
		inline modint &operator/=(unsigned x)
		{num=static_cast<unsigned>(static_cast<unsigned long long>(num)*pow(modint<mod>(x),mod-2)%mod);return *this;}
		inline modint operator+(unsigned x){x+=num;return x<mod?x:x-mod;}
		inline modint operator-(unsigned x){return num<x?num+mod-x:num-x;}
		inline modint operator*(unsigned x)
		{return static_cast<unsigned>(static_cast<unsigned long long>(num)*x%mod);}
		inline modint operator/(unsigned x)
		{return static_cast<unsigned>(static_cast<unsigned long long>(num)*pow(modint<mod>(x),mod-2)%mod);}
	};
	#undef inline
}
namespace check
{
	inline void print_bit_char(char c)
	{
		for(int i=7;i>=0;i--)
			if(c&(1<<i)) putchar('1');
			else putchar('0');
	}
	template<typename ty>
	inline void print_bit(ty *num,const int sz=sizeof(ty))
	{
		const char *end=((char*)(num))+sz;
		for(char *c=(char*)num;c!=end;c++)
		{
			char get=*c;
			print_bit_char(get);
			putchar(' ');
		}
		putchar('\n');
	}	
}
#pragma GCC target("avx2")
namespace poly
{
	template<typename ty>__attribute__((always_inline)) void swap(ty &x,ty &y){ty z=x;x=y;y=z;}
	template<typename ty>ty get_pow(ty n,unsigned m){if(!m)return 1;ty fl=get_pow(n,m>>1);fl*=fl;return m&1?fl*n:fl;}
	static unsigned mod,r,n2_;
	static __m256i a0,mod1,mod2,R,hi32,n2;
	inline __m256i add(__m256i num1,__m256i num2)
	{
		__m256i apb=_mm256_add_epi32(num1,num2),ret=_mm256_sub_epi32(apb,mod2);
		__m256i cmp=_mm256_cmpgt_epi32(a0,ret),add=_mm256_and_si256(cmp,mod2);
		return _mm256_add_epi32(add,ret);
	}
	inline __m256i sub(__m256i num1,__m256i num2)
	{
		__m256i ret=_mm256_sub_epi32(num1,num2),cmp=_mm256_cmpgt_epi32(a0,ret),add=_mm256_and_si256(cmp,mod2);
		return _mm256_add_epi32(add,ret);
	}
	inline __m256i mul(__m256i _num1,__m256i _num2)
	{
		__m256i _num3=_num1,_num4,_num5=_num2;
		_num2=_mm256_mul_epu32(_num1,_num2);
		_num1=_mm256_mul_epu32(_mm256_mul_epu32(_num2,R),mod1);
	    _num4=_mm256_srli_epi64(_mm256_add_epi64(_num1,_num2),32);
		_num1=_mm256_srli_si256(_num3,4);_num2=_mm256_srli_si256(_num5,4);
		_num2=_mm256_mul_epu32(_num1,_num2);
		_num1=_mm256_mul_epu32(_mm256_mul_epu32(_num2,R),mod1);
	    _num1=_mm256_and_si256(_mm256_add_epi64(_num1,_num2),hi32);
	    return _mm256_or_si256(_num1,_num4);
	}
	long long exgcd(long long a,long long b,long long &x,long long &y)
	{
		long long d=a;
		if(b==0) x=1,y=0;
		else d=exgcd(b,a%b,y,x),y-=a/b*x;
		return d;
	}
	inline unsigned mul(unsigned x,unsigned y)
	{
		unsigned long long z=(unsigned long long)x*y;
		return (z+(unsigned long long)(unsigned(z)*r)*mod)>>32;
	}
	inline unsigned mon_in(unsigned x){return mul(x,n2_);}
	inline __m256i mon_in(__m256i x){return mul(x,n2);}
	inline unsigned mon_out(unsigned x)
	{unsigned ret=((x+(unsigned long long)(unsigned(x)*r)*mod)>>32);return ret<mod?ret:ret-mod;}
	inline unsigned add(unsigned x,unsigned y){x+=y;return x<mod<<1?x:x-(mod<<1);}
	inline unsigned sub(unsigned x,unsigned y){return x<y?x+(mod<<1)-y:x-y;}
	template<typename ty>
	inline void init_root(const unsigned mxlim,ty *root,unsigned G)
	{
		const int sz=32-__builtin_clz(mxlim-1);mod=(int)(ty(0)-ty(1))+1;long long x,y;
		n2_=-(unsigned long long)mod%mod;exgcd(mod,1ll<<32,x,y);r=-unsigned(x);
		root[0]=0;root[mxlim>>1]=mon_in(1);ty w=get_pow(ty(G),((int)(ty(0)-1)>>1)>>sz);w*=w;w=mon_in(w);
		for(unsigned i=mxlim/2+1;i^mxlim;root[i]=mul(root[i-1],w),++i);
		for(unsigned i=mxlim/2-1;i;root[i]=root[i<<1],--i);
		mod=0;
	}
	inline void init(const unsigned get_mod)
	{
		if(mod!=get_mod)
		{
			mod=get_mod;long long x,y;
			n2_=-(unsigned long long)mod%mod;exgcd(mod,1ll<<32,x,y);r=-unsigned(x);
			n2=_mm256_set1_epi32(n2_);
			a0=_mm256_setzero_si256(),mod2=_mm256_set1_epi32(mod<<1),
			mod1=_mm256_set1_epi32(mod),R=_mm256_set1_epi32(r);
			hi32=_mm256_set_epi32(-1,0,-1,0,-1,0,-1,0);
		}
	}
	template<typename ty>
	inline void in_mon(const unsigned len,ty *num)
	{
		const ty *end=num+len;
		init((int)(ty(0)-ty(1))+1);
		ty *p=num;
		for(;p+8<=end;p+=8)
			_mm256_storeu_si256((__m256i*)p,mon_in(_mm256_loadu_si256((__m256i*)p)));
		for(;p!=end;p++)
			p->num=mon_in(p->num);
	}
	template<typename ty>
	inline void out_mon(const unsigned len,ty *num)
	{
		const ty *end=num+len;
		init((int)(ty(0)-ty(1))+1);
		for(ty *p=num;p!=end;p++)
			p->num=mon_out(p->num);
	}
	template<typename ty>
	inline void dif(const unsigned lim,const ty *root,ty *num,bool type=1)
	{
		const ty *end=num+lim;
		unsigned *fl;
		init((int)(ty(0)-ty(1))+1);
		if(type) in_mon(lim,num);
		if(lim<=8)
		{
			for(unsigned i=lim>>1,i2=lim;i;i2=i,i>>=1)
				for(ty *p=num;p!=end;p+=i)
				{
					unsigned j=i;
					ty *rt=(ty*)root+j;
					for(ty fl;j^i2;rt++,++p,++j)
					{
						fl=*p;
						*p=add(*p,*(p+i));
						*(p+i)=mul(sub(fl,*(p+i)),*rt);
					}
				}
		}
		else
		{
			for(unsigned i=lim>>1,i2=lim;i>=8;i2=i,i>>=1)
				for(ty *p=num,*pi;p!=end;p+=i)
				{
					__m256i num1,num2,num3,num4,num5;
					ty *rt=(ty*)root+i;
					for(unsigned j=i;j+8<=i2;rt+=8,p+=8,j+=8)
					{
						pi=p+i;
						num1=_mm256_loadu_si256((__m256i*)p);
						num2=_mm256_loadu_si256((__m256i*)pi);
						_mm256_storeu_si256((__m256i*)p,add(num1,num2));
						num4=_mm256_sub_epi32(num1,num2);
						num5=_mm256_cmpgt_epi32(a0,num4);
						num3=_mm256_add_epi32(_mm256_and_si256(num5,mod2),num4);
						num5=_mm256_loadu_si256((__m256i*)rt);
						num1=num3;num2=num5;
						num2=_mm256_mul_epu32(num1,num2);
						num1=_mm256_mul_epu32(_mm256_mul_epu32(num2,R),mod1);
	                    num4=_mm256_srli_epi64(_mm256_add_epi64(num1,num2),32);
						num1=_mm256_srli_si256(num3,4);num2=_mm256_srli_si256(num5,4);
						num2=_mm256_mul_epu32(num1,num2);
						num1=_mm256_mul_epu32(_mm256_mul_epu32(num2,R),mod1);
	                    num1=_mm256_and_si256(_mm256_add_epi64(num1,num2),hi32);
	                    _mm256_storeu_si256((__m256i*)pi,_mm256_or_si256(num1,num4));
					}
				}
			for(unsigned i=4,i2=8;i;i2=i,i>>=1)
			{
				if(i==4)
				{
					ty *p=num;
					__m256i _rt=_mm256_set_epi32(0,0,0,0,(ty)root[7],(ty)root[6],(ty)root[5],(ty)root[4]),num1,num2,num3,num4;
					for(;p+8<=end;p+=8)
					{
						num1=_mm256_loadu_si256((__m256i*)p),num2=_mm256_loadu_si256((__m256i*)(p+4));
						num3=add(num1,num2);
						num4=sub(num1,num2);
						num4=mul(num4,_rt);
						memcpy(p,&num3,16);
						memcpy(p+4,&num4,16);
					}
				}
				if(i==2)
				{
					ty *p=num;
					__m256i _rt=_mm256_set_epi32((ty)root[3],(ty)root[3],(ty)root[2],(ty)root[2],
					(ty)root[3],(ty)root[3],(ty)root[2],(ty)root[2]),num1,num2,num3,num4;
					for(;p+16<=end;p+=16)
					{
						num1=_mm256_loadu_si256((__m256i*)p),num2=_mm256_loadu_si256((__m256i*)p+1);
						//num1:12345678,num2:9abcdefg
						num3=_mm256_unpacklo_epi32(num1,num2);//192a5d6e
						num4=_mm256_unpackhi_epi32(num1,num2);//3b4c7f8g
						num1=add(num3,num4);//192a5d6e
						num2=sub(num3,num4);
						num2=mul(num2,_rt);//3b4c7f8g
						num3=_mm256_unpacklo_epi32(num1,num2);//139b57df
						num4=_mm256_unpackhi_epi32(num1,num2);//24ac68eg
						num1=_mm256_unpacklo_epi32(num3,num4);//12345678
						num2=_mm256_unpackhi_epi32(num3,num4);//9abcdefg
						_mm256_storeu_si256((__m256i*)p,num1);
						_mm256_storeu_si256((__m256i*)p+1,num2);
					}
				}
				if(i==1)
				{
					ty *p=num;
					__m256i num1,num2,num3,num4,num5,num6;
					for(;p+16<=end;p+=16)
					{
						num1=_mm256_loadu_si256((__m256i*)p),num2=_mm256_loadu_si256((__m256i*)p+1);
						//num1:1234,num2:5678
						num3=_mm256_slli_epi64(num1,32);//num3:0103
						num4=_mm256_slli_epi64(num2,32);//num4:0507
						num5=_mm256_or_si256(_mm256_srli_si256(num3,4),num4);//num5:1537
						num3=_mm256_srli_epi64(num1,32);//num3:2040
						num4=_mm256_srli_epi64(num2,32);//num4:6080
						num6=_mm256_or_si256(_mm256_slli_si256(num4,4),num3);//num5:2648
						num1=add(num5,num6);
						num2=sub(num5,num6);
						num3=_mm256_slli_epi64(num1,32);
						num4=_mm256_slli_epi64(num2,32);
						num5=_mm256_or_si256(_mm256_srli_si256(num3,4),num4);
						num3=_mm256_srli_epi64(num1,32);
						num4=_mm256_srli_epi64(num2,32);
						num6=_mm256_or_si256(_mm256_slli_si256(num4,4),num3);
						_mm256_storeu_si256((__m256i*)p,num5);
						_mm256_storeu_si256((__m256i*)p+1,num6);
					}
				}
			}			
		}
		if(type) out_mon(lim,num);
	}
	template<typename ty>
	inline void dit(const unsigned lim,const ty *root,ty *num,bool type=1)
	{
		const ty *end=num+lim;
		init((int)(ty(0)-ty(1))+1);
		if(type) in_mon(lim,num);
		if(lim<=8)
		{
			for(unsigned i=1,i2=2;i^lim;i=i2,i2<<=1)
				for(ty *p=num;p!=end;p+=i)
				{
					unsigned j=i;
					ty *rt=(ty*)root+j;
					for(ty fl;j^i2;rt++,p++,++j)
					{
						fl=mul(*(p+i),*rt);
						*(p+i)=sub(*p,fl);
						*p=add(*p,fl);
					}
			}
		}
		else
		{
			for(unsigned i=1,i2=2;i^8;i=i2,i2<<=1)
			{
				if(i==4)
				{
					ty *p=num;
					__m256i _rt=_mm256_set_epi32(0,0,0,0,(ty)root[7],(ty)root[6],(ty)root[5],(ty)root[4]),num1,num2,num3,num4;
					for(;p+8<=end;p+=8)
					{
						num1=_mm256_loadu_si256((__m256i*)p),num2=_mm256_loadu_si256((__m256i*)(p+4));
						num2=mul(num2,_rt);
						num3=add(num1,num2);
						num4=sub(num1,num2);
						memcpy(p,&num3,16);
						memcpy(p+4,&num4,16);
					}	
				}
				if(i==2)
				{
					ty *p=num;
					__m256i _rt=_mm256_set_epi32((ty)root[3],(ty)root[3],(ty)root[2],(ty)root[2],
					(ty)root[3],(ty)root[3],(ty)root[2],(ty)root[2]),num1,num2,num3,num4;
					for(;p+16<=end;p+=16)
					{
						num1=_mm256_loadu_si256((__m256i*)p),num2=_mm256_loadu_si256((__m256i*)p+1);
						//num1:12345678,num2:9abcdefg
						num3,num4;
						num3=_mm256_unpacklo_epi32(num1,num2);//192a5d6e
						num4=_mm256_unpackhi_epi32(num1,num2);//3b4c7f8g
						num4=mul(num4,_rt);//192a5d6e
						num1=add(num3,num4);
						num2=sub(num3,num4);//3b4c7f8g
						num3=_mm256_unpacklo_epi32(num1,num2);//139b57df
						num4=_mm256_unpackhi_epi32(num1,num2);//24ac68eg
						num1=_mm256_unpacklo_epi32(num3,num4);//12345678
						num2=_mm256_unpackhi_epi32(num3,num4);//9abcdefg
						_mm256_storeu_si256((__m256i*)p,num1);
						_mm256_storeu_si256((__m256i*)p+1,num2);
					}	
				}
				if(i==1)
				{
					ty *p=num;
					__m256i num1,num2,num3,num4,num5,num6;
					for(;p+16<=end;p+=16)
					{
						num1=_mm256_loadu_si256((__m256i*)p),num2=_mm256_loadu_si256((__m256i*)p+1);
						//num1:1234,num2:5678
						num3=_mm256_slli_epi64(num1,32);//num3:0103
						num4=_mm256_slli_epi64(num2,32);//num4:0507
						num5=_mm256_or_si256(_mm256_srli_si256(num3,4),num4);//num5:1537
						num3=_mm256_srli_epi64(num1,32);//num3:2040
						num4=_mm256_srli_epi64(num2,32);//num4:6080
						num6=_mm256_or_si256(_mm256_slli_si256(num4,4),num3);//num5:2648
						num1=add(num5,num6);
						num2=sub(num5,num6);
						num3=_mm256_slli_epi64(num1,32);
						num4=_mm256_slli_epi64(num2,32);
						num5=_mm256_or_si256(_mm256_srli_si256(num3,4),num4);
						num3=_mm256_srli_epi64(num1,32);
						num4=_mm256_srli_epi64(num2,32);
						num6=_mm256_or_si256(_mm256_slli_si256(num4,4),num3);
						_mm256_storeu_si256((__m256i*)p,num5);
						_mm256_storeu_si256((__m256i*)p+1,num6);
					}
				}
			}
			for(unsigned i=4,i2=8;i^lim;i=i2,i2<<=1)
				for(ty *p=num,*pi;p!=end;p+=i)
				{
					__m256i num1,num2,num3,num4,num5;
					ty *rt=(ty*)root+i;
					for(unsigned j=i;j+8<=i2;rt+=8,p+=8,j+=8)
					{
						pi=p+i;
						num4=_mm256_loadu_si256((__m256i*)pi);
						num5=_mm256_loadu_si256((__m256i*)rt);
						num1=num5;num2=num4;
						num2=_mm256_mul_epu32(num1,num2);
						num1=_mm256_mul_epu32(_mm256_mul_epu32(num2,R),mod1);
	                    num3=_mm256_srli_epi64(_mm256_add_epi64(num1,num2),32);
	                    num1=_mm256_srli_si256(num4,4);num2=_mm256_srli_si256(num5,4);
						num2=_mm256_mul_epu32(num1,num2);
						num1=_mm256_mul_epu32(_mm256_mul_epu32(num2,R),mod1);
	                    num1=_mm256_and_si256(_mm256_add_epi64(num1,num2),hi32);
	                    num2=_mm256_or_si256(num1,num3);
	                    num1=_mm256_loadu_si256((__m256i*)p);
						_mm256_storeu_si256((__m256i*)p,add(num1,num2));
						_mm256_storeu_si256((__m256i*)pi,sub(num1,num2));
					}
			}			
		}
		if(type) out_mon(lim,num);
		ty inv=1,x;inv/=lim;int k=1;
		for(ty *i=num+1,*j=num+lim-1;k<lim>>1;++i,--j,++k) x=*i,*i=*j,*j=x;
		for(ty *i=num;i!=end;i++) *i*=inv;
	}
	template<typename ty>
	inline ty* align(ty *use)
	{
		while((long long)use&31) use++;
		return use;
	}
	template<typename ty>
	inline void mul(const unsigned lim,const ty *root,ty *num1,ty *num2,unsigned len1=0,unsigned len2=0,unsigned len3=0,bool type=1)
	{
		init(int(ty(0)-ty(1))+1);
		if(!len1) len1=lim;
		if(!len2) len2=lim;
		if(!len3) len3=lim;
		if(len1<=64||len2<=64||len3<=128)
		{
			ty *use=new ty[len3];
			int I,J;
			for(I=len1-1;I;I--) if(num1[I]) break;
			for(J=len2-1;J;J--) if(num2[J]) break;
			memset(use,0,sizeof(ty)*len3);
			if(type)
			for(int i=0;i<=I;i++)
				for(int j=0;j<=J&&i+j<len3;j++)
					use[i+j]+=num1[i]*num2[j];
			else
			for(int i=0;i<=I;i++)
				for(int j=0;j<=J&&i+j<len3;j++)
					use[i+j]=add(use[i+j],mul(num1[i],num2[j]));
			memcpy(num1,use,sizeof(ty)*len3);
			delete(use);
			return ;
		}
		if(num1==num2&&len1==len2)
		{
			ty *old2=new ty[lim-len3+1];
			memcpy(old2,num1+len3,sizeof(ty)*(lim-len3));memset(num1+len1,0,sizeof(ty)*(lim-len1));
			if(type) in_mon(len1,num1);
			dif(lim,root,num1,0);
			for(int i=0;i<lim;i++) num1[i]=mul(num1[i],num1[i]);
			dit(lim,root,num1,0);
			if(type) out_mon(len3,num1);
			memcpy(num1+len3,old2,sizeof(ty)*(lim-len3));
			delete(old2);
			return ;
		}
		if(num1!=num2)
		{
			ty *old=new ty[lim],*old2=new ty[lim-len3+1];
			memcpy(old,num2,sizeof(ty)*len2);memset(old+len2,0,sizeof(ty)*(lim-len2));
			memcpy(old2,num1+len3,sizeof(ty)*(lim-len3));memset(num1+len1,0,sizeof(ty)*(lim-len1));
			if(type)
			{
				in_mon(len1,num1);dif(lim,root,num1,0);dif(lim,root,old,0);
				for(int i=0;i<lim;i++)num1[i]*=old[i];
				dit(lim,root,num1,0);out_mon(len3,num1);
			}
			else
			{
				dif(lim,root,num1,0);dif(lim,root,old,0);
				for(int i=0;i<lim;i++)num1[i]=mul(num1[i],old[i]);
				dit(lim,root,num1,0);
			}
			memcpy(num1+len3,old2,sizeof(ty)*(lim-len3));
			delete(old);delete(old2);
			return ;
		}
	}
	template<typename ty>
	inline void inverse(const unsigned lim,const ty *root,ty *num,bool type=1)
	{
		static unsigned use2[1024];
		init(int(ty(0)-ty(1))+1);
		ty *use=new ty[lim];
		ty *A=new ty[lim+8],*B=new ty[lim+8];
		ty *a=align(A),*b=align(B);
		int len=2,mid=1;
		memset(use,0,lim*sizeof(ty));
		if(type)
		{
			use[0]=mon_in(ty(1)/num[0]);
			in_mon(lim,num);			
		}
		else use[0]=mon_in(ty(1)/mon_out(num[0]));
		while(len<=lim)
		{
			memcpy(a,num,len*sizeof(ty));memcpy(b,use,len*sizeof(ty)>>1);
			memset(b+mid,0,len*sizeof(ty)>>1);
			if(len>128)
			{
				dif(len,root,a,0);dif(len,root,b,0);
				int i;
				for(i=0;i+8<=len;i+=8)
					*(__m256i*)(a+i)=mul(*(__m256i*)(a+i),*(__m256i*)(b+i));
				for(;i<len;i++)a[i]=mul(a[i],b[i]);
				dit(len,root,a,0),memset(a,0,len*sizeof(ty)>>1),dif(len,root,a,0);
				for(i=0;i+8<=len;i+=8)
					*(__m256i*)(a+i)=mul(*(__m256i*)(a+i),*(__m256i*)(b+i));
				for(;i<len;i++)a[i]=mul(a[i],b[i]);
				dit(len,root,a,0);		
			}
			else
			{
				memset(use2,0,len*sizeof(int));
				for(int i=0;i<len;i++)
					for(int j=0;j<len-i;j++)
						use2[i+j]=add(use2[i+j],mul(a[i],b[j]));
				memset(a,0,len*sizeof(ty));
				for(int i=0;i<len;i++)
					for(int j=0;j<len-i;j++)
						a[i+j]=add((unsigned)a[i+j],mul(use2[i],b[j]));
			}
			for(int i=mid;i<len;i++)use[i]=sub(use[i],a[i]);
			len<<=1;mid<<=1;
		}
		if(type) out_mon(lim,use);
		memcpy(num,use,lim*sizeof(ty));
		delete(use);delete(A);delete(B);
	}
	template<typename ty>
	inline void inverse(const unsigned lim,const ty *root,ty *num,int old_lim,ty *old,ty *use,ty *a,ty *b)
	{
		static unsigned use2[1024];
		int len,mid;
		if(old_lim<=1) 
		{
			len=2,mid=1;
			memset(use,0,lim*sizeof(ty));
			use[0]=mon_in(ty(1)/mon_out(num[0]));
		}
		else
		{
			len=old_lim;old_lim>>=1;mid=old_lim;
			memcpy(use,old,old_lim*sizeof(ty));
			memset(use+old_lim,0,(lim-old_lim)*sizeof(ty));			
		}
		while(len<=lim)
		{
			memcpy(a,num,len*sizeof(ty));memcpy(b,use,len*sizeof(ty)>>1);
			memset(b+mid,0,len*sizeof(ty)>>1);
			if(len>128)
			{
				dif(len,root,a,0);dif(len,root,b,0);
				for(int i=0;i+8<=len;i+=8)
					*(__m256i*)(a+i)=mul(*(__m256i*)(a+i),*(__m256i*)(b+i));
				if(len^lim) dit(len,root,a,0),memset(a,0,len*sizeof(ty)>>1),dif(len,root,a,0);
				for(int i=0;i+8<=len;i+=8)
					*(__m256i*)(a+i)=mul(*(__m256i*)(a+i),*(__m256i*)(b+i));
				dit(len,root,a,0);
			}
			else
			{
				memset(use2,0,len*sizeof(int));
				for(int i=0;i<len;i++)
					for(int j=0;j<len-i;j++)
						use2[i+j]=add(use2[i+j],mul(a[i],b[j]));
				memset(a,0,len*sizeof(ty));
				for(int i=0;i<len;i++)
					for(int j=0;j<len-i;j++)
						a[i+j]=add((unsigned)a[i+j],mul(use2[i],b[j]));
			}
			for(int i=mid;i<len;i++)use[i]=sub(use[i],a[i]);
			len<<=1;mid<<=1;
		}
		memcpy(old,use,lim*sizeof(ty));
	}
	template<typename ty>
	inline void differ(const unsigned lim,ty *num)
	{
		for(int i=1;i<lim;i++) num[i]*=i;
		memmove(num,num+1,(lim-1)*sizeof(ty));
		num[lim-1]=0;
	}
	template<typename ty>
	inline void integral_no_mon(const unsigned lim,ty *num)
	{
		static ty *inv,nowl=0;
		const int mod=static_cast<int>(ty(0)-1)+1;
		if(lim>nowl)
		{
			inv=(ty*)realloc(inv,lim*sizeof(ty));
			if(!nowl) inv[0]=inv[1]=1,nowl=2;
			for(int i=nowl;i<lim;i++) inv[i]=inv[mod%i]*(mod-mod/i);
			nowl=lim;
		}
		for(int i=lim-1;i;i--) num[i]=num[i-1]*inv[i];
		num[0]=0;
	}
	template<typename ty>
	inline void integral(const unsigned lim,ty *num,bool type=1)
	{
		if(type==1)
		{
			integral_no_mon(lim,num);
			return ;
		}
		static ty *inv=0,nowl=0;
		static int lstmod=-1;
		const int mod=static_cast<int>(ty(0)-1)+1;
		if(lstmod!=mod)
		{
			lstmod=mod;
			if(inv) delete(inv);
			inv=0;
			nowl=0;
		}
		if(lim>nowl)
		{
			inv=(ty*)realloc(inv,lim*sizeof(ty));
			if(!nowl) inv[0]=inv[1]=mon_in(1),nowl=2;
			for(int i=nowl;i<lim;i++) inv[i]=inv[mod%i]*(mod-mod/i);
			nowl=lim;
		}
		for(int i=lim-1;i;i--) num[i]=mul(num[i-1],inv[i]);
		num[0]=0;
	}
	template<typename ty>
	inline void Ln(const unsigned lim,const ty *root,ty *num,bool type=1)//Double memory required
	{
		if(type) in_mon(lim,num);
		ty *use=new ty[lim<<1];
		memcpy(use,num,sizeof(ty)*lim*2);
		differ(lim,num);
		inverse(lim,root,use,0);
		mul(lim<<1,root,num,use,0,0,0,0);
		integral(lim,num,0);
		delete(use);
		if(type) out_mon(lim,num);
	}
	template<typename ty>
	inline void Ln(const unsigned lim,const ty *root,ty *num,int old_lim,ty *old_inv,ty *use,ty *use2,ty *use3,ty *use4)
	{
		memcpy(use,num,sizeof(ty)*lim);
		differ(lim,num);
		inverse(lim,root,use,old_lim,old_inv,use2,use3,use4);
		mul(lim<<1,root,num,old_inv,0,0,0,0);
		integral(lim,num,0);
	}
	template<typename ty>
	void Exp(const unsigned lim,const ty *root,ty *num,ty *use1,ty *use2,ty *use3,ty *use4,ty *use5,ty *use6,ty *inv)
	{
		if(lim==1){use1[0]=mon_in(1);return ;}
		Exp(lim>>1,root,num,use1,use2,use3,use4,use5,use6,inv);
		memcpy(use2,use1,lim*sizeof(ty));
		Ln(lim,root,use2,lim>>1,inv,use3,use4,use5,use6);
		for(int i=0;i<lim;i++)use2[i]=sub(num[i],use2[i]);
		mul(lim<<1,root,use1,use2,0,0,lim,0);
	}
	template<typename ty>
	inline void Exp(const unsigned lim,const ty *root,ty *num,bool type=1)//Double memory required
	{
		if(type) in_mon(lim,num);
		num[0]+=mon_in(1);
		ty *use1=new ty[lim<<1],*use2=new ty[lim<<1],*use3=new ty[lim<<1],
		*use4=new ty[lim],*use5=new ty[lim+8],*use6=new ty[lim+8],*inv=new ty[lim<<1];
		memset(use1,0,lim*sizeof(ty)*2);memset(use2,0,lim*sizeof(ty)*2);
		memset(inv,0,lim*sizeof(ty)*2);
		Exp(lim,root,num,use1,use2,use3,use4,align(use5),align(use6),inv);
		memcpy(num,use1,lim*sizeof(ty));
		delete(use1);delete(use2);delete(use3);
		delete(use4);delete(use5);delete(use6);delete(inv);
		if(type) out_mon(lim,num);
	}
	template<typename ty>
	inline void mul(ty *f,const ty *l,const int m)
	{
		__m256i M=_mm256_set1_epi32(m);
		for(;(long long)f&31&&f!=l;f++) *f=mul(*f,m);
		for(;f+8<=l;f+=8) *(__m256i*)f=mul(*(__m256i*)f,M);
		for(;f!=l;f++) *f=mul(*f,m);
	}
	template<typename ty>
	inline void sqrt(const unsigned lim,const ty *root,ty *num,bool type=1)
	{
		if(type) in_mon(lim,num);
		init(int(ty(0)-ty(1))+1);
		ty *use=new ty[lim];
		ty *a=new ty[lim],*b=new ty[lim];
		ty *use1=new ty[lim],*use2=new ty[lim+8],*use3=new ty[lim+8];
		ty inv2=mon_in(ty(1)/ty(2));
		int len=2,mid=1;
		memset(use,0,lim*sizeof(ty));
		memset(a,0,lim*sizeof(ty));
		memset(b,0,lim*sizeof(ty));
		use[0]=mon_in(1);
		mul(num,num+lim,inv2);
		while(len<=lim)
		{
			memcpy(a,use,len*sizeof(ty));
			inverse(len,root,a,len>>1,b,use1,align(use2),align(use3));
			memcpy(a,b,len*sizeof(ty));
			mul(len<<1,root,a,num,len,len,len,0);
			mul(use+(len>>1),use+len,inv2);
			for(int i=len>>1;i<len;i++)
				use[i]=add(use[i],a[i]);
			len<<=1;
		}
		if(type) out_mon(lim,use);
		memcpy(num,use,lim*sizeof(ty));
		delete(use);delete(a);delete(b);
	}
	template<typename ty>
	void qpow(const unsigned lim,const ty *root,ty *num,const int m,bool type=1)//Double memory required
	{
		if(type) in_mon(lim,num);
		Ln(lim,root,num,0);
		mul(num,num+lim,mon_in(m)); 
		Exp(lim,root,num,0);
		if(type) out_mon(lim,num);
	}
	template<typename ty>
	void spow(const unsigned lim,const ty *root,ty *num,int m)
	{
		ty *ret=(ty*)malloc(lim*sizeof(ty)*2);
		memset(ret,0,lim*sizeof(ty)*2);
		ret[0]=1;
		while(m)
		{
			if(m&1) mul(lim<<1,root,ret,num,lim,lim,lim);
			mul(lim<<1,root,num,num,lim,lim,lim);
			m>>=1;
		}
		memcpy(num,ret,lim*sizeof(ty));
		delete(ret);
	}
}
using namespace poly;
struct __uint256_t{
	__uint128_t hi,lo;
	#define il __attribute__((always_inline))
	__uint256_t(){}
	template<typename ty>
	__uint256_t(ty x){hi=0;lo=x;}
	__uint256_t(__uint128_t x,__uint128_t y){hi=x;lo=y;}
	template<typename ty>
	il void operator = (ty x){hi=0;lo=x;}
	il void operator = (__uint256_t x){hi=x.hi;lo=x.lo;}
	template<typename ty>
	il operator ty () {return lo;}
	il void operator += (__uint256_t x){lo+=x.lo;hi+=x.hi;if(lo<x.lo)hi++;}
	il void operator -= (__uint256_t x){if(lo<x.lo)hi--;lo-=x.lo;hi-=x.hi;}
	il __uint256_t operator + (__uint256_t x){__uint256_t y=*this;y+=x;return y;}
	il __uint256_t operator - (__uint256_t x){__uint256_t y=*this;y-=x;return y;}
	il void operator *= (__uint256_t x)
	{
		__uint128_t _x[4];
		_x[0]=(lo&-1ull)*(x.lo&-1ull);
		_x[1]=(lo&-1ull)*(x.lo>>64)+(lo>>64)*(x.lo&-1ull);
		_x[2]=(lo>>64)*(x.lo>>64)+(hi&-1ull)*(x.lo&-1ull)+(x.hi&-1ull)*(lo&-1ull);
		_x[3]=(hi>>64)*(x.lo&-1ull)+(x.hi>>64)*(lo&-1ull)+(lo>>64)*(x.hi&-1ull)+(x.lo>>64)*(hi&-1ull);
		lo=_x[0]+(_x[1]<<64);
		hi=(_x[1]>>64)+_x[2]+(_x[3]<<64);
	}
	il __uint256_t operator * (__uint256_t x){__uint256_t y=*this;y*=x;return y;}
	il void operator |= (__uint256_t x){lo|=x.lo;hi|=x.hi;}
	il __uint256_t operator | (__uint256_t x){return {hi|x.hi,lo|x.lo};}
	il void operator ^= (__uint256_t x){lo^=x.lo;hi^=x.hi;}
	il __uint256_t operator ^ (__uint256_t x){return {hi^x.hi,lo^x.lo};}
	il void operator &= (__uint256_t x){lo&=x.lo;hi&=x.hi;}
	il __uint256_t operator & (__uint256_t x){return {hi&x.hi,lo&x.lo};}
	il void operator <<= (unsigned x){hi=(hi<<x)^(lo>>(128-x));lo<<=x;}
	il __uint256_t operator << (unsigned x){return {(hi<<x)^(lo>>(128-x)),lo<<x};}
	il void operator >>= (unsigned x){lo=(lo>>x)^(hi<<(128-x));hi>>=x;}
	il __uint256_t operator >> (unsigned x){return {hi>>x,(lo>>x)^(hi<<(128-x))};}
	il __uint256_t operator ~ () {return {~hi,~lo};}
	il bool operator < (__uint256_t x){return hi!=x.hi?hi<x.hi:lo<x.lo;}
	il bool operator <=(__uint256_t x){return !(hi!=x.hi?hi>x.hi:lo>x.lo);}
	il bool operator > (__uint256_t x){return hi!=x.hi?hi>x.hi:lo>x.lo;}
	il bool operator >=(__uint256_t x){return !(hi!=x.hi?hi<x.hi:lo<x.lo);}
	il operator bool () {return hi||lo;};
	il void operator /= (__uint256_t x)
	{
		__uint256_t ans=0,fl=~__uint256_t(0);
		unsigned cnt=256;
		while(cnt--)
		{
			fl>>=1;
			if(x&fl) continue;
			if(x<<cnt<=*this) *this-=(x<<cnt),ans|=__uint256_t(1)<<cnt;
		}
		*this=ans;
	}
	il __uint256_t operator / (__uint256_t x)
	{
		__uint256_t ans=0,fl=~__uint256_t(0);
		unsigned cnt=256;
		while(cnt--)
		{
			fl<<=1;
			if(x&fl) continue;
			if(x<<cnt<=*this) *this-=(x<<cnt),ans|=__uint256_t(1)<<cnt;
		}
		return ans;
	}
	#undef il
};
#define mod1 754974721
#define mod2 167772161
#define mod3 469762049
#define mod12 126663740442542081ll
#define mod13 354658471880163329ll
#define mod23 78812994116517889ll
const __uint256_t mod123=(__uint128_t)mod1*mod2*mod3;
#define inv1 190329765
#define inv2 58587104
#define inv3 187290749
#include<cstdio>
#include<cstring>
#define N 2000005
int lena,lenb;
char re[N];
char *a;
char *b;
int num1[N],num2[N];
int_::modint<mod1> root1[N],num1_1[N];
int_::modint<mod2> root2[N],num1_2[N];
int_::modint<mod3> root3[N],num1_3[N];
char pr1[N];
#define sz 8
int main()
{
	int lim=1,i;
	fread(re,1,sizeof(re),stdin);
	a=re;
	for(i=0;re[i]>32;i++);
	lena=i;re[i]=0;
	for(i=i+1;re[i]<33;i++);
	b=re+i;lenb=i;
	for(i=i+1;re[i]>32;i++);
	lenb=i-lenb;re[i]=0;
	if(lena<=1&&!num1[0]||lenb<=1&&!num2[0])
	{
		putchar('0');
		return 0;
	}
	int cnt=(lena-1)%sz+1;
	for(int i=(lena+sz-1)/sz-1,j=0;i>=0;i--)
	{
		for(int k=0;k<cnt;k++)
			num1[i]=num1[i]*10+(a[j++]^48);
		cnt=sz;
	}
	memcpy(num1_1,num1,sizeof(int)*((lena+sz-1)/sz));
	memcpy(num1_2,num1,sizeof(int)*((lena+sz-1)/sz));
	memcpy(num1_3,num1,sizeof(int)*((lena+sz-1)/sz));
	cnt=(lenb-1)%sz+1;
	for(int i=(lenb+sz-1)/sz-1,j=0;i>=0;i--)
	{
		for(int k=0;k<cnt;k++)
			num2[i]=num2[i]*10+(b[j++]^48);
		cnt=sz;
	}
	while(lim<=(lena+sz-1)/sz+(lenb+sz-1)/sz) lim<<=1;
	init_root(lim,root1,11);init_root(lim,root2,3);init_root(lim,root3,3);
	mul(lim,root1,num1_1,(int_::modint<mod1>*)num2);
	mul(lim,root2,num1_2,(int_::modint<mod2>*)num2);
	mul(lim,root3,num1_3,(int_::modint<mod3>*)num2);
	__uint128_t num=0;
	pr1[N-1]=0;
	int len=N-1;
	__uint256_t brt=(__uint256_t(1)<<128)/mod123;
	for(int i=0;i<lim;i++)
	{
		__uint256_t modans=((__uint128_t)((int)num1_1[i])*inv1*mod23+
				(__uint128_t)((int)num1_2[i])*inv2*mod13+(__uint128_t)((int)num1_3[i])*inv3*mod12);
		modans=modans-__uint256_t((modans*brt).hi)*mod123;
		while(modans>=mod123) modans-=mod123;
		num+=__uint128_t(modans);
		unsigned long long fl1=num/100000000,fl=num-fl1*100000000;num=fl1;
		for(int k=0;k<sz;k++)
			pr1[--len]=fl%10^48,fl/=10;
		if(i>(lena+lenb+sz)/sz&&num==0) break;
	}
	while(pr1[len]=='0') len++;
	puts(pr1+len);
	return 0;
}

CompilationN/AN/ACompile OKScore: N/A

Testcase #131.893 ms13 MB + 748 KBAcceptedScore: 100


Judge Duck Online | 评测鸭在线
Server Time: 2025-09-14 23:00:12 | Loaded in 1 ms | Server Status
个人娱乐项目,仅供学习交流使用 | 捐赠