提交记录 12584


用户 题目 状态 得分 用时 内存 语言 代码长度
fjzzq2002 1002i. 【模板题】多项式乘法 Accepted 100 18.997 ms 9864 KB C++ 11.08 KB
提交时间 评测时间
2020-04-23 08:32:20 2020-08-01 02:56:41
#pragma GCC optimize("-Ofast","-funroll-all-loops","-ffast-math")
#pragma GCC optimize("-fno-math-errno")
#pragma GCC optimize("-funsafe-math-optimizations")
#pragma GCC optimize("-freciprocal-math")
#pragma GCC optimize("-fno-trapping-math")
#pragma GCC optimize("-ffinite-math-only")
#pragma GCC optimize("-fno-stack-protector")
#pragma GCC target ("avx2")
#include <immintrin.h>
#include <iostream>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#include <string>
#include <bitset>
#include <vector>
#include <set>
#include <map>
#include <queue>
#include <algorithm>
#include <sstream>
#include <stack>
#include <iomanip>
#include <assert.h>
using namespace std;
#define pb push_back
#define mp make_pair
typedef pair<int,int> pii;
typedef long long ll;
typedef double ld;
typedef vector<int> vi;
#define fi first
#define se second
#define fe first
#define FO(x) {freopen(#x".in","r",stdin);freopen(#x".out","w",stdout);}
#define Edg int M=0,fst[SZ],vb[SZ],nxt[SZ];void ad_de(int a,int b){++M;nxt[M]=fst[a];fst[a]=M;vb[M]=b;}void adde(int a,int b){ad_de(a,b);ad_de(b,a);}
#define Edgc int M=0,fst[SZ],vb[SZ],nxt[SZ],vc[SZ];void ad_de(int a,int b,int c){++M;nxt[M]=fst[a];fst[a]=M;vb[M]=b;vc[M]=c;}void adde(int a,int b,int c){ad_de(a,b,c);ad_de(b,a,c);}
#define es(x,e) (int e=fst[x];e;e=nxt[e])
#define esb(x,e,b) (int e=fst[x],b=vb[e];e;e=nxt[e],b=vb[e])
#define SZ 2345678
const int MOD=998244353;
ll qp(ll a,ll b)
{
	ll ans=1;
	while(b)
	{
		if(b&1) ans=ans*a%MOD;
		a=a*a%MOD; b>>=1;
	}
	return ans;
}
int getK(int n)
{int s=1; while(s<n) s<<=1; return s;}
typedef unsigned us;
typedef unsigned long long ull;
us pool[SZ*4] __attribute__ ((aligned(64))),*ptr=pool;
us *p0[SZ],*p1[SZ],*q0[SZ],*q1[SZ];
__attribute__((always_inline)) void bit_flip(us*p,int t)
{
	for(int i=0,j=0;i<t;++i)
	{
		if(i>j) swap(p[i],p[j]);
		for(int l=t>>1;(j^=l)<l;l>>=1);
	}
}
void prep(int n)
{
	static int t=1;
	for(;t<n;t<<=1)
	{
		int g=qp(3,(MOD-1)/(t*2));
		us*p,*q;
		p=p0[t]=ptr; ptr+=max(t,16); p[0]=1;
		for(int m=1;m<t;++m)
			p[m]=p[m-1]*(ull)g%us(MOD);
		bit_flip(p,t);
		q=q0[t]=ptr; ptr+=max(t,16);
		for(int i=0;i<t;++i)
			q[i]=(ull(p[i])<<32)/MOD;
		g=qp(g,MOD-2);
		p=p1[t]=ptr; ptr+=max(t,16); p[0]=1;
		for(int m=1;m<t;++m)
			p[m]=p[m-1]*(ull)g%us(MOD);
		bit_flip(p,t);
		q=q1[t]=ptr; ptr+=max(t,16);
		for(int i=0;i<t;++i)
			q[i]=(ull(p[i])<<32)/MOD;
	}
}
typedef unsigned long long ull;
__attribute__((always_inline)) us my_mul(us a,us b,us c)
{
	return b*(ull)a-((ull(a)*c)>>32)*ull(998244353);
}
__attribute__((always_inline)) __m128i my_mullo_epu32(const __m128i&a, const __m128i& b)
{
	#if defined ( __SSE4_2__ )
	return _mm_mullo_epi32(a,b);
	#else
	__m128i a13    = _mm_shuffle_epi32(a, 0xF5);          // (-,a3,-,a1)
	__m128i b13    = _mm_shuffle_epi32(b, 0xF5);          // (-,b3,-,b1)
	__m128i prod02 = _mm_mul_epu32(a, b);                 // (-,a2*b2,-,a0*b0)
	__m128i prod13 = _mm_mul_epu32(a13, b13);             // (-,a3*b3,-,a1*b1)
//	__m128i prod01 = ;   // (-,-,a1*b1,a0*b0) 
//	__m128i prod23 = ;   // (-,-,a3*b3,a2*b2) 
	__m128i prod   = _mm_unpacklo_epi64(_mm_unpacklo_epi32(prod02,prod13),_mm_unpackhi_epi32(prod02,prod13));   // (ab3,ab2,ab1,ab0)
	return prod;
	#endif
}
__attribute__((always_inline)) __m128i my_mulhi_epu32(const __m128i&a, const __m128i& b)
{
	__m128i a13    = _mm_shuffle_epi32(a, 0xF5);          // (-,a3,-,a1)
	__m128i b13    = _mm_shuffle_epi32(b, 0xF5);          // (-,b3,-,b1)
	__m128i prod02 = _mm_mul_epu32(a, b);                 // (a2*b2,-,a0*b0,-)
	__m128i prod13 = _mm_mul_epu32(a13, b13);             // (a3*b3,-,a1*b1,-)
//	__m128i prod01 = ;   // (a1*b1,a0*b0,-,-) 
//	__m128i prod23 = ;   // (a3*b3,a2*b2,-,-) 
	__m128i prod   = _mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02,prod13),_mm_unpackhi_epi32(prod02,prod13));   // (ab3,ab2,ab1,ab0)
	return prod;
}
void ntt(us* __restrict__ x,int n)
{
	int t=n;
	for(int m=1;m<n;m<<=1)
	{
		t>>=1;
		us* __restrict__ p=p0[m];
		us* __restrict__ q=q0[m];
		if(t==1||t==2)
		{
			us *xa=x,*xb=x+t;
			for(int i=0;i<m;++i,xa+=t+t,xb+=t+t)
				for(int j=0;j<t;++j)
				{
					us u=xa[j]-(xa[j]>=us(MOD+MOD))*us(MOD+MOD);
					us v=my_mul(xb[j],p[i],q[i]);
					xa[j]=u+v;
					xb[j]=u-v+us(MOD+MOD);
				}
		}
		else if(t==4)
		{
			us *xa=x,*xb=x+t;
			for(int i=0;i<m;++i,xa+=t+t,xb+=t+t)
			{
				const __m128i p4=_mm_set1_epi32(p[i]),
				q4=_mm_set1_epi32(q[i]),
				mm=_mm_set1_epi32(MOD+MOD),
				m0=_mm_set1_epi32(0),
				m1=_mm_set1_epi32(MOD);
				for(int j=0;j<t;j+=4)
				{
					__m128i u=_mm_load_si128((__m128i*)(xa+j));
					u=_mm_sub_epi32(u,
					_mm_and_si128(_mm_or_si128(_mm_cmpgt_epi32(u,mm),
					_mm_cmpgt_epi32(m0,u)),mm));
					__m128i v=_mm_load_si128((__m128i*)(xb+j));
					v=_mm_sub_epi32(my_mullo_epu32(v,p4),
					my_mullo_epu32(my_mulhi_epu32(v,q4),m1));
					_mm_store_si128((__m128i*)(xa+j),_mm_add_epi32(u,v));
					_mm_store_si128((__m128i*)(xb+j),_mm_add_epi32(_mm_sub_epi32(u,v),mm));
				}
			}
		}
		else
		{
			us *xa=x,*xb=x+t;
			for(int i=0;i<m;++i,xa+=t+t,xb+=t+t)
			{
				const __m128i p4=_mm_set1_epi32(p[i]),
				q4=_mm_set1_epi32(q[i]),
				mm=_mm_set1_epi32(MOD+MOD),
				m0=_mm_set1_epi32(0),
				m1=_mm_set1_epi32(MOD);
				//unfold 2x
				for(int j=0;j<t;j+=8)
				{
					__m128i u0=_mm_load_si128((__m128i*)(xa+j));
					__m128i u1=_mm_load_si128((__m128i*)(xa+j+4));
					__m128i v0=_mm_load_si128((__m128i*)(xb+j));
					__m128i v1=_mm_load_si128((__m128i*)(xb+j+4));
					u0=_mm_sub_epi32(u0,
					_mm_and_si128(_mm_or_si128(_mm_cmpgt_epi32(u0,mm),
					_mm_cmpgt_epi32(m0,u0)),mm));
					u1=_mm_sub_epi32(u1,
					_mm_and_si128(_mm_or_si128(_mm_cmpgt_epi32(u1,mm),
					_mm_cmpgt_epi32(m0,u1)),mm));
					v0=_mm_sub_epi32(my_mullo_epu32(v0,p4),
					my_mullo_epu32(my_mulhi_epu32(v0,q4),m1));
					v1=_mm_sub_epi32(my_mullo_epu32(v1,p4),
					my_mullo_epu32(my_mulhi_epu32(v1,q4),m1));
					_mm_store_si128((__m128i*)(xa+j),_mm_add_epi32(u0,v0));
					_mm_store_si128((__m128i*)(xa+j+4),_mm_add_epi32(u1,v1));
					_mm_store_si128((__m128i*)(xb+j),
					_mm_add_epi32(_mm_sub_epi32(u0,v0),mm));
					_mm_store_si128((__m128i*)(xb+j+4),
					_mm_add_epi32(_mm_sub_epi32(u1,v1),mm));
				}
			}
		}
	}
	for(int i=0;i<n;++i)
		x[i]-=(x[i]>=us(MOD+MOD))*us(MOD+MOD),
		x[i]-=(x[i]>=us(MOD))*us(MOD);
}
void intt(us* __restrict__ x,int n)
{
	int t=1;
	for(int m=(n>>1);m;m>>=1)
	{
		us* __restrict__ p=p1[m];
		us* __restrict__ q=q1[m];
		if(t==1||t==2)
		{
			us *xa=x,*xb=x+t;
			for(int i=0;i<m;++i,xa+=t+t,xb+=t+t)
				for(int j=0;j<t;++j)
				{
					us u=xa[j],v=xb[j];
					xa[j]=u+v-(u+v>=us(MOD+MOD))*us(MOD+MOD);
					xb[j]=my_mul(u-v+us(MOD+MOD),p[i],q[i]);
				}
		}
		else if(t==4)
		{
			us *xa=x,*xb=x+t;
			for(int i=0;i<m;++i,xa+=t+t,xb+=t+t)
			{
				const __m128i p4=_mm_set1_epi32(p[i]),
				q4=_mm_set1_epi32(q[i]),
				mm=_mm_set1_epi32(MOD+MOD),
				m0=_mm_set1_epi32(0),
				m1=_mm_set1_epi32(MOD);
				for(int j=0;j<t;j+=4)
				{
					__m128i u=_mm_load_si128((__m128i*)(xa+j));
					__m128i v=_mm_load_si128((__m128i*)(xb+j));
					__m128i uv=_mm_add_epi32(u,v);
					_mm_store_si128((__m128i*)(xa+j),_mm_sub_epi32(uv,
					_mm_and_si128(_mm_or_si128(_mm_cmpgt_epi32(uv,mm),
					_mm_cmpgt_epi32(m0,uv)),mm)));
					uv=_mm_add_epi32(_mm_sub_epi32(u,v),mm);
					_mm_store_si128((__m128i*)(xb+j),
					_mm_sub_epi32(my_mullo_epu32(uv,p4),
					my_mullo_epu32(my_mulhi_epu32(uv,q4),m1)));
				}
			}
		}
		else
		{
			us *xa=x,*xb=x+t;
			for(int i=0;i<m;++i,xa+=t+t,xb+=t+t)
			{
				const __m128i p4=_mm_set1_epi32(p[i]),
				q4=_mm_set1_epi32(q[i]),
				mm=_mm_set1_epi32(MOD+MOD),
				m0=_mm_set1_epi32(0),
				m1=_mm_set1_epi32(MOD);
				//unfold 2x
				for(int j=0;j<t;j+=8)
				{
					__m128i u0=_mm_load_si128((__m128i*)(xa+j));
					__m128i u1=_mm_load_si128((__m128i*)(xa+j+4));
					__m128i v0=_mm_load_si128((__m128i*)(xb+j));
					__m128i v1=_mm_load_si128((__m128i*)(xb+j+4));
					__m128i uv0=_mm_add_epi32(u0,v0);
					__m128i uv1=_mm_add_epi32(u1,v1);
					_mm_store_si128((__m128i*)(xa+j),_mm_sub_epi32(uv0,
					_mm_and_si128(_mm_or_si128(_mm_cmpgt_epi32(uv0,mm),
					_mm_cmpgt_epi32(m0,uv0)),mm)));
					_mm_store_si128((__m128i*)(xa+j+4),_mm_sub_epi32(uv1,
					_mm_and_si128(_mm_or_si128(_mm_cmpgt_epi32(uv1,mm),
					_mm_cmpgt_epi32(m0,uv1)),mm)));
					uv0=_mm_add_epi32(_mm_sub_epi32(u0,v0),mm);
					uv1=_mm_add_epi32(_mm_sub_epi32(u1,v1),mm);
					_mm_store_si128((__m128i*)(xb+j),
					_mm_sub_epi32(my_mullo_epu32(uv0,p4),
					my_mullo_epu32(my_mulhi_epu32(uv0,q4),m1)));
					_mm_store_si128((__m128i*)(xb+j+4),
					_mm_sub_epi32(my_mullo_epu32(uv1,p4),
					my_mullo_epu32(my_mulhi_epu32(uv1,q4),m1)));
				}
			}
		}
		t<<=1;
	}
	us rn=qp(n,MOD-2);
	for(int i=0;i<n;++i)
		x[i]=x[i]*(ull)rn%MOD;
}
//copied from http://uoj.ac/submission/386710
typedef unsigned uint;
struct IO_Tp
{
	const static int _I_Buffer_Size = 2 << 20;
	char _I_Buffer[_I_Buffer_Size], *_I_pos = _I_Buffer;
	
	const static int _O_Buffer_Size = 2 << 20;
	char _O_Buffer[_O_Buffer_Size], *_O_pos = _O_Buffer;
	
	uint m[10000];
	
	IO_Tp()
	{
		constexpr uint e0 = '\0\0\0\1', e1 = '\0\0\1\0', e2 = '\0\1\0\0', e3 = '\1\0\0\0';
		int x = 0;
		for (uint i = 0, c0 = '0000'; i != 10; ++i, c0 += e0)
			for (uint j = 0, c1 = c0; j != 10; ++j, c1 += e1)
				for (uint k = 0, c2 = c1; k != 10; ++k, c2 += e2)
					for (uint l = 0, c3 = c2; l != 10; ++l, c3 += e3)
						m[x++] = c3;
		
		fread(_I_Buffer, 1, _I_Buffer_Size, stdin);
	}
	~IO_Tp() { fwrite(_O_Buffer, 1, _O_pos - _O_Buffer, stdout); }
	
	IO_Tp &operator>>(int &res)
	{
		while (!isdigit(*_I_pos))
			++_I_pos;
		res = *_I_pos++ - '0';
		while (isdigit(*_I_pos))
			res = res * 10 + (*_I_pos++ - '0');
		return *this;
	}
	
	IO_Tp &operator>>(uint &res)
	{
		while (!isdigit(*_I_pos))
			++_I_pos;
		res = *_I_pos++ - '0';
		while (isdigit(*_I_pos))
			res = res * 10 + (*_I_pos++ - '0');
		return *this;
	}
	
	IO_Tp &operator<<(uint x)
	{
		if (x == 0)
		{
			*_O_pos++ = '0';
			return *this; 
		}
		static char _buf[12];
		char *_pos = _buf + 12;
		if (x >= 10000)
			*--reinterpret_cast<uint *&>(_pos) = m[x % 10000], x /= 10000;
		if (x >= 10000)
			*--reinterpret_cast<uint *&>(_pos) = m[x % 10000], x /= 10000;
		*--reinterpret_cast<uint *&>(_pos) = m[x];
		_pos += (x < 1000) + (x < 100) + (x < 10);
		_O_pos = std::copy(_pos, _buf + 12, _O_pos);
		return *this;
	}
	
	IO_Tp &operator<<(char ch)
	{
		*_O_pos++ = ch;
		return *this;
	}
} IO;
int N,M;
us a[2345678] __attribute__ ((aligned(64)));
us b[2345678] __attribute__ ((aligned(64)));
#ifdef LOCAL
int main()
{
	N=100000; M=100000; int t=N+M-1;
	for(int i=0;i<N;i++) a[i]=i%9;
	for(int i=0;i<M;i++) b[i]=i%8;
	printf("%.3lf\n",clock()*1.0/CLOCKS_PER_SEC);
	int K=getK(t); prep(K);
	printf("%.3lf\n",clock()*1.0/CLOCKS_PER_SEC);
	for(int i=0;i<10;++i)
	{
	ntt(a,K); ntt(b,K);
	for(int i=0;i<K;i++) a[i]=(ll)a[i]*b[i]%MOD;
	intt(a,K);
	ll su=0;
	for(int i=0;i<t;i++) su+=a[i]%MOD;
	printf("%lld\n",su);
	}
	cout<<"ANS=99663681227949"<<"\n";
	cout<<"ans=998363825160779\n";
	printf("%.3lf\n",clock()*1.0/CLOCKS_PER_SEC);
}
#else
int main()
{
	IO>>N; IO>>M; ++N; ++M; int t=N+M-1;
	for(int i=0;i<N;i++) IO>>a[i];
	for(int i=0;i<M;i++) IO>>b[i];
	int K=getK(t); prep(K);
	ntt(a,K); ntt(b,K);
	for(int i=0;i<K;i++) a[i]=(ll)a[i]*b[i]%MOD;
	intt(a,K); for(int i=0;i<t;i++) IO<<a[i]<<' ';
}
#endif

CompilationN/AN/ACompile OKScore: N/A

Subtask #1 Testcase #142.1 us116 KBAcceptedScore: 0

Subtask #1 Testcase #218.997 ms9 MB + 488 KBAcceptedScore: 100

Subtask #1 Testcase #38.537 ms3 MB + 1004 KBAcceptedScore: 0

Subtask #1 Testcase #48.49 ms3 MB + 980 KBAcceptedScore: 0

Subtask #1 Testcase #545.18 us116 KBAcceptedScore: 0

Subtask #1 Testcase #642.25 us116 KBAcceptedScore: 0

Subtask #1 Testcase #742.42 us116 KBAcceptedScore: 0

Subtask #1 Testcase #817.783 ms8 MB + 908 KBAcceptedScore: 0

Subtask #1 Testcase #917.787 ms8 MB + 908 KBAcceptedScore: 0

Subtask #1 Testcase #1016.598 ms8 MB + 300 KBAcceptedScore: 0

Subtask #1 Testcase #1118.993 ms9 MB + 648 KBAcceptedScore: 0

Subtask #1 Testcase #1213.432 ms7 MB + 408 KBAcceptedScore: 0

Subtask #1 Testcase #1340.53 us100 KBAcceptedScore: 0


Judge Duck Online | 评测鸭在线
Server Time: 2025-01-19 02:24:02 | Loaded in 1 ms | Server Status
个人娱乐项目,仅供学习交流使用 | 捐赠