// Assumes ymm not used by other thing
#define asmv asm volatile
void solve(int n, int q, char *s1, char *s2, int *q_x, int *q_y, int *q_len, unsigned *ans) {
	for (int i=0; i<n; ++i)
		s1[i] = "120"[s1[i]-'0'];
	for (int k=0; k<q; ++k) {
		char* p1 = s1 + q_x[k];
		char* p2 = s2 + q_y[k];
		unsigned len = q_len[k];
		asmv ("vpxor %ymm0, %ymm0, %ymm0"); // ymm0 total
		while (len >= 255 * 64) {           // ymm1, ymm2 this_group
			asmv ("vpxor %ymm1, %ymm1, %ymm1;"
			      "vpxor %ymm2, %ymm2, %ymm2;");
			for (int i=0; i<255; i++) {
				asmv ("vmovupd   (%0), %%ymm4; vmovupd   (%1), %%ymm5;"
				      "vmovupd 32(%0), %%ymm6; vmovupd 32(%1), %%ymm7;"
				      "vpcmpeqb %%ymm4, %%ymm5, %%ymm4;"
				      "vpcmpeqb %%ymm6, %%ymm7, %%ymm6;"
				      "vpsubb   %%ymm1, %%ymm4, %%ymm1;"
				      "vpsubb   %%ymm2, %%ymm6, %%ymm2;":: "r"(p1), "r"(p2));
				p1 += 64; p2 += 64;
			}
			asmv ("vpmovzxbw %xmm1, %ymm4; vextracti128 $1, %ymm1, %xmm5;"
			      "vpmovzxbw %xmm5, %ymm5; vpaddw %ymm4, %ymm5, %ymm5;"
			      "vpaddw %ymm0, %ymm5, %ymm0;"
			      "vpmovzxbw %xmm2, %ymm6; vextracti128 $1, %ymm2, %xmm7;"
			      "vpmovzxbw %xmm7, %ymm7; vpaddw %ymm6, %ymm7, %ymm7;"
			      "vpaddw %ymm0, %ymm7, %ymm0;");
			len -= 255 * 64;
		}
		int heregrp = len/64;            // ymm1, ymm2 this_group
			asmv ("vpxor %ymm1, %ymm1, %ymm1;"
			      "vpxor %ymm2, %ymm2, %ymm2;");
			for (int i=0; i<heregrp; i++) {
				asmv ("vmovupd   (%0), %%ymm4; vmovupd   (%1), %%ymm5;"
				      "vmovupd 32(%0), %%ymm6; vmovupd 32(%1), %%ymm7;"
				      "vpcmpeqb %%ymm4, %%ymm5, %%ymm4;"
				      "vpcmpeqb %%ymm6, %%ymm7, %%ymm6;"
				      "vpsubb   %%ymm1, %%ymm4, %%ymm1;"
				      "vpsubb   %%ymm2, %%ymm6, %%ymm2;":: "r"(p1), "r"(p2));
				p1 += 64; p2 += 64;
			}
		unsigned short retbuf[32];
			asmv ("vpmovzxbw %xmm1, %ymm4; vextracti128 $1, %ymm1, %xmm5;"
			      "vpmovzxbw %xmm5, %ymm5; vpaddw %ymm4, %ymm5, %ymm5;"
			      "vpaddw %ymm0, %ymm5, %ymm0;"
			      "vpmovzxbw %xmm2, %ymm6; vextracti128 $1, %ymm2, %xmm7;"
			      "vpmovzxbw %xmm7, %ymm7; vpaddw %ymm6, %ymm7, %ymm7;"
			      "vpaddw %ymm0, %ymm7, %ymm0;");
		asmv ("vmovupd %%ymm0, (%0)":: "r"(retbuf): "memory");
		int ret = 0;
		for (int i=0; i<32; ++i) ret += retbuf[i];
		heregrp %= 64;
		for (int i=0; i<heregrp; ++i) {
			ret += p1[i] == p2[i];
		}
		ans[k] = ret;
	}
}
				
				
				| Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 | 
| Testcase #1 | 161.67 us | 40 KB | Wrong Answer | Score: 0 | 显示更多 | 
| Testcase #2 | 848.713 ms | 5 MB + 176 KB | Wrong Answer | Score: 0 | 显示更多 |