// Assumes ymm not used by other thing
#define asmv asm volatile
void solve(int n, int q, char *s1, char *s2, int *q_x, int *q_y, int *q_len, unsigned *ans) {
for (int i=0; i<n; ++i)
s1[i] = "120"[s1[i]-'0'];
for (int k=0; k<q; ++k) {
char* p1 = s1 + q_x[k];
char* p2 = s2 + q_y[k];
unsigned len = q_len[k];
asmv ("vpxor %ymm0, %ymm0, %ymm0"); // ymm0 total
while (len >= 255 * 64) { // ymm1, ymm2 this_group
asmv ("vpxor %ymm1, %ymm1, %ymm1;"
"vpxor %ymm2, %ymm2, %ymm2;");
for (int i=0; i<255; i++) {
asmv ("vmovupd (%0), %%ymm4; vmovupd (%1), %%ymm5;"
"vmovupd 32(%0), %%ymm6; vmovupd 32(%1), %%ymm7;"
"vpcmpeqb %%ymm4, %%ymm5, %%ymm4;"
"vpcmpeqb %%ymm6, %%ymm7, %%ymm6;"
"vpsubb %%ymm4, %%ymm1, %%ymm1;"
"vpsubb %%ymm6, %%ymm2, %%ymm2;":: "r"(p1), "r"(p2));
p1 += 64; p2 += 64;
}
asmv ("vpmovzxbw %xmm1, %ymm4; vextracti128 $1, %ymm1, %xmm5;"
"vpmovzxbw %xmm5, %ymm5; vpaddw %ymm4, %ymm5, %ymm5;"
"vpaddw %ymm0, %ymm5, %ymm0;"
"vpmovzxbw %xmm2, %ymm6; vextracti128 $1, %ymm2, %xmm7;"
"vpmovzxbw %xmm7, %ymm7; vpaddw %ymm6, %ymm7, %ymm7;"
"vpaddw %ymm0, %ymm7, %ymm0;");
len -= 255 * 64;
}
int heregrp = len/64; // ymm1, ymm2 this_group
asmv ("vpxor %ymm1, %ymm1, %ymm1;"
"vpxor %ymm2, %ymm2, %ymm2;");
for (int i=0; i<heregrp; i++) {
asmv ("vmovupd (%0), %%ymm4; vmovupd (%1), %%ymm5;"
"vmovupd 32(%0), %%ymm6; vmovupd 32(%1), %%ymm7;"
"vpcmpeqb %%ymm4, %%ymm5, %%ymm4;"
"vpcmpeqb %%ymm6, %%ymm7, %%ymm6;"
"vpsubb %%ymm4, %%ymm1, %%ymm1;"
"vpsubb %%ymm6, %%ymm2, %%ymm2;":: "r"(p1), "r"(p2));
p1 += 64; p2 += 64;
}
unsigned short retbuf[32];
asmv ("vpmovzxbw %xmm1, %ymm4; vextracti128 $1, %ymm1, %xmm5;"
"vpmovzxbw %xmm5, %ymm5; vpaddw %ymm4, %ymm5, %ymm5;"
"vpaddw %ymm0, %ymm5, %ymm0;"
"vpmovzxbw %xmm2, %ymm6; vextracti128 $1, %ymm2, %xmm7;"
"vpmovzxbw %xmm7, %ymm7; vpaddw %ymm6, %ymm7, %ymm7;"
"vpaddw %ymm0, %ymm7, %ymm0;");
asmv ("vmovupd %%ymm0, (%0)":: "r"(retbuf): "memory");
int ret = 0;
for (int i=0; i<32; ++i) ret += retbuf[i];
heregrp %= 64;
for (int i=0; i<heregrp; ++i) {
ret += p1[i] == p2[i];
}
ans[k] = ret;
}
}
Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
Testcase #1 | 161.6 us | 40 KB | Wrong Answer | Score: 0 | 显示更多 |
Testcase #2 | 848.148 ms | 5 MB + 176 KB | Wrong Answer | Score: 0 | 显示更多 |