#pragma GCC optimize("Ofast")
#include <bits/stdc++.h>
using u32 = unsigned int;
using u64 = unsigned long long;
namespace {
typedef u64 v4ul __attribute__((vector_size(32)));
constexpr size_t N = 3e5 + 5;
v4ul a[3][256][(N >> 6) + 1];
v4ul b[3][256][(N >> 6) + 1];
} // namespace
void solve(int n, int q, char *s1, char *s2, int *qx, int *qy, int *qlen,
unsigned *ans) {
for (int _ = 0; _ < 3; _++)
for (int i = 0; i < 256; i++)
for (int j = 0; (j << 8) + i < n; j++)
#pragma GCC unroll(8)
for (int k = (j << 8) + i; k < std::min(((j + 1) << 8) + i, n); k++)
a[_][i][j][(k - i - (j << 8)) >> 6] |= u64(s1[k] == '0' + _)
<< ((k - i) & 63);
for (int _ = 0; _ < 3; _++)
for (int i = 0; i < 256; i++)
for (int j = 0; (j << 8) + i < n; j++)
#pragma GCC unroll(8)
for (int k = (j << 8) + i; k < std::min(((j + 1) << 8) + i, n); k++)
b[_][i][j][(k - i - (j << 8)) >> 6] |= u64(s2[k] == '0' + _)
<< ((k - i) & 63);
for (int _ = 0; _ < q; _++) {
int x = qx[_], y = qy[_], l = qlen[_];
auto _sum = [&](int p, int q) -> u32 {
u32 res0 = 0, res1 = 0, res2 = 0, res3 = 0;
#pragma GCC unroll(4)
for (int i = 0; i < (l >> 8); i++) {
v4ul tmp = a[p][x & 255][i + (x >> 8)] & b[q][y & 255][i + (y >> 8)];
res0 += __builtin_popcountll(tmp[0]);
res1 += __builtin_popcountll(tmp[1]);
res2 += __builtin_popcountll(tmp[2]);
res3 += __builtin_popcountll(tmp[3]);
}
int _id = l >> 8, _cur = 0;
while (_cur < ((l >> 6) & 3)) {
u64 tmp = a[p][x & 255][_id + (x >> 8)][_cur] &
b[q][y & 255][_id + (y >> 8)][_cur];
res0 += __builtin_popcountll(tmp);
_cur++;
}
res0 += __builtin_popcountll(a[p][x & 255][_id + (x >> 8)][_cur] &
b[q][y & 255][_id + (y >> 8)][_cur] &
((1ull << (l & 63)) - 1));
return res0 + res1 + res2 + res3;
};
u32 _ans0 = _sum(0, 1), _ans1 = _sum(1, 2), _ans2 = _sum(2, 0);
ans[_] = _ans0 + _ans1 + _ans2;
}
}
Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
Testcase #1 | 4.07 ms | 6 MB + 48 KB | Accepted | Score: 50 | 显示更多 |
Testcase #2 | 3 s | 65 MB + 148 KB | Time Limit Exceeded | Score: 0 | 显示更多 |