#pragma GCC optimize("Ofast")
#pragma GCC target("avx2,popcnt,bmi,bmi2")
template<class T> struct vector_int {};
#define vector_int(T) template <> struct vector_int<T> { using type = __attribute((vector_size(32))) T; };
vector_int(char) vector_int(short) vector_int(int) vector_int(long) vector_int(long long)
#undef vector_int
template<class T, int N, class = typename vector_int<T>::type> struct btree_sum {
using vec = typename vector_int<T>::type;
static constexpr int B = 64 / sizeof(T), b = __builtin_ctz(B);
static constexpr int height(int n) {
return n <= B ? 1 : height(n >> b) + 1;
}
static constexpr int offset(int h) {
int s = 0, n = N;
while(h--) s += (n + B - 1) & -B, n = (n + B - 1) >> b;
return s;
}
static constexpr int H = height(N);
struct precalc {
alignas(64) T mask[B][B];
int offset_v[H];
constexpr precalc() :mask{}, offset_v{} {
for(int i = 0; i < B; i++)
for(int j = i + 1; j < B; j++) mask[i][j] = -1;
for(int i = 0; i < H; i++) offset_v[i] = offset(i);
}
};
static constexpr auto p = precalc();
alignas(64) T a[offset(H)];
inline T sum(unsigned k) {
T s = 0;
#pragma GCC unroll 64
for(int h = 0; h < H; h++)
s += a[p.offset_v[h] + (k >> (h * b))];
return s;
}
inline void add(unsigned k) {
const auto v = 1 + vec{};
#pragma GCC unroll 64
for(int h = 0; h < H; h++) {
auto t = (vec*)&a[p.offset_v[h] + (k >> (h * b) & -B)];
auto m = (vec*)p.mask[k >> (h * b) & (B - 1)];
t[0] += v & m[0], t[1] += v & m[1];
}
}
};
using u32 = unsigned;
using u64 = unsigned long long;
const int N = 1e7 + 64;
template<int id> void radixSort(int n, u32 a[N][4], u32 b[N][4]) {
u32 buc[3][256] = {};
#define GET(i, k) buc[k][i >> (k * 8) & 255]
for(int i = 0; i < n; i++) {
GET(a[i][id], 0)++, GET(a[i][id], 1)++, GET(a[i][id], 2)++;
}
for(int k = 0; k < 3; k++) {
u32 s = 0;
for(int i = 0; i < 256; i++) {
u32 t = buc[k][i];
buc[k][i] = s, s += t;
}
}
for(int i = 0; i < n; i++) __builtin_memcpy(b[GET(a[i][id], 0)++], a[i], 16);
for(int i = 0; i < n; i++) __builtin_memcpy(a[GET(b[i][id], 1)++], b[i], 16);
for(int i = 0; i < n; i++) __builtin_memcpy(b[GET(a[i][id], 2)++], a[i], 16);
}
void count_2d(int n, const u32* x, const u32* y, u32* out) {
static btree_sum<int, (N >> 6)> tree;
static u32 a[N][4], b[N][4];
static u64 mask[N >> 6];
for(int i = 0; i < n; i++) a[i][0] = x[i], a[i][1] = y[i], a[i][3] = i;
radixSort<1>(n, a, b);
for(int l = 0, r; l < n; l = r) {
u32 tmp = b[l][1];
for(r = l; b[r][1] == tmp; r++)
b[r][1] = l, b[r][2] = r;
}
radixSort<0>(n, b, a);
for(int l = 0, r; l < n; l = r) {
for(r = l; a[r][0] == a[l][0]; r++) {
u32 p = a[r][1];
out[a[r][3]] = tree.sum(p >> 6) + __builtin_popcountll(mask[p >> 6] & (1ULL << (p & 63)) - 1);
}
while(l < r) {
u32 p = a[l++][2];
tree.add(p >> 6), mask[p >> 6] |= 1ULL << (p & 63);
}
}
}
Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
Testcase #1 | 510.265 ms | 345 MB + 164 KB | Accepted | Score: 100 | 显示更多 |