#pragma GCC optimize("Ofast,inline,unroll-loops")
#include <bits/stdc++.h>
#include <immintrin.h>
#include <algorithm>
using namespace std;
    uint  b[1<<27];
template<int n>
struct foo {
template<class T>
static void F(uint* __restrict__ buc, uint* __restrict__ a, uint* __restrict__ b, T lambda) {
    for (int i = 0; i < n; i += 16) {
        #pragma GCC unroll 16
        for (int j = 0; j < 16; j++)
            b[buc[lambda(a[i + j])]++] = a[i + j];
    }
}
static void sort(uint* a, int) {
    uint buc[4][256] = {};
    for (int i = 0; i < n; i++) {
        buc[0][a[i] & 255]++;
        buc[1][a[i] >> 8 & 255]++;
        buc[2][a[i] >> 16 & 255]++;
        buc[3][(a[i] >> 16) >> 8 & 255]++;
    }
    for (int k = 0; k < 4; k++) {
        uint32_t offset = 0;
        for (int i = 0; i < 256; i++)
            swap(buc[k][i], offset), offset += buc[k][i];
    }
    F(buc[0], a, b, [](uint x) { return x & 255; });
    F(buc[1], b, a, [](uint x) { return x >> 8 & 255; });
    F(buc[2], a, b, [](uint x) { return x >> 16 & 255; });
    F(buc[3], b, n==34217728?b+34217728:a, [](uint x) { return x >> 24; });
}
};
void sort(uint* a, int __n) {
    // nth_element(a, a+34217728, a+134217728);
    foo<100000000>::sort(a+34217728,100000000);
    foo<34217728>::sort(a, 34217728);
    int p = 0, q = 34217728, r = 34217728; 
    while (p!=r) a[p++] = r==134217728||b[q]<a[r]?b[q++]:a[r++];
}
				
				
				| Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 | 
| Testcase #1 | 1.411 s | 893 MB + 520 KB | Accepted | Score: 100 | 显示更多 |