#include <bits/stdc++.h>
using namespace std;
const int n = 1e8;
template <class T>
void F(uint* a, uint* b, T lambda) {
uint *p[256];
for (int i = 0; i < 256; i++) p[i] = b + n / 256 * i * 2;
for (int i = 0; i < n; i++) *p[lambda(a[i])]++ = a[i];
auto ptr = a;
for (int k = 0; k < 256; k++) {
auto len = p[k] - (b + n / 256 * k * 2);
memcpy(ptr, b + n / 256 * k * 2, len * sizeof(uint));
ptr += len;
}
}
void sort(uint* a, int __n) {
uint* b = (uint*)malloc(n * sizeof(uint) * 2);
F(a, b, [](uint x) { return x & 255; });
F(a, b, [](uint x) { return x >> 8 & 255; });
F(a, b, [](uint x) { return x >> 16 & 255; });
F(a, b, [](uint x) { return x >> 24; });
}
int main() {
uint* a = (uint*)malloc(n * sizeof(uint));
uint* b = (uint*)malloc(n * sizeof(uint));
mt19937 gen;
for (int i = 0; i < n; i++) a[i] = gen();
memcpy(b, a, n * 4);
sort(a, n), sort(b, b + n);
if (memcmp(a, b, n * 4))
cout << "wa" << endl;
}