#pragma GCC optimize("Ofast,unroll-loops")
#pragma GCC target("avx2,tune=native")
typedef unsigned u32;
u32 cnt1[1 << 8], cnt2[1 << 8], cnt3[1 << 8], cnt4[1 << 8], ptr1[1<<8], ptr2[1<<8], ptr3[1<<8], ptr4[1<<8];
void sort(unsigned *a, int n)
{
u32 *tmp = new u32[n];
u32 *end= a + n;
#pragma unroll
for (register u32 *i = a; i != end; i++)
++cnt1[*i & 0xff],
++cnt2[(*i >> 8) & 0xff],
++cnt3[(*i >> 16) & 0xff],
++cnt4[*i >> 24];
for(register u32 *i = cnt1 + 1, *j = cnt2 + 1, *k = cnt3 + 1, *l = cnt4 + 1, *I = ptr1 + 1, *J = ptr2 + 1, *K = ptr3 + 1, *L = ptr4 + 1; i <= cnt1 + 0xff; ++i, ++j, ++k, ++l, ++I, ++J, ++K, ++L)
*I = *(I - 1) + *(i - 1),
*J = *(J - 1) + *(j - 1),
*K = *(K - 1) + *(k - 1),
*L = *(L - 1) + *(l - 1);
for(register u32 *i= end - 1; i != a - 1; i-=8)
tmp[--ptr1[*i & 0xff]] = *i,
tmp[--ptr1[*(i - 1) & 0xff]] = *(i - 1),
tmp[--ptr1[*(i - 2) & 0xff]] = *(i - 2),
tmp[--ptr1[*(i - 3) & 0xff]] = *(i - 3),
tmp[--ptr1[*(i - 4) & 0xff]] = *(i - 4),
tmp[--ptr1[*(i - 5) & 0xff]] = *(i - 5),
tmp[--ptr1[*(i - 6) & 0xff]] = *(i - 6),
tmp[--ptr1[*(i - 7) & 0xff]] = *(i - 7);
for(register u32 *i= tmp + n - 1; i != tmp - 1; i-=8)
a[--ptr2[(*i >> 8) & 0xff]] = *i,
a[--ptr2[(*(i - 1) >> 8) & 0xff]] = *(i - 1),
a[--ptr2[(*(i - 2) >> 8) & 0xff]] = *(i - 2),
a[--ptr2[(*(i - 3) >> 8) & 0xff]] = *(i - 3),
a[--ptr2[(*(i - 4) >> 8) & 0xff]] = *(i - 4),
a[--ptr2[(*(i - 5) >> 8) & 0xff]] = *(i - 5),
a[--ptr2[(*(i - 6) >> 8) & 0xff]] = *(i - 6),
a[--ptr2[(*(i - 7) >> 8) & 0xff]] = *(i - 7);
for(register u32 *i= end - 1; i != a - 1; i-=8)
tmp[--ptr3[(*i >> 16) & 0xff]] = *i,
tmp[--ptr3[(*(i - 1) >> 16) & 0xff]] = *(i - 1),
tmp[--ptr3[(*(i - 2) >> 16) & 0xff]] = *(i - 2),
tmp[--ptr3[(*(i - 3) >> 16) & 0xff]] = *(i - 3),
tmp[--ptr3[(*(i - 4) >> 16) & 0xff]] = *(i - 4),
tmp[--ptr3[(*(i - 5) >> 16) & 0xff]] = *(i - 5),
tmp[--ptr3[(*(i - 6) >> 16) & 0xff]] = *(i - 6),
tmp[--ptr3[(*(i - 7) >> 16) & 0xff]] = *(i - 7);
for(register u32 *i= tmp + n - 1; i != tmp - 1; i-=8)
a[--ptr4[*i >> 24]] = *i,
a[--ptr4[*(i - 1) >> 24]] = *(i - 1),
a[--ptr4[*(i - 2) >> 24]] = *(i - 2),
a[--ptr4[*(i - 3) >> 24]] = *(i - 3),
a[--ptr4[*(i - 4) >> 24]] = *(i - 4),
a[--ptr4[*(i - 5) >> 24]] = *(i - 5),
a[--ptr4[*(i - 6) >> 24]] = *(i - 6),
a[--ptr4[*(i - 7) >> 24]] = *(i - 7);
delete[] tmp;
}
Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
Testcase #1 | 862.61 us | 816 KB | Runtime Error | Score: 0 | 显示更多 |
Testcase #2 | 534.379 ms | 761 MB + 472 KB | Runtime Error | Score: 0 | 显示更多 |
Testcase #3 | 1.07 s | 1522 MB + 856 KB | Runtime Error | Score: 0 | 显示更多 |