#pragma GCC optimize("Ofast")
#pragma GCC target("tune=native")
#include <immintrin.h>
typedef unsigned u32;
u32 cnt1[1 << 8], cnt2[1 << 8], cnt3[1 << 8], cnt4[1 << 8];
void sort(unsigned *a, int n)
{
u32* const tmp = new u32[n];
u32* const end= a + n;
register u32 *i, *j, *k, *l;
for (i = a; i != end; i+=8)
++cnt1[*i & 0xff], ++cnt2[(*i >> 8) & 0xff], ++cnt3[(*i >> 16) & 0xff], ++cnt4[*i >> 24],
++cnt1[*(i + 1) & 0xff], ++cnt2[(*(i + 1) >> 8) & 0xff], ++cnt3[(*(i + 1) >> 16) & 0xff], ++cnt4[*(i + 1) >> 24],
++cnt1[*(i + 2) & 0xff], ++cnt2[(*(i + 2) >> 8) & 0xff], ++cnt3[(*(i + 2) >> 16) & 0xff], ++cnt4[*(i + 2) >> 24],
++cnt1[*(i + 3) & 0xff], ++cnt2[(*(i + 3) >> 8) & 0xff], ++cnt3[(*(i + 3) >> 16) & 0xff], ++cnt4[*(i + 3) >> 24],
++cnt1[*(i + 4) & 0xff], ++cnt2[(*(i + 4) >> 8) & 0xff], ++cnt3[(*(i + 4) >> 16) & 0xff], ++cnt4[*(i + 4) >> 24],
++cnt1[*(i + 5) & 0xff], ++cnt2[(*(i + 5) >> 8) & 0xff], ++cnt3[(*(i + 5) >> 16) & 0xff], ++cnt4[*(i + 5) >> 24],
++cnt1[*(i + 6) & 0xff], ++cnt2[(*(i + 6) >> 8) & 0xff], ++cnt3[(*(i + 6) >> 16) & 0xff], ++cnt4[*(i + 6) >> 24],
++cnt1[*(i + 7) & 0xff], ++cnt2[(*(i + 7) >> 8) & 0xff], ++cnt3[(*(i + 7) >> 16) & 0xff], ++cnt4[*(i + 7) >> 24];
for(i = cnt1 + 1, j = cnt2 + 1, k = cnt3 + 1, l = cnt4 + 1; i <= cnt1 + 0xf8; i+=8, j+=8, k+=8, l+=8)
*i += *(i - 1), *j += *(j - 1), *k += *(k - 1), *l += *(l - 1),
*(i + 1) += *i, *(j + 1) += *j, *(k + 1) += *k, *(l + 1) += *l,
*(i + 2) += *(i + 1), *(j + 2) += *(j + 1), *(k + 2) += *(k + 1), *(l + 2) += *(l + 1),
*(i + 3) += *(i + 2), *(j + 3) += *(j + 2), *(k + 3) += *(k + 2), *(l + 3) += *(l + 2),
*(i + 4) += *(i + 3), *(j + 4) += *(j + 3), *(k + 4) += *(k + 3), *(l + 4) += *(l + 3),
*(i + 5) += *(i + 4), *(j + 5) += *(j + 4), *(k + 5) += *(k + 4), *(l + 5) += *(l + 4),
*(i + 6) += *(i + 5), *(j + 6) += *(j + 5), *(k + 6) += *(k + 5), *(l + 6) += *(l + 5),
*(i + 7) += *(i + 6), *(j + 7) += *(j + 6), *(k + 7) += *(k + 6), *(l + 7) += *(l + 6);
*i += *(i - 1), *j += *(j - 1), *k += *(k - 1), *l += *(l - 1),
*(i + 1) += *i, *(j + 1) += *j, *(k + 1) += *k, *(l + 1) += *l,
*(i + 2) += *(i + 1), *(j + 2) += *(j + 1), *(k + 2) += *(k + 1), *(l + 2) += *(l + 1),
*(i + 3) += *(i + 2), *(j + 3) += *(j + 2), *(k + 3) += *(k + 2), *(l + 3) += *(l + 2),
*(i + 4) += *(i + 3), *(j + 4) += *(j + 3), *(k + 4) += *(k + 3), *(l + 4) += *(l + 3),
*(i + 5) += *(i + 4), *(j + 5) += *(j + 4), *(k + 5) += *(k + 4), *(l + 5) += *(l + 4),
*(i + 6) += *(i + 5), *(j + 6) += *(j + 5), *(k + 6) += *(k + 5), *(l + 6) += *(l + 5);
for(i = end - 1; i != a - 1; i-=8)
_mm_prefetch(i - 256, _MM_HINT_NTA),
tmp[--cnt1[*i & 0xff]] = *i,
tmp[--cnt1[*(i - 1) & 0xff]] = *(i - 1),
tmp[--cnt1[*(i - 2) & 0xff]] = *(i - 2),
tmp[--cnt1[*(i - 3) & 0xff]] = *(i - 3),
tmp[--cnt1[*(i - 4) & 0xff]] = *(i - 4),
tmp[--cnt1[*(i - 5) & 0xff]] = *(i - 5),
tmp[--cnt1[*(i - 6) & 0xff]] = *(i - 6),
tmp[--cnt1[*(i - 7) & 0xff]] = *(i - 7);
for(i = tmp + n - 1; i != tmp - 1; i-=8)
_mm_prefetch(i - 256, _MM_HINT_NTA),
a[--cnt2[(*i >> 8) & 0xff]] = *i,
a[--cnt2[(*(i - 1) >> 8) & 0xff]] = *(i - 1),
a[--cnt2[(*(i - 2) >> 8) & 0xff]] = *(i - 2),
a[--cnt2[(*(i - 3) >> 8) & 0xff]] = *(i - 3),
a[--cnt2[(*(i - 4) >> 8) & 0xff]] = *(i - 4),
a[--cnt2[(*(i - 5) >> 8) & 0xff]] = *(i - 5),
a[--cnt2[(*(i - 6) >> 8) & 0xff]] = *(i - 6),
a[--cnt2[(*(i - 7) >> 8) & 0xff]] = *(i - 7);
for(i = end - 1; i != a - 1; i-=8)
_mm_prefetch(i - 256, _MM_HINT_NTA),
tmp[--cnt3[(*i >> 16) & 0xff]] = *i,
tmp[--cnt3[(*(i - 1) >> 16) & 0xff]] = *(i - 1),
tmp[--cnt3[(*(i - 2) >> 16) & 0xff]] = *(i - 2),
tmp[--cnt3[(*(i - 3) >> 16) & 0xff]] = *(i - 3),
tmp[--cnt3[(*(i - 4) >> 16) & 0xff]] = *(i - 4),
tmp[--cnt3[(*(i - 5) >> 16) & 0xff]] = *(i - 5),
tmp[--cnt3[(*(i - 6) >> 16) & 0xff]] = *(i - 6),
tmp[--cnt3[(*(i - 7) >> 16) & 0xff]] = *(i - 7);
for(i = tmp + n - 1; i != tmp - 1; i-=8)
_mm_prefetch(i - 256, _MM_HINT_NTA),
a[--cnt4[*i >> 24]] = *i,
a[--cnt4[*(i - 1) >> 24]] = *(i - 1),
a[--cnt4[*(i - 2) >> 24]] = *(i - 2),
a[--cnt4[*(i - 3) >> 24]] = *(i - 3),
a[--cnt4[*(i - 4) >> 24]] = *(i - 4),
a[--cnt4[*(i - 5) >> 24]] = *(i - 5),
a[--cnt4[*(i - 6) >> 24]] = *(i - 6),
a[--cnt4[*(i - 7) >> 24]] = *(i - 7);
delete[] tmp;
}
Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
Testcase #1 | 1.018 ms | 804 KB | Accepted | Score: 34 | 显示更多 |
Testcase #2 | 1.109 s | 762 MB + 988 KB | Accepted | Score: 33 | 显示更多 |
Testcase #3 | 2.219 s | 1525 MB + 924 KB | Accepted | Score: 33 | 显示更多 |