#if defined(__GNUC__) && !defined(__clang__)
#define O2 __attribute__((optimize("-O2")))
#else
#define O2
#endif
#include <stdio.h>
#include <stdlib.h>
#define BUCKET_MOD (0xFFU)
#define BUCKET_SIZE (BUCKET_MOD + 1U)
#ifndef MAX_SIZE
#define MAX_SIZE (1000000U)
#endif
#define SPLIT_LOOP_1(p, key, mod, value, index) \
*(--(key[value[index] mod].addr)) = value[index]
#define SPLIT_LOOP_2(p, key, mod, value, index) \
SPLIT_LOOP_1(p, key, mod, value, index + 1); \
SPLIT_LOOP_1(p, key, mod, value, index)
#define SPLIT_LOOP_4(p, key, mod, value, index) \
SPLIT_LOOP_2(p, key, mod, value, index + 2); \
SPLIT_LOOP_2(p, key, mod, value, index)
#define _SPLIT_LOOP_8_(p, key, mod, value) \
SPLIT_LOOP_4(p, key, mod, value, 4); \
SPLIT_LOOP_4(p, key, mod, value, 0)
#define ADD_LOOP_1(key, mod, v1) \
++key[v1 mod].offset
#define ADD_LOOP_2(key, mod, v1, v2) \
ADD_LOOP_1(key, mod, v1); \
ADD_LOOP_1(key, mod, v2)
#define ADD_LOOP_4(key, mod, v1, v2, v3, v4) \
ADD_LOOP_2(key, mod, v1, v2); \
ADD_LOOP_2(key, mod, v3, v4)
#define ADD_LOOP_8(key, mod, v1, v2, v3, v4, v5, v6, v7, v8) \
ADD_LOOP_4(key, mod, v1, v2, v3, v4) \
ADD_LOOP_4(key, mod, v5, v6, v7, v8)
#ifndef LOOP
#define SPLIT(dst, src, size, key, mod, p) \
for (p = src + size; p > src; ) { \
p -= 8; \
_SPLIT_LOOP_8_(dst, key, mod, p); \
}
#else
#define SPLIT(dst, src, size, key, mod, p) \
for (p = src + size; p > src; ) { \
register sort_t tmp = *(--p); \
*(--key[tmp mod].addr) = tmp; \
}
#endif
#define CAL_OFFSET_INIT(key, init, begin) \
key[init].addr = begin + key[init].offset;
#define CAL_OFFSET(key, i) \
key[i].addr = key[i - 1].addr + key[i].offset
typedef unsigned sort_t;
typedef union int_p_t {
unsigned int offset;
sort_t* addr;
} int_p_t;
// sort_t arr[MAX_SIZE];
O2 void sort(sort_t *data, int n) {
unsigned int i;
int_p_t k1[BUCKET_SIZE] = {0}, k2[BUCKET_SIZE] = {0}, k3[BUCKET_SIZE] = {0}, k4[BUCKET_SIZE] = {0};
sort_t *arr = (sort_t*)malloc(sizeof(data[0]) * n);
sort_t *p = data, *end = data + n;
for (; p < end; ) {
register sort_t tmp = *(p++);
ADD_LOOP_1(k4, & BUCKET_MOD, tmp);
ADD_LOOP_1(k3, >> 8 & BUCKET_MOD, tmp);
ADD_LOOP_1(k2, >> 16 & BUCKET_MOD, tmp);
ADD_LOOP_1(k1, >> 24, tmp);
// register sort_t tmp = p[0], tmp_ = p[1], tmp__ = p[2], tmp___ = p[3];
// ADD_LOOP_4(k4, & BUCKET_MOD, tmp, tmp_, tmp__, tmp___);
// ADD_LOOP_4(k3, >> 8 & BUCKET_MOD, tmp, tmp_, tmp__, tmp___);
// ADD_LOOP_4(k2, >> 16 & BUCKET_MOD, tmp, tmp_, tmp__, tmp___);
// ADD_LOOP_4(k1, >> 24, tmp, tmp_, tmp__, tmp___);
// p += 4;
// register sort_t tmp = p[0], tmp_ = p[1], tmp__ = p[2], tmp___ = p[3];
// ADD_LOOP_4(k4, & BUCKET_MOD, tmp, tmp_, tmp__, tmp___);
// ADD_LOOP_4(k3, >> 8 & BUCKET_MOD, tmp, tmp_, tmp__, tmp___);
// ADD_LOOP_4(k2, >> 16 & BUCKET_MOD, tmp, tmp_, tmp__, tmp___);
// ADD_LOOP_4(k1, >> 24, tmp, tmp_, tmp__, tmp___);
// tmp = p[4], tmp_ = p[5], tmp__ = p[6], tmp___ = p[7];
// ADD_LOOP_4(k4, & BUCKET_MOD, tmp, tmp_, tmp__, tmp___);
// ADD_LOOP_4(k3, >> 8 & BUCKET_MOD, tmp, tmp_, tmp__, tmp___);
// ADD_LOOP_4(k2, >> 16 & BUCKET_MOD, tmp, tmp_, tmp__, tmp___);
// ADD_LOOP_4(k1, >> 24, tmp, tmp_, tmp__, tmp___);
// tmp = p[4], tmp_ = p[5], tmp__ = p[6], tmp___ = p[7];
// p += 8;
}
CAL_OFFSET_INIT(k4, 0, arr);
CAL_OFFSET_INIT(k3, 0, data);
CAL_OFFSET_INIT(k2, 0, arr);
CAL_OFFSET_INIT(k1, 0, data);
for (i = 1; i < BUCKET_SIZE; ++i) {
CAL_OFFSET(k4, i);
CAL_OFFSET(k3, i);
CAL_OFFSET(k2, i);
CAL_OFFSET(k1, i);
}
SPLIT(arr, data, n, k4, & BUCKET_MOD, p);
SPLIT(data, arr, n, k3, >> 8 & BUCKET_MOD, p);
SPLIT(arr, data, n, k2, >> 16 & BUCKET_MOD, p);
SPLIT(data, arr, n, k1, >> 24, p);
free(arr);
}
| Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
| Testcase #1 | 805.142 ms | 762 MB + 984 KB | Accepted | Score: 100 | 显示更多 |