#include <stdio.h>
#include <stdlib.h>
#define BUCKET_MOD (0xFFU)
#define BUCKET_SIZE (0x100U)
#define SPLIT_LOOP_1(p,key,mod,value,index) *(--(key[value[index] mod].addr))=value[index]
#define SPLIT_LOOP_2(p,key,mod,value,index)\
SPLIT_LOOP_1(p,key,mod,value,index+1);\
SPLIT_LOOP_1(p,key,mod,value,index)
#define SPLIT_LOOP_4(p,key,mod,value,index)\
SPLIT_LOOP_2(p,key,mod,value,index+2);\
SPLIT_LOOP_2(p,key,mod,value,index)
#define _SPLIT_LOOP_8_(p,key,mod,value)\
SPLIT_LOOP_4(p,key,mod,value,4);\
SPLIT_LOOP_4(p,key,mod,value,0)
#define ADD_LOOP_1(key,mod,v1) ++key[v1 mod].offset
#define ADD_LOOP_2(key,mod,v1,v2)\
ADD_LOOP_1(key,mod,v1);\
ADD_LOOP_1(key,mod,v2)
#define ADD_LOOP_4(key,mod,v1,v2,v3,v4)\
ADD_LOOP_2(key,mod,v1,v2);\
ADD_LOOP_2(key,mod,v3,v4)
#define ADD_LOOP_8(key,mod,v1,v2,v3,v4,v5,v6,v7,v8)\
ADD_LOOP_4(key,mod,v1,v2,v3,v4)\
ADD_LOOP_4(key,mod,v5,v6,v7,v8)
#define SPLIT_END_LOOP(dst,src,end,key,mod,p)\
for(p=end;p>src;){\
p-=8;\
_SPLIT_LOOP_8_(dst,key,mod,p);\
}
#define SPLIT_LOOP(dst,src,size,key,mod,p) SPLIT_END_LOOP(dst,src,src+size,key,mod,p)
#define SPLIT_END_NOLOOP(dst,src,end,key,mod,p)\
for(p=end;p>src;){\
sort_t tmp=*(--p);\
*(--key[tmp mod].addr)=tmp;\
}
#define SPLIT_NOLOOP(dst,src,size,key,mod,p) SPLIT_END_NOLOOP(dst,src,src+size,key,mod,p)
#ifndef LOOP
#define SPLIT(dst,src,size,key,mod,p) SPLIT_LOOP(dst,src,size,key,mod,p)
#else
#define SPLIT(dst,src,size,key,mod,p) SPLIT_NOLOOP(dst,src,size,key,mod,p)
#endif
#define CAL_OFFSET_INIT(key,init,begin) key[init].addr=begin+key[init].offset;
#define CAL_OFFSET(key,i) key[i].addr=key[i-1].addr+key[i].offset
typedef unsigned sort_t;
typedef union int_p_t{unsigned int offset;sort_t* addr;} int_p_t;
#define LSB_SORT(dst,src,dst_end,src_end)\
do{\
int_p_t k2[BUCKET_SIZE]={0},k3[BUCKET_SIZE]={0},k4[BUCKET_SIZE]={0};\
sort_t *p=src;\
for(;p<src_end;){\
sort_t tmp=*(p++);\
ADD_LOOP_1(k4,&BUCKET_MOD,tmp);\
ADD_LOOP_1(k3,>>8&BUCKET_MOD,tmp);\
ADD_LOOP_1(k2,>>16&BUCKET_MOD,tmp);\
}\
CAL_OFFSET_INIT(k4,0,dst);\
CAL_OFFSET_INIT(k3,0,src);\
CAL_OFFSET_INIT(k2,0,dst);\
for(unsigned int i=1;i<BUCKET_SIZE;++i){\
CAL_OFFSET(k4,i);\
CAL_OFFSET(k3,i);\
CAL_OFFSET(k2,i);\
}\
SPLIT_END_NOLOOP(dst,src,src_end,k4,&BUCKET_MOD,p);\
SPLIT_END_NOLOOP(src,dst,dst_end,k3,>>8&BUCKET_MOD,p);\
SPLIT_END_NOLOOP(dst,src,src_end,k2,>>16&BUCKET_MOD,p);\
}while(0)
void sort(sort_t *data,int n){
int_p_t k1[BUCKET_SIZE]={0};
sort_t *arr=(sort_t*)malloc(sizeof(data[0])*n);
sort_t *p=data,*end=data+n;
for(;p<end;){
sort_t tmp=p[0],tmp_=p[1],tmp__=p[2],tmp___=p[3];
ADD_LOOP_4(k1,>>24,tmp,tmp_,tmp__,tmp___);
p+=4;
}
CAL_OFFSET_INIT(k1,0,arr);
for(unsigned int i=1;i<BUCKET_SIZE;++i){
CAL_OFFSET(k1,i);
}
SPLIT(arr,data,n,k1,>>24,p);
end=arr+n;
sort_t* data_end=data+n;
for (unsigned int i=BUCKET_SIZE;i;){
sort_t* begin=k1[--i].addr;
sort_t* data_begin=data_end-(end-begin);
LSB_SORT(data_begin,begin,data_end,end);
data_end=data_begin;
end=begin;
}
free(arr);
}
#undef BUCKET_MOD
#undef BUCKET_SIZE
#undef SPLIT_LOOP_1
#undef SPLIT_LOOP_2
#undef SPLIT_LOOP_4
#undef _SPLIT_LOOP_8_
#undef ADD_LOOP_1
#undef ADD_LOOP_2
#undef ADD_LOOP_4
#undef ADD_LOOP_8
#undef SPLIT_END_LOOP
#undef SPLIT_LOOP
#undef SPLIT_END_NOLOOP
#undef SPLIT_NOLOOP
#undef SPLIT
#undef CAL_OFFSET_INIT
#undef CAL_OFFSET
#undef LSB_SORT