提交记录 27817
提交时间 |
评测时间 |
2025-01-27 13:35:18 |
2025-01-27 13:35:19 |
//written by deepseek-r1
#include <algorithm>
#include <cstdlib>
#include <immintrin.h>
#pragma GCC target("avx2")
// 每个基数桶的缓冲区大小(按256位SIMD对齐)
constexpr int BUF_SIZE = 1024;
void sort(unsigned *a, int n) {
const size_t alignment = 64;
unsigned *tmp = static_cast<unsigned*>(aligned_alloc(alignment, n * sizeof(unsigned)));
unsigned *src = a;
unsigned *dst = tmp;
for (int k = 0; k < 4; ++k) {
const unsigned shift = k * 8;
unsigned count[256] = {0};
// Phase 1: 统计分布 (SIMD加速)
unsigned *p = src;
const unsigned *end = src + n;
for (; p + 8 <= end; p += 8) {
__m256i vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
for (int i = 0; i < 8; ++i) {
unsigned x = _mm256_extract_epi32(vec, i);
count[(x >> shift) & 0xFF]++;
}
}
for (; p < end; ++p) {
count[(*p >> shift) & 0xFF]++;
}
// Phase 2: 计算前缀和
unsigned sum = 0;
for (int i = 0; i < 256; ++i) {
unsigned temp = count[i];
count[i] = sum;
sum += temp;
}
// Phase 3: 数据重排 (SIMD优化)
struct alignas(32) SIMDBuffer {
unsigned data[BUF_SIZE];
};
SIMDBuffer buffers[256];
unsigned buf_counts[256] = {0};
p = src;
end = src + n;
for (; p < end; ++p) {
unsigned x = *p;
unsigned radix = (x >> shift) & 0xFF;
// 填充缓冲区
buffers[radix].data[buf_counts[radix]++] = x;
// 缓冲区满时用SIMD批量写入
if (buf_counts[radix] == BUF_SIZE) {
unsigned *dest = dst + count[radix];
__m256i *src_vec = reinterpret_cast<__m256i*>(buffers[radix].data);
// 每次处理8个元素
for (int i = 0; i < BUF_SIZE; i += 8) {
__m256i data = _mm256_load_si256(src_vec + i/8);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dest + i), data);
}
count[radix] += BUF_SIZE;
buf_counts[radix] = 0;
}
}
// 处理剩余数据
for (int r = 0; r < 256; ++r) {
if (buf_counts[r] > 0) {
unsigned *dest = dst + count[r];
__m256i *src_vec = reinterpret_cast<__m256i*>(buffers[r].data);
// SIMD批量写入剩余元素
int i = 0;
for (; i + 8 <= buf_counts[r]; i += 8) {
__m256i data = _mm256_load_si256(src_vec + i/8);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dest + i), data);
}
// 处理尾部元素
for (; i < buf_counts[r]; ++i) {
dest[i] = buffers[r].data[i];
}
count[r] += buf_counts[r];
}
}
std::swap(src, dst);
}
// 结果拷贝优化
if (src == tmp) {
__builtin_memcpy(a, tmp, n * sizeof(unsigned));
}
free(tmp);
}
Compilation | N/A | N/A | Compile Error | Score: N/A | 显示更多 |
Judge Duck Online | 评测鸭在线
Server Time: 2025-04-05 13:54:37 | Loaded in 0 ms | Server Status
个人娱乐项目,仅供学习交流使用 | 捐赠