提交记录 27818
提交时间 |
评测时间 |
2025-01-27 14:01:43 |
2025-01-27 14:01:45 |
//written by deepseek-r1
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <immintrin.h>
#pragma GCC target("avx2")
// 内存对齐分配
inline unsigned* aligned_alloc(size_t alignment, size_t size) {
void *ptr = nullptr;
posix_memalign(&ptr, alignment, size);
return static_cast<unsigned*>(ptr);
}
void sort(unsigned *a, int n) {
constexpr size_t alignment = 64;
unsigned *tmp = aligned_alloc(alignment, n * sizeof(unsigned));
unsigned *src = a;
unsigned *dst = tmp;
// 每个基数桶的SIMD缓冲区
constexpr int SIMD_WIDTH = 8;
alignas(32) unsigned simd_buffer[256][SIMD_WIDTH];
unsigned buffer_counts[256] = {0};
for (int k = 0; k < 4; ++k) {
const unsigned shift = k * 8;
unsigned count[256] = {0};
// Phase 1: 统计分布(SIMD优化)
const int prefetch_offset = 64; // 预取提前量
unsigned *p = src;
for (; p <= src + n - 32; p += 32) {
// 预取未来数据
_mm_prefetch(reinterpret_cast<const char*>(p + prefetch_offset), _MM_HINT_T0);
// 批量处理32个元素(4个SIMD向量)
__m256i vec[4];
for (int i = 0; i < 4; ++i) {
vec[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p + i*8));
}
// 统计每个元素的基数
alignas(32) unsigned temp_store[32];
for (int i = 0; i < 4; ++i) {
_mm256_store_si256(reinterpret_cast<__m256i*>(temp_store + i*8), vec[i]);
}
for (int i = 0; i < 32; ++i) {
++count[(temp_store[i] >> shift) & 0xFF];
}
}
// 处理尾部剩余元素
for (; p < src + n; ++p) {
++count[(*p >> shift) & 0xFF];
}
// Phase 2: 前缀和计算
unsigned sum = 0;
for (int i = 0; i < 256; ++i) {
unsigned temp = count[i];
count[i] = sum;
sum += temp;
}
// Phase 3: 数据重排(SIMD优化)
p = src;
for (; p <= src + n - 8; p += 8) {
// 加载8个元素
__m256i vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
// 提取并处理每个元素
alignas(32) unsigned elements[8];
_mm256_store_si256(reinterpret_cast<__m256i*>(elements), vec);
for (int i = 0; i < 8; ++i) {
const unsigned x = elements[i];
const unsigned radix = (x >> shift) & 0xFF;
// 填充SIMD缓冲区
simd_buffer[radix][buffer_counts[radix]++] = x;
// 缓冲区满时批量写入
if (buffer_counts[radix] == SIMD_WIDTH) {
__m256i data = _mm256_load_si256(reinterpret_cast<__m256i*>(simd_buffer[radix]));
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(dst + count[radix]),
data
);
count[radix] += SIMD_WIDTH;
buffer_counts[radix] = 0;
}
}
}
// 处理尾部元素
for (; p < src + n; ++p) {
const unsigned x = *p;
const unsigned radix = (x >> shift) & 0xFF;
dst[count[radix]++] = x;
}
// 处理剩余的缓冲区数据
for (int r = 0; r < 256; ++r) {
if (buffer_counts[r] > 0) {
__m256i data = _mm256_maskz_loadu_epi32(
(1U << buffer_counts[r]) - 1,
simd_buffer[r]
);
_mm256_mask_storeu_epi32(
dst + count[r],
(1U << buffer_counts[r]) - 1,
data
);
count[r] += buffer_counts[r];
buffer_counts[r] = 0;
}
}
std::swap(src, dst);
}
// 结果拷贝
if (src == tmp) {
if (n >= 64) {
__m256i *src_vec = reinterpret_cast<__m256i*>(tmp);
__m256i *dst_vec = reinterpret_cast<__m256i*>(a);
for (int i = 0; i < n/8; ++i) {
_mm256_storeu_si256(dst_vec + i, _mm256_load_si256(src_vec + i));
}
} else {
std::memcpy(a, tmp, n * sizeof(unsigned));
}
}
free(tmp);
}
Compilation | N/A | N/A | Compile Error | Score: N/A | 显示更多 |
Judge Duck Online | 评测鸭在线
Server Time: 2025-04-05 14:02:33 | Loaded in 1 ms | Server Status
个人娱乐项目,仅供学习交流使用 | 捐赠