提交记录 27821


用户 题目 状态 得分 用时 内存 语言 代码长度
Izumi_Sagiri wc2017b1. 【WC2017】挑战-任务1 Compile Error 0 0 ns 0 KB C++17 2.71 KB
提交时间 评测时间
2025-01-27 15:06:40 2025-01-27 15:06:42
//written by deepseek-r1
#include <cstring>
#include <immintrin.h>
#include <stdlib.h>

#define ALIGNED_ALLOC(align, size) aligned_alloc(align, (size + (align)-1) & ~((align)-1))

#pragma GCC target("avx2")

inline void radix_sort(unsigned* a, int n) {
    constexpr int RADIX_BITS = 8;
    constexpr int BUCKETS = 1 << RADIX_BITS;
    constexpr int PASSES = 32 / RADIX_BITS;

    unsigned* tmp = (unsigned*)ALIGNED_ALLOC(64, n * sizeof(unsigned));
    unsigned* src = a;
    unsigned* dst = tmp;

    for (int shift = 0; shift < 32; shift += RADIX_BITS) {
        unsigned count[BUCKETS] __attribute__((aligned(64))) = {0};
        
        // 直方图统计:SIMD加速+循环展开
        const int BLOCK = 32;
        for (int i = 0; i < n - BLOCK; i += BLOCK) {
            __m256i v0 = _mm256_loadu_si256((__m256i*)(src + i));
            __m256i v1 = _mm256_loadu_si256((__m256i*)(src + i + 8));
            __m256i v2 = _mm256_loadu_si256((__m256i*)(src + i + 16));
            __m256i v3 = _mm256_loadu_si256((__m256i*)(src + i + 24));

            v0 = _mm256_srli_epi32(v0, shift);
            v1 = _mm256_srli_epi32(v1, shift);
            v2 = _mm256_srli_epi32(v2, shift);
            v3 = _mm256_srli_epi32(v3, shift);

            v0 = _mm256_and_si256(v0, _mm256_set1_epi32(0xFF));
            v1 = _mm256_and_si256(v1, _mm256_set1_epi32(0xFF));
            v2 = _mm256_and_si256(v2, _mm256_set1_epi32(0xFF));
            v3 = _mm256_and_si256(v3, _mm256_set1_epi32(0xFF));

            alignas(32) unsigned buf[32];
            _mm256_store_si256((__m256i*)buf, v0);
            _mm256_store_si256((__m256i*)(buf + 8), v1);
            _mm256_store_si256((__m256i*)(buf + 16), v2);
            _mm256_store_si256((__m256i*)(buf + 24), v3);

            for (int j = 0; j < BLOCK; j++) 
                count[buf[j]]++;
        }

        // 处理剩余元素
        for (int i = (n / BLOCK) * BLOCK; i < n; i++)
            count[(src[i] >> shift) & 0xFF]++;

        // 前缀和计算:展开循环
        unsigned prefix[BUCKETS];
        prefix[0] = 0;
        for (int i = 1; i < BUCKETS; i += 4) {
            prefix[i] = prefix[i-1] + count[i-1];
            prefix[i+1] = prefix[i] + count[i];
            prefix[i+2] = prefix[i+1] + count[i+1];
            prefix[i+3] = prefix[i+2] + count[i+2];
        }

        // 元素重排:指针数组优化
        unsigned* pos[BUCKETS];
        for (int b = 0; b < BUCKETS; b++)
            pos[b] = dst + prefix[b];

        for (int i = 0; i < n; i++) {
            const unsigned val = src[i];
            const unsigned b = (val >> shift) & 0xFF;
            *pos[b]++ = val;
        }

        std::swap(src, dst);
    }

    if (src == tmp)
        memcpy(a, tmp, n * sizeof(unsigned));

    free(tmp);
}

void sort(unsigned* a, int n) {
    radix_sort(a, n);
}

CompilationN/AN/ACompile ErrorScore: N/A


Judge Duck Online | 评测鸭在线
Server Time: 2025-04-05 13:56:40 | Loaded in 1 ms | Server Status
个人娱乐项目,仅供学习交流使用 | 捐赠