提交记录 27815


用户 题目 状态 得分 用时 内存 语言 代码长度
Izumi_Sagiri wc2017b1. 【WC2017】挑战-任务1 Compile Error 0 0 ns 0 KB C++17 2.62 KB
提交时间 评测时间
2025-01-27 13:22:51 2025-01-27 13:22:53
//written by deepseek-R1
#include <algorithm>
#include <cstdlib>
#include <immintrin.h>

void sort(unsigned *a, int n) {
    const size_t alignment = 64; // 缓存行对齐
    unsigned *tmp = static_cast<unsigned*>(aligned_alloc(alignment, n * sizeof(unsigned)));
    unsigned *src = a;
    unsigned *dst = tmp;

    for (int k = 0; k < 4; ++k) {
        const unsigned shift = k * 8;
        unsigned count[256] = {0};

        // 统计循环:指针遍历+预取
        unsigned *p = src;
        unsigned *end = src + n;
        for (; p < end - 32; p += 32) {
            _mm_prefetch(p + 64, _MM_HINT_T0); // 预取未来64元素
            for (int i = 0; i < 32; ++i) {
                unsigned x = p[i];
                count[(x >> shift) & 0xFF]++;
            }
        }
        // 处理剩余元素
        for (; p < end; ++p) {
            unsigned x = *p;
            count[(x >> shift) & 0xFF]++;
        }

        // 前缀和计算
        unsigned sum = 0;
        for (int i = 0; i < 256; ++i) {
            unsigned temp = count[i];
            count[i] = sum;
            sum += temp;
        }

        // 重新排列:批量写入优化
        const int BUF_SIZE = 1024;
        unsigned buffer[256][BUF_SIZE];
        unsigned buf_counts[256] = {0};

        p = src;
        end = src + n;
        for (; p < end; ++p) {
            unsigned x = *p;
            unsigned radix = (x >> shift) & 0xFF;
            buffer[radix][buf_counts[radix]] = x;
            if (++buf_counts[radix] == BUF_SIZE) {
                unsigned *dst_ptr = dst + count[radix];
                for (int j = 0; j < BUF_SIZE; j += 8) {
                    _mm256_storeu_si256(
                        reinterpret_cast<__m256i*>(dst_ptr + j),
                        _mm256_loadu_si256(reinterpret_cast<__m256i*>(&buffer[radix][j]))
                    );
                }
                count[radix] += BUF_SIZE;
                buf_counts[radix] = 0;
            }
        }

        // 处理剩余缓冲数据
        for (int r = 0; r < 256; ++r) {
            if (buf_counts[r] > 0) {
                unsigned *dst_ptr = dst + count[r];
                for (int j = 0; j < buf_counts[r]; j += 8) {
                    _mm256_storeu_si256(
                        reinterpret_cast<__m256i*>(dst_ptr + j),
                        _mm256_loadu_si256(reinterpret_cast<__m256i*>(&buffer[r][j]))
                    );
                }
                count[r] += buf_counts[r];
            }
        }

        std::swap(src, dst);
    }

    if (src == tmp) {
        _mm256_storeu_si256(
            reinterpret_cast<__m256i*>(a),
            _mm256_loadu_si256(reinterpret_cast<__m256i*>(src))
        );
        std::copy(src, src + n, a);
    }

    free(tmp);
}

CompilationN/AN/ACompile ErrorScore: N/A


Judge Duck Online | 评测鸭在线
Server Time: 2025-04-05 14:18:48 | Loaded in 3 ms | Server Status
个人娱乐项目,仅供学习交流使用 | 捐赠