提交记录 27817


用户 题目 状态 得分 用时 内存 语言 代码长度
Izumi_Sagiri wc2017b1. 【WC2017】挑战-任务1 Compile Error 0 0 ns 0 KB C++17 3.08 KB
提交时间 评测时间
2025-01-27 13:35:18 2025-01-27 13:35:19
//written by deepseek-r1
#include <algorithm>
#include <cstdlib>
#include <immintrin.h>

#pragma GCC target("avx2")

// 每个基数桶的缓冲区大小(按256位SIMD对齐)
constexpr int BUF_SIZE = 1024;

void sort(unsigned *a, int n) {
    const size_t alignment = 64;
    unsigned *tmp = static_cast<unsigned*>(aligned_alloc(alignment, n * sizeof(unsigned)));
    unsigned *src = a;
    unsigned *dst = tmp;

    for (int k = 0; k < 4; ++k) {
        const unsigned shift = k * 8;
        unsigned count[256] = {0};

        // Phase 1: 统计分布 (SIMD加速)
        unsigned *p = src;
        const unsigned *end = src + n;
        for (; p + 8 <= end; p += 8) {
            __m256i vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
            for (int i = 0; i < 8; ++i) {
                unsigned x = _mm256_extract_epi32(vec, i);
                count[(x >> shift) & 0xFF]++;
            }
        }
        for (; p < end; ++p) {
            count[(*p >> shift) & 0xFF]++;
        }

        // Phase 2: 计算前缀和
        unsigned sum = 0;
        for (int i = 0; i < 256; ++i) {
            unsigned temp = count[i];
            count[i] = sum;
            sum += temp;
        }

        // Phase 3: 数据重排 (SIMD优化)
        struct alignas(32) SIMDBuffer {
            unsigned data[BUF_SIZE];
        };
        SIMDBuffer buffers[256];
        unsigned buf_counts[256] = {0};

        p = src;
        end = src + n;
        for (; p < end; ++p) {
            unsigned x = *p;
            unsigned radix = (x >> shift) & 0xFF;
            
            // 填充缓冲区
            buffers[radix].data[buf_counts[radix]++] = x;
            
            // 缓冲区满时用SIMD批量写入
            if (buf_counts[radix] == BUF_SIZE) {
                unsigned *dest = dst + count[radix];
                __m256i *src_vec = reinterpret_cast<__m256i*>(buffers[radix].data);
                
                // 每次处理8个元素
                for (int i = 0; i < BUF_SIZE; i += 8) {
                    __m256i data = _mm256_load_si256(src_vec + i/8);
                    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dest + i), data);
                }
                
                count[radix] += BUF_SIZE;
                buf_counts[radix] = 0;
            }
        }

        // 处理剩余数据
        for (int r = 0; r < 256; ++r) {
            if (buf_counts[r] > 0) {
                unsigned *dest = dst + count[r];
                __m256i *src_vec = reinterpret_cast<__m256i*>(buffers[r].data);
                
                // SIMD批量写入剩余元素
                int i = 0;
                for (; i + 8 <= buf_counts[r]; i += 8) {
                    __m256i data = _mm256_load_si256(src_vec + i/8);
                    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dest + i), data);
                }
                
                // 处理尾部元素
                for (; i < buf_counts[r]; ++i) {
                    dest[i] = buffers[r].data[i];
                }
                
                count[r] += buf_counts[r];
            }
        }

        std::swap(src, dst);
    }

    // 结果拷贝优化
    if (src == tmp) {
        __builtin_memcpy(a, tmp, n * sizeof(unsigned));
    }

    free(tmp);
}

CompilationN/AN/ACompile ErrorScore: N/A


Judge Duck Online | 评测鸭在线
Server Time: 2025-04-05 13:54:37 | Loaded in 0 ms | Server Status
个人娱乐项目,仅供学习交流使用 | 捐赠