提交记录 27818


用户 题目 状态 得分 用时 内存 语言 代码长度
Izumi_Sagiri wc2017b1. 【WC2017】挑战-任务1 Compile Error 0 0 ns 0 KB C++17 4.17 KB
提交时间 评测时间
2025-01-27 14:01:43 2025-01-27 14:01:45
//written by deepseek-r1
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <immintrin.h>

#pragma GCC target("avx2")

// 内存对齐分配
inline unsigned* aligned_alloc(size_t alignment, size_t size) {
    void *ptr = nullptr;
    posix_memalign(&ptr, alignment, size);
    return static_cast<unsigned*>(ptr);
}

void sort(unsigned *a, int n) {
    constexpr size_t alignment = 64;
    unsigned *tmp = aligned_alloc(alignment, n * sizeof(unsigned));
    unsigned *src = a;
    unsigned *dst = tmp;

    // 每个基数桶的SIMD缓冲区
    constexpr int SIMD_WIDTH = 8;
    alignas(32) unsigned simd_buffer[256][SIMD_WIDTH];
    unsigned buffer_counts[256] = {0};

    for (int k = 0; k < 4; ++k) {
        const unsigned shift = k * 8;
        unsigned count[256] = {0};

        // Phase 1: 统计分布(SIMD优化)
        const int prefetch_offset = 64; // 预取提前量
        unsigned *p = src;
        for (; p <= src + n - 32; p += 32) {
            // 预取未来数据
            _mm_prefetch(reinterpret_cast<const char*>(p + prefetch_offset), _MM_HINT_T0);
            
            // 批量处理32个元素(4个SIMD向量)
            __m256i vec[4];
            for (int i = 0; i < 4; ++i) {
                vec[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p + i*8));
            }
            
            // 统计每个元素的基数
            alignas(32) unsigned temp_store[32];
            for (int i = 0; i < 4; ++i) {
                _mm256_store_si256(reinterpret_cast<__m256i*>(temp_store + i*8), vec[i]);
            }
            
            for (int i = 0; i < 32; ++i) {
                ++count[(temp_store[i] >> shift) & 0xFF];
            }
        }

        // 处理尾部剩余元素
        for (; p < src + n; ++p) {
            ++count[(*p >> shift) & 0xFF];
        }

        // Phase 2: 前缀和计算
        unsigned sum = 0;
        for (int i = 0; i < 256; ++i) {
            unsigned temp = count[i];
            count[i] = sum;
            sum += temp;
        }

        // Phase 3: 数据重排(SIMD优化)
        p = src;
        for (; p <= src + n - 8; p += 8) {
            // 加载8个元素
            __m256i vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
            
            // 提取并处理每个元素
            alignas(32) unsigned elements[8];
            _mm256_store_si256(reinterpret_cast<__m256i*>(elements), vec);
            
            for (int i = 0; i < 8; ++i) {
                const unsigned x = elements[i];
                const unsigned radix = (x >> shift) & 0xFF;
                
                // 填充SIMD缓冲区
                simd_buffer[radix][buffer_counts[radix]++] = x;
                
                // 缓冲区满时批量写入
                if (buffer_counts[radix] == SIMD_WIDTH) {
                    __m256i data = _mm256_load_si256(reinterpret_cast<__m256i*>(simd_buffer[radix]));
                    _mm256_storeu_si256(
                        reinterpret_cast<__m256i*>(dst + count[radix]),
                        data
                    );
                    count[radix] += SIMD_WIDTH;
                    buffer_counts[radix] = 0;
                }
            }
        }

        // 处理尾部元素
        for (; p < src + n; ++p) {
            const unsigned x = *p;
            const unsigned radix = (x >> shift) & 0xFF;
            dst[count[radix]++] = x;
        }

        // 处理剩余的缓冲区数据
        for (int r = 0; r < 256; ++r) {
            if (buffer_counts[r] > 0) {
                __m256i data = _mm256_maskz_loadu_epi32(
                    (1U << buffer_counts[r]) - 1,
                    simd_buffer[r]
                );
                _mm256_mask_storeu_epi32(
                    dst + count[r],
                    (1U << buffer_counts[r]) - 1,
                    data
                );
                count[r] += buffer_counts[r];
                buffer_counts[r] = 0;
            }
        }

        std::swap(src, dst);
    }

    // 结果拷贝
    if (src == tmp) {
        if (n >= 64) {
            __m256i *src_vec = reinterpret_cast<__m256i*>(tmp);
            __m256i *dst_vec = reinterpret_cast<__m256i*>(a);
            for (int i = 0; i < n/8; ++i) {
                _mm256_storeu_si256(dst_vec + i, _mm256_load_si256(src_vec + i));
            }
        } else {
            std::memcpy(a, tmp, n * sizeof(unsigned));
        }
    }

    free(tmp);
}

CompilationN/AN/ACompile ErrorScore: N/A


Judge Duck Online | 评测鸭在线
Server Time: 2025-04-05 14:02:33 | Loaded in 1 ms | Server Status
个人娱乐项目,仅供学习交流使用 | 捐赠