提交记录 27818 - Judge Duck Online

用户	题目	状态	得分	用时	内存	语言	代码长度
Izumi_Sagiri	wc2017b1. 【WC2017】挑战-任务1	Compile Error	0	0 ns	0 KB	C++17	4.17 KB

提交时间	评测时间
2025-01-27 14:01:43	2025-01-27 14:01:45

代码

//written by deepseek-r1
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <immintrin.h>

#pragma GCC target("avx2")

// 内存对齐分配
inline unsigned* aligned_alloc(size_t alignment, size_t size) {
    void *ptr = nullptr;
    posix_memalign(&ptr, alignment, size);
    return static_cast<unsigned*>(ptr);
}

void sort(unsigned *a, int n) {
    constexpr size_t alignment = 64;
    unsigned *tmp = aligned_alloc(alignment, n * sizeof(unsigned));
    unsigned *src = a;
    unsigned *dst = tmp;

    // 每个基数桶的SIMD缓冲区
    constexpr int SIMD_WIDTH = 8;
    alignas(32) unsigned simd_buffer[256][SIMD_WIDTH];
    unsigned buffer_counts[256] = {0};

    for (int k = 0; k < 4; ++k) {
        const unsigned shift = k * 8;
        unsigned count[256] = {0};

        // Phase 1: 统计分布（SIMD优化）
        const int prefetch_offset = 64; // 预取提前量
        unsigned *p = src;
        for (; p <= src + n - 32; p += 32) {
            // 预取未来数据
            _mm_prefetch(reinterpret_cast<const char*>(p + prefetch_offset), _MM_HINT_T0);
            
            // 批量处理32个元素（4个SIMD向量）
            __m256i vec[4];
            for (int i = 0; i < 4; ++i) {
                vec[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p + i*8));
            }
            
            // 统计每个元素的基数
            alignas(32) unsigned temp_store[32];
            for (int i = 0; i < 4; ++i) {
                _mm256_store_si256(reinterpret_cast<__m256i*>(temp_store + i*8), vec[i]);
            }
            
            for (int i = 0; i < 32; ++i) {
                ++count[(temp_store[i] >> shift) & 0xFF];
            }
        }

        // 处理尾部剩余元素
        for (; p < src + n; ++p) {
            ++count[(*p >> shift) & 0xFF];
        }

        // Phase 2: 前缀和计算
        unsigned sum = 0;
        for (int i = 0; i < 256; ++i) {
            unsigned temp = count[i];
            count[i] = sum;
            sum += temp;
        }

        // Phase 3: 数据重排（SIMD优化）
        p = src;
        for (; p <= src + n - 8; p += 8) {
            // 加载8个元素
            __m256i vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
            
            // 提取并处理每个元素
            alignas(32) unsigned elements[8];
            _mm256_store_si256(reinterpret_cast<__m256i*>(elements), vec);
            
            for (int i = 0; i < 8; ++i) {
                const unsigned x = elements[i];
                const unsigned radix = (x >> shift) & 0xFF;
                
                // 填充SIMD缓冲区
                simd_buffer[radix][buffer_counts[radix]++] = x;
                
                // 缓冲区满时批量写入
                if (buffer_counts[radix] == SIMD_WIDTH) {
                    __m256i data = _mm256_load_si256(reinterpret_cast<__m256i*>(simd_buffer[radix]));
                    _mm256_storeu_si256(
                        reinterpret_cast<__m256i*>(dst + count[radix]),
                        data
                    );
                    count[radix] += SIMD_WIDTH;
                    buffer_counts[radix] = 0;
                }
            }
        }

        // 处理尾部元素
        for (; p < src + n; ++p) {
            const unsigned x = *p;
            const unsigned radix = (x >> shift) & 0xFF;
            dst[count[radix]++] = x;
        }

        // 处理剩余的缓冲区数据
        for (int r = 0; r < 256; ++r) {
            if (buffer_counts[r] > 0) {
                __m256i data = _mm256_maskz_loadu_epi32(
                    (1U << buffer_counts[r]) - 1,
                    simd_buffer[r]
                );
                _mm256_mask_storeu_epi32(
                    dst + count[r],
                    (1U << buffer_counts[r]) - 1,
                    data
                );
                count[r] += buffer_counts[r];
                buffer_counts[r] = 0;
            }
        }

        std::swap(src, dst);
    }

    // 结果拷贝
    if (src == tmp) {
        if (n >= 64) {
            __m256i *src_vec = reinterpret_cast<__m256i*>(tmp);
            __m256i *dst_vec = reinterpret_cast<__m256i*>(a);
            for (int i = 0; i < n/8; ++i) {
                _mm256_storeu_si256(dst_vec + i, _mm256_load_si256(src_vec + i));
            }
        } else {
            std::memcpy(a, tmp, n * sizeof(unsigned));
        }
    }

    free(tmp);
}

评测结果

Compilation

N/A

Compile Error

Score: N/A

显示更多

jp_data/tasks/2f4a48f6dc7411ef8cd0c970e0538c36/contestant.cpp:10:18: error: ambiguating new declaration of ‘unsigned int* aligned_alloc(size_t, size_t)’
   10 | inline unsigned* aligned_alloc(size_t alignment, size_t size) {
      |                  ^~~~~~~~~~~~~
In file included from /usr/include/c++/9/cstdlib:75,
                 from /usr/include/c++/9/bits/stl_algo.h:59,
                 from /usr/include/c++/9/algorithm:62,
                 from jp_data/tasks/2f4a48f6dc7411ef8cd0c970e0538c36/contestant.cpp:2:
/usr/include/stdlib.h:586:14: note: old declaration ‘void* aligned_alloc(size_t, size_t)’
  586 | extern void *aligned_alloc (size_t __alignment, size_t __size)
      |              ^~~~~~~~~~~~~
jp_data/tasks/2f4a48f6dc7411ef8cd0c970e0538c36/contestant.cpp: In function ‘void sort(unsigned int*, int)’:
jp_data/tasks/2f4a48f6dc7411ef8cd0c970e0538c36/contestant.cpp:18:34: error: invalid conversion from ‘void*’ to ‘unsigned int*’ [-fpermissive]
   18 |     unsigned *tmp = aligned_alloc(alignment, n * sizeof(unsigned));
      |                     ~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                                  |
      |                                  void*