提交记录 11115


用户 题目 状态 得分 用时 内存 语言 代码长度
rd0x01 test. 自定义测试 Compile Error 0 0 ns 0 KB C++11 8.40 KB
提交时间 评测时间
2019-10-28 17:20:34 2023-09-03 19:38:57
#include <cmath>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <functional>
#include <x86intrin.h>

#pragma GCC optimize("O0")
template <class T, class OP, int IDENT>
struct compile_o0
{
    void operator()(int n, const T *v, T *dest)
    {
        OP op;
        T t = IDENT;
        for (int i = 0; i < n; i++)
            t = op(t, v[i]);
        *dest = t;
    }
};

#pragma GCC optimize("O2")
template <class T, class OP, int IDENT>
struct compile_o2
{
    void operator()(int n, const T *v, T *dest)
    {
        OP op;
        T t = IDENT;
        for (int i = 0; i < n; i++)
            t = op(t, v[i]);
        *dest = t;
    }
};

template <class T, class OP, int IDENT>
struct unroll2x1
{
    void operator()(int n, const T *v, T *dest)
    {
        OP op;
        T t = IDENT;
        int i;
        for (i = 0; i <= n - 2; i += 2)
        {
            t = op(t, v[i]);
            t = op(t, v[i + 1]);
        }
        for (; i < n; i++)
            t = op(t, v[i]);
        *dest = t;
    }
};

template <class T, class OP, int IDENT>
struct unroll2x1a
{
    void operator()(int n, const T *v, T *dest)
    {
        OP op;
        T t = IDENT;
        int i;
        for (i = 0; i <= n - 2; i += 2)
            t = op(t, op(v[i], v[i + 1]));
        for (; i < n; i++)
            t = op(t, v[i]);
        *dest = t;
    }
};

template <class T, class OP, int IDENT>
struct unroll2x2
{
    void operator()(int n, const T *v, T *dest)
    {
        OP op;
        T t1 = IDENT, t2 = IDENT;
        int i;
        for (i = 0; i <= n - 2; i += 2)
        {
            t1 = op(t1, v[i]);
            t2 = op(t2, v[i + 1]);
        }
        T t = op(t1, t2);
        for (; i < n; i++)
            t = op(t, v[i]);
        *dest = t;
    }
};

template <class T, class OP, int IDENT>
struct unroll4x1a
{
    void operator()(int n, const T *v, T *dest)
    {
        OP op;
        T t = IDENT;
        int i;
        for (i = 0; i <= n - 4; i += 4)
            t = op(t, op(op(v[i], v[i + 1]), op(v[i + 2], v[i + 3])));
        for (; i < n; i++)
            t = op(t, v[i]);
        *dest = t;
    }
};

template <class T, class OP, int IDENT>
struct unroll4x4
{
    void operator()(int n, const T *v, T *dest)
    {
        OP op;
        T t1 = IDENT, t2 = IDENT, t3 = IDENT, t4 = IDENT;
        int i;
        for (i = 0; i <= n - 4; i += 4)
        {
            t1 = op(t1, v[i]);
            t2 = op(t2, v[i + 1]);
            t3 = op(t3, v[i + 2]);
            t4 = op(t4, v[i + 3]);
        }
        T t = op(op(t1, t2), op(t3, t4));
        for (; i < n; i++)
            t = op(t, v[i]);
        *dest = t;
    }
};

template <class T, class OP, int IDENT>
struct unroll8x1a
{
    void operator()(int n, const T *v, T *dest)
    {
        OP op;
        T t = IDENT;
        int i;
        for (i = 0; i <= n - 8; i += 8)
            t = op(t, op(
                        op(op(v[i + 0], v[i + 1]), op(v[i + 2], v[i + 3])),
                        op(op(v[i + 4], v[i + 5]), op(v[i + 6], v[i + 7]))));
        for (; i < n; i++)
            t = op(t, v[i]);
        *dest = t;
    }
};

template <class T, class OP, int IDENT>
struct unroll8x8
{
    void operator()(int n, const T *v, T *dest)
    {
        OP op;
        T t1 = IDENT, t2 = IDENT, t3 = IDENT, t4 = IDENT;
        T t5 = IDENT, t6 = IDENT, t7 = IDENT, t8 = IDENT;
        int i;
        for (i = 0; i <= n - 8; i += 8)
        {
            t1 = op(t1, v[i]);
            t2 = op(t2, v[i + 1]);
            t3 = op(t3, v[i + 2]);
            t4 = op(t4, v[i + 3]);
            t5 = op(t5, v[i + 4]);
            t6 = op(t6, v[i + 5]);
            t7 = op(t7, v[i + 6]);
            t8 = op(t8, v[i + 7]);
        }
        T t = op(op(op(t1, t2), op(t3, t4)), op(op(t5, t6), op(t7, t8)));
        for (; i < n; i++)
            t = op(t, v[i]);
        *dest = t;
    }
};

#pragma GCC target "arch=core-avx2,tune=core-avx2"

template <class T, class OP, int IDENT>
struct vec_avx2;

template <class T, class OP>
struct vec_avx2<T, OP, 0>
{
    typedef T vec __attribute__ ((vector_size (256)));
    static constexpr int LEN = 256 / (8 * sizeof(T));

    void operator()(int n, const T *v, T *dest)
    {
        T t = 0;
        int i;
        if (LEN == 4)
        {
            vec tt = {0, 0, 0, 0};
            for (i = 0; i <= n - LEN; i += LEN)
                tt += vec{v[i], v[i + 1], v[i + 2], v[i + 3]};
            t = tt[0] + tt[1] + tt[2] + tt[3];
        } else if (LEN == 8)
        {
            vec tt = {0, 0, 0, 0, 0, 0, 0, 0};
            for (i = 0; i <= n - LEN; i += LEN)
                tt += vec{v[i], v[i + 1], v[i + 2], v[i + 3], v[i + 4], v[i + 5], v[i + 6], v[i + 7]};
            t = tt[0] + tt[1] + tt[2] + tt[3] + tt[4] + tt[5] + tt[6] + tt[7];
        } else
            assert(false);
        for (; i < n; i++)
            t += v[i];
        *dest = t;
    }
};

template <class T, class OP>
struct vec_avx2<T, OP, 1>
{
    typedef T vec __attribute__ ((vector_size (256)));
    static constexpr int LEN = 256 / (8 * sizeof(T));

    void operator()(int n, const T *v, T *dest)
    {
        T t = 1;
        int i;
        if (LEN == 4)
        {
            vec tt = {1, 1, 1, 1};
            for (i = 0; i <= n - LEN; i += LEN)
                tt *= vec{v[i], v[i + 1], v[i + 2], v[i + 3]};
            t = tt[0] * tt[1] * tt[2] * tt[3];
        } else if (LEN == 8)
        {
            vec tt = {1, 1, 1, 1, 1, 1, 1, 1};
            for (i = 0; i <= n - LEN; i += LEN)
                tt *= vec{v[i], v[i + 1], v[i + 2], v[i + 3], v[i + 4], v[i + 5], v[i + 6], v[i + 7]};
            t = tt[0] * tt[1] * tt[2] * tt[3] * tt[4] * tt[5] * tt[6] * tt[7];
        } else
            assert(false);
        for (; i < n; i++)
            t *= v[i];
        *dest = t;
    }
};

/*
template <class T, class OP, int IDENT>
struct vec_avx2;

template <class T, class OP>
struct vec_avx2<T, OP, 0>
{
    void operator()(int n, const T *v, T *dest)
    {
        OP op;
        T t = 0;
#pragma omp simd reduction(+: t)
        for (int i = 0; i < n; i++)
            t = op(t, v[i]);
        *dest = t;
    }
};

template <class T, class OP>
struct vec_avx2<T, OP, 1>
{
    void operator()(int n, const T *v, T *dest)
    {
        OP op;
        T t = 1;
#pragma omp simd reduction(*: t)
        for (int i = 0; i < n; i++)
            t = op(t, v[i]);
        *dest = t;
    }
};
*/

template <class T, class OP, int IDENT, class U>
int64_t eval(int n, U f, const char *name)
{
    T *buff = new T[n];
    for (int i = 0; i < n; i++)
        buff[i] = rand() % 1000;
    T dest, correct;
    f(n, buff, &dest); // drop the first run
    int TEST_RUN = 1000;
    int64_t clocks = 0;
    for (int t = 0; t < TEST_RUN; t++)
    {
        int64_t start = __rdtsc();
        f(n, buff, &dest);
        int64_t end = __rdtsc();
        clocks += end - start;
        compile_o0<T, OP, IDENT>()(n, buff, &correct);
        if (std::abs(dest - correct) > 1e-7 && std::abs(dest - correct) > 1e-7 * correct)
            return -1;
    }
    delete[] buff;
    return clocks / TEST_RUN;
}

void output(int64_t x)
{
    if (x == -1)
        std::cout << "\tWA";
    else
        std::cout << "\t" << x;
}

#define eval_all(n, f) \
    std::cout << #f; \
    output(eval<int32_t, std::plus<int32_t>, 0>(n, f<int32_t, std::plus<int32_t>, 0>(), #f " i32 +")); \
    output(eval<int32_t, std::multiplies<int32_t>, 1>(n, f<int32_t, std::multiplies<int32_t>, 1>(), #f " i32 *")); \
    output(eval<int64_t, std::plus<int64_t>, 0>(n, f<int64_t, std::plus<int64_t>, 0>(), #f " i64 +")); \
    output(eval<int64_t, std::multiplies<int64_t>, 1>(n, f<int64_t, std::multiplies<int64_t>, 1>(), #f " i64 *")); \
    output(eval<float, std::plus<float>, 0>(n, f<float, std::plus<float>, 0>(), #f " f32 +")); \
    output(eval<float, std::multiplies<float>, 1>(n, f<float, std::multiplies<float>, 1>(), #f " f32 *")); \
    output(eval<double, std::plus<double>, 0>(n, f<double, std::plus<double>, 0>(), #f " f64 +")); \
    output(eval<double, std::multiplies<double>, 1>(n, f<double, std::multiplies<double>, 1>(), #f " f64 *")); \
    std::cout << std::endl;

int main()
{
    std::cout << "name\t\ti32+\ti32*\ti64+\ti64*\tf32+\tf32*\t64+\tf64*" << std::endl;
    eval_all(200, compile_o0);
    eval_all(200, compile_o2);
    eval_all(200, unroll2x1);
    eval_all(200, unroll2x1a);
    eval_all(200, unroll2x2);
    eval_all(200, unroll4x1a);
    eval_all(200, unroll4x4);
    eval_all(200, unroll8x1a);
    eval_all(200, unroll8x8);
    eval_all(200, vec_avx2);
    return 0;
}


CompilationN/AN/ACompile ErrorScore: N/A


Judge Duck Online | 评测鸭在线
Server Time: 2026-03-28 08:23:03 | Loaded in 1 ms | Server Status
个人娱乐项目,仅供学习交流使用 | 捐赠