#include <cmath>
#include <cstdlib>
#include <iostream>
#include <functional>
#include <x86intrin.h>
#pragma GCC optimize("O0")
template <class T, class OP, int IDENT>
struct compile_o0
{
void operator()(int n, const T *v, T *dest)
{
OP op;
T t = IDENT;
for (int i = 0; i < n; i++)
t = op(t, v[i]);
*dest = t;
}
};
#pragma GCC optimize("O2")
template <class T, class OP, int IDENT>
struct compile_o2
{
void operator()(int n, const T *v, T *dest)
{
OP op;
T t = IDENT;
for (int i = 0; i < n; i++)
t = op(t, v[i]);
*dest = t;
}
};
template <class T, class OP, int IDENT>
struct unroll2x1
{
void operator()(int n, const T *v, T *dest)
{
OP op;
T t = IDENT;
int i;
for (i = 0; i <= n - 2; i += 2)
{
t = op(t, v[i]);
t = op(t, v[i + 1]);
}
for (; i < n; i++)
t = op(t, v[i]);
*dest = t;
}
};
template <class T, class OP, int IDENT>
struct unroll2x1a
{
void operator()(int n, const T *v, T *dest)
{
OP op;
T t = IDENT;
int i;
for (i = 0; i <= n - 2; i += 2)
t = op(t, op(v[i], v[i + 1]));
for (; i < n; i++)
t = op(t, v[i]);
*dest = t;
}
};
template <class T, class OP, int IDENT>
struct unroll2x2
{
void operator()(int n, const T *v, T *dest)
{
OP op;
T t1 = IDENT, t2 = IDENT;
int i;
for (i = 0; i <= n - 2; i += 2)
{
t1 = op(t1, v[i]);
t2 = op(t2, v[i + 1]);
}
T t = op(t1, t2);
for (; i < n; i++)
t = op(t, v[i]);
*dest = t;
}
};
template <class T, class OP, int IDENT>
struct unroll4x4
{
void operator()(int n, const T *v, T *dest)
{
OP op;
T t1 = IDENT, t2 = IDENT, t3 = IDENT, t4 = IDENT;
int i;
for (i = 0; i <= n - 2; i += 4)
{
t1 = op(t1, v[i]);
t2 = op(t2, v[i + 1]);
t3 = op(t3, v[i + 2]);
t4 = op(t4, v[i + 3]);
}
T t = op(op(t1, t2), op(t3, t4));
for (; i < n; i++)
t = op(t, v[i]);
*dest = t;
}
};
template <class T, class OP, int IDENT>
struct unroll8x8
{
void operator()(int n, const T *v, T *dest)
{
OP op;
T t1 = IDENT, t2 = IDENT, t3 = IDENT, t4 = IDENT;
T t5 = IDENT, t6 = IDENT, t7 = IDENT, t8 = IDENT;
int i;
for (i = 0; i <= n - 2; i += 8)
{
t1 = op(t1, v[i]);
t2 = op(t2, v[i + 1]);
t3 = op(t3, v[i + 2]);
t4 = op(t4, v[i + 3]);
t5 = op(t5, v[i + 4]);
t6 = op(t6, v[i + 5]);
t7 = op(t7, v[i + 6]);
t8 = op(t8, v[i + 7]);
}
T t = op(op(op(t1, t2), op(t3, t4)), op(op(t5, t6), op(t7, t8)));
for (; i < n; i++)
t = op(t, v[i]);
*dest = t;
}
};
template <class T, class OP, int IDENT, class U>
int64_t eval(int n, U f, const char *name)
{
T *buff = new T[n];
for (int i = 0; i < n; i++)
buff[i] = rand() % 1000;
T dest, correct;
f(n, buff, &dest); // drop the first run
int TEST_RUN = 1000;
int64_t clocks = 0;
for (int t = 0; t < TEST_RUN; t++)
{
int64_t start = __rdtsc();
f(n, buff, &dest);
int64_t end = __rdtsc();
clocks += end - start;
compile_o0<T, OP, IDENT>()(n, buff, &correct);
if (std::abs(dest - correct) > 1e-7 && std::abs(dest - correct) > 1e-7 * correct)
return -1;
}
delete[] buff;
return clocks / TEST_RUN;
}
void output(int64_t x)
{
if (x == -1)
std::cout << "\tWA";
else
std::cout << "\t" << x;
}
#define eval_all(n, f) \
std::cout << #f; \
output(eval<int32_t, std::plus<int32_t>, 0>(n, f<int32_t, std::plus<int32_t>, 0>(), #f " i32 +")); \
output(eval<int32_t, std::multiplies<int32_t>, 1>(n, f<int32_t, std::multiplies<int32_t>, 1>(), #f " i32 *")); \
output(eval<int64_t, std::plus<int64_t>, 0>(n, f<int64_t, std::plus<int64_t>, 0>(), #f " i64 +")); \
output(eval<int64_t, std::multiplies<int64_t>, 1>(n, f<int64_t, std::multiplies<int64_t>, 1>(), #f " i64 *")); \
output(eval<float, std::plus<float>, 0>(n, f<float, std::plus<float>, 0>(), #f " f32 +")); \
output(eval<float, std::multiplies<float>, 1>(n, f<float, std::multiplies<float>, 1>(), #f " f32 *")); \
output(eval<double, std::plus<double>, 0>(n, f<double, std::plus<double>, 0>(), #f " f64 +")); \
output(eval<double, std::multiplies<double>, 1>(n, f<double, std::multiplies<double>, 1>(), #f " f64 *")); \
std::cout << std::endl;
int main()
{
std::cout << "name\t\ti32+\ti32*\ti64+\ti64*\tf32+\tf32*\t64+\tf64*" << std::endl;
eval_all(200, compile_o0);
eval_all(200, compile_o2);
eval_all(200, unroll2x1);
eval_all(200, unroll2x1a);
eval_all(200, unroll2x2);
eval_all(200, unroll4x4);
eval_all(200, unroll8x8);
return 0;
}
| Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
| Testcase #1 | 38.075 ms | 40 KB | Accepted | Score: 100 | 显示更多 |