#include <iostream>
#include <chrono>
#include <immintrin.h>
#pragma GCC target("avx2,fma")
//#pragma GCC target("avx512f")
class StopWatch
{
private:
bool is_stop = false, is_start = false;
uint64_t tick = 0;
double rate = 1.0f;
std::chrono::system_clock::time_point begin;
std::chrono::system_clock::time_point end;
public:
StopWatch()
{
is_start = false;
is_stop = false;
rate = 1.0f;
tick = 0;
}
StopWatch(double rate_in)
{
is_start = false;
is_stop = false;
rate = rate_in;
tick = 0;
}
void start()
{
reset();
is_start = true;
is_stop = false;
begin = std::chrono::system_clock::now();
}
void stop()
{
if (is_start)
{
is_stop = true;
end = std::chrono::system_clock::now();
auto delta_time = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
tick = delta_time.count();
}
}
void reset()
{
is_start = false;
is_stop = false;
tick = 0;
}
double duration()
{
if (!is_stop)
{
stop();
is_stop = false;
}
return static_cast<double>(tick / (rate * 1000));
}
};
StopWatch watch_default(1);
size_t MFLOPS_AVX2(size_t test_cycle)
{
__m256d a0 = _mm256_set1_pd(1);
__m256d a1 = _mm256_set1_pd(2);
__m256d a2 = _mm256_set1_pd(3);
__m256d a3 = _mm256_set1_pd(4);
__m256d a4 = _mm256_set1_pd(5);
__m256d a5 = _mm256_set1_pd(6);
__m256d a6 = _mm256_set1_pd(7);
__m256d a7 = _mm256_set1_pd(8);
__m256d a8 = _mm256_set1_pd(9);
__m256d a9 = _mm256_set1_pd(10);
__m256d a10 = _mm256_set1_pd(11);
__m256d a11 = _mm256_set1_pd(12);
__m256d b = _mm256_set1_pd(2);
watch_default.start();
for (size_t i = 0; i < test_cycle; i++)
{
a0 = _mm256_fmadd_pd(b, a0, a0);
a1 = _mm256_fmadd_pd(b, a1, a1);
a2 = _mm256_fmadd_pd(b, a2, a2);
a3 = _mm256_fmadd_pd(b, a3, a3);
a4 = _mm256_fmadd_pd(b, a4, a4);
a5 = _mm256_fmadd_pd(b, a5, a5);
a6 = _mm256_fmadd_pd(b, a6, a6);
a7 = _mm256_fmadd_pd(b, a7, a7);
/*a8 = _mm256_fmadd_pd(b, a8, a8);
a9 = _mm256_fmadd_pd(b, a9, a9);
a10 = _mm256_fmadd_pd(b, a10, a10);
a11 = _mm256_fmadd_pd(b, a11, a11);*/
}
watch_default.stop();
double ary[48];
_mm256_storeu_pd(ary, a0);
_mm256_storeu_pd(ary + 4, a1);
_mm256_storeu_pd(ary + 8, a2);
_mm256_storeu_pd(ary + 12, a3);
_mm256_storeu_pd(ary + 16, a4);
_mm256_storeu_pd(ary + 20, a5);
_mm256_storeu_pd(ary + 24, a6);
_mm256_storeu_pd(ary + 28, a7);
_mm256_storeu_pd(ary + 32, a8);
_mm256_storeu_pd(ary + 36, a9);
_mm256_storeu_pd(ary + 40, a10);
_mm256_storeu_pd(ary + 44, a11);
double sum = 0;
for (int i = 0; i < 48; i++)
{
sum += ary[i];
}
std::cout << sum << std::endl;
size_t flops = test_cycle * 12 * 4 * 2;
size_t us = watch_default.duration();
std::cout << "time: " << us / 1000 << "ms\t" << flops / us << "Mflops" << std::endl;
return flops / us;
}
int main()
{
std::cout << "AVX2:\n";
MFLOPS_AVX2(1e8); // 循环1e9次
//std::cout << "AVX512:\n";
//MFLOPS_AVX512(1e9); // 循环1e9次
}
Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
Testcase #1 | 111.25 ms | 44 KB | Runtime Error | Score: 0 | 显示更多 |