测试硬件:CPU-i5-4590
命令行:/arch:AVX
优化项:/O2
main.cpp
#include <iostream>
#include <vector>
#include "method.h"
#include <random>
#include <time.h>
using std::default_random_engine;
using std::uniform_real_distribution;
int main(int argc, char* argv[])
{
//乘法累加运算
{
int size = 33;
float *input1 = (float *)malloc(sizeof(float) * size);
float *input2 = (float *)malloc(sizeof(float) * size);
default_random_engine e;
uniform_real_distribution<float> u(0, 1); //随机数分布对象
for (int i = 0; i < size; i++)
{
input1[i] = u(e);
input2[i] = u(e);
}
int cntLoop = 10000000;
clock_t start_t = clock();
float org = 0.0;
for (int i = 0; i < cntLoop; i++)
org = MathMulAdd(input1, input2, size);
printf("org = %f\t", org);
printf("cost time: %d(ms)\n", clock() - start_t);
start_t = clock();
float sse = 0.0;
for (int i = 0; i < cntLoop; i++)
sse = SSEMulAdd(input1, input2, size);
printf("sse = %f\t", sse);
printf("cost time: %d(ms)\n", clock() - start_t);
start_t = clock();
float sse_ = 0.0;
for (int i = 0; i < cntLoop; i++)
sse_ = SSEFmAdd(input1, input2, size);
printf("sse_= %f\t", sse_);
printf("cost time: %d(ms)\n", clock() - start_t);
start_t = clock();
float avx = 0.0;
for (int i = 0; i < cntLoop; i++)
avx = AVXMulAdd(input1, input2, size);
printf("avx = %f\t", avx);
printf("cost time: %d(ms)\n", clock() - start_t);
start_t = clock();
float avx_ = 0.0;
for (int i = 0; i < cntLoop; i++)
avx_ = AVXFmAdd(input1, input2, size);
printf("avx_= %f\t", avx_);
printf("cost time: %d(ms)\n", clock() - start_t);
free(input1);
free(input2);
}
//结果:
//org = 11.216135 cost time : 174(ms)
//sse = 11.216136 cost time : 102(ms)
//sse_ = 11.216136 cost time : 119(ms)
//avx = 11.216136 cost time : 63(ms)
//avx_ = 11.216136 cost time : 61(ms)
//加法运算
//{
// int size = 27;
// float *input = (float *)malloc(sizeof(float) * size);
// for (int i = 0; i < size; i++)
// input[i] = 0.0025;
// int cntLoop = 300000000;
// clock_t start_t = clock();
// float org = 0.0;
// for (int i = 0; i < cntLoop; i++)
// org = MathSum(input, size);
// printf("org = %f\t", org);
// printf("cost time: %d\n", clock() - start_t);
// start_t = clock();
// float sse = 0.0;
// for (int i = 0; i < cntLoop; i++)
// sse = SSESum(input, size);
// printf("sse = %f\t", sse);
// printf("cost time: %d\n", clock() - start_t);
// start_t = clock();
// float avx = 0.0;
// for (int i = 0; i < cntLoop; i++)
// avx = AVXSum(input, size);
// printf("avx = %f\t", avx);
// printf("cost time: %d\n", clock() - start_t);
// free(input);
//}
//结果:
//org = 0.067500 cost time : 3062
//sse = 0.067500 cost time : 2283
//avx = 0.067500 cost time : 1829
//最大值/最小值运算
//{
// int size = 58;
// float *input = (float *)malloc(sizeof(float) * size);
// default_random_engine e;
// uniform_real_distribution<float> u(0, 3); //随机数分布对象
// for (int i = 0; i < size; i++)
// {
// input[i] = u(e);
// printf("%f ", input[i]);
// if ((i + 1) % 8 == 0)
// printf("\n");
// }
// printf("\n");
// int cntLoop = 100000000;
// clock_t start_t = clock();
// float org;
// for (int i = 0; i < cntLoop; i++)
// org = MathMax(input, size);
// printf("org = %f\t", org);
// printf("cost time: %d(ms)\n", clock() - start_t);
// start_t = clock();
// float sse;
// for (int i = 0; i < cntLoop; i++)
// sse = SSEMax(input, size);
// printf("sse = %f\t", sse);
// printf("cost time: %d(ms)\n", clock() - start_t);
// start_t = clock();
// float avx;
// for (int i = 0; i < cntLoop; i++)
// avx = AVXMax(input, size);
// printf("avx = %f\t", avx);
// printf("cost time: %d(ms)\n", clock() - start_t);
// free(input);
//}
//结果:
//org = 2.989384 cost time : 9491(ms)
//sse = 2.989384 cost time : 1261(ms)
//avx = 2.989384 cost time : 1413(ms)
return 0;
}
method.h
#pragma once
#include <intrin.h>
#include <stdio.h>
float MathMulAdd(const float *input1, const float *input2, int size);
float SSEMulAdd(const float *input1, const float *input2, int size);
float SSEFmAdd(const float *input1, const float *input2, int size);
float AVXMulAdd(const float *input1, const float *input2, int size);
float AVXFmAdd(const float *input1, const float *input2, int size);
float MathSum(const float *input, int size);
float SSESum(const float *input, int size);
float AVXSum(const float *input, int size);
float MathMax(const float *input, int size);
float SSEMax(const float *input, int size);
float AVXMax(const float *input, int size);
method.cpp
#include "method.h"
float MathMulAdd(const float *input1, const float *input2, int size)
{
float output = 0.0;
for (int i = 0; i < size; i++)
{
output += input1[i] * input2[i];
}
return output;
}
float SSEMulAdd(const float *input1, const float *input2, int size)
{
if (input1 == nullptr || input2 == nullptr)
{
printf("input data is null\n");
return -1;
}
int nBlockWidth = 4;
int cntBlock = size / nBlockWidth;
int cntRem = size % nBlockWidth;
float output = 0;
__m128 loadData1, loadData2;
__m128 mulData = _mm_setzero_ps();
__m128 sumData = _mm_setzero_ps();
const float *p1 = input1;
const float *p2 = input2;
for (int i = 0; i < cntBlock; i++)
{
loadData1 = _mm_load_ps(p1);
loadData2 = _mm_load_ps(p2);
mulData = _mm_mul_ps(loadData1, loadData2);
sumData = _mm_add_ps(sumData, mulData);
p1 += nBlockWidth;
p2 += nBlockWidth;
}
sumData = _mm_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + ...
sumData = _mm_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + ...
output += sumData.m128_f32[(0)]; // 前4组
for (int i = 0; i < cntRem; i++)
{
output += p1[i] * p2[i];
}
return output;
}
float SSEFmAdd(const float *input1, const float *input2, int size)
{
if (input1 == nullptr || input2 == nullptr)
{
printf("input data is null\n");
return -1;
}
int nBlockWidth = 4;
int cntBlock = size / nBlockWidth;
int cntRem = size % nBlockWidth;
float output = 0;
__m128 loadData1, loadData2;
__m128 sumData = _mm_setzero_ps();
const float *p1 = input1;
const float *p2 = input2;
for (int i = 0; i < cntBlock; i++)
{
loadData1 = _mm_load_ps(p1);
loadData2 = _mm_load_ps(p2);
sumData = _mm_fmadd_ps(loadData1, loadData2, sumData);
p1 += nBlockWidth;
p2 += nBlockWidth;
}
sumData = _mm_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + ...
sumData = _mm_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + ...
output += sumData.m128_f32[(0)]; // 前4组
for (int i = 0; i < cntRem; i++)
{
output += p1[i] * p2[i];
}
return output;
}
float AVXMulAdd(const float *input1, const float *input2, int size)
{
if (input1 == nullptr || input2 == nullptr)
{
printf("input data is null\n");
return -1;
}
int nBlockWidth = 8;
int cntBlock = size / nBlockWidth;
int cntRem = size % nBlockWidth;
float output = 0;
__m256 loadData1, loadData2;
__m256 mulData = _mm256_setzero_ps();
__m256 sumData = _mm256_setzero_ps();
const float *p1 = input1;
const float *p2 = input2;
for (int i = 0; i < cntBlock; i++)
{
loadData1 = _mm256_load_ps(p1);
loadData2 = _mm256_load_ps(p2);
mulData = _mm256_mul_ps(loadData1, loadData2);
sumData = _mm256_add_ps(sumData, mulData);
p1 += nBlockWidth;
p2 += nBlockWidth;
}
sumData = _mm256_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + p[8] + p[9] + p[12] + p[13] + ...
sumData = _mm256_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + p[10] + p[11] + p[14] + p[15] + ...
output += sumData.m256_f32[(0)]; // 前4组
output += sumData.m256_f32[(4)]; // 后4组
for (int i = 0; i < cntRem; i++)
{
output += p1[i] * p2[i];
}
return output;
}
float AVXFmAdd(const float *input1, const float *input2, int size)
{
if (input1 == nullptr || input2 == nullptr)
{
printf("input data is null\n");
return -1;
}
int nBlockWidth = 8;
int cntBlock = size / nBlockWidth;
int cntRem = size % nBlockWidth;
float output = 0;
__m256 loadData1, loadData2;
__m256 sumData = _mm256_setzero_ps();
const float *p1 = input1;
const float *p2 = input2;
for (int i = 0; i < cntBlock; i++)
{
loadData1 = _mm256_load_ps(p1);
loadData2 = _mm256_load_ps(p2);
sumData = _mm256_fmadd_ps(loadData1, loadData2, sumData);
p1 += nBlockWidth;
p2 += nBlockWidth;
}
sumData = _mm256_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + p[8] + p[9] + p[12] + p[13] + ...
sumData = _mm256_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + p[10] + p[11] + p[14] + p[15] + ...
output += sumData.m256_f32[(0)]; // 前4组
output += sumData.m256_f32[(4)]; // 后4组
for (int i = 0; i < cntRem; i++)
{
output += p1[i] * p2[i];
}
return output;
}
float MathSum(const float *input, int size)
{
float output = 0.0;
for (int i = 0; i < size; i++)
{
output += input[i];
}
return output;
}
float SSESum(const float *input, int size)
{
if (input == nullptr)
{
printf("input data is null\n");
return -1;
}
int nBlockWidth = 4;
int cntBlock = size / nBlockWidth;
int cntRem = size % nBlockWidth;
float output = 0;
__m128 loadData;
__m128 sumData = _mm_setzero_ps();
const float *p = input;
for (int i = 0; i < cntBlock; i++)
{
loadData = _mm_load_ps(p);
sumData = _mm_add_ps(sumData, loadData);
p += nBlockWidth;
}
sumData = _mm_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + ...
sumData = _mm_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + ...
output += sumData.m128_f32[(0)]; // 前4组
for (int i = 0; i < cntRem; i++)
{
output += p[i];
}
return output;
}
float AVXSum(const float *input, int size)
{
if (input == nullptr)
{
printf("input data is null\n");
return -1;
}
int nBlockWidth = 8;
int cntBlock = size / nBlockWidth;
int cntRem = size % nBlockWidth;
float output = 0;
__m256 loadData;
__m256 sumData = _mm256_setzero_ps();
const float *p = input;
for (int i = 0; i < cntBlock; i++)
{
loadData = _mm256_load_ps(p);
sumData = _mm256_add_ps(sumData, loadData);
p += nBlockWidth;
}
sumData = _mm256_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + p[8] + p[9] + p[12] + p[13] + ...
sumData = _mm256_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + p[10] + p[11] + p[14] + p[15] + ...
output += sumData.m256_f32[(0)]; // 前4组
output += sumData.m256_f32[(4)]; // 后4组
for (int i = 0; i < cntRem; i++)
{
output += p[i];
}
return output;
}
float MathMax(const float *input, int size)
{
float maxVal = input[0];
for (int i = 1; i < size; i++)
{
maxVal = maxVal > input[i] ? maxVal : input[i];
}
return maxVal;
}
float SSEMax(const float *input, int size)
{
if (input == nullptr)
{
printf("input data is null\n");
return -1;
}
int nBlockWidth = 4;
int cntBlock = size / nBlockWidth;
int cntRem = size % nBlockWidth;
__declspec(align(16)) float output[4];
__m128 loadData;
const float *p = input;
__m128 maxVal = _mm_load_ps(p);
p += nBlockWidth;
for (int i = 1; i < cntBlock; i++)
{
loadData = _mm_load_ps(p);
maxVal = _mm_max_ps(maxVal, loadData);
p += nBlockWidth;
}
_mm_store_ps(output, maxVal);
float maxVal_ = output[0];
for (int i = 1; i < 4; i++)
{
maxVal_ = maxVal_ > output[i] ? maxVal_ : output[i];
}
for (int i = 0; i < cntRem; i++)
{
maxVal_ = maxVal_ > p[i] ? maxVal_ : p[i];
}
return maxVal_;
}
float AVXMax(const float *input, int size)
{
if (input == nullptr)
{
printf("input data is null\n");
return -1;
}
int nBlockWidth = 8;
int cntBlock = size / nBlockWidth;
int cntRem = size % nBlockWidth;
__declspec(align(32)) float output[8];
__m256 loadData;
const float *p = input;
__m256 maxVal = _mm256_load_ps(p);
p += nBlockWidth;
for (int i = 1; i < cntBlock; i++)
{
loadData = _mm256_load_ps(p);
maxVal = _mm256_max_ps(maxVal, loadData);
p += nBlockWidth;
}
_mm256_store_ps(output, maxVal);
float maxVal_ = output[0];
for (int i = 1; i < 8; i++)
{
maxVal_ = maxVal_ > output[i] ? maxVal_ : output[i];
}
for (int i = 0; i < cntRem; i++)
{
maxVal_ = maxVal_ > p[i] ? maxVal_ : p[i];
}
return maxVal_;
}