我尝试使用valarray,因为它在运算向量和矩阵时很像MATLAB。我首先做了一些性能检查,发现valarray无法达到Stroustrup在一书中所宣称的性能。
这个测试程序实际上做了500万次双精度乘法。我认为c= a*b至少可以与for
循环的双类型元素乘法相媲美,但我完全错了。2008年,我在几台电脑和微软的Visual C++ 6.0和Visual Studio上进行了尝试。
顺便说一下,我在MATLAB上使用以下代码进行了测试:
len = 5*1024*1024;
a = rand(len, 1);
b = rand(len, 1);
c = zeros(len, 1);
tic;
c = a.*b;
toc;
结果是46ms。这个时间不是很精确;它只作为一个参考。
代码是:
#include <iostream>
#include <valarray>
#include <iostream>
#include "windows.h"
using namespace std;
SYSTEMTIME stime;
LARGE_INTEGER sys_freq;
double gettime_hp();
int main()
{
enum { N = 5*1024*1024 };
valarray<double> a(N), b(N), c(N);
QueryPerformanceFrequency(&sys_freq);
int i, j;
for (j=0 ; j<8 ; ++j)
{
for (i=0 ; i<N ; ++i)
{
a[i] = rand();
b[i] = rand();
}
double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
double dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c1[i] = a1[i] * b1[i];
dtime = gettime_hp()-dtime;
cout << "double operator* " << dtime << " ms\n";
dtime = gettime_hp();
c = a*b ;
dtime = gettime_hp() - dtime;
cout << "valarray operator* " << dtime << " ms\n";
dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c[i] = a[i] * b[i];
dtime = gettime_hp() - dtime;
cout << "valarray[i] operator* " << dtime<< " ms\n";
cout << "------------------------------------------------------\n";
}
}
double gettime_hp()
{
LARGE_INTEGER tick;
extern LARGE_INTEGER sys_freq;
QueryPerformanceCounter(&tick);
return (double)tick.QuadPart * 1000.0 / sys_freq.QuadPart;
}
运行结果:(最大速度优化的发布模式)
double operator* 52.3019 ms
valarray operator* 128.338 ms
valarray[i] operator* 43.1801 ms
------------------------------------------------------
double operator* 43.4036 ms
valarray operator* 145.533 ms
valarray[i] operator* 44.9121 ms
------------------------------------------------------
double operator* 43.2619 ms
valarray operator* 158.681 ms
valarray[i] operator* 43.4871 ms
------------------------------------------------------
double operator* 42.7317 ms
valarray operator* 173.164 ms
valarray[i] operator* 80.1004 ms
------------------------------------------------------
double operator* 43.2236 ms
valarray operator* 158.004 ms
valarray[i] operator* 44.3813 ms
------------------------------------------------------
同样优化的调试模式:
double operator* 41.8123 ms
valarray operator* 201.484 ms
valarray[i] operator* 41.5452 ms
------------------------------------------------------
double operator* 40.2238 ms
valarray operator* 215.351 ms
valarray[i] operator* 40.2076 ms
------------------------------------------------------
double operator* 40.5859 ms
valarray operator* 232.007 ms
valarray[i] operator* 40.8803 ms
------------------------------------------------------
double operator* 40.9734 ms
valarray operator* 234.325 ms
valarray[i] operator* 40.9711 ms
------------------------------------------------------
double operator* 41.1977 ms
valarray operator* 234.409 ms
valarray[i] operator* 41.1429 ms
------------------------------------------------------
double operator* 39.7754 ms
valarray operator* 234.26 ms
valarray[i] operator* 39.6338 ms
------------------------------------------------------
发布于 2011-07-28 05:24:26
我刚刚在Linux x86-64系统(Sandy Bridge CPU)上尝试过:
gcc 4.5.0:
double operator* 9.64185 ms
valarray operator* 9.36987 ms
valarray[i] operator* 9.35815 ms
英特尔ICC 12.0.2:
double operator* 7.76757 ms
valarray operator* 9.60208 ms
valarray[i] operator* 7.51409 ms
在这两种情况下,我只使用了-O3
,没有其他与优化相关的标志。
看起来MS C++编译器和/或valarray实现很烂。
下面是为Linux修改的OP代码:
#include <iostream>
#include <valarray>
#include <iostream>
#include <ctime>
using namespace std ;
double gettime_hp();
int main()
{
enum { N = 5*1024*1024 };
valarray<double> a(N), b(N), c(N) ;
int i,j;
for( j=0 ; j<8 ; ++j )
{
for( i=0 ; i<N ; ++i )
{
a[i]=rand();
b[i]=rand();
}
double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0] ;
double dtime=gettime_hp();
for( i=0 ; i<N ; ++i ) c1[i] = a1[i] * b1[i] ;
dtime=gettime_hp()-dtime;
cout << "double operator* " << dtime << " ms\n" ;
dtime=gettime_hp();
c = a*b ;
dtime=gettime_hp()-dtime;
cout << "valarray operator* " << dtime << " ms\n" ;
dtime=gettime_hp();
for( i=0 ; i<N ; ++i ) c[i] = a[i] * b[i] ;
dtime=gettime_hp()-dtime;
cout << "valarray[i] operator* " << dtime<< " ms\n" ;
cout << "------------------------------------------------------\n" ;
}
}
double gettime_hp()
{
struct timespec timestamp;
clock_gettime(CLOCK_REALTIME, ×tamp);
return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6;
}
发布于 2011-09-07 05:12:11
valarray的全部要点是在向量机上速度快,这是x86机器所不具备的。
在非向量机上一个好的实现应该能够与您获得的性能相匹配,例如
for (i=0; i < N; ++i)
c1[i] = a1[i] * b1[i];
一个坏的当然不会,除非硬件中有一些东西可以加速并行处理,否则这将是你能做的最好的事情。
发布于 2011-07-28 14:18:17
我最终通过使用延迟求值得到了这个结果。代码可能很难看,因为我刚刚开始学习这些C++高级概念。
代码如下:
#include <iostream>
#include <valarray>
#include <iostream>
#include "windows.h"
using namespace std;
SYSTEMTIME stime;
LARGE_INTEGER sys_freq;
double gettime_hp();
// To improve the c = a*b (it will generate a temporary first, assigned to 'c' and delete the temporary.
// Which causes the program really slow
// The solution is the expression template and let the compiler to decide when all the expression is known.
// Delayed evaluation
//typedef valarray<double> Vector;
class Vector;
class VecMul
{
public:
const Vector& va;
const Vector& vb;
//Vector& vc;
VecMul(const Vector& v1, const Vector& v2): va(v1), vb(v2) {}
operator Vector();
};
class Vector:public valarray<double>
{
valarray<double> *p;
public:
explicit Vector(int n)
{
p = new valarray<double>(n);
}
Vector& operator = (const VecMul &m)
{
for(int i=0; i<m.va.size(); i++)
(*p)[i] = (m.va)[i]*(m.vb)[i]; // Ambiguous
return *this;
}
double& operator[](int i) const {return (*p)[i];} //const vector_type[i]
int size()const {return (*p).size();}
};
inline VecMul operator*(const Vector& v1, const Vector& v2)
{
return VecMul(v1, v2);
}
int main()
{
enum {N = 5*1024*1024};
Vector a(N), b(N), c(N);
QueryPerformanceFrequency(&sys_freq);
int i, j;
for (j=0 ; j<8 ; ++j)
{
for (i=0 ; i<N ; ++i)
{
a[i] = rand();
b[i] = rand();
}
double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
double dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c1[i] = a1[i] * b1[i];
dtime = gettime_hp()-dtime;
cout << "double operator* " << dtime << " ms\n";
dtime = gettime_hp();
c = a*b;
dtime = gettime_hp()-dtime;
cout << "valarray operator* " << dtime << " ms\n";
dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c[i] = a[i] * b[i];
dtime = gettime_hp() - dtime;
cout << "valarray[i] operator* " << dtime << " ms\n";
cout << "------------------------------------------------------\n";
}
}
double gettime_hp()
{
LARGE_INTEGER tick;
extern LARGE_INTEGER sys_freq;
QueryPerformanceCounter(&tick);
return (double)tick.QuadPart*1000.0/sys_freq.QuadPart;
}
在Visual studio上运行的结果是:
double operator* 41.2031 ms
valarray operator* 43.8407 ms
valarray[i] operator* 42.49 ms
https://stackoverflow.com/questions/6850807
复制相似问题