#include #include #include using namespace Eigen; #ifndef SIZE #define SIZE 50 #endif #ifndef REPEAT #define REPEAT 10000 #endif typedef float Scalar; __attribute__((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size); __attribute__((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c); __attribute__((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c); int main(int argc, char* argv[]) { int size = SIZE * 8; int size2 = size * size; Scalar* a = internal::aligned_new(size2); Scalar* b = internal::aligned_new(size2 + 4) + 1; Scalar* c = internal::aligned_new(size2); for (int i = 0; i < size; ++i) { a[i] = b[i] = c[i] = 0; } BenchTimer timer; timer.reset(); for (int k = 0; k < 10; ++k) { timer.start(); benchVec(a, b, c, size2); timer.stop(); } std::cout << timer.value() << "s " << (double(size2 * REPEAT) / timer.value()) / (1024. * 1024. * 1024.) << " GFlops\n"; return 0; for (int innersize = size; innersize > 2; --innersize) { if (size2 % innersize == 0) { int outersize = size2 / innersize; MatrixXf ma = Map(a, innersize, outersize); MatrixXf mb = Map(b, innersize, outersize); MatrixXf mc = Map(c, innersize, outersize); timer.reset(); for (int k = 0; k < 3; ++k) { timer.start(); benchVec(ma, mb, mc); timer.stop(); } std::cout << innersize << " x " << outersize << " " << timer.value() << "s " << (double(size2 * REPEAT) / timer.value()) / (1024. * 1024. * 1024.) << " GFlops\n"; } } VectorXf va = Map(a, size2); VectorXf vb = Map(b, size2); VectorXf vc = Map(c, size2); timer.reset(); for (int k = 0; k < 3; ++k) { timer.start(); benchVec(va, vb, vc); timer.stop(); } std::cout << timer.value() << "s " << (double(size2 * REPEAT) / timer.value()) / (1024. * 1024. * 1024.) << " GFlops\n"; return 0; } void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c) { for (int k = 0; k < REPEAT; ++k) a = a + b; } void benchVec(VectorXf& a, VectorXf& b, VectorXf& c) { for (int k = 0; k < REPEAT; ++k) a = a + b; } void benchVec(Scalar* a, Scalar* b, Scalar* c, int size) { typedef internal::packet_traits::type PacketScalar; const int PacketSize = internal::packet_traits::size; PacketScalar a0, a1, a2, a3, b0, b1, b2, b3; for (int k = 0; k < REPEAT; ++k) for (int i = 0; i < size; i += PacketSize * 8) { // a0 = internal::pload(&a[i]); // b0 = internal::pload(&b[i]); // a1 = internal::pload(&a[i+1*PacketSize]); // b1 = internal::pload(&b[i+1*PacketSize]); // a2 = internal::pload(&a[i+2*PacketSize]); // b2 = internal::pload(&b[i+2*PacketSize]); // a3 = internal::pload(&a[i+3*PacketSize]); // b3 = internal::pload(&b[i+3*PacketSize]); // internal::pstore(&a[i], internal::padd(a0, b0)); // a0 = internal::pload(&a[i+4*PacketSize]); // b0 = internal::pload(&b[i+4*PacketSize]); // // internal::pstore(&a[i+1*PacketSize], internal::padd(a1, b1)); // a1 = internal::pload(&a[i+5*PacketSize]); // b1 = internal::pload(&b[i+5*PacketSize]); // // internal::pstore(&a[i+2*PacketSize], internal::padd(a2, b2)); // a2 = internal::pload(&a[i+6*PacketSize]); // b2 = internal::pload(&b[i+6*PacketSize]); // // internal::pstore(&a[i+3*PacketSize], internal::padd(a3, b3)); // a3 = internal::pload(&a[i+7*PacketSize]); // b3 = internal::pload(&b[i+7*PacketSize]); // // internal::pstore(&a[i+4*PacketSize], internal::padd(a0, b0)); // internal::pstore(&a[i+5*PacketSize], internal::padd(a1, b1)); // internal::pstore(&a[i+6*PacketSize], internal::padd(a2, b2)); // internal::pstore(&a[i+7*PacketSize], internal::padd(a3, b3)); internal::pstore(&a[i + 2 * PacketSize], internal::padd(internal::ploadu(&a[i + 2 * PacketSize]), internal::ploadu(&b[i + 2 * PacketSize]))); internal::pstore(&a[i + 3 * PacketSize], internal::padd(internal::ploadu(&a[i + 3 * PacketSize]), internal::ploadu(&b[i + 3 * PacketSize]))); internal::pstore(&a[i + 4 * PacketSize], internal::padd(internal::ploadu(&a[i + 4 * PacketSize]), internal::ploadu(&b[i + 4 * PacketSize]))); internal::pstore(&a[i + 5 * PacketSize], internal::padd(internal::ploadu(&a[i + 5 * PacketSize]), internal::ploadu(&b[i + 5 * PacketSize]))); internal::pstore(&a[i + 6 * PacketSize], internal::padd(internal::ploadu(&a[i + 6 * PacketSize]), internal::ploadu(&b[i + 6 * PacketSize]))); internal::pstore(&a[i + 7 * PacketSize], internal::padd(internal::ploadu(&a[i + 7 * PacketSize]), internal::ploadu(&b[i + 7 * PacketSize]))); } }