diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 3477d7182..c2f084c82 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -104,13 +104,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
   // - the sizes are large enough
 
   // compute the maximal number of threads from the size of the product:
-  // FIXME this has to be fine tuned
+  // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once.
   Index size = transpose ? rows : cols;
-  Index pb_max_threads = std::max<Index>(1,size / 32);
+  Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr);
+
   // compute the maximal number of threads from the total amount of work:
   double work = static_cast<double>(rows) * static_cast<double>(cols) *
       static_cast<double>(depth);
-  double kMinTaskSize = 50000;  // Heuristic.
+  double kMinTaskSize = 50000;  // FIXME improve this heuristic.
   pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize));
 
   // compute the number of threads we are going to use