1 diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
2 index 2906d0acd9..33610c65f7 100644
3 --- a/caffe2/utils/math_gpu.cu
4 +++ b/caffe2/utils/math_gpu.cu
5 @@ -838,6 +838,24 @@ CAFFE2_CUDA_EXPORT void GemmBatched<at::Half, CUDAContext>(
8 TensorProto::DataType math_type) {
10 + // loop over matrices in the batch
11 + for (int i = 0; i < batch_size; ++i) {
12 + Gemm<at::Half, CUDAContext>(
27 // Note that cublas follows fortran order, so the order is different from
28 // the cblas convention.
29 const int lda = (trans_A == CblasNoTrans) ? K : M;
30 @@ -912,6 +930,7 @@ CAFFE2_CUDA_EXPORT void GemmBatched<at::Half, CUDAContext>(
32 CAFFE_THROW("Unsupported math type");