1 // RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic
2 // RUN: %libomptarget-compilexx-generic -O3 -ffast-math && \
3 // RUN: %libomptarget-run-generic
4 // RUN: %libomptarget-compileoptxx-generic -O3 && %libomptarget-run-generic
5 // RUN: %libomptarget-compileoptxx-generic -O3 -ffast-math && \
6 // RUN: %libomptarget-run-generic
9 // UNSUPPORTED: nvidiagpu
10 // UNSUPPORTED: amdgpu
21 const int rowsPerBlock
;
22 const int colsPerBlock
;
25 const int nBlocksPerRow
;
26 const int nBlocksPerCol
;
27 std::vector
<std::vector
<std::unique_ptr
<float[]>>> Blocks
;
30 BlockMatrix(const int _rowsPerBlock
, const int _colsPerBlock
,
31 const long _nRows
, const long _nCols
)
32 : rowsPerBlock(_rowsPerBlock
), colsPerBlock(_colsPerBlock
), nRows(_nRows
),
33 nCols(_nCols
), nBlocksPerRow(_nRows
/ _rowsPerBlock
),
34 nBlocksPerCol(_nCols
/ _colsPerBlock
), Blocks(nBlocksPerCol
) {
35 for (int i
= 0; i
< nBlocksPerCol
; i
++) {
36 for (int j
= 0; j
< nBlocksPerRow
; j
++) {
37 Blocks
[i
].emplace_back(new float[_rowsPerBlock
* _colsPerBlock
]);
42 // Initialize the BlockMatrix from 2D arrays
43 void Initialize(const std::vector
<float> &matrix
) {
44 for (int i
= 0; i
< nBlocksPerCol
; i
++)
45 for (int j
= 0; j
< nBlocksPerRow
; j
++) {
46 float *CurrBlock
= GetBlock(i
, j
);
47 for (int ii
= 0; ii
< colsPerBlock
; ++ii
)
48 for (int jj
= 0; jj
< rowsPerBlock
; ++jj
) {
49 int curri
= i
* colsPerBlock
+ ii
;
50 int currj
= j
* rowsPerBlock
+ jj
;
51 CurrBlock
[ii
+ jj
* colsPerBlock
] = matrix
[curri
+ currj
* nCols
];
56 void Compare(const std::vector
<float> &matrix
) const {
57 for (int i
= 0; i
< nBlocksPerCol
; i
++)
58 for (int j
= 0; j
< nBlocksPerRow
; j
++) {
59 float *CurrBlock
= GetBlock(i
, j
);
60 for (int ii
= 0; ii
< colsPerBlock
; ++ii
)
61 for (int jj
= 0; jj
< rowsPerBlock
; ++jj
) {
62 int curri
= i
* colsPerBlock
+ ii
;
63 int currj
= j
* rowsPerBlock
+ jj
;
64 float m_value
= matrix
[curri
+ currj
* nCols
];
65 float bm_value
= CurrBlock
[ii
+ jj
* colsPerBlock
];
66 assert(std::fabs(bm_value
- m_value
) <
67 std::numeric_limits
<float>::epsilon());
72 float *GetBlock(int i
, int j
) const {
73 assert(i
< nBlocksPerCol
&& j
< nBlocksPerRow
&& "Accessing outside block");
74 return Blocks
[i
][j
].get();
78 constexpr const int BS
= 16;
79 constexpr const int N
= 256;
81 int BlockMatMul_TargetNowait(BlockMatrix
&A
, BlockMatrix
&B
, BlockMatrix
&C
) {
84 for (int i
= 0; i
< N
/ BS
; ++i
)
85 for (int j
= 0; j
< N
/ BS
; ++j
) {
86 float *BlockC
= C
.GetBlock(i
, j
);
87 for (int k
= 0; k
< N
/ BS
; ++k
) {
88 float *BlockA
= A
.GetBlock(i
, k
);
89 float *BlockB
= B
.GetBlock(k
, j
);
91 #pragma omp target depend(in: BlockA[0], BlockB[0]) depend(inout: BlockC[0]) \
92 map(to: BlockA[:BS * BS], BlockB[:BS * BS]) \
93 map(tofrom: BlockC[:BS * BS]) nowait
95 #pragma omp parallel for
96 for (int ii
= 0; ii
< BS
; ii
++)
97 for (int jj
= 0; jj
< BS
; jj
++) {
98 for (int kk
= 0; kk
< BS
; ++kk
)
99 BlockC
[ii
+ jj
* BS
] +=
100 BlockA
[ii
+ kk
* BS
] * BlockB
[kk
+ jj
* BS
];
107 void Matmul(const std::vector
<float> &a
, const std::vector
<float> &b
,
108 std::vector
<float> &c
) {
109 for (int i
= 0; i
< N
; ++i
) {
110 for (int j
= 0; j
< N
; ++j
) {
112 for (int k
= 0; k
< N
; ++k
) {
113 sum
= sum
+ a
[i
* N
+ k
] * b
[k
* N
+ j
];
120 int main(int argc
, char *argv
[]) {
121 std::vector
<float> a(N
* N
);
122 std::vector
<float> b(N
* N
);
123 std::vector
<float> c(N
* N
, 0.0);
125 for (int i
= 0; i
< N
; ++i
) {
126 for (int j
= 0; j
< N
; ++j
) {
127 a
[i
* N
+ j
] = b
[i
* N
+ j
] = i
+ j
% 100;
131 auto BlockedA
= BlockMatrix(BS
, BS
, N
, N
);
132 auto BlockedB
= BlockMatrix(BS
, BS
, N
, N
);
133 auto BlockedC
= BlockMatrix(BS
, BS
, N
, N
);
134 BlockedA
.Initialize(a
);
135 BlockedB
.Initialize(b
);
136 BlockedC
.Initialize(c
);
142 BlockMatMul_TargetNowait(BlockedA
, BlockedB
, BlockedC
);
146 std::cout
<< "PASS\n";