1 // RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic
2 // RUN: %libomptarget-compilexx-generic -O3 -ffast-math && \
3 // RUN: %libomptarget-run-generic
4 // RUN: %libomptarget-compileoptxx-generic -O3 && %libomptarget-run-generic
5 // RUN: %libomptarget-compileoptxx-generic -O3 -ffast-math && \
6 // RUN: %libomptarget-run-generic
8 // UNSUPPORTED: x86_64-pc-linux-gnu
9 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
10 // UNSUPPORTED: aarch64-unknown-linux-gnu
11 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
12 // UNSUPPORTED: amdgcn-amd-amdhsa
13 // UNSUPPORTED: nvptx64-nvidia-cuda
14 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
25 const int rowsPerBlock
;
26 const int colsPerBlock
;
29 const int nBlocksPerRow
;
30 const int nBlocksPerCol
;
31 std::vector
<std::vector
<std::unique_ptr
<float[]>>> Blocks
;
34 BlockMatrix(const int _rowsPerBlock
, const int _colsPerBlock
,
35 const long _nRows
, const long _nCols
)
36 : rowsPerBlock(_rowsPerBlock
), colsPerBlock(_colsPerBlock
), nRows(_nRows
),
37 nCols(_nCols
), nBlocksPerRow(_nRows
/ _rowsPerBlock
),
38 nBlocksPerCol(_nCols
/ _colsPerBlock
), Blocks(nBlocksPerCol
) {
39 for (int i
= 0; i
< nBlocksPerCol
; i
++) {
40 for (int j
= 0; j
< nBlocksPerRow
; j
++) {
41 Blocks
[i
].emplace_back(new float[_rowsPerBlock
* _colsPerBlock
]);
46 // Initialize the BlockMatrix from 2D arrays
47 void Initialize(const std::vector
<float> &matrix
) {
48 for (int i
= 0; i
< nBlocksPerCol
; i
++)
49 for (int j
= 0; j
< nBlocksPerRow
; j
++) {
50 float *CurrBlock
= GetBlock(i
, j
);
51 for (int ii
= 0; ii
< colsPerBlock
; ++ii
)
52 for (int jj
= 0; jj
< rowsPerBlock
; ++jj
) {
53 int curri
= i
* colsPerBlock
+ ii
;
54 int currj
= j
* rowsPerBlock
+ jj
;
55 CurrBlock
[ii
+ jj
* colsPerBlock
] = matrix
[curri
+ currj
* nCols
];
60 void Compare(const std::vector
<float> &matrix
) const {
61 for (int i
= 0; i
< nBlocksPerCol
; i
++)
62 for (int j
= 0; j
< nBlocksPerRow
; j
++) {
63 float *CurrBlock
= GetBlock(i
, j
);
64 for (int ii
= 0; ii
< colsPerBlock
; ++ii
)
65 for (int jj
= 0; jj
< rowsPerBlock
; ++jj
) {
66 int curri
= i
* colsPerBlock
+ ii
;
67 int currj
= j
* rowsPerBlock
+ jj
;
68 float m_value
= matrix
[curri
+ currj
* nCols
];
69 float bm_value
= CurrBlock
[ii
+ jj
* colsPerBlock
];
70 assert(std::fabs(bm_value
- m_value
) <
71 std::numeric_limits
<float>::epsilon());
76 float *GetBlock(int i
, int j
) const {
77 assert(i
< nBlocksPerCol
&& j
< nBlocksPerRow
&& "Accessing outside block");
78 return Blocks
[i
][j
].get();
82 constexpr const int BS
= 16;
83 constexpr const int N
= 256;
85 int BlockMatMul_TargetNowait(BlockMatrix
&A
, BlockMatrix
&B
, BlockMatrix
&C
) {
88 for (int i
= 0; i
< N
/ BS
; ++i
)
89 for (int j
= 0; j
< N
/ BS
; ++j
) {
90 float *BlockC
= C
.GetBlock(i
, j
);
91 for (int k
= 0; k
< N
/ BS
; ++k
) {
92 float *BlockA
= A
.GetBlock(i
, k
);
93 float *BlockB
= B
.GetBlock(k
, j
);
95 #pragma omp target depend(in: BlockA[0], BlockB[0]) depend(inout: BlockC[0]) \
96 map(to: BlockA[:BS * BS], BlockB[:BS * BS]) \
97 map(tofrom: BlockC[:BS * BS]) nowait
99 #pragma omp parallel for
100 for (int ii
= 0; ii
< BS
; ii
++)
101 for (int jj
= 0; jj
< BS
; jj
++) {
102 for (int kk
= 0; kk
< BS
; ++kk
)
103 BlockC
[ii
+ jj
* BS
] +=
104 BlockA
[ii
+ kk
* BS
] * BlockB
[kk
+ jj
* BS
];
111 void Matmul(const std::vector
<float> &a
, const std::vector
<float> &b
,
112 std::vector
<float> &c
) {
113 for (int i
= 0; i
< N
; ++i
) {
114 for (int j
= 0; j
< N
; ++j
) {
116 for (int k
= 0; k
< N
; ++k
) {
117 sum
= sum
+ a
[i
* N
+ k
] * b
[k
* N
+ j
];
124 int main(int argc
, char *argv
[]) {
125 std::vector
<float> a(N
* N
);
126 std::vector
<float> b(N
* N
);
127 std::vector
<float> c(N
* N
, 0.0);
129 for (int i
= 0; i
< N
; ++i
) {
130 for (int j
= 0; j
< N
; ++j
) {
131 a
[i
* N
+ j
] = b
[i
* N
+ j
] = i
+ j
% 100;
135 auto BlockedA
= BlockMatrix(BS
, BS
, N
, N
);
136 auto BlockedB
= BlockMatrix(BS
, BS
, N
, N
);
137 auto BlockedC
= BlockMatrix(BS
, BS
, N
, N
);
138 BlockedA
.Initialize(a
);
139 BlockedB
.Initialize(b
);
140 BlockedC
.Initialize(c
);
146 BlockMatMul_TargetNowait(BlockedA
, BlockedB
, BlockedC
);
150 std::cout
<< "PASS\n";