4 * This file is part of OpenTTD.
5 * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
6 * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
7 * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
10 /** @file viewport_sprite_sorter_sse.cpp Sprite sorter that uses SSE4.1. */
16 #include "smmintrin.h"
17 #include "viewport_sprite_sorter.h"
19 #include "safeguards.h"
22 assert_compile((sizeof(ParentSpriteToDraw
) % 16) == 0);
23 #define LOAD_128 _mm_load_si128
25 #define LOAD_128 _mm_loadu_si128
28 /** Sort parent sprites pointer array using SSE4.1 optimizations. */
29 void ViewportSortParentSpritesSSE41(ParentSpriteToSortVector
*psdv
)
31 const __m128i mask_ptest
= _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0);
32 ParentSpriteToDraw
** const psdvend
= psdv
->End();
33 ParentSpriteToDraw
**psd
= psdv
->Begin();
34 while (psd
!= psdvend
) {
35 ParentSpriteToDraw
* const ps
= *psd
;
37 if (ps
->comparison_done
) {
42 ps
->comparison_done
= true;
44 for (ParentSpriteToDraw
**psd2
= psd
+ 1; psd2
!= psdvend
; psd2
++) {
45 ParentSpriteToDraw
* const ps2
= *psd2
;
47 if (ps2
->comparison_done
) continue;
50 * Decide which comparator to use, based on whether the bounding boxes overlap
53 * if (ps->xmax >= ps2->xmin && ps->xmin <= ps2->xmax && // overlap in X?
54 * ps->ymax >= ps2->ymin && ps->ymin <= ps2->ymax && // overlap in Y?
55 * ps->zmax >= ps2->zmin && ps->zmin <= ps2->zmax) { // overlap in Z?
57 * Above conditions are equivalent to:
58 * 1/ !( (ps->xmax >= ps2->xmin) && (ps->ymax >= ps2->ymin) && (ps->zmax >= ps2->zmin) && (ps->xmin <= ps2->xmax) && (ps->ymin <= ps2->ymax) && (ps->zmin <= ps2->zmax) )
59 * 2/ !( (ps->xmax >= ps2->xmin) && (ps->ymax >= ps2->ymin) && (ps->zmax >= ps2->zmin) && (ps2->xmax >= ps->xmin) && (ps2->ymax >= ps->ymin) && (ps2->zmax >= ps->zmin) )
60 * 3/ !( ( (ps->xmax >= ps2->xmin) && (ps->ymax >= ps2->ymin) && (ps->zmax >= ps2->zmin) ) && ( (ps2->xmax >= ps->xmin) && (ps2->ymax >= ps->ymin) && (ps2->zmax >= ps->zmin) ) )
61 * 4/ !( !( (ps->xmax < ps2->xmin) || (ps->ymax < ps2->ymin) || (ps->zmax < ps2->zmin) ) && !( (ps2->xmax < ps->xmin) || (ps2->ymax < ps->ymin) || (ps2->zmax < ps->zmin) ) )
62 * 5/ PTEST <---------------------------------- rslt1 ----------------------------------> <------------------------------ rslt2 -------------------------------------->
64 __m128i ps1_max
= LOAD_128((__m128i
*) &ps
->xmax
);
65 __m128i ps2_min
= LOAD_128((__m128i
*) &ps2
->xmin
);
66 __m128i rslt1
= _mm_cmplt_epi32(ps1_max
, ps2_min
);
67 if (!_mm_testz_si128(mask_ptest
, rslt1
))
70 __m128i ps1_min
= LOAD_128((__m128i
*) &ps
->xmin
);
71 __m128i ps2_max
= LOAD_128((__m128i
*) &ps2
->xmax
);
72 __m128i rslt2
= _mm_cmplt_epi32(ps2_max
, ps1_min
);
73 if (_mm_testz_si128(mask_ptest
, rslt2
)) {
74 /* Use X+Y+Z as the sorting order, so sprites closer to the bottom of
75 * the screen and with higher Z elevation, are drawn in front.
76 * Here X,Y,Z are the coordinates of the "center of mass" of the sprite,
77 * i.e. X=(left+right)/2, etc.
78 * However, since we only care about order, don't actually divide / 2
80 if (ps
->xmin
+ ps
->xmax
+ ps
->ymin
+ ps
->ymax
+ ps
->zmin
+ ps
->zmax
<=
81 ps2
->xmin
+ ps2
->xmax
+ ps2
->ymin
+ ps2
->ymax
+ ps2
->zmin
+ ps2
->zmax
) {
86 /* Move ps2 in front of ps */
87 ParentSpriteToDraw
* const temp
= ps2
;
88 for (ParentSpriteToDraw
**psd3
= psd2
; psd3
> psd
; psd3
--) {
97 * Check whether the current CPU supports SSE 4.1.
98 * @return True iff the CPU supports SSE 4.1.
100 bool ViewportSortParentSpritesSSE41Checker()
102 return HasCPUIDFlag(1, 2, 19);
105 #endif /* WITH_SSE */