1 /**************************************************************************
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
5 * Copyright 2009 VMware, Inc. All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 **************************************************************************/
30 * Generate SPU per-fragment code (actually per-quad code).
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "rtasm/rtasm_ppc_spe.h"
39 #include "cell_context.h"
40 #include "cell_gen_fragment.h"
44 /** Do extra optimizations? */
45 #define OPTIMIZATIONS 1
49 * Generate SPE code to perform Z/depth testing.
51 * \param dsa Gallium depth/stencil/alpha state to gen code for
52 * \param f SPE function to append instruction onto.
53 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
54 * \param ifragZ_reg register containing integer fragment Z values (in)
55 * \param ifbZ_reg register containing integer frame buffer Z values (in/out)
56 * \param zmask_reg register containing result of Z test/comparison (out)
58 * Returns TRUE if the Z-buffer needs to be updated.
61 gen_depth_test(struct spe_function
*f
,
62 const struct pipe_depth_stencil_alpha_state
*dsa
,
63 int mask_reg
, int ifragZ_reg
, int ifbZ_reg
, int zmask_reg
)
65 /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
66 * quantities. This only makes a difference for 32-bit Z values though.
68 ASSERT(dsa
->depth
.enabled
);
70 switch (dsa
->depth
.func
) {
72 /* zmask = (ifragZ == ref) */
73 spe_ceq(f
, zmask_reg
, ifragZ_reg
, ifbZ_reg
);
74 /* mask = (mask & zmask) */
75 spe_and(f
, mask_reg
, mask_reg
, zmask_reg
);
78 case PIPE_FUNC_NOTEQUAL
:
79 /* zmask = (ifragZ == ref) */
80 spe_ceq(f
, zmask_reg
, ifragZ_reg
, ifbZ_reg
);
81 /* mask = (mask & ~zmask) */
82 spe_andc(f
, mask_reg
, mask_reg
, zmask_reg
);
85 case PIPE_FUNC_GREATER
:
86 /* zmask = (ifragZ > ref) */
87 spe_clgt(f
, zmask_reg
, ifragZ_reg
, ifbZ_reg
);
88 /* mask = (mask & zmask) */
89 spe_and(f
, mask_reg
, mask_reg
, zmask_reg
);
93 /* zmask = (ref > ifragZ) */
94 spe_clgt(f
, zmask_reg
, ifbZ_reg
, ifragZ_reg
);
95 /* mask = (mask & zmask) */
96 spe_and(f
, mask_reg
, mask_reg
, zmask_reg
);
99 case PIPE_FUNC_LEQUAL
:
100 /* zmask = (ifragZ > ref) */
101 spe_clgt(f
, zmask_reg
, ifragZ_reg
, ifbZ_reg
);
102 /* mask = (mask & ~zmask) */
103 spe_andc(f
, mask_reg
, mask_reg
, zmask_reg
);
106 case PIPE_FUNC_GEQUAL
:
107 /* zmask = (ref > ifragZ) */
108 spe_clgt(f
, zmask_reg
, ifbZ_reg
, ifragZ_reg
);
109 /* mask = (mask & ~zmask) */
110 spe_andc(f
, mask_reg
, mask_reg
, zmask_reg
);
113 case PIPE_FUNC_NEVER
:
114 spe_il(f
, mask_reg
, 0); /* mask = {0,0,0,0} */
115 spe_move(f
, zmask_reg
, mask_reg
); /* zmask = mask */
118 case PIPE_FUNC_ALWAYS
:
120 spe_il(f
, zmask_reg
, ~0); /* zmask = {~0,~0,~0,~0} */
128 if (dsa
->depth
.writemask
) {
130 * If (ztest passed) {
131 * framebufferZ = fragmentZ;
134 * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
136 spe_selb(f
, ifbZ_reg
, ifbZ_reg
, ifragZ_reg
, mask_reg
);
145 * Generate SPE code to perform alpha testing.
147 * \param dsa Gallium depth/stencil/alpha state to gen code for
148 * \param f SPE function to append instruction onto.
149 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
150 * \param fragA_reg register containing four fragment alpha values (in)
153 gen_alpha_test(const struct pipe_depth_stencil_alpha_state
*dsa
,
154 struct spe_function
*f
, int mask_reg
, int fragA_reg
)
156 int ref_reg
= spe_allocate_available_register(f
);
157 int amask_reg
= spe_allocate_available_register(f
);
159 ASSERT(dsa
->alpha
.enabled
);
161 if ((dsa
->alpha
.func
!= PIPE_FUNC_NEVER
) &&
162 (dsa
->alpha
.func
!= PIPE_FUNC_ALWAYS
)) {
163 /* load/splat the alpha reference float value */
164 spe_load_float(f
, ref_reg
, dsa
->alpha
.ref_value
);
167 /* emit code to do the alpha comparison, updating 'mask' */
168 switch (dsa
->alpha
.func
) {
169 case PIPE_FUNC_EQUAL
:
170 /* amask = (fragA == ref) */
171 spe_fceq(f
, amask_reg
, fragA_reg
, ref_reg
);
172 /* mask = (mask & amask) */
173 spe_and(f
, mask_reg
, mask_reg
, amask_reg
);
176 case PIPE_FUNC_NOTEQUAL
:
177 /* amask = (fragA == ref) */
178 spe_fceq(f
, amask_reg
, fragA_reg
, ref_reg
);
179 /* mask = (mask & ~amask) */
180 spe_andc(f
, mask_reg
, mask_reg
, amask_reg
);
183 case PIPE_FUNC_GREATER
:
184 /* amask = (fragA > ref) */
185 spe_fcgt(f
, amask_reg
, fragA_reg
, ref_reg
);
186 /* mask = (mask & amask) */
187 spe_and(f
, mask_reg
, mask_reg
, amask_reg
);
191 /* amask = (ref > fragA) */
192 spe_fcgt(f
, amask_reg
, ref_reg
, fragA_reg
);
193 /* mask = (mask & amask) */
194 spe_and(f
, mask_reg
, mask_reg
, amask_reg
);
197 case PIPE_FUNC_LEQUAL
:
198 /* amask = (fragA > ref) */
199 spe_fcgt(f
, amask_reg
, fragA_reg
, ref_reg
);
200 /* mask = (mask & ~amask) */
201 spe_andc(f
, mask_reg
, mask_reg
, amask_reg
);
204 case PIPE_FUNC_GEQUAL
:
205 /* amask = (ref > fragA) */
206 spe_fcgt(f
, amask_reg
, ref_reg
, fragA_reg
);
207 /* mask = (mask & ~amask) */
208 spe_andc(f
, mask_reg
, mask_reg
, amask_reg
);
211 case PIPE_FUNC_NEVER
:
212 spe_il(f
, mask_reg
, 0); /* mask = [0,0,0,0] */
215 case PIPE_FUNC_ALWAYS
:
216 /* no-op, mask unchanged */
225 /* if mask == {0,0,0,0} we're all done, return */
227 /* re-use amask reg here */
228 int tmp_reg
= amask_reg
;
229 /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
230 spe_orx(f
, tmp_reg
, mask_reg
);
231 /* if tmp[0] == 0 then return from function call */
232 spe_biz(f
, tmp_reg
, SPE_REG_RA
, 0, 0);
236 spe_release_register(f
, ref_reg
);
237 spe_release_register(f
, amask_reg
);
242 * This pair of functions is used inline to allocate and deallocate
243 * optional constant registers. Once a constant is discovered to be
244 * needed, we will likely need it again, so we don't want to deallocate
245 * it and have to allocate and load it again unnecessarily.
248 setup_optional_register(struct spe_function
*f
,
252 *r
= spe_allocate_available_register(f
);
256 release_optional_register(struct spe_function
*f
,
260 spe_release_register(f
, r
);
264 setup_const_register(struct spe_function
*f
,
270 setup_optional_register(f
, r
);
271 spe_load_float(f
, *r
, value
);
275 release_const_register(struct spe_function
*f
,
278 release_optional_register(f
, r
);
284 * Unpack/convert framebuffer colors from four 32-bit packed colors
285 * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
286 * Each 8-bit color component is expanded into a float in [0.0, 1.0].
289 unpack_colors(struct spe_function
*f
,
290 enum pipe_format color_format
,
292 int fbR_reg
, int fbG_reg
, int fbB_reg
, int fbA_reg
)
294 int mask0_reg
= spe_allocate_available_register(f
);
295 int mask1_reg
= spe_allocate_available_register(f
);
296 int mask2_reg
= spe_allocate_available_register(f
);
297 int mask3_reg
= spe_allocate_available_register(f
);
299 spe_load_int(f
, mask0_reg
, 0xff);
300 spe_load_int(f
, mask1_reg
, 0xff00);
301 spe_load_int(f
, mask2_reg
, 0xff0000);
302 spe_load_int(f
, mask3_reg
, 0xff000000);
304 spe_comment(f
, 0, "Unpack framebuffer colors, convert to floats");
306 switch (color_format
) {
307 case PIPE_FORMAT_B8G8R8A8_UNORM
:
308 /* fbB = fbRGBA & mask */
309 spe_and(f
, fbB_reg
, fbRGBA_reg
, mask0_reg
);
311 /* fbG = fbRGBA & mask */
312 spe_and(f
, fbG_reg
, fbRGBA_reg
, mask1_reg
);
314 /* fbR = fbRGBA & mask */
315 spe_and(f
, fbR_reg
, fbRGBA_reg
, mask2_reg
);
317 /* fbA = fbRGBA & mask */
318 spe_and(f
, fbA_reg
, fbRGBA_reg
, mask3_reg
);
321 spe_roti(f
, fbG_reg
, fbG_reg
, -8);
323 /* fbR = fbR >> 16 */
324 spe_roti(f
, fbR_reg
, fbR_reg
, -16);
326 /* fbA = fbA >> 24 */
327 spe_roti(f
, fbA_reg
, fbA_reg
, -24);
330 case PIPE_FORMAT_A8R8G8B8_UNORM
:
331 /* fbA = fbRGBA & mask */
332 spe_and(f
, fbA_reg
, fbRGBA_reg
, mask0_reg
);
334 /* fbR = fbRGBA & mask */
335 spe_and(f
, fbR_reg
, fbRGBA_reg
, mask1_reg
);
337 /* fbG = fbRGBA & mask */
338 spe_and(f
, fbG_reg
, fbRGBA_reg
, mask2_reg
);
340 /* fbB = fbRGBA & mask */
341 spe_and(f
, fbB_reg
, fbRGBA_reg
, mask3_reg
);
344 spe_roti(f
, fbR_reg
, fbR_reg
, -8);
346 /* fbG = fbG >> 16 */
347 spe_roti(f
, fbG_reg
, fbG_reg
, -16);
349 /* fbB = fbB >> 24 */
350 spe_roti(f
, fbB_reg
, fbB_reg
, -24);
357 /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
358 spe_cuflt(f
, fbR_reg
, fbR_reg
, 8);
359 spe_cuflt(f
, fbG_reg
, fbG_reg
, 8);
360 spe_cuflt(f
, fbB_reg
, fbB_reg
, 8);
361 spe_cuflt(f
, fbA_reg
, fbA_reg
, 8);
363 spe_release_register(f
, mask0_reg
);
364 spe_release_register(f
, mask1_reg
);
365 spe_release_register(f
, mask2_reg
);
366 spe_release_register(f
, mask3_reg
);
371 * Generate SPE code to implement the given blend mode for a quad of pixels.
372 * \param f SPE function to append instruction onto.
373 * \param fragR_reg register with fragment red values (float) (in/out)
374 * \param fragG_reg register with fragment green values (float) (in/out)
375 * \param fragB_reg register with fragment blue values (float) (in/out)
376 * \param fragA_reg register with fragment alpha values (float) (in/out)
377 * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
380 gen_blend(const struct pipe_blend_state
*blend
,
381 const struct pipe_blend_color
*blend_color
,
382 struct spe_function
*f
,
383 enum pipe_format color_format
,
384 int fragR_reg
, int fragG_reg
, int fragB_reg
, int fragA_reg
,
387 int term1R_reg
= spe_allocate_available_register(f
);
388 int term1G_reg
= spe_allocate_available_register(f
);
389 int term1B_reg
= spe_allocate_available_register(f
);
390 int term1A_reg
= spe_allocate_available_register(f
);
392 int term2R_reg
= spe_allocate_available_register(f
);
393 int term2G_reg
= spe_allocate_available_register(f
);
394 int term2B_reg
= spe_allocate_available_register(f
);
395 int term2A_reg
= spe_allocate_available_register(f
);
397 int fbR_reg
= spe_allocate_available_register(f
);
398 int fbG_reg
= spe_allocate_available_register(f
);
399 int fbB_reg
= spe_allocate_available_register(f
);
400 int fbA_reg
= spe_allocate_available_register(f
);
402 int tmp_reg
= spe_allocate_available_register(f
);
404 /* Optional constant registers we might or might not end up using;
405 * if we do use them, make sure we only allocate them once by
406 * keeping a flag on each one.
409 int constR_reg
= -1, constG_reg
= -1, constB_reg
= -1, constA_reg
= -1;
411 ASSERT(blend
->rt
[0].blend_enable
);
413 /* packed RGBA -> float colors */
414 unpack_colors(f
, color_format
, fbRGBA_reg
,
415 fbR_reg
, fbG_reg
, fbB_reg
, fbA_reg
);
418 * Compute Src RGB terms. We're actually looking for the value
419 * of (the appropriate RGB factors) * (the incoming source RGB color),
420 * because in some cases (like PIPE_BLENDFACTOR_ONE and
421 * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
423 switch (blend
->rt
[0].rgb_src_factor
) {
424 case PIPE_BLENDFACTOR_ONE
:
425 /* factors = (1,1,1), so term = (R,G,B) */
426 spe_move(f
, term1R_reg
, fragR_reg
);
427 spe_move(f
, term1G_reg
, fragG_reg
);
428 spe_move(f
, term1B_reg
, fragB_reg
);
430 case PIPE_BLENDFACTOR_ZERO
:
431 /* factors = (0,0,0), so term = (0,0,0) */
432 spe_load_float(f
, term1R_reg
, 0.0f
);
433 spe_load_float(f
, term1G_reg
, 0.0f
);
434 spe_load_float(f
, term1B_reg
, 0.0f
);
436 case PIPE_BLENDFACTOR_SRC_COLOR
:
437 /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
438 spe_fm(f
, term1R_reg
, fragR_reg
, fragR_reg
);
439 spe_fm(f
, term1G_reg
, fragG_reg
, fragG_reg
);
440 spe_fm(f
, term1B_reg
, fragB_reg
, fragB_reg
);
442 case PIPE_BLENDFACTOR_SRC_ALPHA
:
443 /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
444 spe_fm(f
, term1R_reg
, fragR_reg
, fragA_reg
);
445 spe_fm(f
, term1G_reg
, fragG_reg
, fragA_reg
);
446 spe_fm(f
, term1B_reg
, fragB_reg
, fragA_reg
);
448 case PIPE_BLENDFACTOR_INV_SRC_COLOR
:
449 /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B))
450 * or in other words term = (R-R*R, G-G*G, B-B*B)
451 * fnms(a,b,c,d) computes a = d - b*c
453 spe_fnms(f
, term1R_reg
, fragR_reg
, fragR_reg
, fragR_reg
);
454 spe_fnms(f
, term1G_reg
, fragG_reg
, fragG_reg
, fragG_reg
);
455 spe_fnms(f
, term1B_reg
, fragB_reg
, fragB_reg
, fragB_reg
);
457 case PIPE_BLENDFACTOR_DST_COLOR
:
458 /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
459 spe_fm(f
, term1R_reg
, fragR_reg
, fbR_reg
);
460 spe_fm(f
, term1G_reg
, fragG_reg
, fbG_reg
);
461 spe_fm(f
, term1B_reg
, fragB_reg
, fbB_reg
);
463 case PIPE_BLENDFACTOR_INV_DST_COLOR
:
464 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
465 * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
466 * fnms(a,b,c,d) computes a = d - b*c
468 spe_fnms(f
, term1R_reg
, fragR_reg
, fbR_reg
, fragR_reg
);
469 spe_fnms(f
, term1G_reg
, fragG_reg
, fbG_reg
, fragG_reg
);
470 spe_fnms(f
, term1B_reg
, fragB_reg
, fbB_reg
, fragB_reg
);
472 case PIPE_BLENDFACTOR_INV_SRC_ALPHA
:
473 /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
474 * or term = (R-R*A,G-G*A,B-B*A)
475 * fnms(a,b,c,d) computes a = d - b*c
477 spe_fnms(f
, term1R_reg
, fragR_reg
, fragA_reg
, fragR_reg
);
478 spe_fnms(f
, term1G_reg
, fragG_reg
, fragA_reg
, fragG_reg
);
479 spe_fnms(f
, term1B_reg
, fragB_reg
, fragA_reg
, fragB_reg
);
481 case PIPE_BLENDFACTOR_DST_ALPHA
:
482 /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
483 spe_fm(f
, term1R_reg
, fragR_reg
, fbA_reg
);
484 spe_fm(f
, term1G_reg
, fragG_reg
, fbA_reg
);
485 spe_fm(f
, term1B_reg
, fragB_reg
, fbA_reg
);
487 case PIPE_BLENDFACTOR_INV_DST_ALPHA
:
488 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb))
489 * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
490 * fnms(a,b,c,d) computes a = d - b*c
492 spe_fnms(f
, term1R_reg
, fragR_reg
, fbA_reg
, fragR_reg
);
493 spe_fnms(f
, term1G_reg
, fragG_reg
, fbA_reg
, fragG_reg
);
494 spe_fnms(f
, term1B_reg
, fragB_reg
, fbA_reg
, fragB_reg
);
496 case PIPE_BLENDFACTOR_CONST_COLOR
:
497 /* We need the optional constant color registers */
498 setup_const_register(f
, &constR_reg
, blend_color
->color
[0]);
499 setup_const_register(f
, &constG_reg
, blend_color
->color
[1]);
500 setup_const_register(f
, &constB_reg
, blend_color
->color
[2]);
501 /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
502 spe_fm(f
, term1R_reg
, fragR_reg
, constR_reg
);
503 spe_fm(f
, term1G_reg
, fragG_reg
, constG_reg
);
504 spe_fm(f
, term1B_reg
, fragB_reg
, constB_reg
);
506 case PIPE_BLENDFACTOR_CONST_ALPHA
:
507 /* we'll need the optional constant alpha register */
508 setup_const_register(f
, &constA_reg
, blend_color
->color
[3]);
509 /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
510 spe_fm(f
, term1R_reg
, fragR_reg
, constA_reg
);
511 spe_fm(f
, term1G_reg
, fragG_reg
, constA_reg
);
512 spe_fm(f
, term1B_reg
, fragB_reg
, constA_reg
);
514 case PIPE_BLENDFACTOR_INV_CONST_COLOR
:
515 /* We need the optional constant color registers */
516 setup_const_register(f
, &constR_reg
, blend_color
->color
[0]);
517 setup_const_register(f
, &constG_reg
, blend_color
->color
[1]);
518 setup_const_register(f
, &constB_reg
, blend_color
->color
[2]);
519 /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc))
520 * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
521 * fnms(a,b,c,d) computes a = d - b*c
523 spe_fnms(f
, term1R_reg
, fragR_reg
, constR_reg
, fragR_reg
);
524 spe_fnms(f
, term1G_reg
, fragG_reg
, constG_reg
, fragG_reg
);
525 spe_fnms(f
, term1B_reg
, fragB_reg
, constB_reg
, fragB_reg
);
527 case PIPE_BLENDFACTOR_INV_CONST_ALPHA
:
528 /* We need the optional constant color registers */
529 setup_const_register(f
, &constR_reg
, blend_color
->color
[0]);
530 setup_const_register(f
, &constG_reg
, blend_color
->color
[1]);
531 setup_const_register(f
, &constB_reg
, blend_color
->color
[2]);
532 /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
533 * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
534 * fnms(a,b,c,d) computes a = d - b*c
536 spe_fnms(f
, term1R_reg
, fragR_reg
, constA_reg
, fragR_reg
);
537 spe_fnms(f
, term1G_reg
, fragG_reg
, constA_reg
, fragG_reg
);
538 spe_fnms(f
, term1B_reg
, fragB_reg
, constA_reg
, fragB_reg
);
540 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE
:
541 /* We'll need the optional {1,1,1,1} register */
542 setup_const_register(f
, &one_reg
, 1.0f
);
543 /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so
544 * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
545 * We could expand the term (as a*min(b,c) == min(a*b,a*c)
546 * as long as a is positive), but then we'd have to do three
547 * spe_float_min() functions instead of one, so this is simpler.
550 spe_fs(f
, tmp_reg
, one_reg
, fbA_reg
);
551 /* tmp = min(A,tmp) */
552 spe_float_min(f
, tmp_reg
, fragA_reg
, tmp_reg
);
554 spe_fm(f
, term1R_reg
, fragR_reg
, tmp_reg
);
555 spe_fm(f
, term1G_reg
, fragG_reg
, tmp_reg
);
556 spe_fm(f
, term1B_reg
, fragB_reg
, tmp_reg
);
559 /* These are special D3D cases involving a second color output
560 * from the fragment shader. I'm not sure we can support them
563 case PIPE_BLENDFACTOR_SRC1_COLOR
:
564 case PIPE_BLENDFACTOR_SRC1_ALPHA
:
565 case PIPE_BLENDFACTOR_INV_SRC1_COLOR
:
566 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA
:
573 * Compute Src Alpha term. Like the above, we're looking for
574 * the full term A*factor, not just the factor itself, because
575 * in many cases we can avoid doing unnecessary multiplies.
577 switch (blend
->rt
[0].alpha_src_factor
) {
578 case PIPE_BLENDFACTOR_ZERO
:
579 /* factor = 0, so term = 0 */
580 spe_load_float(f
, term1A_reg
, 0.0f
);
583 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE
: /* fall through */
584 case PIPE_BLENDFACTOR_ONE
:
585 /* factor = 1, so term = A */
586 spe_move(f
, term1A_reg
, fragA_reg
);
589 case PIPE_BLENDFACTOR_SRC_COLOR
:
590 /* factor = A, so term = A*A */
591 spe_fm(f
, term1A_reg
, fragA_reg
, fragA_reg
);
593 case PIPE_BLENDFACTOR_SRC_ALPHA
:
594 spe_fm(f
, term1A_reg
, fragA_reg
, fragA_reg
);
597 case PIPE_BLENDFACTOR_INV_SRC_ALPHA
: /* fall through */
598 case PIPE_BLENDFACTOR_INV_SRC_COLOR
:
599 /* factor = 1-A, so term = A*(1-A) = A-A*A */
600 /* fnms(a,b,c,d) computes a = d - b*c */
601 spe_fnms(f
, term1A_reg
, fragA_reg
, fragA_reg
, fragA_reg
);
604 case PIPE_BLENDFACTOR_DST_ALPHA
: /* fall through */
605 case PIPE_BLENDFACTOR_DST_COLOR
:
606 /* factor = Afb, so term = A*Afb */
607 spe_fm(f
, term1A_reg
, fragA_reg
, fbA_reg
);
610 case PIPE_BLENDFACTOR_INV_DST_ALPHA
: /* fall through */
611 case PIPE_BLENDFACTOR_INV_DST_COLOR
:
612 /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
613 /* fnms(a,b,c,d) computes a = d - b*c */
614 spe_fnms(f
, term1A_reg
, fragA_reg
, fbA_reg
, fragA_reg
);
617 case PIPE_BLENDFACTOR_CONST_ALPHA
: /* fall through */
618 case PIPE_BLENDFACTOR_CONST_COLOR
:
619 /* We need the optional constA_reg register */
620 setup_const_register(f
, &constA_reg
, blend_color
->color
[3]);
621 /* factor = Ac, so term = A*Ac */
622 spe_fm(f
, term1A_reg
, fragA_reg
, constA_reg
);
625 case PIPE_BLENDFACTOR_INV_CONST_ALPHA
: /* fall through */
626 case PIPE_BLENDFACTOR_INV_CONST_COLOR
:
627 /* We need the optional constA_reg register */
628 setup_const_register(f
, &constA_reg
, blend_color
->color
[3]);
629 /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
630 /* fnms(a,b,c,d) computes a = d - b*c */
631 spe_fnms(f
, term1A_reg
, fragA_reg
, constA_reg
, fragA_reg
);
634 /* These are special D3D cases involving a second color output
635 * from the fragment shader. I'm not sure we can support them
638 case PIPE_BLENDFACTOR_SRC1_COLOR
:
639 case PIPE_BLENDFACTOR_SRC1_ALPHA
:
640 case PIPE_BLENDFACTOR_INV_SRC1_COLOR
:
641 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA
:
647 * Compute Dest RGB term. Like the above, we're looking for
648 * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
649 * in many cases we can avoid doing unnecessary multiplies.
651 switch (blend
->rt
[0].rgb_dst_factor
) {
652 case PIPE_BLENDFACTOR_ONE
:
653 /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
654 spe_move(f
, term2R_reg
, fbR_reg
);
655 spe_move(f
, term2G_reg
, fbG_reg
);
656 spe_move(f
, term2B_reg
, fbB_reg
);
658 case PIPE_BLENDFACTOR_ZERO
:
659 /* factor s= (0,0,0), so term = (0,0,0) */
660 spe_load_float(f
, term2R_reg
, 0.0f
);
661 spe_load_float(f
, term2G_reg
, 0.0f
);
662 spe_load_float(f
, term2B_reg
, 0.0f
);
664 case PIPE_BLENDFACTOR_SRC_COLOR
:
665 /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
666 spe_fm(f
, term2R_reg
, fbR_reg
, fragR_reg
);
667 spe_fm(f
, term2G_reg
, fbG_reg
, fragG_reg
);
668 spe_fm(f
, term2B_reg
, fbB_reg
, fragB_reg
);
670 case PIPE_BLENDFACTOR_INV_SRC_COLOR
:
671 /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B))
672 * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
673 * fnms(a,b,c,d) computes a = d - b*c
675 spe_fnms(f
, term2R_reg
, fragR_reg
, fbR_reg
, fbR_reg
);
676 spe_fnms(f
, term2G_reg
, fragG_reg
, fbG_reg
, fbG_reg
);
677 spe_fnms(f
, term2B_reg
, fragB_reg
, fbB_reg
, fbB_reg
);
679 case PIPE_BLENDFACTOR_SRC_ALPHA
:
680 /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
681 spe_fm(f
, term2R_reg
, fbR_reg
, fragA_reg
);
682 spe_fm(f
, term2G_reg
, fbG_reg
, fragA_reg
);
683 spe_fm(f
, term2B_reg
, fbB_reg
, fragA_reg
);
685 case PIPE_BLENDFACTOR_INV_SRC_ALPHA
:
686 /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
687 /* fnms(a,b,c,d) computes a = d - b*c */
688 spe_fnms(f
, term2R_reg
, fbR_reg
, fragA_reg
, fbR_reg
);
689 spe_fnms(f
, term2G_reg
, fbG_reg
, fragA_reg
, fbG_reg
);
690 spe_fnms(f
, term2B_reg
, fbB_reg
, fragA_reg
, fbB_reg
);
692 case PIPE_BLENDFACTOR_DST_COLOR
:
693 /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
694 spe_fm(f
, term2R_reg
, fbR_reg
, fbR_reg
);
695 spe_fm(f
, term2G_reg
, fbG_reg
, fbG_reg
);
696 spe_fm(f
, term2B_reg
, fbB_reg
, fbB_reg
);
698 case PIPE_BLENDFACTOR_INV_DST_COLOR
:
699 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
700 * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
701 * fnms(a,b,c,d) computes a = d - b*c
703 spe_fnms(f
, term2R_reg
, fbR_reg
, fbR_reg
, fbR_reg
);
704 spe_fnms(f
, term2G_reg
, fbG_reg
, fbG_reg
, fbG_reg
);
705 spe_fnms(f
, term2B_reg
, fbB_reg
, fbB_reg
, fbB_reg
);
708 case PIPE_BLENDFACTOR_DST_ALPHA
:
709 /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
710 spe_fm(f
, term2R_reg
, fbR_reg
, fbA_reg
);
711 spe_fm(f
, term2G_reg
, fbG_reg
, fbA_reg
);
712 spe_fm(f
, term2B_reg
, fbB_reg
, fbA_reg
);
714 case PIPE_BLENDFACTOR_INV_DST_ALPHA
:
715 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb))
716 * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
717 * fnms(a,b,c,d) computes a = d - b*c
719 spe_fnms(f
, term2R_reg
, fbR_reg
, fbA_reg
, fbR_reg
);
720 spe_fnms(f
, term2G_reg
, fbG_reg
, fbA_reg
, fbG_reg
);
721 spe_fnms(f
, term2B_reg
, fbB_reg
, fbA_reg
, fbB_reg
);
723 case PIPE_BLENDFACTOR_CONST_COLOR
:
724 /* We need the optional constant color registers */
725 setup_const_register(f
, &constR_reg
, blend_color
->color
[0]);
726 setup_const_register(f
, &constG_reg
, blend_color
->color
[1]);
727 setup_const_register(f
, &constB_reg
, blend_color
->color
[2]);
728 /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
729 spe_fm(f
, term2R_reg
, fbR_reg
, constR_reg
);
730 spe_fm(f
, term2G_reg
, fbG_reg
, constG_reg
);
731 spe_fm(f
, term2B_reg
, fbB_reg
, constB_reg
);
733 case PIPE_BLENDFACTOR_CONST_ALPHA
:
734 /* we'll need the optional constant alpha register */
735 setup_const_register(f
, &constA_reg
, blend_color
->color
[3]);
736 /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
737 spe_fm(f
, term2R_reg
, fbR_reg
, constA_reg
);
738 spe_fm(f
, term2G_reg
, fbG_reg
, constA_reg
);
739 spe_fm(f
, term2B_reg
, fbB_reg
, constA_reg
);
741 case PIPE_BLENDFACTOR_INV_CONST_COLOR
:
742 /* We need the optional constant color registers */
743 setup_const_register(f
, &constR_reg
, blend_color
->color
[0]);
744 setup_const_register(f
, &constG_reg
, blend_color
->color
[1]);
745 setup_const_register(f
, &constB_reg
, blend_color
->color
[2]);
746 /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc))
747 * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
748 * fnms(a,b,c,d) computes a = d - b*c
750 spe_fnms(f
, term2R_reg
, fbR_reg
, constR_reg
, fbR_reg
);
751 spe_fnms(f
, term2G_reg
, fbG_reg
, constG_reg
, fbG_reg
);
752 spe_fnms(f
, term2B_reg
, fbB_reg
, constB_reg
, fbB_reg
);
754 case PIPE_BLENDFACTOR_INV_CONST_ALPHA
:
755 /* We need the optional constant color registers */
756 setup_const_register(f
, &constR_reg
, blend_color
->color
[0]);
757 setup_const_register(f
, &constG_reg
, blend_color
->color
[1]);
758 setup_const_register(f
, &constB_reg
, blend_color
->color
[2]);
759 /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
760 * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
761 * fnms(a,b,c,d) computes a = d - b*c
763 spe_fnms(f
, term2R_reg
, fbR_reg
, constA_reg
, fbR_reg
);
764 spe_fnms(f
, term2G_reg
, fbG_reg
, constA_reg
, fbG_reg
);
765 spe_fnms(f
, term2B_reg
, fbB_reg
, constA_reg
, fbB_reg
);
767 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE
: /* not supported for dest RGB */
771 /* These are special D3D cases involving a second color output
772 * from the fragment shader. I'm not sure we can support them
775 case PIPE_BLENDFACTOR_SRC1_COLOR
:
776 case PIPE_BLENDFACTOR_SRC1_ALPHA
:
777 case PIPE_BLENDFACTOR_INV_SRC1_COLOR
:
778 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA
:
785 * Compute Dest Alpha term. Like the above, we're looking for
786 * the full term Afb*factor, not just the factor itself, because
787 * in many cases we can avoid doing unnecessary multiplies.
789 switch (blend
->rt
[0].alpha_dst_factor
) {
790 case PIPE_BLENDFACTOR_ONE
:
791 /* factor = 1, so term = Afb */
792 spe_move(f
, term2A_reg
, fbA_reg
);
794 case PIPE_BLENDFACTOR_ZERO
:
795 /* factor = 0, so term = 0 */
796 spe_load_float(f
, term2A_reg
, 0.0f
);
799 case PIPE_BLENDFACTOR_SRC_ALPHA
: /* fall through */
800 case PIPE_BLENDFACTOR_SRC_COLOR
:
801 /* factor = A, so term = Afb*A */
802 spe_fm(f
, term2A_reg
, fbA_reg
, fragA_reg
);
805 case PIPE_BLENDFACTOR_INV_SRC_ALPHA
: /* fall through */
806 case PIPE_BLENDFACTOR_INV_SRC_COLOR
:
807 /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
808 /* fnms(a,b,c,d) computes a = d - b*c */
809 spe_fnms(f
, term2A_reg
, fbA_reg
, fragA_reg
, fbA_reg
);
812 case PIPE_BLENDFACTOR_DST_ALPHA
: /* fall through */
813 case PIPE_BLENDFACTOR_DST_COLOR
:
814 /* factor = Afb, so term = Afb*Afb */
815 spe_fm(f
, term2A_reg
, fbA_reg
, fbA_reg
);
818 case PIPE_BLENDFACTOR_INV_DST_ALPHA
: /* fall through */
819 case PIPE_BLENDFACTOR_INV_DST_COLOR
:
820 /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
821 /* fnms(a,b,c,d) computes a = d - b*c */
822 spe_fnms(f
, term2A_reg
, fbA_reg
, fbA_reg
, fbA_reg
);
825 case PIPE_BLENDFACTOR_CONST_ALPHA
: /* fall through */
826 case PIPE_BLENDFACTOR_CONST_COLOR
:
827 /* We need the optional constA_reg register */
828 setup_const_register(f
, &constA_reg
, blend_color
->color
[3]);
829 /* factor = Ac, so term = Afb*Ac */
830 spe_fm(f
, term2A_reg
, fbA_reg
, constA_reg
);
833 case PIPE_BLENDFACTOR_INV_CONST_ALPHA
: /* fall through */
834 case PIPE_BLENDFACTOR_INV_CONST_COLOR
:
835 /* We need the optional constA_reg register */
836 setup_const_register(f
, &constA_reg
, blend_color
->color
[3]);
837 /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
838 /* fnms(a,b,c,d) computes a = d - b*c */
839 spe_fnms(f
, term2A_reg
, fbA_reg
, constA_reg
, fbA_reg
);
842 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE
: /* not supported for dest alpha */
846 /* These are special D3D cases involving a second color output
847 * from the fragment shader. I'm not sure we can support them
850 case PIPE_BLENDFACTOR_SRC1_COLOR
:
851 case PIPE_BLENDFACTOR_SRC1_ALPHA
:
852 case PIPE_BLENDFACTOR_INV_SRC1_COLOR
:
853 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA
:
859 * Combine Src/Dest RGB terms as per the blend equation.
861 switch (blend
->rt
[0].rgb_func
) {
863 spe_fa(f
, fragR_reg
, term1R_reg
, term2R_reg
);
864 spe_fa(f
, fragG_reg
, term1G_reg
, term2G_reg
);
865 spe_fa(f
, fragB_reg
, term1B_reg
, term2B_reg
);
867 case PIPE_BLEND_SUBTRACT
:
868 spe_fs(f
, fragR_reg
, term1R_reg
, term2R_reg
);
869 spe_fs(f
, fragG_reg
, term1G_reg
, term2G_reg
);
870 spe_fs(f
, fragB_reg
, term1B_reg
, term2B_reg
);
872 case PIPE_BLEND_REVERSE_SUBTRACT
:
873 spe_fs(f
, fragR_reg
, term2R_reg
, term1R_reg
);
874 spe_fs(f
, fragG_reg
, term2G_reg
, term1G_reg
);
875 spe_fs(f
, fragB_reg
, term2B_reg
, term1B_reg
);
878 spe_float_min(f
, fragR_reg
, term1R_reg
, term2R_reg
);
879 spe_float_min(f
, fragG_reg
, term1G_reg
, term2G_reg
);
880 spe_float_min(f
, fragB_reg
, term1B_reg
, term2B_reg
);
883 spe_float_max(f
, fragR_reg
, term1R_reg
, term2R_reg
);
884 spe_float_max(f
, fragG_reg
, term1G_reg
, term2G_reg
);
885 spe_float_max(f
, fragB_reg
, term1B_reg
, term2B_reg
);
892 * Combine Src/Dest A term
894 switch (blend
->rt
[0].alpha_func
) {
896 spe_fa(f
, fragA_reg
, term1A_reg
, term2A_reg
);
898 case PIPE_BLEND_SUBTRACT
:
899 spe_fs(f
, fragA_reg
, term1A_reg
, term2A_reg
);
901 case PIPE_BLEND_REVERSE_SUBTRACT
:
902 spe_fs(f
, fragA_reg
, term2A_reg
, term1A_reg
);
905 spe_float_min(f
, fragA_reg
, term1A_reg
, term2A_reg
);
908 spe_float_max(f
, fragA_reg
, term1A_reg
, term2A_reg
);
914 spe_release_register(f
, term1R_reg
);
915 spe_release_register(f
, term1G_reg
);
916 spe_release_register(f
, term1B_reg
);
917 spe_release_register(f
, term1A_reg
);
919 spe_release_register(f
, term2R_reg
);
920 spe_release_register(f
, term2G_reg
);
921 spe_release_register(f
, term2B_reg
);
922 spe_release_register(f
, term2A_reg
);
924 spe_release_register(f
, fbR_reg
);
925 spe_release_register(f
, fbG_reg
);
926 spe_release_register(f
, fbB_reg
);
927 spe_release_register(f
, fbA_reg
);
929 spe_release_register(f
, tmp_reg
);
931 /* Free any optional registers that actually got used */
932 release_const_register(f
, one_reg
);
933 release_const_register(f
, constR_reg
);
934 release_const_register(f
, constG_reg
);
935 release_const_register(f
, constB_reg
);
936 release_const_register(f
, constA_reg
);
941 gen_logicop(const struct pipe_blend_state
*blend
,
942 struct spe_function
*f
,
943 int fragRGBA_reg
, int fbRGBA_reg
)
945 /* We've got four 32-bit RGBA packed pixels in each of
946 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
947 * reds, greens, blues, and alphas.
949 ASSERT(blend
->logicop_enable
);
951 switch(blend
->logicop_func
) {
952 case PIPE_LOGICOP_CLEAR
: /* 0 */
953 spe_zero(f
, fragRGBA_reg
);
955 case PIPE_LOGICOP_NOR
: /* ~(s | d) */
956 spe_nor(f
, fragRGBA_reg
, fragRGBA_reg
, fbRGBA_reg
);
958 case PIPE_LOGICOP_AND_INVERTED
: /* ~s & d */
959 /* andc R, A, B computes R = A & ~B */
960 spe_andc(f
, fragRGBA_reg
, fbRGBA_reg
, fragRGBA_reg
);
962 case PIPE_LOGICOP_COPY_INVERTED
: /* ~s */
963 spe_complement(f
, fragRGBA_reg
, fragRGBA_reg
);
965 case PIPE_LOGICOP_AND_REVERSE
: /* s & ~d */
966 /* andc R, A, B computes R = A & ~B */
967 spe_andc(f
, fragRGBA_reg
, fragRGBA_reg
, fbRGBA_reg
);
969 case PIPE_LOGICOP_INVERT
: /* ~d */
970 /* Note that (A nor A) == ~(A|A) == ~A */
971 spe_nor(f
, fragRGBA_reg
, fbRGBA_reg
, fbRGBA_reg
);
973 case PIPE_LOGICOP_XOR
: /* s ^ d */
974 spe_xor(f
, fragRGBA_reg
, fragRGBA_reg
, fbRGBA_reg
);
976 case PIPE_LOGICOP_NAND
: /* ~(s & d) */
977 spe_nand(f
, fragRGBA_reg
, fragRGBA_reg
, fbRGBA_reg
);
979 case PIPE_LOGICOP_AND
: /* s & d */
980 spe_and(f
, fragRGBA_reg
, fragRGBA_reg
, fbRGBA_reg
);
982 case PIPE_LOGICOP_EQUIV
: /* ~(s ^ d) */
983 spe_xor(f
, fragRGBA_reg
, fragRGBA_reg
, fbRGBA_reg
);
984 spe_complement(f
, fragRGBA_reg
, fragRGBA_reg
);
986 case PIPE_LOGICOP_NOOP
: /* d */
987 spe_move(f
, fragRGBA_reg
, fbRGBA_reg
);
989 case PIPE_LOGICOP_OR_INVERTED
: /* ~s | d */
990 /* orc R, A, B computes R = A | ~B */
991 spe_orc(f
, fragRGBA_reg
, fbRGBA_reg
, fragRGBA_reg
);
993 case PIPE_LOGICOP_COPY
: /* s */
995 case PIPE_LOGICOP_OR_REVERSE
: /* s | ~d */
996 /* orc R, A, B computes R = A | ~B */
997 spe_orc(f
, fragRGBA_reg
, fragRGBA_reg
, fbRGBA_reg
);
999 case PIPE_LOGICOP_OR
: /* s | d */
1000 spe_or(f
, fragRGBA_reg
, fragRGBA_reg
, fbRGBA_reg
);
1002 case PIPE_LOGICOP_SET
: /* 1 */
1003 spe_load_int(f
, fragRGBA_reg
, 0xffffffff);
1012 * Generate code to pack a quad of float colors into four 32-bit integers.
1014 * \param f SPE function to append instruction onto.
1015 * \param color_format the dest color packing format
1016 * \param r_reg register containing four red values (in/clobbered)
1017 * \param g_reg register containing four green values (in/clobbered)
1018 * \param b_reg register containing four blue values (in/clobbered)
1019 * \param a_reg register containing four alpha values (in/clobbered)
1020 * \param rgba_reg register to store the packed RGBA colors (out)
1023 gen_pack_colors(struct spe_function
*f
,
1024 enum pipe_format color_format
,
1025 int r_reg
, int g_reg
, int b_reg
, int a_reg
,
1028 int rg_reg
= spe_allocate_available_register(f
);
1029 int ba_reg
= spe_allocate_available_register(f
);
1031 /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
1032 spe_cfltu(f
, r_reg
, r_reg
, 32);
1033 spe_cfltu(f
, g_reg
, g_reg
, 32);
1034 spe_cfltu(f
, b_reg
, b_reg
, 32);
1035 spe_cfltu(f
, a_reg
, a_reg
, 32);
1037 /* Shift the most significant bytes to the least significant positions.
1038 * I.e.: reg = reg >> 24
1040 spe_rotmi(f
, r_reg
, r_reg
, -24);
1041 spe_rotmi(f
, g_reg
, g_reg
, -24);
1042 spe_rotmi(f
, b_reg
, b_reg
, -24);
1043 spe_rotmi(f
, a_reg
, a_reg
, -24);
1045 /* Shift the color bytes according to the surface format */
1046 if (color_format
== PIPE_FORMAT_B8G8R8A8_UNORM
) {
1047 spe_roti(f
, g_reg
, g_reg
, 8); /* green <<= 8 */
1048 spe_roti(f
, r_reg
, r_reg
, 16); /* red <<= 16 */
1049 spe_roti(f
, a_reg
, a_reg
, 24); /* alpha <<= 24 */
1051 else if (color_format
== PIPE_FORMAT_A8R8G8B8_UNORM
) {
1052 spe_roti(f
, r_reg
, r_reg
, 8); /* red <<= 8 */
1053 spe_roti(f
, g_reg
, g_reg
, 16); /* green <<= 16 */
1054 spe_roti(f
, b_reg
, b_reg
, 24); /* blue <<= 24 */
1060 /* Merge red, green, blue, alpha registers to make packed RGBA colors.
1061 * Eg: after shifting according to color_format we might have:
1062 * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
1063 * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
1064 * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
1065 * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
1066 * OR-ing all those together gives us four packed colors:
1067 * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
1069 spe_or(f
, rg_reg
, r_reg
, g_reg
);
1070 spe_or(f
, ba_reg
, a_reg
, b_reg
);
1071 spe_or(f
, rgba_reg
, rg_reg
, ba_reg
);
1073 spe_release_register(f
, rg_reg
);
1074 spe_release_register(f
, ba_reg
);
1079 gen_colormask(struct spe_function
*f
,
1081 enum pipe_format color_format
,
1082 int fragRGBA_reg
, int fbRGBA_reg
)
1084 /* We've got four 32-bit RGBA packed pixels in each of
1085 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
1086 * reds, greens, blues, and alphas. Further, the pixels
1087 * are packed according to the given color format, not
1088 * necessarily RGBA...
1095 /* Calculate exactly where the bits for any particular color
1096 * end up, so we can mask them correctly.
1098 switch(color_format
) {
1099 case PIPE_FORMAT_B8G8R8A8_UNORM
:
1101 a_mask
= 0xff000000;
1102 r_mask
= 0x00ff0000;
1103 g_mask
= 0x0000ff00;
1104 b_mask
= 0x000000ff;
1106 case PIPE_FORMAT_A8R8G8B8_UNORM
:
1108 b_mask
= 0xff000000;
1109 g_mask
= 0x00ff0000;
1110 r_mask
= 0x0000ff00;
1111 a_mask
= 0x000000ff;
1117 /* For each R, G, B, and A component we're supposed to mask out,
1118 * clear its bits. Then our mask operation later will work
1121 if (!(colormask
& PIPE_MASK_R
)) {
1124 if (!(colormask
& PIPE_MASK_G
)) {
1127 if (!(colormask
& PIPE_MASK_B
)) {
1130 if (!(colormask
& PIPE_MASK_A
)) {
1134 /* Get a temporary register to hold the mask that will be applied
1137 int colormask_reg
= spe_allocate_available_register(f
);
1139 /* The actual mask we're going to use is an OR of the remaining R, G, B,
1140 * and A masks. Load the result value into our temporary register.
1142 spe_load_uint(f
, colormask_reg
, r_mask
| g_mask
| b_mask
| a_mask
);
1144 /* Use the mask register to select between the fragment color
1145 * values and the frame buffer color values. Wherever the
1146 * mask has a 0 bit, the current frame buffer color should override
1147 * the fragment color. Wherever the mask has a 1 bit, the
1148 * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM)
1149 * instruction will select bits from its first operand rA wherever the
1150 * the mask bits rM are 0, and from its second operand rB wherever the
1151 * mask bits rM are 1. That means that the frame buffer color is the
1152 * first operand, and the fragment color the second.
1154 spe_selb(f
, fragRGBA_reg
, fbRGBA_reg
, fragRGBA_reg
, colormask_reg
);
1156 /* Release the temporary register and we're done */
1157 spe_release_register(f
, colormask_reg
);
1162 * This function is annoyingly similar to gen_depth_test(), above, except
1163 * that instead of comparing two varying values (i.e. fragment and buffer),
1164 * we're comparing a varying value with a static value. As such, we have
1165 * access to the Compare Immediate instructions where we don't in
1166 * gen_depth_test(), which is what makes us very different.
1168 * There's some added complexity if there's a non-trivial state->mask
1169 * value; then stencil and reference both must be masked
1171 * The return value in the stencil_pass_reg is a bitmask of valid
1172 * fragments that also passed the stencil test. The bitmask of valid
1173 * fragments that failed would be found in
1174 * (fragment_mask_reg & ~stencil_pass_reg).
1177 gen_stencil_test(struct spe_function
*f
,
1178 const struct pipe_stencil_state
*state
,
1179 const unsigned ref_value
,
1180 uint stencil_max_value
,
1181 int fragment_mask_reg
,
1183 int stencil_pass_reg
)
1185 /* Generate code that puts the set of passing fragments into the
1186 * stencil_pass_reg register, taking into account whether each fragment
1187 * was active to begin with.
1189 switch (state
->func
) {
1190 case PIPE_FUNC_EQUAL
:
1191 if (state
->valuemask
== stencil_max_value
) {
1192 /* stencil_pass = fragment_mask & (s == reference) */
1193 spe_compare_equal_uint(f
, stencil_pass_reg
, fbS_reg
, ref_value
);
1194 spe_and(f
, stencil_pass_reg
, fragment_mask_reg
, stencil_pass_reg
);
1197 /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
1198 uint tmp_masked_stencil
= spe_allocate_available_register(f
);
1199 spe_and_uint(f
, tmp_masked_stencil
, fbS_reg
, state
->valuemask
);
1200 spe_compare_equal_uint(f
, stencil_pass_reg
, tmp_masked_stencil
,
1201 state
->valuemask
& ref_value
);
1202 spe_and(f
, stencil_pass_reg
, fragment_mask_reg
, stencil_pass_reg
);
1203 spe_release_register(f
, tmp_masked_stencil
);
1207 case PIPE_FUNC_NOTEQUAL
:
1208 if (state
->valuemask
== stencil_max_value
) {
1209 /* stencil_pass = fragment_mask & ~(s == reference) */
1210 spe_compare_equal_uint(f
, stencil_pass_reg
, fbS_reg
, ref_value
);
1211 spe_andc(f
, stencil_pass_reg
, fragment_mask_reg
, stencil_pass_reg
);
1214 /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
1215 int tmp_masked_stencil
= spe_allocate_available_register(f
);
1216 spe_and_uint(f
, tmp_masked_stencil
, fbS_reg
, state
->valuemask
);
1217 spe_compare_equal_uint(f
, stencil_pass_reg
, tmp_masked_stencil
,
1218 state
->valuemask
& ref_value
);
1219 spe_andc(f
, stencil_pass_reg
, fragment_mask_reg
, stencil_pass_reg
);
1220 spe_release_register(f
, tmp_masked_stencil
);
1224 case PIPE_FUNC_LESS
:
1225 if (state
->valuemask
== stencil_max_value
) {
1226 /* stencil_pass = fragment_mask & (reference < s) */
1227 spe_compare_greater_uint(f
, stencil_pass_reg
, fbS_reg
, ref_value
);
1228 spe_and(f
, stencil_pass_reg
, fragment_mask_reg
, stencil_pass_reg
);
1231 /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */
1232 int tmp_masked_stencil
= spe_allocate_available_register(f
);
1233 spe_and_uint(f
, tmp_masked_stencil
, fbS_reg
, state
->valuemask
);
1234 spe_compare_greater_uint(f
, stencil_pass_reg
, tmp_masked_stencil
,
1235 state
->valuemask
& ref_value
);
1236 spe_and(f
, stencil_pass_reg
, fragment_mask_reg
, stencil_pass_reg
);
1237 spe_release_register(f
, tmp_masked_stencil
);
1241 case PIPE_FUNC_GREATER
:
1242 if (state
->valuemask
== stencil_max_value
) {
1243 /* stencil_pass = fragment_mask & (reference > s) */
1244 /* There's no convenient Compare Less Than Immediate instruction, so
1245 * we'll have to do this one the harder way, by loading a register and
1246 * comparing directly. Compare Logical Greater Than Word (clgt)
1247 * treats its operands as unsigned - no sign extension.
1249 int tmp_reg
= spe_allocate_available_register(f
);
1250 spe_load_uint(f
, tmp_reg
, ref_value
);
1251 spe_clgt(f
, stencil_pass_reg
, tmp_reg
, fbS_reg
);
1252 spe_and(f
, stencil_pass_reg
, fragment_mask_reg
, stencil_pass_reg
);
1253 spe_release_register(f
, tmp_reg
);
1256 /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
1257 int tmp_reg
= spe_allocate_available_register(f
);
1258 int tmp_masked_stencil
= spe_allocate_available_register(f
);
1259 spe_load_uint(f
, tmp_reg
, state
->valuemask
& ref_value
);
1260 spe_and_uint(f
, tmp_masked_stencil
, fbS_reg
, state
->valuemask
);
1261 spe_clgt(f
, stencil_pass_reg
, tmp_reg
, tmp_masked_stencil
);
1262 spe_and(f
, stencil_pass_reg
, fragment_mask_reg
, stencil_pass_reg
);
1263 spe_release_register(f
, tmp_reg
);
1264 spe_release_register(f
, tmp_masked_stencil
);
1268 case PIPE_FUNC_GEQUAL
:
1269 if (state
->valuemask
== stencil_max_value
) {
1270 /* stencil_pass = fragment_mask & (reference >= s)
1271 * = fragment_mask & ~(s > reference) */
1272 spe_compare_greater_uint(f
, stencil_pass_reg
, fbS_reg
,
1274 spe_andc(f
, stencil_pass_reg
, fragment_mask_reg
, stencil_pass_reg
);
1277 /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
1278 int tmp_masked_stencil
= spe_allocate_available_register(f
);
1279 spe_and_uint(f
, tmp_masked_stencil
, fbS_reg
, state
->valuemask
);
1280 spe_compare_greater_uint(f
, stencil_pass_reg
, tmp_masked_stencil
,
1281 state
->valuemask
& ref_value
);
1282 spe_andc(f
, stencil_pass_reg
, fragment_mask_reg
, stencil_pass_reg
);
1283 spe_release_register(f
, tmp_masked_stencil
);
1287 case PIPE_FUNC_LEQUAL
:
1288 if (state
->valuemask
== stencil_max_value
) {
1289 /* stencil_pass = fragment_mask & (reference <= s) ]
1290 * = fragment_mask & ~(reference > s) */
1291 /* As above, we have to do this by loading a register */
1292 int tmp_reg
= spe_allocate_available_register(f
);
1293 spe_load_uint(f
, tmp_reg
, ref_value
);
1294 spe_clgt(f
, stencil_pass_reg
, tmp_reg
, fbS_reg
);
1295 spe_andc(f
, stencil_pass_reg
, fragment_mask_reg
, stencil_pass_reg
);
1296 spe_release_register(f
, tmp_reg
);
1299 /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
1300 int tmp_reg
= spe_allocate_available_register(f
);
1301 int tmp_masked_stencil
= spe_allocate_available_register(f
);
1302 spe_load_uint(f
, tmp_reg
, ref_value
& state
->valuemask
);
1303 spe_and_uint(f
, tmp_masked_stencil
, fbS_reg
, state
->valuemask
);
1304 spe_clgt(f
, stencil_pass_reg
, tmp_reg
, tmp_masked_stencil
);
1305 spe_andc(f
, stencil_pass_reg
, fragment_mask_reg
, stencil_pass_reg
);
1306 spe_release_register(f
, tmp_reg
);
1307 spe_release_register(f
, tmp_masked_stencil
);
1311 case PIPE_FUNC_NEVER
:
1312 /* stencil_pass = fragment_mask & 0 = 0 */
1313 spe_load_uint(f
, stencil_pass_reg
, 0);
1316 case PIPE_FUNC_ALWAYS
:
1317 /* stencil_pass = fragment_mask & 1 = fragment_mask */
1318 spe_move(f
, stencil_pass_reg
, fragment_mask_reg
);
1322 /* The fragments that passed the stencil test are now in stencil_pass_reg.
1323 * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
1329 * This function generates code that calculates a set of new stencil values
1330 * given the earlier values and the operation to apply. It does not
1331 * apply any tests. It is intended to be called up to 3 times
1332 * (for the stencil fail operation, for the stencil pass-z fail operation,
1333 * and for the stencil pass-z pass operation) to collect up to three
1334 * possible sets of values, and for the caller to combine them based
1335 * on the result of the tests.
1337 * stencil_max_value should be (2^n - 1) where n is the number of bits
1338 * in the stencil buffer - in other words, it should be usable as a mask.
1341 gen_stencil_values(struct spe_function
*f
,
1343 uint stencil_ref_value
,
1344 uint stencil_max_value
,
1348 /* The code below assumes that newS_reg and fbS_reg are not the same
1349 * register; if they can be, the calculations below will have to use
1350 * an additional temporary register. For now, mark the assumption
1351 * with an assertion that will fail if they are the same.
1353 ASSERT(fbS_reg
!= newS_reg
);
1355 /* The code also assumes that the stencil_max_value is of the form
1356 * 2^n-1 and can therefore be used as a mask for the valid bits in
1357 * addition to a maximum. Make sure this is the case as well.
1358 * The clever math below exploits the fact that incrementing a
1359 * binary number serves to flip all the bits of a number starting at
1360 * the LSB and continuing to (and including) the first zero bit
1361 * found. That means that a number and its increment will always
1362 * have at least one bit in common (the high order bit, if nothing
1363 * else) *unless* the number is zero, *or* the number is of a form
1364 * consisting of some number of 1s in the low-order bits followed
1365 * by nothing but 0s in the high-order bits. The latter case
1366 * implies it's of the form 2^n-1.
1368 ASSERT(stencil_max_value
> 0 && ((stencil_max_value
+ 1) & stencil_max_value
) == 0);
1370 switch(stencil_op
) {
1371 case PIPE_STENCIL_OP_KEEP
:
1373 spe_move(f
, newS_reg
, fbS_reg
);
1376 case PIPE_STENCIL_OP_ZERO
:
1378 spe_zero(f
, newS_reg
);
1381 case PIPE_STENCIL_OP_REPLACE
:
1382 /* newS = stencil reference value */
1383 spe_load_uint(f
, newS_reg
, stencil_ref_value
);
1386 case PIPE_STENCIL_OP_INCR
: {
1387 /* newS = (s == max ? max : s + 1) */
1388 int equals_reg
= spe_allocate_available_register(f
);
1390 spe_compare_equal_uint(f
, equals_reg
, fbS_reg
, stencil_max_value
);
1391 /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
1392 spe_ai(f
, newS_reg
, fbS_reg
, 1);
1393 /* Select from the current value or the new value based on the equality test */
1394 spe_selb(f
, newS_reg
, newS_reg
, fbS_reg
, equals_reg
);
1396 spe_release_register(f
, equals_reg
);
1399 case PIPE_STENCIL_OP_DECR
: {
1400 /* newS = (s == 0 ? 0 : s - 1) */
1401 int equals_reg
= spe_allocate_available_register(f
);
1403 spe_compare_equal_uint(f
, equals_reg
, fbS_reg
, 0);
1404 /* Add Word Immediate with a (-1) value works */
1405 spe_ai(f
, newS_reg
, fbS_reg
, -1);
1406 /* Select from the current value or the new value based on the equality test */
1407 spe_selb(f
, newS_reg
, newS_reg
, fbS_reg
, equals_reg
);
1409 spe_release_register(f
, equals_reg
);
1412 case PIPE_STENCIL_OP_INCR_WRAP
:
1413 /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
1414 * do a normal add and mask off the correct bits
1416 spe_ai(f
, newS_reg
, fbS_reg
, 1);
1417 spe_and_uint(f
, newS_reg
, newS_reg
, stencil_max_value
);
1420 case PIPE_STENCIL_OP_DECR_WRAP
:
1421 /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
1422 spe_ai(f
, newS_reg
, fbS_reg
, -1);
1423 spe_and_uint(f
, newS_reg
, newS_reg
, stencil_max_value
);
1426 case PIPE_STENCIL_OP_INVERT
:
1427 /* newS = ~s. We take advantage of the mask/max value to invert only
1428 * the valid bits for the field so we don't have to do an extra "and".
1430 spe_xor_uint(f
, newS_reg
, fbS_reg
, stencil_max_value
);
1440 * This function generates code to get all the necessary possible
1441 * stencil values. For each of the output registers (fail_reg,
1442 * zfail_reg, and zpass_reg), it either allocates a new register
1443 * and calculates a new set of values based on the stencil operation,
1444 * or it reuses a register allocation and calculation done for an
1445 * earlier (matching) operation, or it reuses the fbS_reg register
1446 * (if the stencil operation is KEEP, which doesn't change the
1449 * Since this function allocates a variable number of registers,
1450 * to avoid incurring complex logic to free them, they should
1451 * be allocated after a spe_allocate_register_set() call
1452 * and released by the corresponding spe_release_register_set() call.
1455 gen_get_stencil_values(struct spe_function
*f
,
1456 const struct pipe_stencil_state
*stencil
,
1457 const unsigned ref_value
,
1458 const uint depth_enabled
,
1466 /* Stenciling had better be enabled here */
1467 ASSERT(stencil
->enabled
);
1469 /* If the depth test is not enabled, it is treated as though it always
1470 * passes, which means that the zfail_op is not considered - a
1471 * failing stencil test triggers the fail_op, and a passing one
1472 * triggers the zpass_op
1474 * As an optimization, override calculation of the zfail_op values
1475 * if they aren't going to be used. By setting the value of
1476 * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
1477 * to match the incoming stencil values, and no calculation will
1480 if (depth_enabled
) {
1481 zfail_op
= stencil
->zfail_op
;
1484 zfail_op
= PIPE_STENCIL_OP_KEEP
;
1487 /* One-sided or front-facing stencil */
1488 if (stencil
->fail_op
== PIPE_STENCIL_OP_KEEP
) {
1489 *fail_reg
= fbS_reg
;
1492 *fail_reg
= spe_allocate_available_register(f
);
1493 gen_stencil_values(f
, stencil
->fail_op
, ref_value
,
1494 0xff, fbS_reg
, *fail_reg
);
1497 /* Check the possibly overridden value, not the structure value */
1498 if (zfail_op
== PIPE_STENCIL_OP_KEEP
) {
1499 *zfail_reg
= fbS_reg
;
1501 else if (zfail_op
== stencil
->fail_op
) {
1502 *zfail_reg
= *fail_reg
;
1505 *zfail_reg
= spe_allocate_available_register(f
);
1506 gen_stencil_values(f
, stencil
->zfail_op
, ref_value
,
1507 0xff, fbS_reg
, *zfail_reg
);
1510 if (stencil
->zpass_op
== PIPE_STENCIL_OP_KEEP
) {
1511 *zpass_reg
= fbS_reg
;
1513 else if (stencil
->zpass_op
== stencil
->fail_op
) {
1514 *zpass_reg
= *fail_reg
;
1516 else if (stencil
->zpass_op
== zfail_op
) {
1517 *zpass_reg
= *zfail_reg
;
1520 *zpass_reg
= spe_allocate_available_register(f
);
1521 gen_stencil_values(f
, stencil
->zpass_op
, ref_value
,
1522 0xff, fbS_reg
, *zpass_reg
);
1527 * Note that fbZ_reg may *not* be set on entry, if in fact
1528 * the depth test is not enabled. This function must not use
1529 * the register if depth is not enabled.
1532 gen_stencil_depth_test(struct spe_function
*f
,
1533 const struct pipe_depth_stencil_alpha_state
*dsa
,
1534 const struct pipe_stencil_ref
*stencil_ref
,
1536 const int mask_reg
, const int fragZ_reg
,
1537 const int fbZ_reg
, const int fbS_reg
)
1539 /* True if we've generated code that could require writeback to the
1540 * depth and/or stencil buffers
1542 boolean modified_buffers
= FALSE
;
1544 boolean need_to_calculate_stencil_values
;
1545 boolean need_to_writemask_stencil_values
;
1547 struct pipe_stencil_state
*stencil
;
1549 /* Registers. We may or may not actually allocate these, depending
1550 * on whether the state values indicate that we need them.
1552 int stencil_pass_reg
, stencil_fail_reg
;
1553 int stencil_fail_values
, stencil_pass_depth_fail_values
, stencil_pass_depth_pass_values
;
1554 int stencil_writemask_reg
;
1559 /* Stenciling is quite complex: up to six different configurable stencil
1560 * operations/calculations can be required (three each for front-facing
1561 * and back-facing fragments). Many of those operations will likely
1562 * be identical, so there's good reason to try to avoid calculating
1563 * the same values more than once (which unfortunately makes the code less
1566 * To make register management easier, we start a new
1567 * register set; we can release all the registers in the set at
1568 * once, and avoid having to keep track of exactly which registers
1569 * we allocate. We can still allocate and free registers as
1570 * desired (if we know we no longer need a register), but we don't
1571 * have to spend the complexity to track the more difficult variant
1572 * register usage scenarios.
1574 spe_comment(f
, 0, "Allocating stencil register set");
1575 spe_allocate_register_set(f
);
1577 /* The facing we're given is the fragment facing; it doesn't
1578 * exactly match the stencil facing. If stencil is enabled,
1579 * but two-sided stencil is *not* enabled, we use the same
1580 * stencil settings for both front- and back-facing fragments.
1581 * We only use the "back-facing" stencil for backfacing fragments
1582 * if two-sided stenciling is enabled.
1584 if (facing
== CELL_FACING_BACK
&& dsa
->stencil
[1].enabled
) {
1585 stencil
= &dsa
->stencil
[1];
1586 ref_value
= stencil_ref
->ref_value
[1];
1589 stencil
= &dsa
->stencil
[0];
1590 ref_value
= stencil_ref
->ref_value
[0];
1593 /* Calculate the writemask. If the writemask is trivial (either
1594 * all 0s, meaning that we don't need to calculate any stencil values
1595 * because they're not going to change the stencil anyway, or all 1s,
1596 * meaning that we have to calculate the stencil values but do not
1597 * need to mask them), we can avoid generating code. Don't forget
1598 * that we need to consider backfacing stencil, if enabled.
1600 * Note that if the backface stencil is *not* enabled, the backface
1601 * stencil will have the same values as the frontface stencil.
1603 if (stencil
->fail_op
== PIPE_STENCIL_OP_KEEP
&&
1604 stencil
->zfail_op
== PIPE_STENCIL_OP_KEEP
&&
1605 stencil
->zpass_op
== PIPE_STENCIL_OP_KEEP
) {
1606 need_to_calculate_stencil_values
= FALSE
;
1607 need_to_writemask_stencil_values
= FALSE
;
1609 else if (stencil
->writemask
== 0x0) {
1610 /* All changes are writemasked out, so no need to calculate
1611 * what those changes might be, and no need to write anything back.
1613 need_to_calculate_stencil_values
= FALSE
;
1614 need_to_writemask_stencil_values
= FALSE
;
1616 else if (stencil
->writemask
== 0xff) {
1617 /* Still trivial, but a little less so. We need to write the stencil
1618 * values, but we don't need to mask them.
1620 need_to_calculate_stencil_values
= TRUE
;
1621 need_to_writemask_stencil_values
= FALSE
;
1624 /* The general case: calculate, mask, and write */
1625 need_to_calculate_stencil_values
= TRUE
;
1626 need_to_writemask_stencil_values
= TRUE
;
1628 /* While we're here, generate code that calculates what the
1629 * writemask should be. If backface stenciling is enabled,
1630 * and the backface writemask is not the same as the frontface
1631 * writemask, we'll have to generate code that merges the
1632 * two masks into a single effective mask based on fragment facing.
1634 spe_comment(f
, 0, "Computing stencil writemask");
1635 stencil_writemask_reg
= spe_allocate_available_register(f
);
1636 spe_load_uint(f
, stencil_writemask_reg
, dsa
->stencil
[facing
].writemask
);
1639 /* At least one-sided stenciling must be on. Generate code that
1640 * runs the stencil test on the basic/front-facing stencil, leaving
1641 * the mask of passing stencil bits in stencil_pass_reg. This mask will
1642 * be used both to mask the set of active pixels, and also to
1643 * determine how the stencil buffer changes.
1645 * This test will *not* change the value in mask_reg (because we don't
1646 * yet know whether to apply the two-sided stencil or one-sided stencil).
1648 spe_comment(f
, 0, "Running basic stencil test");
1649 stencil_pass_reg
= spe_allocate_available_register(f
);
1650 gen_stencil_test(f
, stencil
, ref_value
, 0xff, mask_reg
, fbS_reg
, stencil_pass_reg
);
1652 /* Generate code that, given the mask of valid fragments and the
1653 * mask of valid fragments that passed the stencil test, computes
1654 * the mask of valid fragments that failed the stencil test. We
1655 * have to do this before we run a depth test (because the
1656 * depth test should not be performed on fragments that failed the
1657 * stencil test, and because the depth test will update the
1658 * mask of valid fragments based on the results of the depth test).
1660 spe_comment(f
, 0, "Computing stencil fail mask and updating fragment mask");
1661 stencil_fail_reg
= spe_allocate_available_register(f
);
1662 spe_andc(f
, stencil_fail_reg
, mask_reg
, stencil_pass_reg
);
1663 /* Now remove the stenciled-out pixels from the valid fragment mask,
1664 * so we can later use the valid fragment mask in the depth test.
1666 spe_and(f
, mask_reg
, mask_reg
, stencil_pass_reg
);
1668 /* We may not need to calculate stencil values, if the writemask is off */
1669 if (need_to_calculate_stencil_values
) {
1670 /* Generate code that calculates exactly which stencil values we need,
1671 * without calculating the same value twice (say, if two different
1672 * stencil ops have the same value). This code will work for one-sided
1673 * and two-sided stenciling (so that we take into account that operations
1674 * may match between front and back stencils), and will also take into
1675 * account whether the depth test is enabled (if the depth test is off,
1676 * we don't need any of the zfail results, because the depth test always
1677 * is considered to pass if it is disabled). Any register value that
1678 * does not need to be calculated will come back with the same value
1679 * that's in fbS_reg.
1681 * This function will allocate a variant number of registers that
1682 * will be released as part of the register set.
1684 spe_comment(f
, 0, facing
== CELL_FACING_FRONT
1685 ? "Computing front-facing stencil values"
1686 : "Computing back-facing stencil values");
1687 gen_get_stencil_values(f
, stencil
, ref_value
, dsa
->depth
.enabled
, fbS_reg
,
1688 &stencil_fail_values
, &stencil_pass_depth_fail_values
,
1689 &stencil_pass_depth_pass_values
);
1692 /* We now have all the stencil values we need. We also need
1693 * the results of the depth test to figure out which
1694 * stencil values will become the new stencil values. (Even if
1695 * we aren't actually calculating stencil values, we need to apply
1696 * the depth test if it's enabled.)
1698 * The code generated by gen_depth_test() returns the results of the
1699 * test in the given register, but also alters the mask_reg based
1700 * on the results of the test.
1702 if (dsa
->depth
.enabled
) {
1703 spe_comment(f
, 0, "Running stencil depth test");
1704 zmask_reg
= spe_allocate_available_register(f
);
1705 modified_buffers
|= gen_depth_test(f
, dsa
, mask_reg
, fragZ_reg
,
1706 fbZ_reg
, zmask_reg
);
1709 if (need_to_calculate_stencil_values
) {
1711 /* If we need to writemask the stencil values before going into
1712 * the stencil buffer, we'll have to use a new register to
1713 * hold the new values. If not, we can just keep using the
1716 if (need_to_writemask_stencil_values
) {
1717 newS_reg
= spe_allocate_available_register(f
);
1718 spe_comment(f
, 0, "Saving current stencil values for writemasking");
1719 spe_move(f
, newS_reg
, fbS_reg
);
1725 /* Merge in the selected stencil fail values */
1726 if (stencil_fail_values
!= fbS_reg
) {
1727 spe_comment(f
, 0, "Loading stencil fail values");
1728 spe_selb(f
, newS_reg
, newS_reg
, stencil_fail_values
, stencil_fail_reg
);
1729 modified_buffers
= TRUE
;
1732 /* Same for the stencil pass/depth fail values. If this calculation
1733 * is not needed (say, if depth test is off), then the
1734 * stencil_pass_depth_fail_values register will be equal to fbS_reg
1735 * and we'll skip the calculation.
1737 if (stencil_pass_depth_fail_values
!= fbS_reg
) {
1738 /* We don't actually have a stencil pass/depth fail mask yet.
1739 * Calculate it here from the stencil passing mask and the
1740 * depth passing mask. Note that zmask_reg *must* have been
1741 * set above if we're here.
1743 uint stencil_pass_depth_fail_mask
=
1744 spe_allocate_available_register(f
);
1746 spe_comment(f
, 0, "Loading stencil pass/depth fail values");
1747 spe_andc(f
, stencil_pass_depth_fail_mask
, stencil_pass_reg
, zmask_reg
);
1749 spe_selb(f
, newS_reg
, newS_reg
, stencil_pass_depth_fail_values
,
1750 stencil_pass_depth_fail_mask
);
1752 spe_release_register(f
, stencil_pass_depth_fail_mask
);
1753 modified_buffers
= TRUE
;
1756 /* Same for the stencil pass/depth pass mask. Note that we
1757 * *can* get here with zmask_reg being unset (if the depth
1758 * test is off but the stencil test is on). In this case,
1759 * we assume the depth test passes, and don't need to mask
1760 * the stencil pass mask with the Z mask.
1762 if (stencil_pass_depth_pass_values
!= fbS_reg
) {
1763 if (dsa
->depth
.enabled
) {
1764 uint stencil_pass_depth_pass_mask
= spe_allocate_available_register(f
);
1765 /* We'll need a separate register */
1766 spe_comment(f
, 0, "Loading stencil pass/depth pass values");
1767 spe_and(f
, stencil_pass_depth_pass_mask
, stencil_pass_reg
, zmask_reg
);
1768 spe_selb(f
, newS_reg
, newS_reg
, stencil_pass_depth_pass_values
, stencil_pass_depth_pass_mask
);
1769 spe_release_register(f
, stencil_pass_depth_pass_mask
);
1772 /* We can use the same stencil-pass register */
1773 spe_comment(f
, 0, "Loading stencil pass values");
1774 spe_selb(f
, newS_reg
, newS_reg
, stencil_pass_depth_pass_values
, stencil_pass_reg
);
1776 modified_buffers
= TRUE
;
1779 /* Almost done. If we need to writemask, do it now, leaving the
1780 * results in the fbS_reg register passed in. If we don't need
1781 * to writemask, then the results are *already* in the fbS_reg,
1782 * so there's nothing more to do.
1785 if (need_to_writemask_stencil_values
&& modified_buffers
) {
1786 /* The Select Bytes command makes a fine writemask. Where
1787 * the mask is 0, the first (original) values are retained,
1788 * effectively masking out changes. Where the mask is 1, the
1789 * second (new) values are retained, incorporating changes.
1791 spe_comment(f
, 0, "Writemasking new stencil values");
1792 spe_selb(f
, fbS_reg
, fbS_reg
, newS_reg
, stencil_writemask_reg
);
1795 } /* done calculating stencil values */
1797 /* The stencil and/or depth values have been applied, and the
1798 * mask_reg, fbS_reg, and fbZ_reg values have been updated.
1799 * We're all done, except that we've allocated a fair number
1800 * of registers that we didn't bother tracking. Release all
1801 * those registers as part of the register set, and go home.
1803 spe_comment(f
, 0, "Releasing stencil register set");
1804 spe_release_register_set(f
);
1806 /* Return TRUE if we could have modified the stencil and/or
1809 return modified_buffers
;
1814 * Generate depth and/or stencil test code.
1815 * \param cell context
1816 * \param dsa depth/stencil/alpha state
1817 * \param f spe function to emit
1818 * \param facing either CELL_FACING_FRONT or CELL_FACING_BACK
1819 * \param mask_reg register containing the pixel alive/dead mask
1820 * \param depth_tile_reg register containing address of z/stencil tile
1821 * \param quad_offset_reg offset to quad from start of tile
1822 * \param fragZ_reg register containg fragment Z values
1825 gen_depth_stencil(struct cell_context
*cell
,
1826 const struct pipe_depth_stencil_alpha_state
*dsa
,
1827 const struct pipe_stencil_ref
*stencil_ref
,
1828 struct spe_function
*f
,
1832 int quad_offset_reg
,
1836 const enum pipe_format zs_format
= cell
->framebuffer
.zsbuf
->format
;
1837 boolean write_depth_stencil
;
1839 /* framebuffer's combined z/stencil values register */
1840 int fbZS_reg
= spe_allocate_available_register(f
);
1842 /* Framebufer Z values register */
1843 int fbZ_reg
= spe_allocate_available_register(f
);
1845 /* Framebuffer stencil values register (may not be used) */
1846 int fbS_reg
= spe_allocate_available_register(f
);
1848 /* 24-bit mask register (may not be used) */
1849 int zmask_reg
= spe_allocate_available_register(f
);
1852 * The following code:
1853 * 1. fetch quad of packed Z/S values from the framebuffer tile.
1854 * 2. extract the separate the Z and S values from packed values
1855 * 3. convert fragment Z values from float in [0,1] to 32/24/16-bit ints
1857 * The instructions for doing this are interleaved for better performance.
1859 spe_comment(f
, 0, "Fetch Z/stencil quad from tile");
1862 case PIPE_FORMAT_Z24S8_UNORM
: /* fall through */
1863 case PIPE_FORMAT_Z24X8_UNORM
:
1864 /* prepare mask to extract Z vals from ZS vals */
1865 spe_load_uint(f
, zmask_reg
, 0x00ffffff);
1867 /* convert fragment Z from [0,1] to 32-bit ints */
1868 spe_cfltu(f
, fragZ_reg
, fragZ_reg
, 32);
1870 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1871 spe_lqx(f
, fbZS_reg
, depth_tile_reg
, quad_offset_reg
);
1873 /* right shift 32-bit fragment Z to 24 bits */
1874 spe_rotmi(f
, fragZ_reg
, fragZ_reg
, -8);
1876 /* extract 24-bit Z values from ZS values by masking */
1877 spe_and(f
, fbZ_reg
, fbZS_reg
, zmask_reg
);
1879 /* extract 8-bit stencil values by shifting */
1880 spe_rotmi(f
, fbS_reg
, fbZS_reg
, -24);
1883 case PIPE_FORMAT_S8Z24_UNORM
: /* fall through */
1884 case PIPE_FORMAT_X8Z24_UNORM
:
1885 /* convert fragment Z from [0,1] to 32-bit ints */
1886 spe_cfltu(f
, fragZ_reg
, fragZ_reg
, 32);
1888 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1889 spe_lqx(f
, fbZS_reg
, depth_tile_reg
, quad_offset_reg
);
1891 /* right shift 32-bit fragment Z to 24 bits */
1892 spe_rotmi(f
, fragZ_reg
, fragZ_reg
, -8);
1894 /* extract 24-bit Z values from ZS values by shifting */
1895 spe_rotmi(f
, fbZ_reg
, fbZS_reg
, -8);
1897 /* extract 8-bit stencil values by masking */
1898 spe_and_uint(f
, fbS_reg
, fbZS_reg
, 0x000000ff);
1901 case PIPE_FORMAT_Z32_UNORM
:
1902 /* Load: fbZ_reg = memory[depth_tile_reg + offset_reg] */
1903 spe_lqx(f
, fbZ_reg
, depth_tile_reg
, quad_offset_reg
);
1905 /* convert fragment Z from [0,1] to 32-bit ints */
1906 spe_cfltu(f
, fragZ_reg
, fragZ_reg
, 32);
1908 /* No stencil, so can't do anything there */
1911 case PIPE_FORMAT_Z16_UNORM
:
1912 /* XXX This code for 16bpp Z is broken! */
1914 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1915 spe_lqx(f
, fbZS_reg
, depth_tile_reg
, quad_offset_reg
);
1917 /* Copy over 4 32-bit values */
1918 spe_move(f
, fbZ_reg
, fbZS_reg
);
1920 /* convert Z from [0,1] to 16-bit ints */
1921 spe_cfltu(f
, fragZ_reg
, fragZ_reg
, 32);
1922 spe_rotmi(f
, fragZ_reg
, fragZ_reg
, -16);
1927 ASSERT(0); /* invalid format */
1930 /* If stencil is enabled, use the stencil-specific code
1931 * generator to generate both the stencil and depth (if needed)
1932 * tests. Otherwise, if only depth is enabled, generate
1933 * a quick depth test. The test generators themselves will
1934 * report back whether the depth/stencil buffer has to be
1937 if (dsa
->stencil
[0].enabled
) {
1938 /* This will perform the stencil and depth tests, and update
1939 * the mask_reg, fbZ_reg, and fbS_reg as required by the
1942 ASSERT(fbS_reg
>= 0);
1943 spe_comment(f
, 0, "Perform stencil test");
1945 /* Note that fbZ_reg may not be set on entry, if stenciling
1946 * is enabled but there's no Z-buffer. The
1947 * gen_stencil_depth_test() function must ignore the
1948 * fbZ_reg register if depth is not enabled.
1950 write_depth_stencil
= gen_stencil_depth_test(f
, dsa
, stencil_ref
, facing
,
1951 mask_reg
, fragZ_reg
,
1954 else if (dsa
->depth
.enabled
) {
1955 int zmask_reg
= spe_allocate_available_register(f
);
1956 ASSERT(fbZ_reg
>= 0);
1957 spe_comment(f
, 0, "Perform depth test");
1958 write_depth_stencil
= gen_depth_test(f
, dsa
, mask_reg
, fragZ_reg
,
1959 fbZ_reg
, zmask_reg
);
1960 spe_release_register(f
, zmask_reg
);
1963 write_depth_stencil
= FALSE
;
1966 if (write_depth_stencil
) {
1967 /* Merge latest Z and Stencil values into fbZS_reg.
1968 * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
1969 * fbS_reg has four 8-bit Z values in bits [7..0].
1971 spe_comment(f
, 0, "Store quad's depth/stencil values in tile");
1972 if (zs_format
== PIPE_FORMAT_Z24S8_UNORM
||
1973 zs_format
== PIPE_FORMAT_Z24X8_UNORM
) {
1974 spe_shli(f
, fbS_reg
, fbS_reg
, 24); /* fbS = fbS << 24 */
1975 spe_or(f
, fbZS_reg
, fbS_reg
, fbZ_reg
); /* fbZS = fbS | fbZ */
1977 else if (zs_format
== PIPE_FORMAT_S8Z24_UNORM
||
1978 zs_format
== PIPE_FORMAT_X8Z24_UNORM
) {
1979 spe_shli(f
, fbZ_reg
, fbZ_reg
, 8); /* fbZ = fbZ << 8 */
1980 spe_or(f
, fbZS_reg
, fbS_reg
, fbZ_reg
); /* fbZS = fbS | fbZ */
1982 else if (zs_format
== PIPE_FORMAT_Z32_UNORM
) {
1983 spe_move(f
, fbZS_reg
, fbZ_reg
); /* fbZS = fbZ */
1985 else if (zs_format
== PIPE_FORMAT_Z16_UNORM
) {
1986 spe_move(f
, fbZS_reg
, fbZ_reg
); /* fbZS = fbZ */
1988 else if (zs_format
== PIPE_FORMAT_S8_UNORM
) {
1989 ASSERT(0); /* XXX to do */
1992 ASSERT(0); /* bad zs_format */
1995 /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
1996 spe_stqx(f
, fbZS_reg
, depth_tile_reg
, quad_offset_reg
);
1999 /* Don't need these any more */
2000 spe_release_register(f
, fbZS_reg
);
2001 spe_release_register(f
, fbZ_reg
);
2002 spe_release_register(f
, fbS_reg
);
2003 spe_release_register(f
, zmask_reg
);
2009 * Generate SPE code to implement the fragment operations (alpha test,
2010 * depth test, stencil test, blending, colormask, and final
2011 * framebuffer write) as specified by the current context state.
2013 * Logically, this code will be called after running the fragment
2014 * shader. But under some circumstances we could run some of this
2015 * code before the fragment shader to cull fragments/quads that are
2016 * totally occluded/discarded.
2018 * XXX we only support PIPE_FORMAT_S8Z24_UNORM z/stencil buffer right now.
2020 * See the spu_default_fragment_ops() function to see how the per-fragment
2021 * operations would be done with ordinary C code.
2022 * The code we generate here though has no branches, is SIMD, etc and
2023 * should be much faster.
2025 * \param cell the rendering context (in)
2026 * \param facing whether the generated code is for front-facing or
2027 * back-facing fragments
2028 * \param f the generated function (in/out); on input, the function
2029 * must already have been initialized. On exit, whatever
2030 * instructions within the generated function have had
2031 * the fragment ops appended.
2034 cell_gen_fragment_function(struct cell_context
*cell
,
2036 struct spe_function
*f
)
2038 const struct pipe_depth_stencil_alpha_state
*dsa
= cell
->depth_stencil
;
2039 const struct pipe_stencil_ref
*stencil_ref
= &cell
->stencil_ref
;
2040 const struct pipe_blend_state
*blend
= cell
->blend
;
2041 const struct pipe_blend_color
*blend_color
= &cell
->blend_color
;
2042 const enum pipe_format color_format
= cell
->framebuffer
.cbufs
[0]->format
;
2044 /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
2045 const int x_reg
= 3; /* uint */
2046 const int y_reg
= 4; /* uint */
2047 const int color_tile_reg
= 5; /* tile_t * */
2048 const int depth_tile_reg
= 6; /* tile_t * */
2049 const int fragZ_reg
= 7; /* vector float */
2050 const int fragR_reg
= 8; /* vector float */
2051 const int fragG_reg
= 9; /* vector float */
2052 const int fragB_reg
= 10; /* vector float */
2053 const int fragA_reg
= 11; /* vector float */
2054 const int mask_reg
= 12; /* vector uint */
2056 ASSERT(facing
== CELL_FACING_FRONT
|| facing
== CELL_FACING_BACK
);
2058 /* offset of quad from start of tile
2059 * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
2061 int quad_offset_reg
;
2063 int fbRGBA_reg
; /**< framebuffer's RGBA colors for quad */
2065 if (cell
->debug_flags
& CELL_DEBUG_ASM
) {
2066 spe_print_code(f
, TRUE
);
2068 spe_comment(f
, -4, facing
== CELL_FACING_FRONT
2069 ? "Begin front-facing per-fragment ops"
2070 : "Begin back-facing per-fragment ops");
2073 spe_allocate_register(f
, x_reg
);
2074 spe_allocate_register(f
, y_reg
);
2075 spe_allocate_register(f
, color_tile_reg
);
2076 spe_allocate_register(f
, depth_tile_reg
);
2077 spe_allocate_register(f
, fragZ_reg
);
2078 spe_allocate_register(f
, fragR_reg
);
2079 spe_allocate_register(f
, fragG_reg
);
2080 spe_allocate_register(f
, fragB_reg
);
2081 spe_allocate_register(f
, fragA_reg
);
2082 spe_allocate_register(f
, mask_reg
);
2084 quad_offset_reg
= spe_allocate_available_register(f
);
2085 fbRGBA_reg
= spe_allocate_available_register(f
);
2087 /* compute offset of quad from start of tile, in bytes */
2089 int x2_reg
= spe_allocate_available_register(f
);
2090 int y2_reg
= spe_allocate_available_register(f
);
2092 ASSERT(TILE_SIZE
== 32);
2094 spe_comment(f
, 0, "Compute quad offset within tile");
2095 spe_rotmi(f
, y2_reg
, y_reg
, -1); /* y2 = y / 2 */
2096 spe_rotmi(f
, x2_reg
, x_reg
, -1); /* x2 = x / 2 */
2097 spe_shli(f
, y2_reg
, y2_reg
, 4); /* y2 *= 16 */
2098 spe_a(f
, quad_offset_reg
, y2_reg
, x2_reg
); /* offset = y2 + x2 */
2099 spe_shli(f
, quad_offset_reg
, quad_offset_reg
, 4); /* offset *= 16 */
2101 spe_release_register(f
, x2_reg
);
2102 spe_release_register(f
, y2_reg
);
2105 /* Generate the alpha test, if needed. */
2106 if (dsa
->alpha
.enabled
) {
2107 gen_alpha_test(dsa
, f
, mask_reg
, fragA_reg
);
2110 /* generate depth and/or stencil test code */
2111 if (dsa
->depth
.enabled
|| dsa
->stencil
[0].enabled
) {
2112 gen_depth_stencil(cell
, dsa
, stencil_ref
, f
,
2120 /* Get framebuffer quad/colors. We'll need these for blending,
2121 * color masking, and to obey the quad/pixel mask.
2122 * Load: fbRGBA_reg = memory[color_tile + quad_offset]
2123 * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
2124 * we could skip this load.
2126 spe_comment(f
, 0, "Fetch quad colors from tile");
2127 spe_lqx(f
, fbRGBA_reg
, color_tile_reg
, quad_offset_reg
);
2129 if (blend
->rt
[0].blend_enable
) {
2130 spe_comment(f
, 0, "Perform blending");
2131 gen_blend(blend
, blend_color
, f
, color_format
,
2132 fragR_reg
, fragG_reg
, fragB_reg
, fragA_reg
, fbRGBA_reg
);
2136 * Write fragment colors to framebuffer/tile.
2137 * This involves converting the fragment colors from float[4] to the
2138 * tile's specific format and obeying the quad/pixel mask.
2141 int rgba_reg
= spe_allocate_available_register(f
);
2143 /* Pack four float colors as four 32-bit int colors */
2144 spe_comment(f
, 0, "Convert float quad colors to packed int framebuffer colors");
2145 gen_pack_colors(f
, color_format
,
2146 fragR_reg
, fragG_reg
, fragB_reg
, fragA_reg
,
2149 if (blend
->logicop_enable
) {
2150 spe_comment(f
, 0, "Compute logic op");
2151 gen_logicop(blend
, f
, rgba_reg
, fbRGBA_reg
);
2154 if (blend
->rt
[0].colormask
!= PIPE_MASK_RGBA
) {
2155 spe_comment(f
, 0, "Compute color mask");
2156 gen_colormask(f
, blend
->rt
[0].colormask
, color_format
, rgba_reg
, fbRGBA_reg
);
2159 /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
2161 * rgba[i] = rgba[i];
2163 * rgba[i] = framebuffer[i];
2165 spe_selb(f
, rgba_reg
, fbRGBA_reg
, rgba_reg
, mask_reg
);
2167 /* Store updated quad in tile:
2168 * memory[color_tile + quad_offset] = rgba_reg;
2170 spe_comment(f
, 0, "Store quad colors into color tile");
2171 spe_stqx(f
, rgba_reg
, color_tile_reg
, quad_offset_reg
);
2173 spe_release_register(f
, rgba_reg
);
2176 //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
2178 spe_bi(f
, SPE_REG_RA
, 0, 0); /* return from function call */
2180 spe_release_register(f
, fbRGBA_reg
);
2181 spe_release_register(f
, quad_offset_reg
);
2183 if (cell
->debug_flags
& CELL_DEBUG_ASM
) {
2185 sprintf(buffer
, "End %s-facing per-fragment ops: %d instructions",
2186 facing
== CELL_FACING_FRONT
? "front" : "back", f
->num_inst
);
2187 spe_comment(f
, -4, buffer
);