Merge branch 'fixes' into main/rendor-staging
[ryzomcore.git] / nel / src / 3d / mesh_mrm_skinned_template.cpp
blobc6d25810e19209ddc7452d573339736f51f90cc6
1 /**
2 * File not compiled. Included from mesh_mrm_skinned.cpp. It is an "old school" template.
3 */
5 // NeL - MMORPG Framework <http://dev.ryzom.com/projects/nel/>
6 // Copyright (C) 2010 Winch Gate Property Limited
7 //
8 // This program is free software: you can redistribute it and/or modify
9 // it under the terms of the GNU Affero General Public License as
10 // published by the Free Software Foundation, either version 3 of the
11 // License, or (at your option) any later version.
13 // This program is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 // GNU Affero General Public License for more details.
18 // You should have received a copy of the GNU Affero General Public License
19 // along with this program. If not, see <http://www.gnu.org/licenses/>.
21 #include "std3d.h"
23 #ifdef DEBUG_NEW
24 #define new DEBUG_NEW
25 #endif
27 // ***************************************************************************
28 // ***************************************************************************
29 // Raw "Vertex/Normal only" ApplySkin methods.
30 // ***************************************************************************
31 // ***************************************************************************
33 // ace: before including this, #define this define to use it
34 // the goal is to be able to compile every .cpp file with no
35 // special case (GNU/Linux needs)
36 #ifdef ADD_MESH_MRM_SKINNED_TEMPLATE
39 #define NL3D_RAWSKIN_NORMAL_OFF 12
40 #define NL3D_RAWSKIN_UV_OFF 24
41 #define NL3D_RAWSKIN_VERTEX_SIZE 32
44 /* Speed Feature test.
45 Don't use precaching for now, cause its seems to be slower on some configs (P4-2.4Ghz),
46 but maybe faster on other (P3-800)
47 On a P4-2.4Ghz, for 40000 vertices skinned, both no precaching and asm
48 saves 27% of execution time in the applyRawSkinNormal*() loop (ie 1 ms)
50 #if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
51 //#define NL3D_RAWSKIN_PRECACHE
52 #define NL3D_RAWSKIN_ASM
53 #endif
56 // ***************************************************************************
57 void CMeshMRMSkinnedGeom::applyArrayRawSkinNormal1(CRawVertexNormalSkinned1 *src, uint8 *destVertexPtr,
58 CMatrix3x4 *boneMat3x4, uint nInf)
60 // must write contigously in AGP, and ASM is hardcoded...
61 nlctassert(NL3D_RAWSKIN_NORMAL_OFF==12);
62 nlctassert(NL3D_RAWSKIN_UV_OFF==24);
64 /*extern uint TESTYOYO_NumRawSkinVertices1;
65 TESTYOYO_NumRawSkinVertices1+= nInf;
66 H_AUTO( TestYoyo_RawSkin1 );*/
68 #ifdef NL3D_RAWSKIN_PRECACHE
69 for(;nInf>0;)
71 // number of vertices to process for this block.
72 uint nBlockInf= min(NumCacheVertexNormal1, nInf);
73 // next block.
74 nInf-= nBlockInf;
76 // cache the data in L1 cache.
77 CFastMem::precache(src, nBlockInf * sizeof(CRawVertexNormalSkinned1));
78 #else
80 uint nBlockInf= nInf;
81 #endif
84 #ifndef NL3D_RAWSKIN_ASM
85 // for all InfluencedVertices only.
86 for(;nBlockInf>0;nBlockInf--, src++, destVertexPtr+=NL3D_RAWSKIN_VERTEX_SIZE)
88 CVector *dstVertex= (CVector*)(destVertexPtr);
89 CVector *dstNormal= (CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF);
91 // For 1 matrix, can write directly to AGP (if destVertexPtr is AGP...)
92 // Vertex.
93 boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex, *(CVector*)(destVertexPtr) );
94 // Normal.
95 boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Normal, *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF) );
96 // UV copy.
97 *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->UV;
99 #else
100 // ASM hard coded for 36
101 nlctassert(sizeof(CRawVertexNormalSkinned1)==36);
103 /* 116 cycles / loop typical
104 58 cycles / loop in theory (no memory problem)
106 __asm
108 mov ecx, nBlockInf
109 mov esi, src
110 mov edi, destVertexPtr
111 mov edx, boneMat3x4
112 theLoop:
113 // Vertex.
114 // **** boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex, *(CVector*)(destVertexPtr) );
116 // eax= matrix
117 mov eax, [esi]src.MatrixId // uop: 0/1
118 lea eax, [eax*2+eax]
119 shl eax, 4
120 add eax, edx // uop: 1/0
122 // load x y z
123 fld [esi]src.Vertex.x // uop: 0/1
124 fld [esi]src.Vertex.y // uop: 0/1
125 fld [esi]src.Vertex.z // uop: 0/1
126 // vout.x= (a11*vin.x + a12*vin.y + a13*vin.z + a14);
127 fld [eax]CMatrix3x4.a11 // uop: 0/1
128 fmul st, st(3) // uop: 1/0 (5)
129 fld [eax]CMatrix3x4.a12 // uop: 0/1
130 fmul st, st(3) // uop: 1/0 (5)
131 faddp st(1), st // uop: 1/0 (3)
132 fld [eax]CMatrix3x4.a13 // uop: 0/1
133 fmul st, st(2) // uop: 1/0 (5)
134 faddp st(1), st // uop: 1/0 (3)
135 fld [eax]CMatrix3x4.a14 // uop: 0/1
136 faddp st(1), st // uop: 1/0 (3)
137 fstp dword ptr[edi] // uop: 0/0/1/1
138 // vout.y= (a21*vin.x + a22*vin.y + a23*vin.z + a24);
139 fld [eax]CMatrix3x4.a21
140 fmul st, st(3)
141 fld [eax]CMatrix3x4.a22
142 fmul st, st(3)
143 faddp st(1), st
144 fld [eax]CMatrix3x4.a23
145 fmul st, st(2)
146 faddp st(1), st
147 fld [eax]CMatrix3x4.a24
148 faddp st(1), st
149 fstp dword ptr[edi+4]
150 // vout.z= (a31*vin.x + a32*vin.y + a33*vin.z + a34);
151 fld [eax]CMatrix3x4.a31
152 fmul st, st(3)
153 fld [eax]CMatrix3x4.a32
154 fmul st, st(3)
155 faddp st(1), st
156 fld [eax]CMatrix3x4.a33
157 fmul st, st(2)
158 faddp st(1), st
159 fld [eax]CMatrix3x4.a34
160 faddp st(1), st
161 fstp dword ptr[edi+8]
162 // free x y z
163 fstp st // uop: 1/0
164 fstp st // uop: 1/0
165 fstp st // uop: 1/0
168 // Normal
169 // **** boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Normal, *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF) );
171 // load x y z
172 fld [esi]src.Normal.x
173 fld [esi]src.Normal.y
174 fld [esi]src.Normal.z
175 // vout.x= (a11*vin.x + a12*vin.y + a13*vin.z + a14);
176 fld [eax]CMatrix3x4.a11 // uop: 0/1
177 fmul st, st(3) // uop: 1/0 (5)
178 fld [eax]CMatrix3x4.a12 // uop: 0/1
179 fmul st, st(3) // uop: 1/0 (5)
180 faddp st(1), st // uop: 1/0 (3)
181 fld [eax]CMatrix3x4.a13 // uop: 0/1
182 fmul st, st(2) // uop: 1/0 (5)
183 faddp st(1), st // uop: 1/0 (3)
184 fstp dword ptr[edi+12] // uop: 0/0/1/1
185 // vout.y= (a21*vin.x + a22*vin.y + a23*vin.z + a24);
186 fld [eax]CMatrix3x4.a21
187 fmul st, st(3)
188 fld [eax]CMatrix3x4.a22
189 fmul st, st(3)
190 faddp st(1), st
191 fld [eax]CMatrix3x4.a23
192 fmul st, st(2)
193 faddp st(1), st
194 fstp dword ptr[edi+16]
195 // vout.z= (a31*vin.x + a32*vin.y + a33*vin.z + a34);
196 fld [eax]CMatrix3x4.a31
197 fmul st, st(3)
198 fld [eax]CMatrix3x4.a32
199 fmul st, st(3)
200 faddp st(1), st
201 fld [eax]CMatrix3x4.a33
202 fmul st, st(2)
203 faddp st(1), st
204 fstp dword ptr[edi+20]
205 // free x y z
206 fstp st
207 fstp st
208 fstp st
211 // UV copy.
212 // **** *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->UV;
213 mov eax, [esi]src.UV.U // uop: 0/1
214 mov dword ptr[edi+24], eax // uop: 0/0/1/1
215 mov eax, [esi]src.UV.V // uop: 0/1
216 mov dword ptr[edi+28], eax // uop: 0/0/1/1
219 // **** next
220 add esi, 36 // uop: 1/0
221 add edi, NL3D_RAWSKIN_VERTEX_SIZE // uop: 1/0
222 dec ecx // uop: 1/0
223 jnz theLoop // uop: 1/1 (p1)
225 mov nBlockInf, ecx
226 mov src, esi
227 mov destVertexPtr, edi
229 #endif
235 // ***************************************************************************
236 void CMeshMRMSkinnedGeom::applyArrayRawSkinNormal2(CRawVertexNormalSkinned2 *src, uint8 *destVertexPtr,
237 CMatrix3x4 *boneMat3x4, uint nInf)
239 // must write contigously in AGP, and ASM is hardcoded...
240 nlctassert(NL3D_RAWSKIN_NORMAL_OFF==12);
241 nlctassert(NL3D_RAWSKIN_UV_OFF==24);
243 /*extern uint TESTYOYO_NumRawSkinVertices2;
244 TESTYOYO_NumRawSkinVertices2+= nInf;
245 H_AUTO( TestYoyo_RawSkin2 );*/
247 // Since VertexPtr may be a AGP Ram, MUST NOT read into it! (mulAdd*() do it!)
248 CVector tmpVert;
250 #ifdef NL3D_RAWSKIN_PRECACHE
251 for(;nInf>0;)
253 // number of vertices to process for this block.
254 uint nBlockInf= min(NumCacheVertexNormal2, nInf);
255 // next block.
256 nInf-= nBlockInf;
258 // cache the data in L1 cache.
259 CFastMem::precache(src, nBlockInf * sizeof(CRawVertexNormalSkinned2));
260 #else
262 uint nBlockInf= nInf;
263 #endif
266 #ifndef NL3D_RAWSKIN_ASM
267 // for all InfluencedVertices only.
268 for(;nBlockInf>0;nBlockInf--, src++, destVertexPtr+=NL3D_RAWSKIN_VERTEX_SIZE)
270 // Vertex.
271 boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex, src->Weights[0], tmpVert);
272 boneMat3x4[ src->MatrixId[1] ].mulAddPoint( src->Vertex, src->Weights[1], tmpVert);
273 *(CVector*)(destVertexPtr)= tmpVert;
274 // Normal.
275 boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Normal, src->Weights[0], tmpVert);
276 boneMat3x4[ src->MatrixId[1] ].mulAddVector( src->Normal, src->Weights[1], tmpVert);
277 *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
278 // UV copy.
279 *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->UV;
281 #else
282 // ASM harcoded for 48
283 nlctassert(sizeof(CRawVertexNormalSkinned2)==48);
285 /* 154 cycles / loop typical
286 124 cycles / loop in theory (no memory problem)
288 __asm
290 mov ecx, nBlockInf
291 mov esi, src
292 mov edi, destVertexPtr
293 mov edx, boneMat3x4
294 theLoop:
295 // Vertex.
296 // **** boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex, *(CVector*)(destVertexPtr) );
298 // eax= matrix0
299 mov eax, [esi+0]src.MatrixId // uop: 0/1
300 lea eax, [eax*2+eax]
301 shl eax, 4
302 add eax, edx // uop: 1/0
303 // ebx= matrix1
304 mov ebx, [esi+4]src.MatrixId // uop: 0/1
305 lea ebx, [ebx*2+ebx]
306 shl ebx, 4
307 add ebx, edx // uop: 1/0
309 // load x y z
310 fld [esi]src.Vertex.x // uop: 0/1
311 fld [esi]src.Vertex.y // uop: 0/1
312 fld [esi]src.Vertex.z // uop: 0/1
314 // **** vout.x= (a11*vin.x + a12*vin.y + a13*vin.z + a14);
315 // 1st Matrix
316 fld [eax]CMatrix3x4.a11 // uop: 0/1
317 fmul st, st(3) // uop: 1/0 (5)
318 fld [eax]CMatrix3x4.a12 // uop: 0/1
319 fmul st, st(3) // uop: 1/0 (5)
320 faddp st(1), st // uop: 1/0 (3)
321 fld [eax]CMatrix3x4.a13 // uop: 0/1
322 fmul st, st(2) // uop: 1/0 (5)
323 faddp st(1), st // uop: 1/0 (3)
324 fld [eax]CMatrix3x4.a14 // uop: 0/1
325 faddp st(1), st // uop: 1/0 (3)
326 // mul by scale
327 fmul [esi+0]src.Weights
329 // 2nd matrix
330 fld [ebx]CMatrix3x4.a11
331 fmul st, st(4)
332 fld [ebx]CMatrix3x4.a12
333 fmul st, st(4)
334 faddp st(1), st
335 fld [ebx]CMatrix3x4.a13
336 fmul st, st(3)
337 faddp st(1), st
338 fld [ebx]CMatrix3x4.a14
339 faddp st(1), st
340 // mul by scale, and append
341 fmul [esi+4]src.Weights
342 faddp st(1), st
344 // store
345 fstp dword ptr[edi] // uop: 0/0/1/1
347 // **** vout.y= (a21*vin.x + a22*vin.y + a23*vin.z + a24);
348 fld [eax]CMatrix3x4.a21
349 fmul st, st(3)
350 fld [eax]CMatrix3x4.a22
351 fmul st, st(3)
352 faddp st(1), st
353 fld [eax]CMatrix3x4.a23
354 fmul st, st(2)
355 faddp st(1), st
356 fld [eax]CMatrix3x4.a24
357 faddp st(1), st
358 // mul by scale
359 fmul [esi+0]src.Weights
361 // 2nd matrix
362 fld [ebx]CMatrix3x4.a21
363 fmul st, st(4)
364 fld [ebx]CMatrix3x4.a22
365 fmul st, st(4)
366 faddp st(1), st
367 fld [ebx]CMatrix3x4.a23
368 fmul st, st(3)
369 faddp st(1), st
370 fld [ebx]CMatrix3x4.a24
371 faddp st(1), st
372 // mul by scale, and append
373 fmul [esi+4]src.Weights
374 faddp st(1), st
376 // store
377 fstp dword ptr[edi+4]
379 // **** vout.z= (a31*vin.x + a32*vin.y + a33*vin.z + a34);
380 fld [eax]CMatrix3x4.a31
381 fmul st, st(3)
382 fld [eax]CMatrix3x4.a32
383 fmul st, st(3)
384 faddp st(1), st
385 fld [eax]CMatrix3x4.a33
386 fmul st, st(2)
387 faddp st(1), st
388 fld [eax]CMatrix3x4.a34
389 faddp st(1), st
390 // mul by scale
391 fmul [esi+0]src.Weights
393 // 2nd matrix
394 fld [ebx]CMatrix3x4.a31
395 fmul st, st(4)
396 fld [ebx]CMatrix3x4.a32
397 fmul st, st(4)
398 faddp st(1), st
399 fld [ebx]CMatrix3x4.a33
400 fmul st, st(3)
401 faddp st(1), st
402 fld [ebx]CMatrix3x4.a34
403 faddp st(1), st
404 // mul by scale, and append
405 fmul [esi+4]src.Weights
406 faddp st(1), st
408 // store
409 fstp dword ptr[edi+8]
411 // free x y z
412 fstp st // uop: 1/0
413 fstp st // uop: 1/0
414 fstp st // uop: 1/0
417 // Normal
418 // **** boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Normal, *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF) );
420 // load x y z
421 fld [esi]src.Normal.x
422 fld [esi]src.Normal.y
423 fld [esi]src.Normal.z
425 // **** vout.x= (a11*vin.x + a12*vin.y + a13*vin.z + a14);
426 fld [eax]CMatrix3x4.a11 // uop: 0/1
427 fmul st, st(3) // uop: 1/0 (5)
428 fld [eax]CMatrix3x4.a12 // uop: 0/1
429 fmul st, st(3) // uop: 1/0 (5)
430 faddp st(1), st // uop: 1/0 (3)
431 fld [eax]CMatrix3x4.a13 // uop: 0/1
432 fmul st, st(2) // uop: 1/0 (5)
433 faddp st(1), st // uop: 1/0 (3)
434 // mul by scale
435 fmul [esi+0]src.Weights
437 // 2nd matrix
438 fld [ebx]CMatrix3x4.a11
439 fmul st, st(4)
440 fld [ebx]CMatrix3x4.a12
441 fmul st, st(4)
442 faddp st(1), st
443 fld [ebx]CMatrix3x4.a13
444 fmul st, st(3)
445 faddp st(1), st
446 // mul by scale, and append
447 fmul [esi+4]src.Weights
448 faddp st(1), st
450 // store
451 fstp dword ptr[edi+12] // uop: 0/0/1/1
453 // **** vout.y= (a21*vin.x + a22*vin.y + a23*vin.z + a24);
454 fld [eax]CMatrix3x4.a21
455 fmul st, st(3)
456 fld [eax]CMatrix3x4.a22
457 fmul st, st(3)
458 faddp st(1), st
459 fld [eax]CMatrix3x4.a23
460 fmul st, st(2)
461 faddp st(1), st
462 // mul by scale
463 fmul [esi+0]src.Weights
465 // 2nd matrix
466 fld [ebx]CMatrix3x4.a21
467 fmul st, st(4)
468 fld [ebx]CMatrix3x4.a22
469 fmul st, st(4)
470 faddp st(1), st
471 fld [ebx]CMatrix3x4.a23
472 fmul st, st(3)
473 faddp st(1), st
474 // mul by scale, and append
475 fmul [esi+4]src.Weights
476 faddp st(1), st
478 // store
479 fstp dword ptr[edi+16]
481 // **** vout.z= (a31*vin.x + a32*vin.y + a33*vin.z + a34);
482 fld [eax]CMatrix3x4.a31
483 fmul st, st(3)
484 fld [eax]CMatrix3x4.a32
485 fmul st, st(3)
486 faddp st(1), st
487 fld [eax]CMatrix3x4.a33
488 fmul st, st(2)
489 faddp st(1), st
490 // mul by scale
491 fmul [esi+0]src.Weights
493 // 2nd matrix
494 fld [ebx]CMatrix3x4.a31
495 fmul st, st(4)
496 fld [ebx]CMatrix3x4.a32
497 fmul st, st(4)
498 faddp st(1), st
499 fld [ebx]CMatrix3x4.a33
500 fmul st, st(3)
501 faddp st(1), st
502 // mul by scale, and append
503 fmul [esi+4]src.Weights
504 faddp st(1), st
506 // store
507 fstp dword ptr[edi+20]
509 // free x y z
510 fstp st
511 fstp st
512 fstp st
515 // UV copy.
516 // **** *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->UV;
517 mov eax, [esi]src.UV.U // uop: 0/1
518 mov dword ptr[edi+24], eax // uop: 0/0/1/1
519 mov eax, [esi]src.UV.V // uop: 0/1
520 mov dword ptr[edi+28], eax // uop: 0/0/1/1
523 // **** next
524 add esi, 48 // uop: 1/0
525 add edi, NL3D_RAWSKIN_VERTEX_SIZE // uop: 1/0
526 dec ecx // uop: 1/0
527 jnz theLoop // uop: 1/1 (p1)
529 mov nBlockInf, ecx
530 mov src, esi
531 mov destVertexPtr, edi
533 #endif
538 // ***************************************************************************
539 void CMeshMRMSkinnedGeom::applyArrayRawSkinNormal3(CRawVertexNormalSkinned3 *src, uint8 *destVertexPtr,
540 CMatrix3x4 *boneMat3x4, uint nInf)
542 // must write contigously in AGP, and ASM is hardcoded...
543 nlctassert(NL3D_RAWSKIN_NORMAL_OFF==12);
544 nlctassert(NL3D_RAWSKIN_UV_OFF==24);
546 /*extern uint TESTYOYO_NumRawSkinVertices3;
547 TESTYOYO_NumRawSkinVertices3+= nInf;
548 H_AUTO( TestYoyo_RawSkin3 );*/
550 // Since VertexPtr may be a AGP Ram, MUST NOT read into it! (mulAdd*() do it!)
551 CVector tmpVert;
553 #ifdef NL3D_RAWSKIN_PRECACHE
554 for(;nInf>0;)
556 // number of vertices to process for this block.
557 uint nBlockInf= min(NumCacheVertexNormal3, nInf);
558 // next block.
559 nInf-= nBlockInf;
561 // cache the data in L1 cache.
562 CFastMem::precache(src, nBlockInf * sizeof(CRawVertexNormalSkinned3));
563 #else
565 uint nBlockInf= nInf;
566 #endif
569 #ifndef NL3D_RAWSKIN_ASM
570 // for all InfluencedVertices only.
571 for(;nBlockInf>0;nBlockInf--, src++, destVertexPtr+=NL3D_RAWSKIN_VERTEX_SIZE)
573 // Vertex.
574 boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex, src->Weights[0], tmpVert);
575 boneMat3x4[ src->MatrixId[1] ].mulAddPoint( src->Vertex, src->Weights[1], tmpVert);
576 boneMat3x4[ src->MatrixId[2] ].mulAddPoint( src->Vertex, src->Weights[2], tmpVert);
577 *(CVector*)(destVertexPtr)= tmpVert;
578 // Normal.
579 boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Normal, src->Weights[0], tmpVert);
580 boneMat3x4[ src->MatrixId[1] ].mulAddVector( src->Normal, src->Weights[1], tmpVert);
581 boneMat3x4[ src->MatrixId[2] ].mulAddVector( src->Normal, src->Weights[2], tmpVert);
582 *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
583 // UV copy.
584 *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->UV;
586 #else
587 // ASM hard coded for 56
588 nlctassert(sizeof(CRawVertexNormalSkinned3)==56);
591 /* 226 cycles / loop typical
592 192 cycles / loop in theory (no memory problem)
593 148 optimal
595 __asm
597 mov ecx, nBlockInf
598 mov esi, src
599 mov edi, destVertexPtr
600 theLoop:
601 // Vertex.
602 // **** boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex, *(CVector*)(destVertexPtr) );
604 // eax= matrix0
605 mov eax, [esi+0]src.MatrixId // uop: 0/1
606 lea eax, [eax*2+eax]
607 shl eax, 4
608 add eax, boneMat3x4 // uop: 1/0
609 // ebx= matrix1
610 mov ebx, [esi+4]src.MatrixId // uop: 0/1
611 lea ebx, [ebx*2+ebx]
612 shl ebx, 4
613 add ebx, boneMat3x4 // uop: 1/0
614 // edx= matrix2
615 mov edx, [esi+8]src.MatrixId // uop: 0/1
616 lea edx, [edx*2+edx]
617 shl edx, 4
618 add edx, boneMat3x4 // uop: 1/0
620 // load x y z
621 fld [esi]src.Vertex.x // uop: 0/1
622 fld [esi]src.Vertex.y // uop: 0/1
623 fld [esi]src.Vertex.z // uop: 0/1
625 // **** vout.x= (a11*vin.x + a12*vin.y + a13*vin.z + a14);
626 // 1st Matrix
627 fld [eax]CMatrix3x4.a11 // uop: 0/1
628 fmul st, st(3) // uop: 1/0 (5)
629 fld [eax]CMatrix3x4.a12 // uop: 0/1
630 fmul st, st(3) // uop: 1/0 (5)
631 faddp st(1), st // uop: 1/0 (3)
632 fld [eax]CMatrix3x4.a13 // uop: 0/1
633 fmul st, st(2) // uop: 1/0 (5)
634 faddp st(1), st // uop: 1/0 (3)
635 fld [eax]CMatrix3x4.a14 // uop: 0/1
636 faddp st(1), st // uop: 1/0 (3)
637 // mul by scale
638 fmul [esi+0]src.Weights
640 // 2nd matrix
641 fld [ebx]CMatrix3x4.a11
642 fmul st, st(4)
643 fld [ebx]CMatrix3x4.a12
644 fmul st, st(4)
645 faddp st(1), st
646 fld [ebx]CMatrix3x4.a13
647 fmul st, st(3)
648 faddp st(1), st
649 fld [ebx]CMatrix3x4.a14
650 faddp st(1), st
651 // mul by scale, and append
652 fmul [esi+4]src.Weights
653 faddp st(1), st
655 // 3rd matrix
656 fld [edx]CMatrix3x4.a11
657 fmul st, st(4)
658 fld [edx]CMatrix3x4.a12
659 fmul st, st(4)
660 faddp st(1), st
661 fld [edx]CMatrix3x4.a13
662 fmul st, st(3)
663 faddp st(1), st
664 fld [edx]CMatrix3x4.a14
665 faddp st(1), st
666 // mul by scale, and append
667 fmul [esi+8]src.Weights
668 faddp st(1), st
670 // store
671 fstp dword ptr[edi] // uop: 0/0/1/1
673 // **** vout.y= (a21*vin.x + a22*vin.y + a23*vin.z + a24);
674 fld [eax]CMatrix3x4.a21
675 fmul st, st(3)
676 fld [eax]CMatrix3x4.a22
677 fmul st, st(3)
678 faddp st(1), st
679 fld [eax]CMatrix3x4.a23
680 fmul st, st(2)
681 faddp st(1), st
682 fld [eax]CMatrix3x4.a24
683 faddp st(1), st
684 // mul by scale
685 fmul [esi+0]src.Weights
687 // 2nd matrix
688 fld [ebx]CMatrix3x4.a21
689 fmul st, st(4)
690 fld [ebx]CMatrix3x4.a22
691 fmul st, st(4)
692 faddp st(1), st
693 fld [ebx]CMatrix3x4.a23
694 fmul st, st(3)
695 faddp st(1), st
696 fld [ebx]CMatrix3x4.a24
697 faddp st(1), st
698 // mul by scale, and append
699 fmul [esi+4]src.Weights
700 faddp st(1), st
702 // 3rd matrix
703 fld [edx]CMatrix3x4.a21
704 fmul st, st(4)
705 fld [edx]CMatrix3x4.a22
706 fmul st, st(4)
707 faddp st(1), st
708 fld [edx]CMatrix3x4.a23
709 fmul st, st(3)
710 faddp st(1), st
711 fld [edx]CMatrix3x4.a24
712 faddp st(1), st
713 // mul by scale, and append
714 fmul [esi+8]src.Weights
715 faddp st(1), st
717 // store
718 fstp dword ptr[edi+4]
720 // **** vout.z= (a31*vin.x + a32*vin.y + a33*vin.z + a34);
721 fld [eax]CMatrix3x4.a31
722 fmul st, st(3)
723 fld [eax]CMatrix3x4.a32
724 fmul st, st(3)
725 faddp st(1), st
726 fld [eax]CMatrix3x4.a33
727 fmul st, st(2)
728 faddp st(1), st
729 fld [eax]CMatrix3x4.a34
730 faddp st(1), st
731 // mul by scale
732 fmul [esi+0]src.Weights
734 // 2nd matrix
735 fld [ebx]CMatrix3x4.a31
736 fmul st, st(4)
737 fld [ebx]CMatrix3x4.a32
738 fmul st, st(4)
739 faddp st(1), st
740 fld [ebx]CMatrix3x4.a33
741 fmul st, st(3)
742 faddp st(1), st
743 fld [ebx]CMatrix3x4.a34
744 faddp st(1), st
745 // mul by scale, and append
746 fmul [esi+4]src.Weights
747 faddp st(1), st
749 // 3rd matrix
750 fld [edx]CMatrix3x4.a31
751 fmul st, st(4)
752 fld [edx]CMatrix3x4.a32
753 fmul st, st(4)
754 faddp st(1), st
755 fld [edx]CMatrix3x4.a33
756 fmul st, st(3)
757 faddp st(1), st
758 fld [edx]CMatrix3x4.a34
759 faddp st(1), st
760 // mul by scale, and append
761 fmul [esi+8]src.Weights
762 faddp st(1), st
764 // store
765 fstp dword ptr[edi+8]
767 // free x y z
768 fstp st // uop: 1/0
769 fstp st // uop: 1/0
770 fstp st // uop: 1/0
773 // Normal
774 // **** boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Normal, *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF) );
776 // load x y z
777 fld [esi]src.Normal.x
778 fld [esi]src.Normal.y
779 fld [esi]src.Normal.z
780 // **** vout.x= (a11*vin.x + a12*vin.y + a13*vin.z + a14);
781 fld [eax]CMatrix3x4.a11 // uop: 0/1
782 fmul st, st(3) // uop: 1/0 (5)
783 fld [eax]CMatrix3x4.a12 // uop: 0/1
784 fmul st, st(3) // uop: 1/0 (5)
785 faddp st(1), st // uop: 1/0 (3)
786 fld [eax]CMatrix3x4.a13 // uop: 0/1
787 fmul st, st(2) // uop: 1/0 (5)
788 faddp st(1), st // uop: 1/0 (3)
789 // mul by scale
790 fmul [esi+0]src.Weights
792 // 2nd matrix
793 fld [ebx]CMatrix3x4.a11
794 fmul st, st(4)
795 fld [ebx]CMatrix3x4.a12
796 fmul st, st(4)
797 faddp st(1), st
798 fld [ebx]CMatrix3x4.a13
799 fmul st, st(3)
800 faddp st(1), st
801 // mul by scale, and append
802 fmul [esi+4]src.Weights
803 faddp st(1), st
805 // 3rd matrix
806 fld [edx]CMatrix3x4.a11
807 fmul st, st(4)
808 fld [edx]CMatrix3x4.a12
809 fmul st, st(4)
810 faddp st(1), st
811 fld [edx]CMatrix3x4.a13
812 fmul st, st(3)
813 faddp st(1), st
814 // mul by scale, and append
815 fmul [esi+8]src.Weights
816 faddp st(1), st
818 // store
819 fstp dword ptr[edi+12] // uop: 0/0/1/1
821 // **** vout.y= (a21*vin.x + a22*vin.y + a23*vin.z + a24);
822 fld [eax]CMatrix3x4.a21
823 fmul st, st(3)
824 fld [eax]CMatrix3x4.a22
825 fmul st, st(3)
826 faddp st(1), st
827 fld [eax]CMatrix3x4.a23
828 fmul st, st(2)
829 faddp st(1), st
830 // mul by scale
831 fmul [esi+0]src.Weights
833 // 2nd matrix
834 fld [ebx]CMatrix3x4.a21
835 fmul st, st(4)
836 fld [ebx]CMatrix3x4.a22
837 fmul st, st(4)
838 faddp st(1), st
839 fld [ebx]CMatrix3x4.a23
840 fmul st, st(3)
841 faddp st(1), st
842 // mul by scale, and append
843 fmul [esi+4]src.Weights
844 faddp st(1), st
846 // 3rd matrix
847 fld [edx]CMatrix3x4.a21
848 fmul st, st(4)
849 fld [edx]CMatrix3x4.a22
850 fmul st, st(4)
851 faddp st(1), st
852 fld [edx]CMatrix3x4.a23
853 fmul st, st(3)
854 faddp st(1), st
855 // mul by scale, and append
856 fmul [esi+8]src.Weights
857 faddp st(1), st
859 // store
860 fstp dword ptr[edi+16]
862 // **** vout.z= (a31*vin.x + a32*vin.y + a33*vin.z + a34);
863 fld [eax]CMatrix3x4.a31
864 fmul st, st(3)
865 fld [eax]CMatrix3x4.a32
866 fmul st, st(3)
867 faddp st(1), st
868 fld [eax]CMatrix3x4.a33
869 fmul st, st(2)
870 faddp st(1), st
871 // mul by scale
872 fmul [esi+0]src.Weights
874 // 2nd matrix
875 fld [ebx]CMatrix3x4.a31
876 fmul st, st(4)
877 fld [ebx]CMatrix3x4.a32
878 fmul st, st(4)
879 faddp st(1), st
880 fld [ebx]CMatrix3x4.a33
881 fmul st, st(3)
882 faddp st(1), st
883 // mul by scale, and append
884 fmul [esi+4]src.Weights
885 faddp st(1), st
887 // 3rd matrix
888 fld [edx]CMatrix3x4.a31
889 fmul st, st(4)
890 fld [edx]CMatrix3x4.a32
891 fmul st, st(4)
892 faddp st(1), st
893 fld [edx]CMatrix3x4.a33
894 fmul st, st(3)
895 faddp st(1), st
896 // mul by scale, and append
897 fmul [esi+8]src.Weights
898 faddp st(1), st
900 // store
901 fstp dword ptr[edi+20]
903 // free x y z
904 fstp st
905 fstp st
906 fstp st
909 // UV copy.
910 // **** *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->UV;
911 mov eax, [esi]src.UV.U // uop: 0/1
912 mov dword ptr[edi+24], eax // uop: 0/0/1/1
913 mov eax, [esi]src.UV.V // uop: 0/1
914 mov dword ptr[edi+28], eax // uop: 0/0/1/1
917 // **** next
918 add esi, 56 // uop: 1/0
919 add edi, NL3D_RAWSKIN_VERTEX_SIZE // uop: 1/0
920 dec ecx // uop: 1/0
921 jnz theLoop // uop: 1/1 (p1)
923 mov nBlockInf, ecx
924 mov src, esi
925 mov destVertexPtr, edi
927 #endif
932 // ***************************************************************************
933 void CMeshMRMSkinnedGeom::applyArrayRawSkinNormal4(CRawVertexNormalSkinned4 *src, uint8 *destVertexPtr,
934 CMatrix3x4 *boneMat3x4, uint nInf)
936 // must write contigously in AGP, and ASM is hardcoded...
937 nlctassert(NL3D_RAWSKIN_NORMAL_OFF==12);
938 nlctassert(NL3D_RAWSKIN_UV_OFF==24);
940 /*extern uint TESTYOYO_NumRawSkinVertices4;
941 TESTYOYO_NumRawSkinVertices4+= nInf;
942 H_AUTO( TestYoyo_RawSkin4 );*/
944 // Since VertexPtr may be a AGP Ram, MUST NOT read into it! (mulAdd*() do it!)
945 CVector tmpVert;
947 #ifdef NL3D_RAWSKIN_PRECACHE
948 for(;nInf>0;)
950 // number of vertices to process for this block.
951 uint nBlockInf= min(NumCacheVertexNormal4, nInf);
952 // next block.
953 nInf-= nBlockInf;
955 // cache the data in L1 cache.
956 CFastMem::precache(src, nBlockInf * sizeof(CRawVertexNormalSkinned4));
957 #else
959 uint nBlockInf= nInf;
960 #endif
962 // for all InfluencedVertices only.
963 for(;nBlockInf>0;nBlockInf--, src++, destVertexPtr+=NL3D_RAWSKIN_VERTEX_SIZE)
965 // Vertex.
966 boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex, src->Weights[0], tmpVert);
967 boneMat3x4[ src->MatrixId[1] ].mulAddPoint( src->Vertex, src->Weights[1], tmpVert);
968 boneMat3x4[ src->MatrixId[2] ].mulAddPoint( src->Vertex, src->Weights[2], tmpVert);
969 boneMat3x4[ src->MatrixId[3] ].mulAddPoint( src->Vertex, src->Weights[3], tmpVert);
970 *(CVector*)(destVertexPtr)= tmpVert;
971 // Normal.
972 boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Normal, src->Weights[0], tmpVert);
973 boneMat3x4[ src->MatrixId[1] ].mulAddVector( src->Normal, src->Weights[1], tmpVert);
974 boneMat3x4[ src->MatrixId[2] ].mulAddVector( src->Normal, src->Weights[2], tmpVert);
975 boneMat3x4[ src->MatrixId[3] ].mulAddVector( src->Normal, src->Weights[3], tmpVert);
976 *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
977 // UV copy.
978 *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->UV;
981 // NB: ASM not done for 4 vertices, cause very rare and negligeable ...
986 // ***************************************************************************
987 void CMeshMRMSkinnedGeom::applyRawSkinWithNormal(CLod &lod, CRawSkinnedNormalCache &rawSkinLod, const CSkeletonModel *skeleton, uint8 *vbHard, float alphaLod)
989 // Some assert
990 //===========================
992 // assert, code below is written especially for 4 per vertex.
993 nlassert( NL3D_MESH_MRM_SKINNED_MAX_MATRIX==4 );
996 // Compute useful Matrix for this lod.
997 //===========================
998 // Those arrays map the array of bones in skeleton.
999 static vector<CMatrix3x4> boneMat3x4;
1000 computeBoneMatrixes3x4(boneMat3x4, lod.MatrixInfluences, skeleton);
1003 // TestYoyo
1004 /*extern uint TESTYOYO_NumRawSkinVertices;
1005 TESTYOYO_NumRawSkinVertices+= rawSkinLod.Vertices1.size();
1006 TESTYOYO_NumRawSkinVertices+= rawSkinLod.Vertices2.size();
1007 TESTYOYO_NumRawSkinVertices+= rawSkinLod.Vertices3.size();
1008 TESTYOYO_NumRawSkinVertices+= rawSkinLod.Vertices4.size();*/
1011 uint nInf;
1013 // Manage "SoftVertices"
1014 if(rawSkinLod.TotalSoftVertices)
1016 // apply skinning into Temp RAM for vertices that are Src of Geomorph
1017 //===========================
1018 static vector<uint8> tempSkin;
1019 uint tempVbSize= rawSkinLod.TotalSoftVertices*NL3D_RAWSKIN_VERTEX_SIZE;
1020 if(tempSkin.size() < tempVbSize)
1021 tempSkin.resize(tempVbSize);
1022 uint8 *destVertexPtr= &tempSkin[0];
1024 // 1 Matrix
1025 nInf= rawSkinLod.SoftVertices[0];
1026 if(nInf>0)
1028 applyArrayRawSkinNormal1(&rawSkinLod.Vertices1[0], destVertexPtr, &boneMat3x4[0], nInf);
1029 destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE;
1031 // 2 Matrix
1032 nInf= rawSkinLod.SoftVertices[1];
1033 if(nInf>0)
1035 applyArrayRawSkinNormal2(&rawSkinLod.Vertices2[0], destVertexPtr, &boneMat3x4[0], nInf);
1036 destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE;
1038 // 3 Matrix
1039 nInf= rawSkinLod.SoftVertices[2];
1040 if(nInf>0)
1042 applyArrayRawSkinNormal3(&rawSkinLod.Vertices3[0], destVertexPtr, &boneMat3x4[0], nInf);
1043 destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE;
1045 // 4 Matrix
1046 nInf= rawSkinLod.SoftVertices[3];
1047 if(nInf>0)
1049 applyArrayRawSkinNormal4(&rawSkinLod.Vertices4[0], destVertexPtr, &boneMat3x4[0], nInf);
1050 destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE;
1053 // Fast Copy this into AGP Ram. NB: done before Geomorphs, because ensure some precaching this way!!
1054 //===========================
1055 // Skin geomorphs.
1056 uint8 *vbHardStart= vbHard + rawSkinLod.Geomorphs.size()*NL3D_RAWSKIN_VERTEX_SIZE;
1058 // fast copy
1059 CFastMem::memcpy(vbHardStart, &tempSkin[0], tempVbSize);
1061 // Geomorphs directly into AGP Ram
1062 //===========================
1063 clamp(alphaLod, 0.f, 1.f);
1064 float a= alphaLod;
1065 float a1= 1 - alphaLod;
1067 // Fast Geomorph
1068 applyGeomorphPosNormalUV0(rawSkinLod.Geomorphs, &tempSkin[0], vbHard, NL3D_RAWSKIN_VERTEX_SIZE, a, a1);
1071 // Manage HardVertices
1072 if(rawSkinLod.TotalHardVertices)
1074 // apply skinning directly into AGP RAM for vertices that are not Src of Geomorph
1075 //===========================
1076 uint startId;
1078 // Skip Geomorphs and SoftVertices.
1079 uint8 *destVertexPtr= vbHard + (rawSkinLod.Geomorphs.size()+rawSkinLod.TotalSoftVertices)*NL3D_RAWSKIN_VERTEX_SIZE;
1081 // 1 Matrix
1082 nInf= rawSkinLod.HardVertices[0];
1083 startId= rawSkinLod.SoftVertices[0];
1084 if(nInf>0)
1086 applyArrayRawSkinNormal1(&rawSkinLod.Vertices1[startId], destVertexPtr, &boneMat3x4[0], nInf);
1087 destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE;
1089 // 2 Matrix
1090 nInf= rawSkinLod.HardVertices[1];
1091 startId= rawSkinLod.SoftVertices[1];
1092 if(nInf>0)
1094 applyArrayRawSkinNormal2(&rawSkinLod.Vertices2[startId], destVertexPtr, &boneMat3x4[0], nInf);
1095 destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE;
1097 // 3 Matrix
1098 nInf= rawSkinLod.HardVertices[2];
1099 startId= rawSkinLod.SoftVertices[2];
1100 if(nInf>0)
1102 applyArrayRawSkinNormal3(&rawSkinLod.Vertices3[startId], destVertexPtr, &boneMat3x4[0], nInf);
1103 destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE;
1105 // 4 Matrix
1106 nInf= rawSkinLod.HardVertices[3];
1107 startId= rawSkinLod.SoftVertices[3];
1108 if(nInf>0)
1110 applyArrayRawSkinNormal4(&rawSkinLod.Vertices4[startId], destVertexPtr, &boneMat3x4[0], nInf);
1111 destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE;
1117 #endif // ADD_MESH_MRM_SKINNED_TEMPLATE