- added instructions how to update the online documentation
[bochs-mirror.git] / cpu / sse_string.cc
blob1232e7c8c315326e8e45d677957481ca5748e8bf
1 /////////////////////////////////////////////////////////////////////////
2 // $Id: sse_string.cc,v 1.11 2008/09/06 18:21:29 sshwarts Exp $
3 /////////////////////////////////////////////////////////////////////////
4 //
5 // Copyright (c) 2007 Stanislav Shwartsman
6 // Written by Stanislav Shwartsman [sshwarts at sourceforge net]
7 //
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2 of the License, or (at your option) any later version.
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 /////////////////////////////////////////////////////////////////////////
24 #define NEED_CPU_REG_SHORTCUTS 1
25 #include "bochs.h"
26 #include "cpu.h"
27 #define LOG_THIS BX_CPU_THIS_PTR
29 // Make code more tidy with a few macros.
30 #if BX_SUPPORT_X86_64==0
31 #define RCX ECX
32 #endif
34 #if (BX_SUPPORT_SSE > 4) || (BX_SUPPORT_SSE >= 4 && BX_SUPPORT_SSE_EXTENSION > 0)
36 // Compare all pairs of Ai, Bj according to imm8 control
37 static void compare_strings(Bit8u BoolRes[16][16], BxPackedXmmRegister op1, BxPackedXmmRegister op2, Bit8u imm)
39 unsigned i, j;
40 unsigned aggregation_operation = (imm >> 2) & 3;
42 // All possible comparisons are performed, the individual boolean
43 // results of those comparisons are referred by
44 // BoolRes[op2 element index, op1 element index]
46 switch (imm & 3) {
47 case 0: /* unsigned bytes compare */
48 for (i=0;i<16;i++) {
49 for (j=0;j<16;j++) {
50 switch (aggregation_operation) {
51 case 0: /* 'equal' comparison */
52 case 2:
53 case 3:
54 BoolRes[j][i] = (op1.xmmubyte(i) == op2.xmmubyte(j));
55 break;
56 case 1: /* 'ranges' comparison */
57 if ((i % 2) == 0)
58 BoolRes[j][i] = (op1.xmmubyte(i) <= op2.xmmubyte(j));
59 else
60 BoolRes[j][i] = (op1.xmmubyte(i) >= op2.xmmubyte(j));
61 break;
65 break;
67 case 1: /* unsigned words compare */
68 for (i=0;i<8;i++) {
69 for (j=0;j<8;j++) {
70 switch (aggregation_operation) {
71 case 0: /* 'equal' comparison */
72 case 2:
73 case 3:
74 BoolRes[j][i] = (op1.xmm16u(i) == op2.xmm16u(j));
75 break;
76 case 1: /* 'ranges' comparison */
77 if ((i % 2) == 0)
78 BoolRes[j][i] = (op1.xmm16u(i) <= op2.xmm16u(j));
79 else
80 BoolRes[j][i] = (op1.xmm16u(i) >= op2.xmm16u(j));
81 break;
85 break;
87 case 2: /* signed bytes compare */
88 for (i=0;i<16;i++) {
89 for (j=0;j<16;j++) {
90 switch (aggregation_operation) {
91 case 0: /* 'equal' comparison */
92 case 2:
93 case 3:
94 BoolRes[j][i] = (op1.xmmsbyte(i) == op2.xmmsbyte(j));
95 break;
96 case 1: /* 'ranges' comparison */
97 if ((i % 2) == 0)
98 BoolRes[j][i] = (op1.xmmsbyte(i) <= op2.xmmsbyte(j));
99 else
100 BoolRes[j][i] = (op1.xmmsbyte(i) >= op2.xmmsbyte(j));
101 break;
105 break;
107 case 3: /* signed words compare */
108 for (i=0;i<8;i++) {
109 for (j=0;j<8;j++) {
110 switch (aggregation_operation) {
111 case 0: /* 'equal' comparison */
112 case 2:
113 case 3:
114 BoolRes[j][i] = (op1.xmm16s(i) == op2.xmm16s(j));
115 break;
116 case 1: /* 'ranges' comparison */
117 if ((i % 2) == 0)
118 BoolRes[j][i] = (op1.xmm16s(i) <= op2.xmm16s(j));
119 else
120 BoolRes[j][i] = (op1.xmm16s(i) >= op2.xmm16s(j));
121 break;
125 break;
129 static unsigned find_eos32(Bit32s reg32, Bit8u imm)
131 if (imm & 0x1) { // 8 elements
132 if (reg32 > 8 || reg32 < -8) return 8;
133 else return abs(reg32);
135 else { // 16 elements
136 if (reg32 > 16 || reg32 < -16) return 16;
137 else return abs(reg32);
141 #if BX_SUPPORT_X86_64
142 static unsigned find_eos64(Bit64s reg64, Bit8u imm)
144 if (imm & 0x1) { // 8 elements
145 if (reg64 > 8 || reg64 < -8) return 8;
146 else return abs(reg64);
148 else { // 16 elements
149 if (reg64 > 16 || reg64 < -16) return 16;
150 else return abs(reg64);
153 #endif
155 static unsigned find_eos(BxPackedXmmRegister op, Bit8u imm)
157 unsigned i = 0;
159 if (imm & 0x1) { // 8 elements
160 for(i=0;i<8;i++)
161 if (op.xmm16u(i) == 0) break;
163 else { // 16 elements
164 for(i=0;i<16;i++)
165 if (op.xmmubyte(i) == 0) break;
168 return i;
171 static bx_bool override_if_data_invalid(bx_bool val, bx_bool i_valid, bx_bool j_valid, Bit8u imm)
173 unsigned aggregation_operation = (imm >> 2) & 3;
175 switch(aggregation_operation) {
176 case 0: // 'equal any'
177 case 1: // 'ranges'
178 if (! i_valid || ! j_valid) // one of the elements is invalid
179 return 0;
180 break;
182 case 2: // 'equal each'
183 if (! i_valid) {
184 if (! j_valid) return 1; // both elements are invalid
185 else return 0; // only i is invalid
187 else {
188 if (! j_valid) return 0; // only j is invalid
190 break;
192 case 3: // 'equal ordered'
193 if (! i_valid) { // element i is invalid
194 return 1;
196 else {
197 if (! j_valid) { // only j is invalid
198 return 0;
201 break;
204 return val;
207 static Bit16u aggregate(Bit8u BoolRes[16][16], unsigned len1, unsigned len2, Bit8u imm)
209 unsigned aggregation_operation = (imm >> 2) & 3;
210 unsigned num_elements = (imm & 0x1) ? 8 : 16;
211 unsigned polarity = (imm >> 4) & 3;
212 unsigned i,j,k;
214 Bit16u result = 0;
216 switch(aggregation_operation) {
217 case 0: // 'equal any'
218 for(j=0; j<num_elements; j++) {
219 bx_bool res = 0;
220 for(i=0; i<num_elements; i++) {
221 if (override_if_data_invalid(BoolRes[j][i], (i < len1), (j < len2), imm)) {
222 res = 1;
223 break;
227 if (res)
228 result |= (1<<j);
230 break;
232 case 1: // 'ranges'
233 for(j=0; j<num_elements; j++) {
234 bx_bool res = 0;
235 for(i=0; i<num_elements; i+=2) {
236 if (override_if_data_invalid(BoolRes[j][i], (i < len1), (j < len2), imm) &&
237 override_if_data_invalid(BoolRes[j][i+1], (i+1 < len1), (j < len2), imm)) {
238 res = 1;
239 break;
243 if (res)
244 result |= (1<<j);
246 break;
248 case 2: // 'equal each'
249 for(j=0; j<num_elements; j++) {
250 if (override_if_data_invalid(BoolRes[j][j], (j < len1), (j < len2), imm))
251 result |= (1<<j);
253 break;
255 case 3: // 'equal ordered'
256 for(j=0; j<num_elements; j++) {
257 bx_bool res = 1;
258 for (i=0, k=j; (i < num_elements-j) && (k < num_elements); i++, k++) {
259 if (! override_if_data_invalid(BoolRes[k][i], (i < len1), (k < len2), imm)) {
260 res = 0;
261 break;
265 if (res)
266 result |= (1<<j);
268 break;
271 switch(polarity) {
272 case 0:
273 case 2:
274 break; // do nothing
276 case 1:
277 result ^= (num_elements == 8) ? 0xFF : 0xFFFF;
278 break;
280 case 3:
281 for (j=0;j<num_elements;j++)
282 if (j < len2) result ^= (1<<j); // flip the bit
283 break;
286 return result;
289 #endif // (BX_SUPPORT_SSE > 4) || (BX_SUPPORT_SSE >= 4 && BX_SUPPORT_SSE_EXTENSION > 0)
291 // for 3-byte opcodes
292 #if (BX_SUPPORT_SSE >= 4) || (BX_SUPPORT_SSE >= 3 && BX_SUPPORT_SSE_EXTENSION > 0)
294 /* 66 0F 3A 60 */
295 void BX_CPP_AttrRegparmN(1) BX_CPU_C::PCMPESTRM_VdqWdqIb(bxInstruction_c *i)
297 #if (BX_SUPPORT_SSE > 4) || (BX_SUPPORT_SSE >= 4 && BX_SUPPORT_SSE_EXTENSION > 0)
298 BX_CPU_THIS_PTR prepareSSE();
300 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->nnn()), op2, result;
301 Bit8u imm8 = i->Ib();
303 /* op2 is a register or memory reference */
304 if (i->modC0()) {
305 op2 = BX_READ_XMM_REG(i->rm());
307 else {
308 bx_address eaddr = BX_CPU_CALL_METHODR(i->ResolveModrm, (i));
309 /* pointer, segment address pair */
310 readVirtualDQwordAligned(i->seg(), eaddr, (Bit8u *) &op2);
313 // compare all pairs of Ai, Bj
314 Bit8u BoolRes[16][16];
315 compare_strings(BoolRes, op1, op2, imm8);
316 unsigned len1, len2, num_elements = (imm8 & 0x1) ? 8 : 16;
318 #if BX_SUPPORT_X86_64
319 if (i->os64L()) {
320 len1 = find_eos64(RAX, imm8);
321 len2 = find_eos64(RDX, imm8);
323 else
324 #endif
326 len1 = find_eos32(EAX, imm8);
327 len2 = find_eos32(EDX, imm8);
329 Bit16u result2 = aggregate(BoolRes, len1, len2, imm8);
331 // As defined by imm8[6], result2 is then either stored to the least
332 // significant bits of XMM0 (zero extended to 128 bits) or expanded
333 // into a byte/word-mask and then stored to XMM0
334 if (imm8 & 0x40) {
335 if (num_elements == 8) {
336 for (int index = 0; index < 8; index++)
337 result.xmm16u(index) = (result2 & (1<<index)) ? 0xffff : 0;
339 else { // num_elements = 16
340 for (int index = 0; index < 16; index++)
341 result.xmmubyte(index) = (result2 & (1<<index)) ? 0xff : 0;
344 else {
345 result.xmm64u(1) = 0;
346 result.xmm64u(0) = (Bit64u) result2;
349 Bit32u flags = 0;
350 if (result2 != 0) flags |= EFlagsCFMask;
351 if (len1 < num_elements) flags |= EFlagsSFMask;
352 if (len2 < num_elements) flags |= EFlagsZFMask;
353 if (result2 & 0x1)
354 flags |= EFlagsOFMask;
355 setEFlagsOSZAPC(flags);
357 BX_WRITE_XMM_REG(0, result); /* store result XMM0 */
358 #else
359 BX_INFO(("PCMPESTRM_VdqWdqIb: required SSE4.2, use --enable-sse and --enable-sse-extension options"));
360 exception(BX_UD_EXCEPTION, 0, 0);
361 #endif
364 /* 66 0F 3A 61 */
365 void BX_CPP_AttrRegparmN(1) BX_CPU_C::PCMPESTRI_VdqWdqIb(bxInstruction_c *i)
367 #if (BX_SUPPORT_SSE > 4) || (BX_SUPPORT_SSE >= 4 && BX_SUPPORT_SSE_EXTENSION > 0)
368 BX_CPU_THIS_PTR prepareSSE();
370 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->nnn()), op2;
371 Bit8u imm8 = i->Ib();
373 /* op2 is a register or memory reference */
374 if (i->modC0()) {
375 op2 = BX_READ_XMM_REG(i->rm());
377 else {
378 bx_address eaddr = BX_CPU_CALL_METHODR(i->ResolveModrm, (i));
379 /* pointer, segment address pair */
380 readVirtualDQwordAligned(i->seg(), eaddr, (Bit8u *) &op2);
383 // compare all pairs of Ai, Bj
384 Bit8u BoolRes[16][16];
385 compare_strings(BoolRes, op1, op2, imm8);
386 unsigned len1, len2, num_elements = (imm8 & 0x1) ? 8 : 16;
387 int index;
389 #if BX_SUPPORT_X86_64
390 if (i->os64L()) {
391 len1 = find_eos64(RAX, imm8);
392 len2 = find_eos64(RDX, imm8);
394 else
395 #endif
397 len1 = find_eos32(EAX, imm8);
398 len2 = find_eos32(EDX, imm8);
400 Bit16u result2 = aggregate(BoolRes, len1, len2, imm8);
402 // The index of the first (or last, according to imm8[6]) set bit of result2
403 // is returned to ECX. If no bits are set in IntRes2, ECX is set to 16 (8)
404 if (imm8 & 0x40) {
405 // The index returned to ECX is of the MSB in result2
406 for (index=num_elements-1; index>=0; index--)
407 if (result2 & (1<<index)) break;
408 if (index < 0) index = num_elements;
410 else {
411 // The index returned to ECX is of the LSB in result2
412 for (index=0; index<(int)num_elements; index++)
413 if (result2 & (1<<index)) break;
415 RCX = index;
417 Bit32u flags = 0;
418 if (result2 != 0) flags |= EFlagsCFMask;
419 if (len1 < num_elements) flags |= EFlagsSFMask;
420 if (len2 < num_elements) flags |= EFlagsZFMask;
421 if (result2 & 0x1)
422 flags |= EFlagsOFMask;
423 setEFlagsOSZAPC(flags);
425 #else
426 BX_INFO(("PCMPESTRI_VdqWdqIb: required SSE4.2, use --enable-sse and --enable-sse-extension options"));
427 exception(BX_UD_EXCEPTION, 0, 0);
428 #endif
431 /* 66 0F 3A 62 */
432 void BX_CPP_AttrRegparmN(1) BX_CPU_C::PCMPISTRM_VdqWdqIb(bxInstruction_c *i)
434 #if (BX_SUPPORT_SSE > 4) || (BX_SUPPORT_SSE >= 4 && BX_SUPPORT_SSE_EXTENSION > 0)
435 BX_CPU_THIS_PTR prepareSSE();
437 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->nnn()), op2, result;
438 Bit8u imm8 = i->Ib();
440 /* op2 is a register or memory reference */
441 if (i->modC0()) {
442 op2 = BX_READ_XMM_REG(i->rm());
444 else {
445 bx_address eaddr = BX_CPU_CALL_METHODR(i->ResolveModrm, (i));
446 /* pointer, segment address pair */
447 readVirtualDQwordAligned(i->seg(), eaddr, (Bit8u *) &op2);
450 // compare all pairs of Ai, Bj
451 Bit8u BoolRes[16][16];
452 compare_strings(BoolRes, op1, op2, imm8);
454 unsigned num_elements = (imm8 & 0x1) ? 8 : 16;
455 unsigned len1 = find_eos(op1, imm8);
456 unsigned len2 = find_eos(op2, imm8);
457 Bit16u result2 = aggregate(BoolRes, len1, len2, imm8);
459 // As defined by imm8[6], result2 is then either stored to the least
460 // significant bits of XMM0 (zero extended to 128 bits) or expanded
461 // into a byte/word-mask and then stored to XMM0
462 if (imm8 & 0x40) {
463 if (num_elements == 8) {
464 for (int index = 0; index < 8; index++)
465 result.xmm16u(index) = (result2 & (1<<index)) ? 0xffff : 0;
467 else { // num_elements = 16
468 for (int index = 0; index < 16; index++)
469 result.xmmubyte(index) = (result2 & (1<<index)) ? 0xff : 0;
472 else {
473 result.xmm64u(1) = 0;
474 result.xmm64u(0) = (Bit64u) result2;
477 Bit32u flags = 0;
478 if (result2 != 0) flags |= EFlagsCFMask;
479 if (len1 < num_elements) flags |= EFlagsSFMask;
480 if (len2 < num_elements) flags |= EFlagsZFMask;
481 if (result2 & 0x1)
482 flags |= EFlagsOFMask;
483 setEFlagsOSZAPC(flags);
485 BX_WRITE_XMM_REG(0, result); /* store result XMM0 */
486 #else
487 BX_INFO(("PCMPISTRM_VdqWdqIb: required SSE4.2, use --enable-sse and --enable-sse-extension options"));
488 exception(BX_UD_EXCEPTION, 0, 0);
489 #endif
492 /* 66 0F 3A 63 */
493 void BX_CPP_AttrRegparmN(1) BX_CPU_C::PCMPISTRI_VdqWdqIb(bxInstruction_c *i)
495 #if (BX_SUPPORT_SSE > 4) || (BX_SUPPORT_SSE >= 4 && BX_SUPPORT_SSE_EXTENSION > 0)
496 BX_CPU_THIS_PTR prepareSSE();
498 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->nnn()), op2;
499 Bit8u imm8 = i->Ib();
501 /* op2 is a register or memory reference */
502 if (i->modC0()) {
503 op2 = BX_READ_XMM_REG(i->rm());
505 else {
506 bx_address eaddr = BX_CPU_CALL_METHODR(i->ResolveModrm, (i));
507 /* pointer, segment address pair */
508 readVirtualDQwordAligned(i->seg(), eaddr, (Bit8u *) &op2);
511 // compare all pairs of Ai, Bj
512 Bit8u BoolRes[16][16];
513 compare_strings(BoolRes, op1, op2, imm8);
514 unsigned num_elements = (imm8 & 0x1) ? 8 : 16;
515 int index;
517 unsigned len1 = find_eos(op1, imm8);
518 unsigned len2 = find_eos(op2, imm8);
519 Bit16u result2 = aggregate(BoolRes, len1, len2, imm8);
521 // The index of the first (or last, according to imm8[6]) set bit of result2
522 // is returned to ECX. If no bits are set in IntRes2, ECX is set to 16 (8)
523 if (imm8 & 0x40) {
524 // The index returned to ECX is of the MSB in result2
525 for (index=num_elements-1; index>=0; index--)
526 if (result2 & (1<<index)) break;
527 if (index < 0) index = num_elements;
529 else {
530 // The index returned to ECX is of the LSB in result2
531 for (index=0; index<(int)num_elements; index++)
532 if (result2 & (1<<index)) break;
534 RCX = index;
536 Bit32u flags = 0;
537 if (result2 != 0) flags |= EFlagsCFMask;
538 if (len1 < num_elements) flags |= EFlagsSFMask;
539 if (len2 < num_elements) flags |= EFlagsZFMask;
540 if (result2 & 0x1)
541 flags |= EFlagsOFMask;
542 setEFlagsOSZAPC(flags);
544 #else
545 BX_INFO(("PCMPISTRI_VdqWdqIb: required SSE4.2, use --enable-sse and --enable-sse-extension options"));
546 exception(BX_UD_EXCEPTION, 0, 0);
547 #endif
550 #endif // (BX_SUPPORT_SSE >= 4) || (BX_SUPPORT_SSE >= 3 && BX_SUPPORT_SSE_EXTENSION > 0)