Reestructured format strings
[x-diff.git] / C++ / XDiff.cpp
blobe29a1fbfbe36cf053472dcadb053de13bba14a0c
1 /**
2 * Copyright (c) 2001 - 2005
3 * Yuan Wang. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Redistributions in any form must be accompanied by information on
14 * how to obtain complete source code for the X-Diff software and any
15 * accompanying software that uses the X-Diff software. The source code
16 * must either be included in the distribution or be available for no
17 * more than the cost of distribution plus a nominal fee, and must be
18 * freely redistributable under reasonable conditions. For an executable
19 * file, complete source code means the source code for all modules it
20 * contains. It does not include source code for modules or files that
21 * typically accompany the major components of the operating system on
22 * which the executable file runs.
24 * THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
25 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
27 * ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
28 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
33 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
38 #include "XDiff.hpp"
40 #include <xercesc/sax/InputSource.hpp>
41 #include <xercesc/framework/LocalFileInputSource.hpp>
43 using namespace xercesc;
45 using std::vector;
46 using std::string;
47 using std::ofstream;
48 using std::ifstream;
49 using std::ios;
50 using std::cin;
51 using std::cout;
52 using std::cerr;
53 using std::endl;
55 const int XDiff::_CIRCUIT_SIZE = 2048;
56 const int XDiff::_MATRIX_SIZE = 1024;
57 const int XDiff::_ATTRIBUTE_SIZE = 1024;
58 const int XDiff::_TEXT_SIZE = 1024;
60 XDiff::XDiff(const char* input1, const char* input2,
61 bool optimal, bool greedy, double no_match_threshold, int sample_count)
63 // open the first file.
64 XMLCh *wstr1 = XMLString::transcode(input1);
65 InputSource *source1 = new LocalFileInputSource(wstr1);
66 source1->setPublicId(wstr1);
67 if (wstr1) XMLString::release(&wstr1);
69 // open the second file.
70 XMLCh *wstr2 = XMLString::transcode(input2);
71 InputSource *source2 = new LocalFileInputSource(wstr2);
72 source2->setPublicId(wstr2);
73 if (wstr2) XMLString::release(&wstr2);
75 init(*source1, *source2, optimal, greedy, no_match_threshold, sample_count);
77 if (source2) delete source2;
78 if (source1) delete source1;
82 void XDiff::init(const InputSource &source1, const InputSource &source2,
83 bool optimal, bool greedy, double no_match_threshold, int sample_count)
85 struct timeval *tv0 = new struct timeval;
86 struct timeval *tv1 = new struct timeval;
87 struct timeval *tv2 = new struct timeval;
88 struct timeval *tv3 = new struct timeval;
89 struct timezone *tz = new struct timezone;
91 _xlut = NULL;
92 _attrList1 = NULL;
93 _attrList2 = NULL;
94 _attrMatch = NULL;
95 _attrHash = NULL;
96 _attrTag = NULL;
97 _circuit = NULL;
98 _leastCostMatrix = NULL;
99 _pathMatrix = NULL;
101 _sampleCount = sample_count;
102 _NO_MATCH_THRESHOLD = no_match_threshold;
103 _oFlag = optimal;
104 _gFlag = greedy;
106 // starting time.
107 gettimeofday(tv0, tz);
109 // parse the first file.
110 XParser *parser1 = new XParser();
111 _xtree1 = parser1->parse(source1);
112 delete parser1;
113 gettimeofday(tv1, tz);
115 // parse the second file.
116 XParser *parser2 = new XParser();
117 _xtree2 = parser2->parse(source2);
118 delete parser2;
119 gettimeofday(tv2, tz);
121 // check both root nodes.
122 int root1 = _xtree1->getRoot();
123 int root2 = _xtree2->getRoot();
124 if (_xtree1->getHashValue(root1) == _xtree2->getHashValue(root2))
126 cerr << "No difference!" << endl;
127 cerr << "Execution time:\t\t" << _diffTime(tv0, tv2) << " s\n";
128 cerr << "Parsing " << XMLString::transcode(source1.getPublicId())
129 << " ( " << XMLString::transcode(source1.getSystemId()) << " )"
130 << ":\t" << _diffTime(tv0, tv1) << " s\n";
131 cerr << "Parsing " << XMLString::transcode(source2.getPublicId())
132 << " ( " << XMLString::transcode(source2.getSystemId()) << " )"
133 << ":\t" << _diffTime(tv1, tv2) << " s\n";
135 else
137 if (_xtree1->getTag(root1).compare(_xtree2->getValue(root2)) != 0)
139 cerr << "The root is changed!" << endl;
140 _xtree1->addMatching(root1, XTree::DELETE, root2);
141 _xtree2->addMatching(root2, XTree::INSERT, root1);
143 else
145 _attrList1 = new int[_ATTRIBUTE_SIZE];
146 _attrList2 = new int[_ATTRIBUTE_SIZE];
147 _textList1 = new int[_TEXT_SIZE];
148 _textList2 = new int[_TEXT_SIZE];
149 _attrMatch = new bool[_ATTRIBUTE_SIZE];
150 _textMatch1 = new bool[_TEXT_SIZE];
151 _textMatch2 = new bool[_TEXT_SIZE];
152 _attrHash = new unsigned long long[_ATTRIBUTE_SIZE];
153 _textHash = new unsigned long long[_TEXT_SIZE];
154 _attrTag = new string[_ATTRIBUTE_SIZE];
155 _leastCostMatrix = new int*[_MATRIX_SIZE];
156 _pathMatrix = new int*[_MATRIX_SIZE];
157 _circuit = new int[_CIRCUIT_SIZE];
158 for (int i = 0; i < _MATRIX_SIZE; i++)
160 _leastCostMatrix[i] = new int[_MATRIX_SIZE];
161 _pathMatrix[i] = new int[_MATRIX_SIZE];
163 _xtree1->addMatching(root1, XTree::CHANGE, root2);
164 _xtree2->addMatching(root2, XTree::CHANGE, root1);
165 _seed = (unsigned int)(tv0->tv_usec & 0xffffffff);
166 _xlut = new XLut((_xtree1->getNodeCount() > 0xffff) ||
167 (_xtree2->getNodeCount() > 0xffff));
168 xdiff(root1, root2, false);
171 gettimeofday(tv3, tz);
173 cerr << "Difference detected!" << endl;
174 cerr << "Parsing " << XMLString::transcode(source1.getPublicId())
175 << " ( " << XMLString::transcode(source1.getSystemId()) << " )"
176 << ":\t" << _diffTime(tv0, tv1) << " s\n";
177 cerr << "Parsing " << XMLString::transcode(source2.getPublicId())
178 << " ( " << XMLString::transcode(source2.getSystemId()) << " )"
179 << ":\t" << _diffTime(tv1, tv2) << " s\n";
180 cerr << "Diffing:\t\t" << _diffTime(tv2, tv3) << " s\n";
182 delete _xlut; _xlut = NULL;
183 delete[] _attrList1; _attrList1 = NULL;
184 delete[] _attrList2; _attrList2 = NULL;
185 delete[] _attrMatch; _attrMatch = NULL;
186 delete[] _attrHash; _attrHash = NULL;
187 delete[] _attrTag; _attrTag = NULL;
188 delete[] _circuit; _circuit = NULL;
189 for (int i = 0; i < _MATRIX_SIZE; i++)
191 if (_leastCostMatrix) delete[] _leastCostMatrix[i];
192 if (_pathMatrix) delete[] _pathMatrix[i];
194 delete[] _leastCostMatrix; _leastCostMatrix = NULL;
195 delete[] _pathMatrix; _pathMatrix = NULL;
198 delete tv0;
199 delete tv1;
200 delete tv2;
201 delete tv3;
202 delete tz;
205 XDiff::~XDiff()
207 delete _xtree1;
208 delete _xtree2;
211 void XDiff::xdiff(int pid1, int pid2, bool matchFlag)
213 // diff attributes.
214 int attrCount1 = 0, attrCount2 = 0;
215 int attr1 = _xtree1->getFirstAttribute(pid1);
216 while (attr1 != XTree::NULL_NODE)
218 _attrList1[attrCount1++] = attr1;
219 attr1 = _xtree1->getNextAttribute(attr1);
221 int attr2 = _xtree2->getFirstAttribute(pid2);
222 while (attr2 != XTree::NULL_NODE)
224 _attrList2[attrCount2++] = attr2;
225 attr2 = _xtree2->getNextAttribute(attr2);
227 if (attrCount1 > 0)
229 if (attrCount2 > 0)
230 diffAttributes(attrCount1, attrCount2);
231 else
233 for (int i = 0; i < attrCount1; i++)
234 _xtree1->addMatching(_attrList1[i],
235 XTree::NO_MATCH);
238 else if (attrCount2 > 0) // attrCount1 == 0
240 for (int i = 0; i < attrCount2; i++)
241 _xtree2->addMatching(_attrList2[i], XTree::NO_MATCH);
244 // Match element nodes.
245 int count1 = _xtree1->getChildrenCount(pid1) - attrCount1;
246 int count2 = _xtree2->getChildrenCount(pid2) - attrCount2;
248 if ((count1 == 0) && (count2 == 0))
251 else if (count1 == 0)
253 int node2 = _xtree2->getFirstChild(pid2);
254 _xtree2->addMatching(node2, XTree::NO_MATCH);
255 for (int i = 1; i < count2; i++)
257 node2 = _xtree2->getNextSibling(node2);
258 _xtree2->addMatching(attr2, XTree::NO_MATCH);
261 else if (count2 == 0)
263 int node1 = _xtree1->getFirstChild(pid1);
264 _xtree1->addMatching(node1, XTree::NO_MATCH);
265 for (int i = 0; i < count1; i++)
267 node1 = _xtree1->getNextSibling(node1);
268 _xtree1->addMatching(node1, XTree::NO_MATCH);
271 else if ((count1 == 1) && (count2 == 1))
273 int node1 = _xtree1->getFirstChild(pid1);
274 int node2 = _xtree2->getFirstChild(pid2);
276 if (_xtree1->getHashValue(node1) == _xtree2->getHashValue(node2))
277 return;
279 bool isE1 = _xtree1->isElement(node1);
280 bool isE2 = _xtree2->isElement(node2);
282 if (isE1 && isE2)
284 if (_xtree1->getTag(node1).compare(_xtree2->getTag(node2)) == 0)
286 _xtree1->addMatching(node1, XTree::CHANGE, node2);
287 _xtree2->addMatching(node2, XTree::CHANGE, node1);
288 xdiff(node1, node2, matchFlag);
290 else
292 _xtree1->addMatching(node1, XTree::NO_MATCH);
293 _xtree2->addMatching(node2, XTree::NO_MATCH);
296 else if (!isE1 && !isE2)
298 _xtree1->addMatching(node1, XTree::CHANGE, node2);
299 _xtree2->addMatching(node2, XTree::CHANGE, node1);
301 else
303 _xtree1->addMatching(node1, XTree::NO_MATCH);
304 _xtree2->addMatching(node2, XTree::NO_MATCH);
307 else
309 int *elements1 = new int[count1];
310 int *elements2 = new int[count2];
311 int elementCount1 = 0, textCount1 = 0;
312 int elementCount2 = 0, textCount2 = 0;
314 int child1 = _xtree1->getFirstChild(pid1);
315 if (_xtree1->isElement(child1))
316 elements1[elementCount1++] = child1;
317 else
318 _textList1[textCount1++] = child1;
319 for (int i = 1; i < count1; i++)
321 child1 = _xtree1->getNextSibling(child1);
322 if (_xtree1->isElement(child1))
323 elements1[elementCount1++] = child1;
324 else
325 _textList1[textCount1++] = child1;
328 int child2 = _xtree2->getFirstChild(pid2);
329 if (_xtree2->isElement(child2))
330 elements2[elementCount2++] = child2;
331 else
332 _textList2[textCount2++] = child2;
333 for (int i = 1; i < count2; i++)
335 child2 = _xtree2->getNextSibling(child2);
336 if (_xtree2->isElement(child2))
337 elements2[elementCount2++] = child2;
338 else
339 _textList2[textCount2++] = child2;
342 // Match text nodes.
343 if (textCount1 > 0)
345 if (textCount2 > 0)
346 diffText(textCount1, textCount2);
347 else
349 for (int i = 0; i < textCount1; i++)
350 _xtree1->addMatching(_textList1[i],
351 XTree::NO_MATCH);
354 else if (textCount2 > 0) // textCount1 == 0
356 for (int i = 0; i < textCount2; i++)
357 _xtree2->addMatching(_textList2[i],
358 XTree::NO_MATCH);
361 bool *matched1 = new bool[elementCount1];
362 bool *matched2 = new bool[elementCount2];
363 int mcount = _matchFilter(elements1, elements2,
364 matched1, matched2,
365 elementCount1, elementCount2);
367 if ((elementCount1 == mcount) &&
368 (elementCount2 == mcount))
371 else if (elementCount1 == mcount)
373 for (int i = 0; i < elementCount2; i++)
375 if (!matched2[i])
376 _xtree2->addMatching(elements2[i],
377 XTree::NO_MATCH);
380 else if (elementCount2 == mcount)
382 for (int i = 0; i < elementCount1; i++)
384 if (!matched1[i])
385 _xtree1->addMatching(elements1[i],
386 XTree::NO_MATCH);
389 else
391 // Write the list of unmatched nodes.
392 int ucount1 = elementCount1 - mcount;
393 int ucount2 = elementCount2 - mcount;
394 int *unmatched1 = new int[ucount1];
395 int *unmatched2 = new int[ucount2];
396 int muc1 = 0, muc2 = 0;
397 int start = 0;
399 while ((muc1 < ucount1) && (muc2 < ucount2))
401 for (; (start < elementCount1) && matched1[start]; start++);
402 string startTag = _xtree1->getTag(elements1[start]);
403 int uele1 = 0, uele2 = 0;
404 muc1++;
405 unmatched1[uele1++] = elements1[start];
406 matched1[start++] = true;
408 for (int i = start; (i < elementCount1) && (muc1 < ucount1); i++)
410 if (!matched1[i] && (startTag.compare(_xtree1->getTag(elements1[i])) == 0))
412 matched1[i] = true;
413 muc1++;
414 unmatched1[uele1++] = elements1[i];
418 for (int i = 0; (i < elementCount2) && (muc2 < ucount2); i++)
420 if (!matched2[i] && (startTag.compare(_xtree2->getTag(elements2[i])) == 0))
422 matched2[i] = true;
423 muc2++;
424 unmatched2[uele2++] = elements2[i];
428 if (uele2 == 0)
430 for (int i = 0; i < uele1; i++)
431 _xtree1->addMatching(unmatched1[i], XTree::NO_MATCH);
433 else
435 if ((uele1 == 1) && (uele2 == 1))
437 _xtree1->addMatching(unmatched1[0], XTree::CHANGE, unmatched2[0]);
438 _xtree2->addMatching(unmatched2[0], XTree::CHANGE, unmatched1[0]);
439 xdiff(unmatched1[0],
440 unmatched2[0], matchFlag);
442 // To find minimal-cost matching between those unmatched elements (with the same tag name.
443 else if (uele1 >= uele2)
445 if ((uele2 <= _sampleCount) ||
446 !_gFlag)
447 matchListO(unmatched1, unmatched2, uele1, uele2, true, matchFlag);
448 else
449 matchList(unmatched1, unmatched2, uele1, uele2, true, matchFlag);
451 else
453 if ((uele1 <= _sampleCount) ||
454 !_gFlag)
455 matchListO(unmatched2, unmatched1, uele2, uele1, false, matchFlag);
456 else
457 matchList(unmatched2, unmatched1, uele2, uele1, false, matchFlag);
462 if (muc1 < ucount1)
464 for (int i = start; i < elementCount1; i++)
466 if (!matched1[i])
467 _xtree1->addMatching(elements1[i], XTree::NO_MATCH);
470 else if (muc2 < ucount2)
472 for (int i = 0; i < elementCount2; i++)
474 if (!matched2[i])
475 _xtree2->addMatching(elements2[i], XTree::NO_MATCH);
479 delete[] unmatched1;
480 delete[] unmatched2;
483 delete[] elements1;
484 delete[] elements2;
485 delete[] matched1;
486 delete[] matched2;
490 void XDiff::diffAttributes(int attrCount1, int attrCount2)
492 if ((attrCount1 == 1) && (attrCount2 == 1))
494 int a1 = _attrList1[0];
495 int a2 = _attrList2[0];
496 if (_xtree1->getHashValue(a1) == _xtree2->getHashValue(a2))
497 return;
499 if (_xtree1->getTag(a1).compare(_xtree2->getTag(a2)) == 0)
501 _xtree1->addMatching(a1, XTree::CHANGE, a2);
502 _xtree2->addMatching(a2, XTree::CHANGE, a1);
503 _xtree1->addMatching(_xtree1->getFirstChild(a1),
504 XTree::CHANGE,
505 _xtree2->getFirstChild(a2));
506 _xtree2->addMatching(_xtree2->getFirstChild(a2),
507 XTree::CHANGE,
508 _xtree1->getFirstChild(a1));
510 else
512 _xtree1->addMatching(a1, XTree::NO_MATCH);
513 _xtree2->addMatching(a2, XTree::NO_MATCH);
515 return;
518 for (int i = 0; i < attrCount2; i++)
520 _attrHash[i] = _xtree2->getHashValue(_attrList2[i]);
521 _attrTag[i] = _xtree2->getTag(_attrList2[i]);
522 _attrMatch[i] = false;
525 int matchCount = 0;
526 for (int i = 0; i < attrCount1; i++)
528 int attr1 = _attrList1[i];
529 unsigned long long ah1 = _xtree1->getHashValue(attr1);
530 string tag1 = _xtree1->getTag(attr1);
532 bool found = false;
533 for (int j = 0; j < attrCount2; j++)
535 int attr2 = _attrList2[j];
536 if (_attrMatch[j])
537 continue;
538 else if (ah1 == _attrHash[j])
540 _attrMatch[j] = true;
541 matchCount++;
542 found = true;
543 break;
545 else if (tag1.compare(_attrTag[j]) == 0)
547 _attrMatch[j] = true;
548 matchCount++;
550 _xtree1->addMatching(attr1, XTree::CHANGE, attr2);
551 _xtree2->addMatching(attr2, XTree::CHANGE, attr1);
552 _xtree1->addMatching(_xtree1->getFirstChild(attr1), XTree::CHANGE, _xtree2->getFirstChild(attr2));
553 _xtree2->addMatching(_xtree2->getFirstChild(attr2), XTree::CHANGE, _xtree1->getFirstChild(attr1));
555 found = true;
556 break;
560 if (!found)
561 _xtree1->addMatching(attr1, XTree::NO_MATCH);
564 if (matchCount != attrCount2)
566 for (int i = 0; i < attrCount2; i++)
568 if (!_attrMatch[i])
569 _xtree2->addMatching(_attrList2[i], XTree::NO_MATCH);
574 void XDiff::diffText(int textCount1, int textCount2)
576 for (int i = 0; i < textCount1; i++)
577 _textMatch1[i] = false;
578 for (int i = 0; i < textCount2; i++)
580 _textMatch2[i] = false;
581 _textHash[i] = _xtree2->getHashValue(_textList2[i]);
584 int mcount = 0;
585 for (int i = 0; i < textCount1; i++)
587 unsigned long long hash1 = _xtree1->getHashValue(_textList1[i]);
588 for (int j = 0; j < textCount2; j++)
590 if (!_textMatch2[j] && (hash1 == _textHash[j]))
592 _textMatch1[i] = true;
593 _textMatch2[j] = true;
594 mcount++;
595 break;
599 if (mcount == textCount2)
600 break;
603 if ((mcount < textCount1) && (textCount1 <= textCount2))
605 for (int i = 0, j = 0;
606 (i < textCount1) && (mcount < textCount1); i++)
608 if (_textMatch1[i])
609 continue;
610 for (; _textMatch2[j]; j++);
611 _xtree1->addMatching(_textList1[i], XTree::CHANGE,
612 _textList2[j]);
613 _textMatch1[i] = true;
614 _xtree2->addMatching(_textList2[j], XTree::CHANGE,
615 _textList1[i]);
616 _textMatch2[j] = true;
617 mcount++;
620 else if ((mcount < textCount2) && (textCount2 < textCount1))
622 for (int i = 0, j = 0;
623 (i < textCount2) && (mcount < textCount2); i++)
625 if (_textMatch2[i])
626 continue;
627 for (; _textMatch1[j]; j++);
628 _xtree2->addMatching(_textList2[i], XTree::CHANGE,
629 _textList1[j]);
630 _textMatch2[i] = true;
631 _xtree1->addMatching(_textList1[j], XTree::CHANGE,
632 _textList2[i]);
633 _textMatch1[j] = true;
634 mcount++;
638 if (mcount < textCount1)
640 for (int i = 0; i < textCount1; i++)
642 if (!_textMatch1[i])
643 _xtree1->addMatching(_textList1[i],
644 XTree::NO_MATCH);
647 else if (mcount < textCount2)
649 for (int i = 0; i < textCount2; i++)
651 if (!_textMatch2[i])
652 _xtree2->addMatching(_textList2[i],
653 XTree::NO_MATCH);
658 int XDiff::_matchFilter(int *elements1, int *elements2, bool *matched1,
659 bool *matched2, int count1, int count2)
661 unsigned long long *value1 = new unsigned long long[count1];
662 unsigned long long *value2 = new unsigned long long[count2];
664 for (int i = 0; i < count1; i++)
666 value1[i] = _xtree1->getHashValue(elements1[i]);
667 matched1[i] = false;
669 for (int i = 0; i < count2; i++)
671 value2[i] = _xtree2->getHashValue(elements2[i]);
672 matched2[i] = false;
675 int mcount = 0;
676 for (int i = 0; i < count2; i++)
677 for (int j = 0; j < count1; j++)
679 if (!matched1[j] && !matched2[i] &&
680 (value1[j] == value2[i]))
682 matched1[j] = true;
683 matched2[i] = true;
684 mcount++;
685 break;
689 delete[] value1;
690 delete[] value2;
692 return mcount;
695 void XDiff::matchListO(int *nodes1, int *nodes2, int count1, int count2,
696 bool treeOrder, bool matchFlag)
698 int **distanceMatrix = new int*[count1+1];
699 int *matching1 = new int[count1];
700 int *matching2 = new int[count2];
702 // insert cost.
703 distanceMatrix[count1] = new int[count2+1];
704 for (int i = 0; i < count2; i++)
705 distanceMatrix[count1][i] = (treeOrder ? _xtree2->getDecendentsCount(nodes2[i]) : _xtree1->getDecendentsCount(nodes2[i])) + 1;
707 for (int i = 0; i < count1; i++)
709 distanceMatrix[i] = new int[count2+1];
710 int deleteCost = (treeOrder ? _xtree1->getDecendentsCount(nodes1[i]) : _xtree2->getDecendentsCount(nodes1[i])) + 1;
711 for (int j = 0; j < count2; j++)
713 int dist;
714 if (matchFlag)
715 dist = treeOrder ? _xlut->get(nodes1[i], nodes2[j]) : _xlut->get(nodes2[j], nodes1[i]);
716 else
718 dist = treeOrder ? distance(nodes1[i], nodes2[j], true, XTree::NO_CONNECTION) : distance(nodes2[j], nodes1[i], true, XTree::NO_CONNECTION);
720 // the default mode.
721 if (!_oFlag && (dist > 1) && (dist >= _NO_MATCH_THRESHOLD * (deleteCost + distanceMatrix[count1][j])))
722 dist = XTree::NO_CONNECTION;
723 if (dist < XTree::NO_CONNECTION)
725 if (treeOrder)
726 _xlut->add(nodes1[i],
727 nodes2[j],
728 dist);
729 else
730 _xlut->add(nodes2[j],
731 nodes1[i],
732 dist);
735 distanceMatrix[i][j] = dist;
737 // delete cost.
738 distanceMatrix[i][count2] = deleteCost;
741 // compute the minimal cost matching.
742 findMatching(count1, count2, distanceMatrix, matching1, matching2);
744 for (int i = 0; i < count1; i++)
746 if (matching1[i] == XTree::NO_MATCH)
748 if (treeOrder)
749 _xtree1->addMatching(nodes1[i], XTree::NO_MATCH);
750 else
751 _xtree2->addMatching(nodes1[i], XTree::NO_MATCH);
753 else
755 if (treeOrder)
756 _xtree1->addMatching(nodes1[i], XTree::CHANGE,
757 nodes2[matching1[i]]);
758 else
759 _xtree2->addMatching(nodes1[i], XTree::CHANGE,
760 nodes2[matching1[i]]);
764 for (int i = 0; i < count2; i++)
766 if (matching2[i] == XTree::NO_MATCH)
768 if (treeOrder)
769 _xtree2->addMatching(nodes2[i], XTree::NO_MATCH);
770 else
771 _xtree1->addMatching(nodes2[i], XTree::NO_MATCH);
773 else
775 if (treeOrder)
776 _xtree2->addMatching(nodes2[i], XTree::CHANGE,
777 nodes1[matching2[i]]);
778 else
779 _xtree1->addMatching(nodes2[i], XTree::CHANGE,
780 nodes1[matching2[i]]);
784 for (int i = 0; i < count1; i++)
786 if (matching1[i] != XTree::NO_MATCH)
788 int todo1 = nodes1[i];
789 int todo2 = nodes2[matching1[i]];
790 if (treeOrder)
792 if (_xtree1->isElement(todo1) &&
793 _xtree2->isElement(todo2))
794 xdiff(todo1, todo2, true);
796 else
798 if (_xtree1->isElement(todo2) &&
799 _xtree2->isElement(todo1))
800 xdiff(todo2, todo1, true);
805 delete[] matching1;
806 delete[] matching2;
807 for (int i = 0; i <= count1; i++)
808 delete[] distanceMatrix[i];
809 delete[] distanceMatrix;
812 void XDiff::matchList(int *nodes1, int *nodes2, int count1, int count2,
813 bool treeOrder, bool matchFlag)
815 int *matching1 = new int[count1];
816 int *matching2 = new int[count2];
817 for (int i = 0; i < count1; i++)
818 matching1[i] = XTree::NO_MATCH;
819 for (int i = 0; i < count2; i++)
820 matching2[i] = XTree::NO_MATCH;
822 if (matchFlag)
824 for (int i = 0; i < count1; i++)
826 for (int j = 0; j < count2; j++)
828 int d = treeOrder ? _xlut->get(nodes1[i], nodes2[j]) : _xlut->get(nodes2[j], nodes1[i]);
829 if (d != XTree::NO_CONNECTION)
831 matching1[i] = j;
832 matching2[j] = i;
833 break;
838 else
840 int scount1 = 0;
841 int scount2 = 0;
842 int matchingThreshold = 0;
843 for (int i = 0; ((i < _sampleCount) && (scount2 < count2)); scount2++)
845 srand(_seed);
846 _seed = rand();
847 int snode = (int)((long long)_seed * (count2 - scount2) / (RAND_MAX + 1)) + scount2;
848 int dist = XTree::NO_CONNECTION;
849 int bestmatch = XTree::NULL_NODE;
850 for (int j = scount1; j < count1; j++)
852 int d = treeOrder ? distance(nodes1[j], nodes2[snode], false, dist) : distance(nodes2[snode], nodes1[j], false, dist);
853 if (d < dist)
855 dist = d;
856 bestmatch = j;
857 if (d == 1)
858 break;
862 int deleteCost = (treeOrder ? _xtree2->getDecendentsCount(nodes2[snode]) : _xtree1->getDecendentsCount(nodes2[snode])) + 1;
863 if ((dist > 1) &&
864 (dist > _NO_MATCH_THRESHOLD * deleteCost))
866 int tmp = nodes2[snode];
867 nodes2[snode] = nodes2[scount2];
868 nodes2[scount2] = tmp;
870 else
872 int tmp = nodes1[bestmatch];
873 nodes1[bestmatch] = nodes1[scount1];
874 nodes1[scount1] = tmp;
875 tmp = nodes2[snode];
876 nodes2[snode] = nodes2[scount2];
877 nodes2[scount2] = tmp;
879 if (treeOrder)
880 _xlut->add(nodes1[scount1], nodes2[scount2], dist);
881 else
882 _xlut->add(nodes2[scount2], nodes1[scount1], dist);
883 matching1[scount1] = scount2;
884 matching2[scount2] = scount1;
886 i++;
887 scount1++;
888 if (matchingThreshold < dist)
889 matchingThreshold = dist;
893 for (;scount2 < count2; scount2++)
895 int dist = XTree::NO_CONNECTION;
896 int bestmatch = XTree::NO_MATCH;
897 for (int i = scount1; i < count1; i++)
899 int d = treeOrder ? distance(nodes1[i], nodes2[scount2], false, dist) : distance(nodes2[scount2], nodes1[i], false, dist);
900 if (d <= matchingThreshold)
902 dist = d;
903 bestmatch = i;
904 break;
906 else if (d < dist)
908 dist = d;
909 bestmatch = i;
913 if (bestmatch != XTree::NO_MATCH)
915 int tmp = nodes1[bestmatch];
916 nodes1[bestmatch] = nodes1[scount1];
917 nodes1[scount1] = tmp;
919 if (treeOrder)
920 _xlut->add(nodes1[scount1], nodes2[scount2], dist);
921 else
922 _xlut->add(nodes2[scount2], nodes1[scount1], dist);
923 matching1[scount1] = scount2;
924 matching2[scount2] = scount1;
925 scount1++;
930 // Record matching
931 for (int i = 0; i < count1; i++)
933 if (matching1[i] == XTree::NO_MATCH)
935 if (treeOrder)
936 _xtree1->addMatching(nodes1[i], XTree::NO_MATCH);
937 else
938 _xtree2->addMatching(nodes1[i], XTree::NO_MATCH);
940 else
942 if (treeOrder)
943 _xtree1->addMatching(nodes1[i], XTree::CHANGE,
944 nodes2[matching1[i]]);
945 else
946 _xtree2->addMatching(nodes1[i], XTree::CHANGE,
947 nodes2[matching1[i]]);
951 for (int i = 0; i < count2; i++)
953 if (matching2[i] == XTree::NO_MATCH)
955 if (treeOrder)
956 _xtree2->addMatching(nodes2[i], XTree::NO_MATCH);
957 else
958 _xtree1->addMatching(nodes2[i], XTree::NO_MATCH);
960 else
962 if (treeOrder)
963 _xtree2->addMatching(nodes2[i], XTree::CHANGE,
964 nodes1[matching2[i]]);
965 else
966 _xtree1->addMatching(nodes2[i], XTree::CHANGE,
967 nodes1[matching2[i]]);
971 for (int i = 0; i < count1; i++)
973 if (matching1[i] != XTree::NO_MATCH)
975 int todo1 = nodes1[i];
976 int todo2 = nodes2[matching1[i]];
977 if (treeOrder)
979 if (_xtree1->isElement(todo1) &&
980 _xtree2->isElement(todo2))
981 xdiff(todo1, todo2, true);
983 else
985 if (_xtree1->isElement(todo2) &&
986 _xtree2->isElement(todo1))
987 xdiff(todo2, todo1, true);
992 delete[] matching1;
993 delete[] matching2;
996 int XDiff::distance(int eid1, int eid2, bool toRecord, int threshold)
998 bool isE1 = _xtree1->isElement(eid1);
999 bool isE2 = _xtree2->isElement(eid2);
1000 if (isE1 && isE2)
1002 if (_xtree1->getTag(eid1).compare(_xtree2->getTag(eid2)) != 0)
1003 return XTree::NO_CONNECTION;
1004 else
1006 int dist = _xdiff(eid1, eid2, threshold);
1007 if (toRecord && (dist < XTree::NO_CONNECTION))
1008 _xlut->add(eid1, eid2, dist);
1009 return dist;
1012 else if (!isE1 && !isE2)
1013 return 1;
1014 else
1015 return XTree::NO_CONNECTION;
1018 int XDiff::_xdiff(int pid1, int pid2, int threshold)
1020 int dist = 0;
1022 // diff attributes.
1023 int attrCount1 = 0, attrCount2 = 0;
1024 int attr1 = _xtree1->getFirstAttribute(pid1);
1025 while (attr1 != XTree::NULL_NODE)
1027 _attrList1[attrCount1++] = attr1;
1028 attr1 = _xtree1->getNextAttribute(attr1);
1030 int attr2 = _xtree2->getFirstAttribute(pid2);
1031 while (attr2 != XTree::NULL_NODE)
1033 _attrList2[attrCount2++] = attr2;
1034 attr2 = _xtree2->getNextAttribute(attr2);
1037 if (attrCount1 == 0)
1038 dist = attrCount2 * 2;
1039 else if (attrCount2 == 0)
1040 dist = attrCount1 * 2;
1041 else
1042 dist = _diffAttributes(attrCount1, attrCount2);
1043 if (!_gFlag && (dist >= threshold))
1044 return XTree::NO_CONNECTION;
1046 // Match element nodes.
1047 int count1 = _xtree1->getChildrenCount(pid1) - attrCount1;
1048 int count2 = _xtree2->getChildrenCount(pid2) - attrCount2;
1050 if (count1 == 0)
1052 int node2 = _xtree2->getFirstChild(pid2);
1053 while (node2 != XTree::NULL_NODE)
1055 dist += _xtree2->getDecendentsCount(node2) + 1;
1056 if (!_gFlag && (dist >= threshold))
1057 return XTree::NO_CONNECTION;
1058 node2 = _xtree2->getNextSibling(node2);
1061 else if (count2 == 0)
1063 int node1 = _xtree1->getFirstChild(pid1);
1064 while (node1 != XTree::NULL_NODE)
1066 dist += _xtree1->getDecendentsCount(node1) + 1;
1067 if (!_gFlag && (dist >= threshold))
1068 return XTree::NO_CONNECTION;
1069 node1 = _xtree1->getNextSibling(node1);
1072 else if ((count1 == 1) && (count2 == 1))
1074 int node1 = _xtree1->getFirstChild(pid1);
1075 int node2 = _xtree2->getFirstChild(pid2);
1077 if (_xtree1->getHashValue(node1) == _xtree2->getHashValue(node2))
1078 return dist;
1080 bool isE1 = _xtree1->isElement(node1);
1081 bool isE2 = _xtree2->isElement(node2);
1083 if (isE1 && isE2)
1085 if (_xtree1->getTag(node1).compare(_xtree2->getTag(node2)) == 0)
1086 dist += _xdiff(node1, node2, threshold - dist);
1087 else
1088 dist += _xtree1->getDecendentsCount(node1) +
1089 _xtree2->getDecendentsCount(node2) + 2;
1091 else if (!isE1 && !isE2)
1092 dist++;
1093 else
1094 dist += _xtree1->getDecendentsCount(node1) +
1095 _xtree2->getDecendentsCount(node2) + 2;
1097 else
1099 int *elements1 = new int[count1];
1100 int *elements2 = new int[count2];
1101 int elementCount1 = 0, textCount1 = 0;
1102 int elementCount2 = 0, textCount2 = 0;
1104 int child1 = _xtree1->getFirstChild(pid1);
1105 if (_xtree1->isElement(child1))
1106 elements1[elementCount1++] = child1;
1107 else
1108 _textList1[textCount1++] = child1;
1109 for (int i = 1; i < count1; i++)
1111 child1 = _xtree1->getNextSibling(child1);
1112 if (_xtree1->isElement(child1))
1113 elements1[elementCount1++] = child1;
1114 else
1115 _textList1[textCount1++] = child1;
1118 int child2 = _xtree2->getFirstChild(pid2);
1119 if (_xtree2->isElement(child2))
1120 elements2[elementCount2++] = child2;
1121 else
1122 _textList2[textCount2++] = child2;
1123 for (int i = 1; i < count2; i++)
1125 child2 = _xtree2->getNextSibling(child2);
1126 if (_xtree2->isElement(child2))
1127 elements2[elementCount2++] = child2;
1128 else
1129 _textList2[textCount2++] = child2;
1132 // Match text nodes.
1133 if (textCount1 == 0)
1135 dist += textCount2;
1137 else if (textCount2 == 0)
1139 dist += textCount1;
1141 else
1142 dist += _diffText(textCount1, textCount2);
1144 if (_gFlag && (dist >= threshold))
1145 return XTree::NO_CONNECTION;
1147 bool *matched1 = new bool[elementCount1];
1148 bool *matched2 = new bool[elementCount2];
1149 int mcount = _matchFilter(elements1, elements2,
1150 matched1, matched2,
1151 elementCount1, elementCount2);
1153 if ((elementCount1 == mcount) &&
1154 (elementCount2 == mcount))
1157 else if (elementCount1 == mcount)
1159 for (int i = 0; i < elementCount2; i++)
1161 if (!matched2[i])
1163 dist += _xtree2->getDecendentsCount(elements2[i]) + 1;
1164 if (_gFlag && (dist >= threshold))
1166 dist = XTree::NO_CONNECTION;
1167 break;
1172 else if (elementCount2 == mcount)
1174 for (int i = 0; i < elementCount1; i++)
1176 if (!matched1[i])
1178 dist += _xtree1->getDecendentsCount(elements1[i]) + 1;
1179 if (_gFlag && (dist >= threshold))
1181 dist = XTree::NO_CONNECTION;
1182 break;
1187 else //if ((count1 > mcount) && (count2 > mcount))
1189 // Write the list of unmatched nodes.
1190 int ucount1 = elementCount1 - mcount;
1191 int ucount2 = elementCount2 - mcount;
1192 int *unmatched1 = new int[ucount1];
1193 int *unmatched2 = new int[ucount2];
1194 int muc1 = 0, muc2 = 0, start = 0;
1196 while ((muc1 < ucount1) && (muc2 < ucount2))
1198 for (; (start < elementCount1) && matched1[start]; start++);
1199 string startTag = _xtree1->getTag(elements1[start]);
1200 int uele1 = 0, uele2 = 0;
1201 muc1++;
1202 unmatched1[uele1++] = elements1[start];
1203 matched1[start++] = true;
1205 for (int i = start; (i < elementCount1) && (muc1 < ucount1); i++)
1207 if (!matched1[i] && (startTag.compare(_xtree1->getTag(elements1[i])) == 0))
1209 matched1[i] = true;
1210 muc1++;
1211 unmatched1[uele1++] = elements1[i];
1215 for (int i = 0; (i < elementCount2) && (muc2 < ucount2); i++)
1217 if (!matched2[i] && (startTag.compare(_xtree2->getTag(elements2[i])) == 0))
1219 matched2[i] = true;
1220 muc2++;
1221 unmatched2[uele2++] = elements2[i];
1225 if (uele2 == 0)
1227 for (int i = 0; i < uele1; i++)
1228 dist += _xtree1->getDecendentsCount(unmatched1[i]);
1230 else
1233 if ((uele1 == 1) && (uele2 == 1))
1235 dist += _xdiff(unmatched1[0],
1236 unmatched2[0],
1237 threshold-dist);
1239 // To find minimal-cost matching between those unmatched elements (with the same tag name.
1240 else if (uele1 >= uele2)
1242 if (uele1 >= uele2)
1244 if ((uele2 <= _sampleCount) ||
1245 !_gFlag)
1246 dist += _matchListO(unmatched1, unmatched2, uele1, uele2, true);
1247 else
1248 dist += _matchList(unmatched1, unmatched2, uele1, uele2, true, threshold - dist);
1250 else
1252 if ((uele1 <= _sampleCount) ||
1253 !_gFlag)
1254 dist += _matchListO(unmatched2, unmatched1, uele2, uele1, false);
1255 else
1256 dist += _matchList(unmatched2, unmatched1, uele2, uele1, false, threshold - dist);
1260 if (_gFlag && (dist >= threshold))
1262 dist = XTree::NO_CONNECTION;
1263 break;
1267 if (dist < XTree::NO_CONNECTION)
1269 if (muc1 < ucount1)
1271 for (int i = start; i < elementCount1; i++)
1273 if (!matched1[i])
1275 dist =+ _xtree1->getDecendentsCount(elements1[i]);
1276 if (_gFlag && (dist >= threshold))
1278 dist = XTree::NO_CONNECTION;
1279 break;
1284 else if (muc2 < ucount2)
1286 for (int i = 0; i < elementCount2; i++)
1288 if (!matched2[i])
1290 dist += _xtree2->getDecendentsCount(elements2[i]);
1291 if (_gFlag && (dist >= threshold))
1293 dist = XTree::NO_CONNECTION;
1294 break;
1301 delete[] unmatched1;
1302 delete[] unmatched2;
1305 delete[] elements1;
1306 delete[] elements2;
1307 delete[] matched1;
1308 delete[] matched2;
1311 if (!_gFlag || (dist < threshold))
1312 return dist;
1313 else
1314 return XTree::NO_CONNECTION;
1317 int XDiff::_diffAttributes(int attrCount1, int attrCount2)
1319 if ((attrCount1 == 1) && (attrCount2 == 1))
1321 int a1 = _attrList1[0];
1322 int a2 = _attrList2[0];
1323 if (_xtree1->getHashValue(a1) == _xtree2->getHashValue(a2))
1324 return 0;
1326 if (_xtree1->getTag(a1).compare(_xtree2->getTag(a2)) == 0)
1327 return 1;
1328 else
1329 return 2;
1332 int dist = 0;
1333 for (int i = 0; i < attrCount2; i++)
1335 _attrHash[i] = _xtree2->getHashValue(_attrList2[i]);
1336 _attrTag[i] = _xtree2->getTag(_attrList2[i]);
1337 _attrMatch[i] = false;
1340 int matchCount = 0;
1341 for (int i = 0; i < attrCount1; i++)
1343 int attr1 = _attrList1[i];
1344 unsigned long long ah1 = _xtree1->getHashValue(attr1);
1345 string tag1 = _xtree1->getTag(attr1);
1347 bool found = false;
1348 for (int j = 0; j < attrCount2; j++)
1350 if (_attrMatch[j])
1351 continue;
1352 else if (ah1 == _attrHash[j])
1354 _attrMatch[j] = true;
1355 matchCount++;
1356 found = true;
1357 break;
1359 else if (tag1.compare(_attrTag[j]) == 0)
1361 _attrMatch[j] = true;
1362 matchCount++;
1363 dist++;
1364 found = true;
1365 break;
1369 if (!found)
1370 dist += 2;
1373 dist += (attrCount2 - matchCount) * 2;
1374 return dist;
1377 int XDiff::_diffText(int textCount1, int textCount2)
1379 for (int i = 0; i < textCount2; i++)
1381 _textMatch2[i] = false;
1382 _textHash[i] = _xtree2->getHashValue(_textList2[i]);
1385 int mcount = 0;
1386 for (int i = 0; i < textCount1; i++)
1388 unsigned long long hash1 = _xtree1->getHashValue(_textList1[i]);
1389 for (int j = 0; j < textCount2; j++)
1391 if (!_textMatch2[j] && (hash1 == _textHash[j]))
1393 _textMatch2[j] = true;
1394 mcount++;
1395 break;
1399 if (mcount == textCount2)
1400 break;
1403 if (textCount1 >= textCount2)
1404 return textCount1 - mcount;
1405 else
1406 return textCount2 - mcount;
1409 int XDiff::_matchListO(int *nodes1, int *nodes2, int count1, int count2,
1410 bool treeOrder)
1412 int **distanceMatrix = new int*[count1+1];
1413 int *matching1 = new int[count1];
1414 int *matching2 = new int[count2];
1416 // insert cost.
1417 distanceMatrix[count1] = new int[count2+1];
1418 for (int i = 0; i < count2; i++)
1419 distanceMatrix[count1][i] = (treeOrder ? _xtree2->getDecendentsCount(nodes2[i]) : _xtree1->getDecendentsCount(nodes2[i])) + 1;
1421 for (int i = 0; i < count1; i++)
1423 distanceMatrix[i] = new int[count2+1];
1424 int deleteCost = (treeOrder ? _xtree1->getDecendentsCount(nodes1[i]) : _xtree2->getDecendentsCount(nodes1[i])) + 1;
1425 for (int j = 0; j < count2; j++)
1427 int dist = treeOrder ? distance(nodes1[i], nodes2[j], true, XTree::NO_CONNECTION) : distance(nodes2[j], nodes1[i], true, XTree::NO_CONNECTION);
1429 // the default mode.
1430 if (!_oFlag && (dist > 1) &&
1431 (dist < XTree::NO_CONNECTION) &&
1432 (dist >= _NO_MATCH_THRESHOLD *
1433 (deleteCost + distanceMatrix[count1][j])))
1434 dist = XTree::NO_CONNECTION;
1436 if (dist < XTree::NO_CONNECTION)
1438 if (treeOrder)
1439 _xlut->add(nodes1[i], nodes2[j], dist);
1440 else
1441 _xlut->add(nodes2[j], nodes1[i], dist);
1443 distanceMatrix[i][j] = dist;
1445 // delete cost.
1446 distanceMatrix[i][count2] = deleteCost;
1449 // compute the minimal cost matching.
1450 int dist_value = findMatching(count1, count2, distanceMatrix,
1451 matching1, matching2);
1453 delete[] matching1;
1454 delete[] matching2;
1455 for (int i = 0; i <= count1; i++)
1456 delete[] distanceMatrix[i];
1457 delete[] distanceMatrix;
1459 return dist_value;
1462 int XDiff::_matchList(int *nodes1, int *nodes2, int count1, int count2,
1463 bool treeOrder, int threshold)
1465 int *matching1 = new int[count1];
1466 int *matching2 = new int[count2];
1467 for (int i = 0; i < count1; i++)
1468 matching1[i] = XTree::NULL_NODE;
1469 for (int i = 0; i < count2; i++)
1470 matching2[i] = XTree::NULL_NODE;
1472 int Distance = 0;
1473 int scount1 = 0;
1474 int scount2 = 0;
1475 int matchingThreshold = 0;
1477 for (int i = 0; ((i < _sampleCount) && (scount2 < count2)); scount2++)
1479 int snode = rand() % (count2 - scount2) + scount2;
1480 int dist = XTree::NO_CONNECTION;
1481 int bestmatch = XTree::NULL_NODE;
1482 for (int j = scount1; j < count1; j++)
1484 int d = treeOrder ? distance(nodes1[j], nodes2[snode], false, threshold - Distance) : distance(nodes2[snode], nodes1[j], false, threshold - Distance);
1485 if (d < dist)
1487 dist = d;
1488 bestmatch = j;
1489 if (d == 1)
1490 break;
1494 int deleteCost = (treeOrder ? _xtree2->getDecendentsCount(nodes2[snode]) : _xtree1->getDecendentsCount(nodes2[snode])) + 1;
1496 if ((dist > 1) && (dist > _NO_MATCH_THRESHOLD * deleteCost))
1498 int tmp = nodes2[snode];
1499 nodes2[snode] = nodes2[scount2];
1500 nodes2[scount2] = tmp;
1501 Distance += deleteCost;
1503 else
1505 int tmp = nodes1[bestmatch];
1506 nodes1[bestmatch] = nodes1[scount1];
1507 nodes1[scount1] = tmp;
1508 tmp = nodes2[snode];
1509 nodes2[snode] = nodes2[scount2];
1510 nodes2[scount2] = tmp;
1512 if (treeOrder)
1513 _xlut->add(nodes1[scount1], nodes2[scount2], dist);
1514 else
1515 _xlut->add(nodes2[scount2], nodes1[scount1], dist);
1516 matching1[scount1] = scount2;
1517 matching2[scount2] = scount1;
1519 i++;
1520 scount1++;
1521 if (matchingThreshold < dist)
1522 matchingThreshold = dist;
1523 Distance += dist;
1526 if (Distance >= threshold)
1528 delete[] matching1;
1529 delete[] matching2;
1530 return XTree::NO_CONNECTION;
1534 for (;scount2 < count2; scount2++)
1536 int deleteCost = (treeOrder ? _xtree2->getDecendentsCount(nodes2[scount2]) : _xtree1->getDecendentsCount(nodes2[scount2])) + 1;
1537 int dist = XTree::NO_CONNECTION;
1538 int bestmatch = XTree::NULL_NODE;
1539 for (int i = scount1; i < count1; i++)
1541 int d = treeOrder ? distance(nodes1[i], nodes2[scount2], false, threshold - Distance) : distance(nodes2[scount2], nodes1[i], false, threshold - Distance);
1542 if (d <= matchingThreshold)
1544 dist = d;
1545 bestmatch = i;
1546 break;
1548 else if ((d == 1) || (d < _NO_MATCH_THRESHOLD * dist))
1550 dist = d;
1551 bestmatch = i;
1555 if (bestmatch == XTree::NO_MATCH)
1557 Distance += deleteCost;
1559 else
1561 int tmp = nodes1[bestmatch];
1562 nodes1[bestmatch] = nodes1[scount1];
1563 nodes1[scount1] = tmp;
1565 if (treeOrder)
1566 _xlut->add(nodes1[scount1], nodes2[scount2], dist);
1567 else
1568 _xlut->add(nodes2[scount2], nodes1[scount1], dist);
1570 matching1[scount1] = scount2;
1571 matching2[scount2] = scount1;
1572 scount1++;
1573 Distance += dist;
1576 if (Distance >= threshold)
1578 delete[] matching1;
1579 delete[] matching2;
1580 return XTree::NO_CONNECTION;
1584 for (int i = 0; i < count1; i++)
1586 if (matching1[i] == XTree::NO_MATCH)
1588 Distance += (treeOrder ? _xtree1->getDecendentsCount(nodes1[i]) : _xtree2->getDecendentsCount(nodes1[i])) + 1;
1589 if (Distance >= threshold)
1591 delete[] matching1;
1592 delete[] matching2;
1593 return XTree::NO_CONNECTION;
1598 delete[] matching1;
1599 delete[] matching2;
1600 return Distance;
1603 int XDiff::findMatching(int count1, int count2, int** dist,
1604 int* matching1, int* matching2)
1606 if (count1 == 1)
1608 // count2 == 1
1609 if (dist[0][0] < XTree::NO_CONNECTION)
1611 matching1[0] = 0;
1612 matching2[0] = 0;
1614 else
1616 matching1[0] = XTree::DELETE;
1617 matching2[0] = XTree::DELETE;
1620 return dist[0][0];
1622 else if (count2 == 1)
1624 int distance = 0;
1625 int mate = 0;
1626 int mindist = XTree::NO_CONNECTION;
1627 matching2[0] = XTree::DELETE;
1629 for (int i = 0; i < count1; i++)
1631 matching1[i] = XTree::DELETE;
1632 if (mindist > dist[i][0])
1634 mindist = dist[i][0];
1635 mate = i;
1638 // Suppose we delete every node on list1.
1639 distance += dist[i][1];
1642 if (mindist < XTree::NO_CONNECTION)
1644 matching1[mate] = 0;
1645 matching2[0] = mate;
1646 distance += mindist - dist[mate][1];
1648 else
1650 // Add the delete cost of the single node on list2.
1651 distance += dist[count1][0];
1654 return distance;
1656 else if ((count1 == 2) && (count2 == 2))
1658 int distance1 = dist[0][0] + dist[1][1];
1659 int distance2 = dist[0][1] + dist[1][0];
1660 if (distance1 < distance2)
1662 if (dist[0][0] < XTree::NO_CONNECTION)
1664 matching1[0] = 0;
1665 matching2[0] = 0;
1666 distance1 = dist[0][0];
1668 else
1670 matching1[0] = XTree::DELETE;
1671 matching2[0] = XTree::DELETE;
1672 distance1 = dist[0][2] + dist[2][0];
1675 if (dist[1][1] < XTree::NO_CONNECTION)
1677 matching1[1] = 1;
1678 matching2[1] = 1;
1679 distance1 += dist[1][1];
1681 else
1683 matching1[1] = XTree::DELETE;
1684 matching2[1] = XTree::DELETE;
1685 distance1 += dist[1][2] + dist[2][1];
1688 return distance1;
1690 else
1692 if (dist[0][1] < XTree::NO_CONNECTION)
1694 matching1[0] = 1;
1695 matching2[1] = 0;
1696 distance2 = dist[0][1];
1698 else
1700 matching1[0] = XTree::DELETE;
1701 matching2[1] = XTree::DELETE;
1702 distance2 = dist[0][2] + dist[2][1];
1705 if (dist[1][0] < XTree::NO_CONNECTION)
1707 matching1[1] = 0;
1708 matching2[0] = 1;
1709 distance2 += dist[1][0];
1711 else
1713 matching1[1] = XTree::DELETE;
1714 matching2[0] = XTree::DELETE;
1715 distance2 += dist[1][2] + dist[2][0];
1718 return distance2;
1721 else
1723 return optimalMatching(count1, count2, dist,
1724 matching1, matching2);
1728 int XDiff::optimalMatching(int count1, int count2, int** dist,
1729 int* matching1, int* matching2)
1731 // Initialize matching.
1732 // Initial guess will be pair-matching between two lists.
1733 // Others will be insertion or deletion
1734 for (int i = 0; i < count2; i++)
1735 matching1[i] = i;
1736 for (int i = count2; i < count1; i++)
1737 matching1[i] = XTree::DELETE;
1739 // Three artificial nodes: "start", "end" and "delete".
1740 int count = count1 + count2 + 3;
1742 // Initialize least cost matrix and path matrix.
1743 // Both have been initialized at the very beginning.
1745 // Start algorithm.
1746 while (true)
1748 // Construct least cost matrix.
1749 constructLCM(dist, matching1, count1, count2);
1751 // Initialize path matrix.
1752 for (int i = 0; i < count; i++)
1753 for (int j = 0; j < count; j++)
1754 _pathMatrix[i][j] = i;
1756 // Search negative cost circuit.
1757 int clen = searchNCC(count);
1758 if (clen > 0)
1760 // Modify matching.
1761 for (int i = 0, next = 0; i < clen - 1; i++)
1763 int n1 = _circuit[next];
1764 next = _circuit[next+1];
1765 // Node in node list 1.
1766 if ((n1 > 0) && (n1 <= count1))
1768 int nid1 = n1 - 1;
1769 int nid2 = _circuit[next] - count1 - 1;
1770 if (nid2 == count2)
1771 nid2 = XTree::DELETE;
1773 matching1[nid1] = nid2;
1777 else // Stop.
1778 break;
1781 int distance = 0;
1782 // Suppose all insertion on list2
1783 for (int i = 0; i < count2; i++)
1785 matching2[i] = XTree::INSERT;
1786 distance += dist[count1][i];
1789 // update distance by looking at matching pairs.
1790 for (int i = 0; i < count1; i++)
1792 int mmm = matching1[i];
1793 if (mmm == XTree::DELETE)
1794 distance += dist[i][count2];
1795 else
1797 matching2[mmm] = i;
1798 distance += dist[i][mmm] -
1799 dist[count1][mmm];
1803 return distance;
1806 void XDiff::constructLCM(int** costMatrix, int* matching,
1807 int nodeCount1, int nodeCount2)
1809 // Three artificial nodes: "start", "end" and "delete".
1810 int nodeCount = nodeCount1 + nodeCount2 + 3;
1812 // Initialize.
1813 for (int i = 0; i < nodeCount; i++)
1815 for (int j = 0; j < nodeCount; j++)
1816 _leastCostMatrix[i][j] = XTree::NO_CONNECTION;
1818 // self.
1819 _leastCostMatrix[i][i] = 0;
1822 // Between start node and nodes in list 1.
1823 // Start -> node1 = Infinity; node1 -> Start = -0.
1824 for (int i = 0; i < nodeCount1; i++)
1825 _leastCostMatrix[i+1][0] = 0;
1827 // Between nodes in list2 and the end node.
1828 // Unless matched (later), node2 -> end = 0;
1829 // end -> node2 = Infinity.
1830 for (int i = 0; i < nodeCount2; i++)
1831 _leastCostMatrix[i+nodeCount1+1][nodeCount-1] = 0;
1833 int deleteCount = 0;
1835 // Between nodes in list1 and nodes in list2.
1836 // For matched, node1 -> node2 = Infinity;
1837 // node2 -> node1 = -1 * distance
1838 // For unmatched, node1 -> node2 = distance;
1839 // node2 -> node1 = Infinity
1840 for (int i = 0; i < nodeCount1; i++)
1842 int node1 = i + 1;
1843 int node2;
1845 // According to cost matrix.
1846 for (int j = 0; j < nodeCount2; j++)
1848 node2 = j + nodeCount1 + 1;
1849 _leastCostMatrix[node1][node2] = costMatrix[i][j];
1852 // According to matching.
1853 if (matching[i] == XTree::DELETE)
1855 deleteCount++;
1857 // node1 -> Delete = Infinity;
1858 // Delete -> node1 = -1 * DELETE_COST
1859 _leastCostMatrix[nodeCount-2][node1] = -1 * costMatrix[i][nodeCount2];
1861 else
1863 node2 = matching[i] + nodeCount1 + 1;
1865 // Between node1 and node2.
1866 _leastCostMatrix[node1][node2] = XTree::NO_CONNECTION;
1867 _leastCostMatrix[node2][node1] = costMatrix[i][matching[i]] * -1;
1869 // Between node1 and delete.
1870 _leastCostMatrix[node1][nodeCount-2] = costMatrix[i][nodeCount2];
1872 // Between node2 and end.
1873 _leastCostMatrix[node2][nodeCount-1] = XTree::NO_CONNECTION;
1874 _leastCostMatrix[nodeCount-1][node2] = costMatrix[nodeCount1][matching[i]];
1878 // Between the "Delete" and the "End".
1879 // If delete all, delete -> end = Infinity; end -> delete = 0.
1880 if (deleteCount == nodeCount1)
1881 _leastCostMatrix[nodeCount-1][nodeCount-2] = 0;
1882 // if no delete, delete -> end = 0; end -> delete = Infinity.
1883 else if (deleteCount == 0)
1884 _leastCostMatrix[nodeCount-2][nodeCount-1] = 0;
1885 // else, both 0;
1886 else
1888 _leastCostMatrix[nodeCount-2][nodeCount-1] = 0;
1889 _leastCostMatrix[nodeCount-1][nodeCount-2] = 0;
1893 int XDiff::searchNCC(int nodeCount)
1895 for (int k = 0; k < nodeCount; k++)
1897 for (int i = 0; i < nodeCount; i++)
1899 if ((i != k) &&
1900 (_leastCostMatrix[i][k] != XTree::NO_CONNECTION))
1902 for (int j = 0; j < nodeCount; j++)
1904 if ((j != k) &&
1905 (_leastCostMatrix[k][j] != XTree::NO_CONNECTION))
1907 int less = _leastCostMatrix[i][k] +
1908 _leastCostMatrix[k][j];
1909 if (less < _leastCostMatrix[i][j])
1911 _leastCostMatrix[i][j] = less;
1912 _pathMatrix[i][j] = k;
1914 // Found!
1915 if ((i == j) && (less < 0))
1917 int clen = 0; // the length of the circuit.
1919 // Locate the circuit.
1920 //circuit.addElement(new Integer(i));
1921 _circuit[0] = i;
1922 _circuit[1] = 2;
1924 //circuit.addElement(new Integer(pathMatrix[i][i]));
1925 _circuit[2] = _pathMatrix[i][i];
1926 _circuit[3] = 4;
1928 //circuit.addElement(new Integer(i));
1929 _circuit[4] = i;
1930 _circuit[5] = -1;
1932 clen = 3;
1934 bool finish;
1938 finish = true;
1939 for (int cit = 0, n = 0; cit < clen - 1; cit++)
1941 int left = _circuit[n];
1942 int next = _circuit[n+1];
1943 int right = (next == -1)? -1 : _circuit[next];
1945 //int middle = pathMatrix[circuit[n-1]][circuit[n]];
1946 int middle = _pathMatrix[left][right];
1948 if (middle != left)
1950 //circuit.insert( cit, middle );
1951 _circuit[clen*2] = middle;
1952 _circuit[clen*2+1] = next;
1953 _circuit[n+1] = clen*2;
1954 clen++;
1956 finish = false;
1957 break;
1959 n = next;
1961 } while (!finish);
1963 return clen;
1971 return 0;
1974 double XDiff::_diffTime(const struct timeval *time1,
1975 const struct timeval *time2)
1977 long sec = time2->tv_sec - time1->tv_sec;
1978 long usec = time2->tv_usec - time1->tv_usec;
1979 if (usec < 0)
1981 sec--;
1982 usec += 1000000;
1985 return 0.000001 * usec + sec;