--- /dev/null
+#
+# Copyright (c) 2001 - 2005
+# Yuan Wang. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. Redistributions in any form must be accompanied by information on
+# how to obtain complete source code for the X-Diff software and any
+# accompanying software that uses the X-Diff software. The source code
+# must either be included in the distribution or be available for no
+# more than the cost of distribution plus a nominal fee, and must be
+# freely redistributable under reasonable conditions. For an executable
+# file, complete source code means the source code for all modules it
+# contains. It does not include source code for modules or files that
+# typically accompany the major components of the operating system on
+# which the executable file runs.
+#
+# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+##############################################################################
+#
+# Makefile to build X-Diff
+#
+# make: builds all
+#
+##############################################################################
+
+# Replace it with your own $XERCESROOT
+XERCESROOT = /p/niagara/s/xerces-c-1.4.0
+
+CC = g++
+CFLAGS = -DDEBUG -DLINUX -D_REENTRANT -g -fpic -Wall
+LFLAGS = -DLINUX -fpic
+
+INCLUDES = -I. -I$(XERCESROOT)/include
+LIBS = -L$(XERCESROOT)/lib -static
+LINKS = -lxerces-c1_4 -lc
+
+SOURCES = XTree.cpp \
+ XHash.cpp \
+ XLut.cpp \
+ XParser.cpp \
+ XDiff.cpp
+
+HEADERS = $(SOURCES:.cpp=.hpp)
+OBJECTS = $(SOURCES:.cpp=.o)
+
+.SUFFIXES: .cpp .o
+
+.cpp.o:
+ $(CC) $(CFLAGS) $(INCLUDES) -c $<
+
+xdiff: $(OBJECTS)
+ $(CC) $(LFLAGS) $(LIBS) -o xdiff $(OBJECTS) $(LINKS)
+
+clean:
+ rm -rf *.o core
--- /dev/null
+
+ X-Diff Version 0.9.6
+
+What is it?
+-----------
+
+X-Diff is a tool for detecting difference between two XML documents.
+
+
+Documentation
+-------------
+
+The core algorithm of X-Diff is described in the paper, "X-Diff: An
+Efficient Change Detection Algorithm for XML Documents", ICDE 2003.
+(http://www.cs.wisc.edu/~yuanwang/research/xdiff.pdf).
+
+
+Installation
+------------
+
+-- Java version:
+
+Required: Xerces Java v1.4.0+.
+
+On UNIX/LINUX,
+
+ tar zxf xdiff-j-0.9.6.tar.gz
+ cd X-Diff
+ make
+
+On Windows,
+
+ unzip xdiff-j-0.9.6.zip
+ cd X-Diff
+ make.bat
+
+-- C++ version
+
+Required: Xerces C++ v1.4.0, g++ v2.9.*
+
+ tar zxf xdiff-j-0.9.6.tar.gz
+ cd X-Diff
+ make
+
+
+Running X-Diff
+--------------
+
+java XDiff [-o|-g] [-p percent] xml_file1 xml_file2 result_file
+
+or,
+
+xdiff [-o|-g] [-p percent] xml_file1 xml_file2 result_file
+
+Options:
+
+ The default mode is "-o -p 0.3".
+
+ -o The optimal mode, to get the diff result with minimum editing
+ distance.
+
+ -g The greedy mode, to get a diff result quickly, the result may not
+ be "optimal".
+
+ -p The maximum change percentage allowed. X-Diff will not try to match
+ nodes that are much different from each other.
+
+
+Bug Reporting
+-------------
+
+Please report any bugs or send your comments to Yuan Wang at
+
+
+Copyright Information
+---------------------
+
+X-Diff is an open source product, which open source license permits you to
+use X-Diff at no charge under the condition that if you use the software in
+an application you redistribute, the complete source code for your application
+must be available and freely redistributable under reasonable conditions.
+If you do not want to release the source code for your application, you may
+purchase a license from me.
+
+
+
+Copyright (c) 2001 - 2005
+ Yuan Wang. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+3. Redistributions in any form must be accompanied by information on
+how to obtain complete source code for the X-Diff software and any
+accompanying software that uses the X-Diff software. The source code
+must either be included in the distribution or be available for no
+more than the cost of distribution plus a nominal fee, and must be
+freely redistributable under reasonable conditions. For an executable
+file, complete source code means the source code for all modules it
+contains. It does not include source code for modules or files that
+typically accompany the major components of the operating system on
+which the executable file runs.
+
+THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
--- /dev/null
+/**
+ * Copyright (c) 2001 - 2005
+ * Yuan Wang. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Redistributions in any form must be accompanied by information on
+ * how to obtain complete source code for the X-Diff software and any
+ * accompanying software that uses the X-Diff software. The source code
+ * must either be included in the distribution or be available for no
+ * more than the cost of distribution plus a nominal fee, and must be
+ * freely redistributable under reasonable conditions. For an executable
+ * file, complete source code means the source code for all modules it
+ * contains. It does not include source code for modules or files that
+ * typically accompany the major components of the operating system on
+ * which the executable file runs.
+ *
+ * THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+ * ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "XDiff.hpp"
+
+const string XDiff::USAGE = "xdiff [-o|-g] [-p percent] input_xml1 input_xml2 diff_result\nOptions:\n The default mode is \"-o -p 0.3\"\n -o\tThe optimal mode, to get the minimum editing distance.\n -g\tThe greedy mode, to find a difference quickly.\n -p\tThe maximum change percentage allowed.\n\tDefault value: 1.0 for -o mode; 0.3 for -g mode.";
+
+const int XDiff::_CIRCUIT_SIZE = 2048;
+const int XDiff::_MATRIX_SIZE = 1024;
+const int XDiff::_ATTRIBUTE_SIZE = 1024;
+const int XDiff::_TEXT_SIZE = 1024;
+
+/* You can use a different number for sampling here, as sqrt(n) is a safe
+ one, though 3 works pretty well. */
+const int XDiff::_sampleCount = 3;
+double XDiff::_NO_MATCH_THRESHOLD = 0.3;
+bool XDiff::_oFlag = false;
+bool XDiff::_gFlag = false;
+
+XDiff::XDiff(const char* input1, const char* input2, const char* output)
+{
+ struct timeval *tv0 = new struct timeval;
+ struct timeval *tv1 = new struct timeval;
+ struct timeval *tv2 = new struct timeval;
+ struct timeval *tv3 = new struct timeval;
+ struct timeval *tv4 = new struct timeval;
+ struct timezone *tz = new struct timezone;
+
+ // starting time.
+ gettimeofday(tv0, tz);
+
+ // parse the first file.
+ XParser *parser1 = new XParser();
+ _xtree1 = parser1->parse(input1);
+ delete parser1;
+ gettimeofday(tv1, tz);
+
+ // parse the second file.
+ XParser *parser2 = new XParser();
+ _xtree2 = parser2->parse(input2);
+ delete parser2;
+ gettimeofday(tv2, tz);
+
+ // check both root nodes.
+ int root1 = _xtree1->getRoot();
+ int root2 = _xtree2->getRoot();
+ if (_xtree1->getHashValue(root1) == _xtree2->getHashValue(root2))
+ {
+ cout << "No difference!" << endl;
+ cout << "Execution time:\t\t" << _diffTime(tv0, tv2) << " s\n";
+ cout << "Parsing " << input1 << ":\t"
+ << _diffTime(tv0, tv1) << " s\n";
+ cout << "Parsing " << input2 << ":\t"
+ << _diffTime(tv1, tv2) << " s\n";
+ }
+ else
+ {
+ if (_xtree1->getTag(root1).compare(_xtree2->getValue(root2)) != 0)
+ {
+ cout << "The root is changed!" << endl;
+ _xtree1->addMatching(root1, XTree::DELETE, root2);
+ _xtree2->addMatching(root2, XTree::INSERT, root1);
+ }
+ else
+ {
+ _attrList1 = new int[_ATTRIBUTE_SIZE];
+ _attrList2 = new int[_ATTRIBUTE_SIZE];
+ _textList1 = new int[_TEXT_SIZE];
+ _textList2 = new int[_TEXT_SIZE];
+ _attrMatch = new bool[_ATTRIBUTE_SIZE];
+ _textMatch1 = new bool[_TEXT_SIZE];
+ _textMatch2 = new bool[_TEXT_SIZE];
+ _attrHash = new unsigned long long[_ATTRIBUTE_SIZE];
+ _textHash = new unsigned long long[_TEXT_SIZE];
+ _attrTag = new string[_ATTRIBUTE_SIZE];
+ _leastCostMatrix = new int*[_MATRIX_SIZE];
+ _pathMatrix = new int*[_MATRIX_SIZE];
+ _circuit = new int[_CIRCUIT_SIZE];
+ for (int i = 0; i < _MATRIX_SIZE; i++)
+ {
+ _leastCostMatrix[i] = new int[_MATRIX_SIZE];
+ _pathMatrix[i] = new int[_MATRIX_SIZE];
+ }
+ _xtree1->addMatching(root1, XTree::CHANGE, root2);
+ _xtree2->addMatching(root2, XTree::CHANGE, root1);
+ _seed = (unsigned int)(tv0->tv_usec & 0xffffffff);
+ _xlut = new XLut((_xtree1->getNodeCount() > 0xffff) ||
+ (_xtree2->getNodeCount() > 0xffff));
+ xdiff(root1, root2, false);
+ }
+
+ // after diffing.
+ gettimeofday(tv3, tz);
+ writeDiff(input1, output);
+ gettimeofday(tv4, tz);
+
+ cout << "Difference detected!" << endl;
+ cout << "Execution time:\t\t" << _diffTime(tv0, tv4) << " s\n";
+ cout << "Parsing " << input1 << ":\t"
+ << _diffTime(tv0, tv1) << " s\n";
+ cout << "Parsing " << input2 << ":\t"
+ << _diffTime(tv1, tv2) << " s\n";
+ cout << "Diffing:\t\t" << _diffTime(tv2, tv3) << " s\n";
+ cout << "Writing diff:\t\t" << _diffTime(tv3, tv4) << " s\n";
+
+ delete _xlut;
+ delete[] _attrList1;
+ delete[] _attrList2;
+ delete[] _attrMatch;
+ delete[] _attrHash;
+ delete[] _attrTag;
+ delete[] _circuit;
+ for (int i = 0; i < _MATRIX_SIZE; i++)
+ {
+ delete[] _leastCostMatrix[i];
+ delete[] _pathMatrix[i];
+ }
+ delete[] _leastCostMatrix;
+ delete[] _pathMatrix;
+ }
+
+ delete tv0;
+ delete tv1;
+ delete tv2;
+ delete tv3;
+ delete tv4;
+ delete tz;
+}
+
+XDiff::~XDiff()
+{
+ delete _xtree1;
+ delete _xtree2;
+}
+
+void XDiff::xdiff(int pid1, int pid2, bool matchFlag)
+{
+ // diff attributes.
+ int attrCount1 = 0, attrCount2 = 0;
+ int attr1 = _xtree1->getFirstAttribute(pid1);
+ while (attr1 != XTree::NULL_NODE)
+ {
+ _attrList1[attrCount1++] = attr1;
+ attr1 = _xtree1->getNextAttribute(attr1);
+ }
+ int attr2 = _xtree2->getFirstAttribute(pid2);
+ while (attr2 != XTree::NULL_NODE)
+ {
+ _attrList2[attrCount2++] = attr2;
+ attr2 = _xtree2->getNextAttribute(attr2);
+ }
+ if (attrCount1 > 0)
+ {
+ if (attrCount2 > 0)
+ diffAttributes(attrCount1, attrCount2);
+ else
+ {
+ for (int i = 0; i < attrCount1; i++)
+ _xtree1->addMatching(_attrList1[i],
+ XTree::NO_MATCH);
+ }
+ }
+ else if (attrCount2 > 0) // attrCount1 == 0
+ {
+ for (int i = 0; i < attrCount2; i++)
+ _xtree2->addMatching(_attrList2[i], XTree::NO_MATCH);
+ }
+
+ // Match element nodes.
+ int count1 = _xtree1->getChildrenCount(pid1) - attrCount1;
+ int count2 = _xtree2->getChildrenCount(pid2) - attrCount2;
+
+ if ((count1 == 0) && (count2 == 0))
+ {
+ }
+ else if (count1 == 0)
+ {
+ int node2 = _xtree2->getFirstChild(pid2);
+ _xtree2->addMatching(node2, XTree::NO_MATCH);
+ for (int i = 1; i < count2; i++)
+ {
+ node2 = _xtree2->getNextSibling(node2);
+ _xtree2->addMatching(attr2, XTree::NO_MATCH);
+ }
+ }
+ else if (count2 == 0)
+ {
+ int node1 = _xtree1->getFirstChild(pid1);
+ _xtree1->addMatching(node1, XTree::NO_MATCH);
+ for (int i = 0; i < count1; i++)
+ {
+ node1 = _xtree1->getNextSibling(node1);
+ _xtree1->addMatching(node1, XTree::NO_MATCH);
+ }
+ }
+ else if ((count1 == 1) && (count2 == 1))
+ {
+ int node1 = _xtree1->getFirstChild(pid1);
+ int node2 = _xtree2->getFirstChild(pid2);
+
+ if (_xtree1->getHashValue(node1) == _xtree2->getHashValue(node2))
+ return;
+
+ bool isE1 = _xtree1->isElement(node1);
+ bool isE2 = _xtree2->isElement(node2);
+
+ if (isE1 && isE2)
+ {
+ if (_xtree1->getTag(node1).compare(_xtree2->getTag(node2)) == 0)
+ {
+ _xtree1->addMatching(node1, XTree::CHANGE, node2);
+ _xtree2->addMatching(node2, XTree::CHANGE, node1);
+ xdiff(node1, node2, matchFlag);
+ }
+ else
+ {
+ _xtree1->addMatching(node1, XTree::NO_MATCH);
+ _xtree2->addMatching(node2, XTree::NO_MATCH);
+ }
+ }
+ else if (!isE1 && !isE2)
+ {
+ _xtree1->addMatching(node1, XTree::CHANGE, node2);
+ _xtree2->addMatching(node2, XTree::CHANGE, node1);
+ }
+ else
+ {
+ _xtree1->addMatching(node1, XTree::NO_MATCH);
+ _xtree2->addMatching(node2, XTree::NO_MATCH);
+ }
+ }
+ else
+ {
+ int *elements1 = new int[count1];
+ int *elements2 = new int[count2];
+ int elementCount1 = 0, textCount1 = 0;
+ int elementCount2 = 0, textCount2 = 0;
+
+ int child1 = _xtree1->getFirstChild(pid1);
+ if (_xtree1->isElement(child1))
+ elements1[elementCount1++] = child1;
+ else
+ _textList1[textCount1++] = child1;
+ for (int i = 1; i < count1; i++)
+ {
+ child1 = _xtree1->getNextSibling(child1);
+ if (_xtree1->isElement(child1))
+ elements1[elementCount1++] = child1;
+ else
+ _textList1[textCount1++] = child1;
+ }
+
+ int child2 = _xtree2->getFirstChild(pid2);
+ if (_xtree2->isElement(child2))
+ elements2[elementCount2++] = child2;
+ else
+ _textList2[textCount2++] = child2;
+ for (int i = 1; i < count2; i++)
+ {
+ child2 = _xtree2->getNextSibling(child2);
+ if (_xtree2->isElement(child2))
+ elements2[elementCount2++] = child2;
+ else
+ _textList2[textCount2++] = child2;
+ }
+
+ // Match text nodes.
+ if (textCount1 > 0)
+ {
+ if (textCount2 > 0)
+ diffText(textCount1, textCount2);
+ else
+ {
+ for (int i = 0; i < textCount1; i++)
+ _xtree1->addMatching(_textList1[i],
+ XTree::NO_MATCH);
+ }
+ }
+ else if (textCount2 > 0) // textCount1 == 0
+ {
+ for (int i = 0; i < textCount2; i++)
+ _xtree2->addMatching(_textList2[i],
+ XTree::NO_MATCH);
+ }
+
+ bool *matched1 = new bool[elementCount1];
+ bool *matched2 = new bool[elementCount2];
+ int mcount = _matchFilter(elements1, elements2,
+ matched1, matched2,
+ elementCount1, elementCount2);
+
+ if ((elementCount1 == mcount) &&
+ (elementCount2 == mcount))
+ {
+ }
+ else if (elementCount1 == mcount)
+ {
+ for (int i = 0; i < elementCount2; i++)
+ {
+ if (!matched2[i])
+ _xtree2->addMatching(elements2[i],
+ XTree::NO_MATCH);
+ }
+ }
+ else if (elementCount2 == mcount)
+ {
+ for (int i = 0; i < elementCount1; i++)
+ {
+ if (!matched1[i])
+ _xtree1->addMatching(elements1[i],
+ XTree::NO_MATCH);
+ }
+ }
+ else
+ {
+ // Write the list of unmatched nodes.
+ int ucount1 = elementCount1 - mcount;
+ int ucount2 = elementCount2 - mcount;
+ int *unmatched1 = new int[ucount1];
+ int *unmatched2 = new int[ucount2];
+ int muc1 = 0, muc2 = 0;
+ int start = 0;
+
+ while ((muc1 < ucount1) && (muc2 < ucount2))
+ {
+ for (; (start < elementCount1) && matched1[start]; start++);
+ string startTag = _xtree1->getTag(elements1[start]);
+ int uele1 = 0, uele2 = 0;
+ muc1++;
+ unmatched1[uele1++] = elements1[start];
+ matched1[start++] = true;
+
+ for (int i = start; (i < elementCount1) && (muc1 < ucount1); i++)
+ {
+ if (!matched1[i] && (startTag.compare(_xtree1->getTag(elements1[i])) == 0))
+ {
+ matched1[i] = true;
+ muc1++;
+ unmatched1[uele1++] = elements1[i];
+ }
+ }
+
+ for (int i = 0; (i < elementCount2) && (muc2 < ucount2); i++)
+ {
+ if (!matched2[i] && (startTag.compare(_xtree2->getTag(elements2[i])) == 0))
+ {
+ matched2[i] = true;
+ muc2++;
+ unmatched2[uele2++] = elements2[i];
+ }
+ }
+
+ if (uele2 == 0)
+ {
+ for (int i = 0; i < uele1; i++)
+ _xtree1->addMatching(unmatched1[i], XTree::NO_MATCH);
+ }
+ else
+ {
+ if ((uele1 == 1) && (uele2 == 1))
+ {
+ _xtree1->addMatching(unmatched1[0], XTree::CHANGE, unmatched2[0]);
+ _xtree2->addMatching(unmatched2[0], XTree::CHANGE, unmatched1[0]);
+ xdiff(unmatched1[0],
+ unmatched2[0], matchFlag);
+ }
+ // To find minimal-cost matching between those unmatched elements (with the same tag name.
+ else if (uele1 >= uele2)
+ {
+ if ((uele2 <= _sampleCount) ||
+ !_gFlag)
+ matchListO(unmatched1, unmatched2, uele1, uele2, true, matchFlag);
+ else
+ matchList(unmatched1, unmatched2, uele1, uele2, true, matchFlag);
+ }
+ else
+ {
+ if ((uele1 <= _sampleCount) ||
+ !_gFlag)
+ matchListO(unmatched2, unmatched1, uele2, uele1, false, matchFlag);
+ else
+ matchList(unmatched2, unmatched1, uele2, uele1, false, matchFlag);
+ }
+ }
+ }
+
+ if (muc1 < ucount1)
+ {
+ for (int i = start; i < elementCount1; i++)
+ {
+ if (!matched1[i])
+ _xtree1->addMatching(elements1[i], XTree::NO_MATCH);
+ }
+ }
+ else if (muc2 < ucount2)
+ {
+ for (int i = 0; i < elementCount2; i++)
+ {
+ if (!matched2[i])
+ _xtree2->addMatching(elements2[i], XTree::NO_MATCH);
+ }
+ }
+
+ delete[] unmatched1;
+ delete[] unmatched2;
+ }
+
+ delete[] elements1;
+ delete[] elements2;
+ delete[] matched1;
+ delete[] matched2;
+ }
+}
+
+void XDiff::diffAttributes(int attrCount1, int attrCount2)
+{
+ if ((attrCount1 == 1) && (attrCount2 == 1))
+ {
+ int a1 = _attrList1[0];
+ int a2 = _attrList2[0];
+ if (_xtree1->getHashValue(a1) == _xtree2->getHashValue(a2))
+ return;
+
+ if (_xtree1->getTag(a1).compare(_xtree2->getTag(a2)) == 0)
+ {
+ _xtree1->addMatching(a1, XTree::CHANGE, a2);
+ _xtree2->addMatching(a2, XTree::CHANGE, a1);
+ _xtree1->addMatching(_xtree1->getFirstChild(a1),
+ XTree::CHANGE,
+ _xtree2->getFirstChild(a2));
+ _xtree2->addMatching(_xtree2->getFirstChild(a2),
+ XTree::CHANGE,
+ _xtree1->getFirstChild(a1));
+ }
+ else
+ {
+ _xtree1->addMatching(a1, XTree::NO_MATCH);
+ _xtree2->addMatching(a2, XTree::NO_MATCH);
+ }
+ return;
+ }
+
+ for (int i = 0; i < attrCount2; i++)
+ {
+ _attrHash[i] = _xtree2->getHashValue(_attrList2[i]);
+ _attrTag[i] = _xtree2->getTag(_attrList2[i]);
+ _attrMatch[i] = false;
+ }
+
+ int matchCount = 0;
+ for (int i = 0; i < attrCount1; i++)
+ {
+ int attr1 = _attrList1[i];
+ unsigned long long ah1 = _xtree1->getHashValue(attr1);
+ string tag1 = _xtree1->getTag(attr1);
+
+ bool found = false;
+ for (int j = 0; j < attrCount2; j++)
+ {
+ int attr2 = _attrList2[j];
+ if (_attrMatch[j])
+ continue;
+ else if (ah1 == _attrHash[j])
+ {
+ _attrMatch[j] = true;
+ matchCount++;
+ found = true;
+ break;
+ }
+ else if (tag1.compare(_attrTag[j]) == 0)
+ {
+ _attrMatch[j] = true;
+ matchCount++;
+
+ _xtree1->addMatching(attr1, XTree::CHANGE, attr2);
+ _xtree2->addMatching(attr2, XTree::CHANGE, attr1);
+ _xtree1->addMatching(_xtree1->getFirstChild(attr1), XTree::CHANGE, _xtree2->getFirstChild(attr2));
+ _xtree2->addMatching(_xtree2->getFirstChild(attr2), XTree::CHANGE, _xtree1->getFirstChild(attr1));
+
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ _xtree1->addMatching(attr1, XTree::NO_MATCH);
+ }
+
+ if (matchCount != attrCount2)
+ {
+ for (int i = 0; i < attrCount2; i++)
+ {
+ if (!_attrMatch[i])
+ _xtree2->addMatching(_attrList2[i], XTree::NO_MATCH);
+ }
+ }
+}
+
+void XDiff::diffText(int textCount1, int textCount2)
+{
+ for (int i = 0; i < textCount1; i++)
+ _textMatch1[i] = false;
+ for (int i = 0; i < textCount2; i++)
+ {
+ _textMatch2[i] = false;
+ _textHash[i] = _xtree2->getHashValue(_textList2[i]);
+ }
+
+ int mcount = 0;
+ for (int i = 0; i < textCount1; i++)
+ {
+ unsigned long long hash1 = _xtree1->getHashValue(_textList1[i]);
+ for (int j = 0; j < textCount2; j++)
+ {
+ if (!_textMatch2[j] && (hash1 == _textHash[j]))
+ {
+ _textMatch1[i] = true;
+ _textMatch2[j] = true;
+ mcount++;
+ break;
+ }
+ }
+
+ if (mcount == textCount2)
+ break;
+ }
+
+ if ((mcount < textCount1) && (textCount1 <= textCount2))
+ {
+ for (int i = 0, j = 0;
+ (i < textCount1) && (mcount < textCount1); i++)
+ {
+ if (_textMatch1[i])
+ continue;
+ for (; _textMatch2[j]; j++);
+ _xtree1->addMatching(_textList1[i], XTree::CHANGE,
+ _textList2[j]);
+ _textMatch1[i] = true;
+ _xtree2->addMatching(_textList2[j], XTree::CHANGE,
+ _textList1[i]);
+ _textMatch2[j] = true;
+ mcount++;
+ }
+ }
+ else if ((mcount < textCount2) && (textCount2 < textCount1))
+ {
+ for (int i = 0, j = 0;
+ (i < textCount2) && (mcount < textCount2); i++)
+ {
+ if (_textMatch2[i])
+ continue;
+ for (; _textMatch1[j]; j++);
+ _xtree2->addMatching(_textList2[i], XTree::CHANGE,
+ _textList1[j]);
+ _textMatch2[i] = true;
+ _xtree1->addMatching(_textList1[j], XTree::CHANGE,
+ _textList2[i]);
+ _textMatch1[j] = true;
+ mcount++;
+ }
+ }
+
+ if (mcount < textCount1)
+ {
+ for (int i = 0; i < textCount1; i++)
+ {
+ if (!_textMatch1[i])
+ _xtree1->addMatching(_textList1[i],
+ XTree::NO_MATCH);
+ }
+ }
+ else if (mcount < textCount2)
+ {
+ for (int i = 0; i < textCount2; i++)
+ {
+ if (!_textMatch2[i])
+ _xtree2->addMatching(_textList2[i],
+ XTree::NO_MATCH);
+ }
+ }
+}
+
+int XDiff::_matchFilter(int *elements1, int *elements2, bool *matched1,
+ bool *matched2, int count1, int count2)
+{
+ unsigned long long *value1 = new unsigned long long[count1];
+ unsigned long long *value2 = new unsigned long long[count2];
+
+ for (int i = 0; i < count1; i++)
+ {
+ value1[i] = _xtree1->getHashValue(elements1[i]);
+ matched1[i] = false;
+ }
+ for (int i = 0; i < count2; i++)
+ {
+ value2[i] = _xtree2->getHashValue(elements2[i]);
+ matched2[i] = false;
+ }
+
+ int mcount = 0;
+ for (int i = 0; i < count2; i++)
+ for (int j = 0; j < count1; j++)
+ {
+ if (!matched1[j] && !matched2[i] &&
+ (value1[j] == value2[i]))
+ {
+ matched1[j] = true;
+ matched2[i] = true;
+ mcount++;
+ break;
+ }
+ }
+
+ delete[] value1;
+ delete[] value2;
+
+ return mcount;
+}
+
+void XDiff::matchListO(int *nodes1, int *nodes2, int count1, int count2,
+ bool treeOrder, bool matchFlag)
+{
+ int **distanceMatrix = new int*[count1+1];
+ int *matching1 = new int[count1];
+ int *matching2 = new int[count2];
+
+ // insert cost.
+ distanceMatrix[count1] = new int[count2+1];
+ for (int i = 0; i < count2; i++)
+ distanceMatrix[count1][i] = (treeOrder ? _xtree2->getDecendentsCount(nodes2[i]) : _xtree1->getDecendentsCount(nodes2[i])) + 1;
+
+ for (int i = 0; i < count1; i++)
+ {
+ distanceMatrix[i] = new int[count2+1];
+ int deleteCost = (treeOrder ? _xtree1->getDecendentsCount(nodes1[i]) : _xtree2->getDecendentsCount(nodes1[i])) + 1;
+ for (int j = 0; j < count2; j++)
+ {
+ int dist;
+ if (matchFlag)
+ dist = treeOrder ? _xlut->get(nodes1[i], nodes2[j]) : _xlut->get(nodes2[j], nodes1[i]);
+ else
+ {
+ dist = treeOrder ? distance(nodes1[i], nodes2[j], true, XTree::NO_CONNECTION) : distance(nodes2[j], nodes1[i], true, XTree::NO_CONNECTION);
+
+ // the default mode.
+ if (!_oFlag && (dist > 1) && (dist >= _NO_MATCH_THRESHOLD * (deleteCost + distanceMatrix[count1][j])))
+ dist = XTree::NO_CONNECTION;
+ if (dist < XTree::NO_CONNECTION)
+ {
+ if (treeOrder)
+ _xlut->add(nodes1[i],
+ nodes2[j],
+ dist);
+ else
+ _xlut->add(nodes2[j],
+ nodes1[i],
+ dist);
+ }
+ }
+ distanceMatrix[i][j] = dist;
+ }
+ // delete cost.
+ distanceMatrix[i][count2] = deleteCost;
+ }
+
+ // compute the minimal cost matching.
+ findMatching(count1, count2, distanceMatrix, matching1, matching2);
+
+ for (int i = 0; i < count1; i++)
+ {
+ if (matching1[i] == XTree::NO_MATCH)
+ {
+ if (treeOrder)
+ _xtree1->addMatching(nodes1[i], XTree::NO_MATCH);
+ else
+ _xtree2->addMatching(nodes1[i], XTree::NO_MATCH);
+ }
+ else
+ {
+ if (treeOrder)
+ _xtree1->addMatching(nodes1[i], XTree::CHANGE,
+ nodes2[matching1[i]]);
+ else
+ _xtree2->addMatching(nodes1[i], XTree::CHANGE,
+ nodes2[matching1[i]]);
+ }
+ }
+
+ for (int i = 0; i < count2; i++)
+ {
+ if (matching2[i] == XTree::NO_MATCH)
+ {
+ if (treeOrder)
+ _xtree2->addMatching(nodes2[i], XTree::NO_MATCH);
+ else
+ _xtree1->addMatching(nodes2[i], XTree::NO_MATCH);
+ }
+ else
+ {
+ if (treeOrder)
+ _xtree2->addMatching(nodes2[i], XTree::CHANGE,
+ nodes1[matching2[i]]);
+ else
+ _xtree1->addMatching(nodes2[i], XTree::CHANGE,
+ nodes1[matching2[i]]);
+ }
+ }
+
+ for (int i = 0; i < count1; i++)
+ {
+ if (matching1[i] != XTree::NO_MATCH)
+ {
+ int todo1 = nodes1[i];
+ int todo2 = nodes2[matching1[i]];
+ if (treeOrder)
+ {
+ if (_xtree1->isElement(todo1) &&
+ _xtree2->isElement(todo2))
+ xdiff(todo1, todo2, true);
+ }
+ else
+ {
+ if (_xtree1->isElement(todo2) &&
+ _xtree2->isElement(todo1))
+ xdiff(todo2, todo1, true);
+ }
+ }
+ }
+
+ delete[] matching1;
+ delete[] matching2;
+ for (int i = 0; i <= count1; i++)
+ delete[] distanceMatrix[i];
+ delete[] distanceMatrix;
+}
+
+void XDiff::matchList(int *nodes1, int *nodes2, int count1, int count2,
+ bool treeOrder, bool matchFlag)
+{
+ int *matching1 = new int[count1];
+ int *matching2 = new int[count2];
+ for (int i = 0; i < count1; i++)
+ matching1[i] = XTree::NO_MATCH;
+ for (int i = 0; i < count2; i++)
+ matching2[i] = XTree::NO_MATCH;
+
+ if (matchFlag)
+ {
+ for (int i = 0; i < count1; i++)
+ {
+ for (int j = 0; j < count2; j++)
+ {
+ int d = treeOrder ? _xlut->get(nodes1[i], nodes2[j]) : _xlut->get(nodes2[j], nodes1[i]);
+ if (d != XTree::NO_CONNECTION)
+ {
+ matching1[i] = j;
+ matching2[j] = i;
+ break;
+ }
+ }
+ }
+ }
+ else
+ {
+ int scount1 = 0;
+ int scount2 = 0;
+ int matchingThreshold = 0;
+ for (int i = 0; ((i < _sampleCount) && (scount2 < count2)); scount2++)
+ {
+ srand(_seed);
+ _seed = rand();
+ int snode = (int)((long long)_seed * (count2 - scount2) / (RAND_MAX + 1)) + scount2;
+ int dist = XTree::NO_CONNECTION;
+ int bestmatch = XTree::NULL_NODE;
+ for (int j = scount1; j < count1; j++)
+ {
+ int d = treeOrder ? distance(nodes1[j], nodes2[snode], false, dist) : distance(nodes2[snode], nodes1[j], false, dist);
+ if (d < dist)
+ {
+ dist = d;
+ bestmatch = j;
+ if (d == 1)
+ break;
+ }
+ }
+
+ int deleteCost = (treeOrder ? _xtree2->getDecendentsCount(nodes2[snode]) : _xtree1->getDecendentsCount(nodes2[snode])) + 1;
+ if ((dist > 1) &&
+ (dist > _NO_MATCH_THRESHOLD * deleteCost))
+ {
+ int tmp = nodes2[snode];
+ nodes2[snode] = nodes2[scount2];
+ nodes2[scount2] = tmp;
+ }
+ else
+ {
+ int tmp = nodes1[bestmatch];
+ nodes1[bestmatch] = nodes1[scount1];
+ nodes1[scount1] = tmp;
+ tmp = nodes2[snode];
+ nodes2[snode] = nodes2[scount2];
+ nodes2[scount2] = tmp;
+
+ if (treeOrder)
+ _xlut->add(nodes1[scount1], nodes2[scount2], dist);
+ else
+ _xlut->add(nodes2[scount2], nodes1[scount1], dist);
+ matching1[scount1] = scount2;
+ matching2[scount2] = scount1;
+
+ i++;
+ scount1++;
+ if (matchingThreshold < dist)
+ matchingThreshold = dist;
+ }
+ }
+
+ for (;scount2 < count2; scount2++)
+ {
+ int dist = XTree::NO_CONNECTION;
+ int bestmatch = XTree::NO_MATCH;
+ for (int i = scount1; i < count1; i++)
+ {
+ int d = treeOrder ? distance(nodes1[i], nodes2[scount2], false, dist) : distance(nodes2[scount2], nodes1[i], false, dist);
+ if (d <= matchingThreshold)
+ {
+ dist = d;
+ bestmatch = i;
+ break;
+ }
+ else if (d < dist)
+ {
+ dist = d;
+ bestmatch = i;
+ }
+ }
+
+ if (bestmatch != XTree::NO_MATCH)
+ {
+ int tmp = nodes1[bestmatch];
+ nodes1[bestmatch] = nodes1[scount1];
+ nodes1[scount1] = tmp;
+
+ if (treeOrder)
+ _xlut->add(nodes1[scount1], nodes2[scount2], dist);
+ else
+ _xlut->add(nodes2[scount2], nodes1[scount1], dist);
+ matching1[scount1] = scount2;
+ matching2[scount2] = scount1;
+ scount1++;
+ }
+ }
+ }
+
+ // Record matching
+ for (int i = 0; i < count1; i++)
+ {
+ if (matching1[i] == XTree::NO_MATCH)
+ {
+ if (treeOrder)
+ _xtree1->addMatching(nodes1[i], XTree::NO_MATCH);
+ else
+ _xtree2->addMatching(nodes1[i], XTree::NO_MATCH);
+ }
+ else
+ {
+ if (treeOrder)
+ _xtree1->addMatching(nodes1[i], XTree::CHANGE,
+ nodes2[matching1[i]]);
+ else
+ _xtree2->addMatching(nodes1[i], XTree::CHANGE,
+ nodes2[matching1[i]]);
+ }
+ }
+
+ for (int i = 0; i < count2; i++)
+ {
+ if (matching2[i] == XTree::NO_MATCH)
+ {
+ if (treeOrder)
+ _xtree2->addMatching(nodes2[i], XTree::NO_MATCH);
+ else
+ _xtree1->addMatching(nodes2[i], XTree::NO_MATCH);
+ }
+ else
+ {
+ if (treeOrder)
+ _xtree2->addMatching(nodes2[i], XTree::CHANGE,
+ nodes1[matching2[i]]);
+ else
+ _xtree1->addMatching(nodes2[i], XTree::CHANGE,
+ nodes1[matching2[i]]);
+ }
+ }
+
+ for (int i = 0; i < count1; i++)
+ {
+ if (matching1[i] != XTree::NO_MATCH)
+ {
+ int todo1 = nodes1[i];
+ int todo2 = nodes2[matching1[i]];
+ if (treeOrder)
+ {
+ if (_xtree1->isElement(todo1) &&
+ _xtree2->isElement(todo2))
+ xdiff(todo1, todo2, true);
+ }
+ else
+ {
+ if (_xtree1->isElement(todo2) &&
+ _xtree2->isElement(todo1))
+ xdiff(todo2, todo1, true);
+ }
+ }
+ }
+
+ delete[] matching1;
+ delete[] matching2;
+}
+
+int XDiff::distance(int eid1, int eid2, bool toRecord, int threshold)
+{
+ bool isE1 = _xtree1->isElement(eid1);
+ bool isE2 = _xtree2->isElement(eid2);
+ if (isE1 && isE2)
+ {
+ if (_xtree1->getTag(eid1).compare(_xtree2->getTag(eid2)) != 0)
+ return XTree::NO_CONNECTION;
+ else
+ {
+ int dist = _xdiff(eid1, eid2, threshold);
+ if (toRecord && (dist < XTree::NO_CONNECTION))
+ _xlut->add(eid1, eid2, dist);
+ return dist;
+ }
+ }
+ else if (!isE1 && !isE2)
+ return 1;
+ else
+ return XTree::NO_CONNECTION;
+}
+
+int XDiff::_xdiff(int pid1, int pid2, int threshold = XTree::NO_CONNECTION)
+{
+ int dist = 0;
+
+ // diff attributes.
+ int attrCount1 = 0, attrCount2 = 0;
+ int attr1 = _xtree1->getFirstAttribute(pid1);
+ while (attr1 != XTree::NULL_NODE)
+ {
+ _attrList1[attrCount1++] = attr1;
+ attr1 = _xtree1->getNextAttribute(attr1);
+ }
+ int attr2 = _xtree2->getFirstAttribute(pid2);
+ while (attr2 != XTree::NULL_NODE)
+ {
+ _attrList2[attrCount2++] = attr2;
+ attr2 = _xtree2->getNextAttribute(attr2);
+ }
+
+ if (attrCount1 == 0)
+ dist = attrCount2 * 2;
+ else if (attrCount2 == 0)
+ dist = attrCount1 * 2;
+ else
+ dist = _diffAttributes(attrCount1, attrCount2);
+ if (!_gFlag && (dist >= threshold))
+ return XTree::NO_CONNECTION;
+
+ // Match element nodes.
+ int count1 = _xtree1->getChildrenCount(pid1) - attrCount1;
+ int count2 = _xtree2->getChildrenCount(pid2) - attrCount2;
+
+ if (count1 == 0)
+ {
+ int node2 = _xtree2->getFirstChild(pid2);
+ while (node2 != XTree::NULL_NODE)
+ {
+ dist += _xtree2->getDecendentsCount(node2) + 1;
+ if (!_gFlag && (dist >= threshold))
+ return XTree::NO_CONNECTION;
+ node2 = _xtree2->getNextSibling(node2);
+ }
+ }
+ else if (count2 == 0)
+ {
+ int node1 = _xtree1->getFirstChild(pid1);
+ while (node1 != XTree::NULL_NODE)
+ {
+ dist += _xtree1->getDecendentsCount(node1) + 1;
+ if (!_gFlag && (dist >= threshold))
+ return XTree::NO_CONNECTION;
+ node1 = _xtree1->getNextSibling(node1);
+ }
+ }
+ else if ((count1 == 1) && (count2 == 1))
+ {
+ int node1 = _xtree1->getFirstChild(pid1);
+ int node2 = _xtree2->getFirstChild(pid2);
+
+ if (_xtree1->getHashValue(node1) == _xtree2->getHashValue(node2))
+ return dist;
+
+ bool isE1 = _xtree1->isElement(node1);
+ bool isE2 = _xtree2->isElement(node2);
+
+ if (isE1 && isE2)
+ {
+ if (_xtree1->getTag(node1).compare(_xtree2->getTag(node2)) == 0)
+ dist += _xdiff(node1, node2, threshold - dist);
+ else
+ dist += _xtree1->getDecendentsCount(node1) +
+ _xtree2->getDecendentsCount(node2) + 2;
+ }
+ else if (!isE1 && !isE2)
+ dist++;
+ else
+ dist += _xtree1->getDecendentsCount(node1) +
+ _xtree2->getDecendentsCount(node2) + 2;
+ }
+ else
+ {
+ int *elements1 = new int[count1];
+ int *elements2 = new int[count2];
+ int elementCount1 = 0, textCount1 = 0;
+ int elementCount2 = 0, textCount2 = 0;
+
+ int child1 = _xtree1->getFirstChild(pid1);
+ if (_xtree1->isElement(child1))
+ elements1[elementCount1++] = child1;
+ else
+ _textList1[textCount1++] = child1;
+ for (int i = 1; i < count1; i++)
+ {
+ child1 = _xtree1->getNextSibling(child1);
+ if (_xtree1->isElement(child1))
+ elements1[elementCount1++] = child1;
+ else
+ _textList1[textCount1++] = child1;
+ }
+
+ int child2 = _xtree2->getFirstChild(pid2);
+ if (_xtree2->isElement(child2))
+ elements2[elementCount2++] = child2;
+ else
+ _textList2[textCount2++] = child2;
+ for (int i = 1; i < count2; i++)
+ {
+ child2 = _xtree2->getNextSibling(child2);
+ if (_xtree2->isElement(child2))
+ elements2[elementCount2++] = child2;
+ else
+ _textList2[textCount2++] = child2;
+ }
+
+ // Match text nodes.
+ if (textCount1 == 0)
+ {
+ dist += textCount2;
+ }
+ else if (textCount2 == 0)
+ {
+ dist += textCount1;
+ }
+ else
+ dist += _diffText(textCount1, textCount2);
+
+ if (_gFlag && (dist >= threshold))
+ return XTree::NO_CONNECTION;
+
+ bool *matched1 = new bool[elementCount1];
+ bool *matched2 = new bool[elementCount2];
+ int mcount = _matchFilter(elements1, elements2,
+ matched1, matched2,
+ elementCount1, elementCount2);
+
+ if ((elementCount1 == mcount) &&
+ (elementCount2 == mcount))
+ {
+ }
+ else if (elementCount1 == mcount)
+ {
+ for (int i = 0; i < elementCount2; i++)
+ {
+ if (!matched2[i])
+ {
+ dist += _xtree2->getDecendentsCount(elements2[i]) + 1;
+ if (_gFlag && (dist >= threshold))
+ {
+ dist = XTree::NO_CONNECTION;
+ break;
+ }
+ }
+ }
+ }
+ else if (elementCount2 == mcount)
+ {
+ for (int i = 0; i < elementCount1; i++)
+ {
+ if (!matched1[i])
+ {
+ dist += _xtree1->getDecendentsCount(elements1[i]) + 1;
+ if (_gFlag && (dist >= threshold))
+ {
+ dist = XTree::NO_CONNECTION;
+ break;
+ }
+ }
+ }
+ }
+ else //if ((count1 > mcount) && (count2 > mcount))
+ {
+ // Write the list of unmatched nodes.
+ int ucount1 = elementCount1 - mcount;
+ int ucount2 = elementCount2 - mcount;
+ int *unmatched1 = new int[ucount1];
+ int *unmatched2 = new int[ucount2];
+ int muc1 = 0, muc2 = 0, start = 0;
+
+ while ((muc1 < ucount1) && (muc2 < ucount2))
+ {
+ for (; (start < elementCount1) && matched1[start]; start++);
+ string startTag = _xtree1->getTag(elements1[start]);
+ int uele1 = 0, uele2 = 0;
+ muc1++;
+ unmatched1[uele1++] = elements1[start];
+ matched1[start++] = true;
+
+ for (int i = start; (i < elementCount1) && (muc1 < ucount1); i++)
+ {
+ if (!matched1[i] && (startTag.compare(_xtree1->getTag(elements1[i])) == 0))
+ {
+ matched1[i] = true;
+ muc1++;
+ unmatched1[uele1++] = elements1[i];
+ }
+ }
+
+ for (int i = 0; (i < elementCount2) && (muc2 < ucount2); i++)
+ {
+ if (!matched2[i] && (startTag.compare(_xtree2->getTag(elements2[i])) == 0))
+ {
+ matched2[i] = true;
+ muc2++;
+ unmatched2[uele2++] = elements2[i];
+ }
+ }
+
+ if (uele2 == 0)
+ {
+ for (int i = 0; i < uele1; i++)
+ dist += _xtree1->getDecendentsCount(unmatched1[i]);
+ }
+ else
+ {
+/*
+ if ((uele1 == 1) && (uele2 == 1))
+ {
+ dist += _xdiff(unmatched1[0],
+ unmatched2[0],
+ threshold-dist);
+ }
+ // To find minimal-cost matching between those unmatched elements (with the same tag name.
+ else if (uele1 >= uele2)
+ */
+ if (uele1 >= uele2)
+ {
+ if ((uele2 <= _sampleCount) ||
+ !_gFlag)
+ dist += _matchListO(unmatched1, unmatched2, uele1, uele2, true);
+ else
+ dist += _matchList(unmatched1, unmatched2, uele1, uele2, true, threshold - dist);
+ }
+ else
+ {
+ if ((uele1 <= _sampleCount) ||
+ !_gFlag)
+ dist += _matchListO(unmatched2, unmatched1, uele2, uele1, false);
+ else
+ dist += _matchList(unmatched2, unmatched1, uele2, uele1, false, threshold - dist);
+ }
+ }
+
+ if (_gFlag && (dist >= threshold))
+ {
+ dist = XTree::NO_CONNECTION;
+ break;
+ }
+ }
+
+ if (dist < XTree::NO_CONNECTION)
+ {
+ if (muc1 < ucount1)
+ {
+ for (int i = start; i < elementCount1; i++)
+ {
+ if (!matched1[i])
+ {
+ dist =+ _xtree1->getDecendentsCount(elements1[i]);
+ if (_gFlag && (dist >= threshold))
+ {
+ dist = XTree::NO_CONNECTION;
+ break;
+ }
+ }
+ }
+ }
+ else if (muc2 < ucount2)
+ {
+ for (int i = 0; i < elementCount2; i++)
+ {
+ if (!matched2[i])
+ {
+ dist += _xtree2->getDecendentsCount(elements2[i]);
+ if (_gFlag && (dist >= threshold))
+ {
+ dist = XTree::NO_CONNECTION;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ delete[] unmatched1;
+ delete[] unmatched2;
+ }
+
+ delete[] elements1;
+ delete[] elements2;
+ delete[] matched1;
+ delete[] matched2;
+ }
+
+ if (!_gFlag || (dist < threshold))
+ return dist;
+ else
+ return XTree::NO_CONNECTION;
+}
+
+int XDiff::_diffAttributes(int attrCount1, int attrCount2)
+{
+ if ((attrCount1 == 1) && (attrCount2 == 1))
+ {
+ int a1 = _attrList1[0];
+ int a2 = _attrList2[0];
+ if (_xtree1->getHashValue(a1) == _xtree2->getHashValue(a2))
+ return 0;
+
+ if (_xtree1->getTag(a1).compare(_xtree2->getTag(a2)) == 0)
+ return 1;
+ else
+ return 2;
+ }
+
+ int dist = 0;
+ for (int i = 0; i < attrCount2; i++)
+ {
+ _attrHash[i] = _xtree2->getHashValue(_attrList2[i]);
+ _attrTag[i] = _xtree2->getTag(_attrList2[i]);
+ _attrMatch[i] = false;
+ }
+
+ int matchCount = 0;
+ for (int i = 0; i < attrCount1; i++)
+ {
+ int attr1 = _attrList1[i];
+ unsigned long long ah1 = _xtree1->getHashValue(attr1);
+ string tag1 = _xtree1->getTag(attr1);
+
+ bool found = false;
+ for (int j = 0; j < attrCount2; j++)
+ {
+ if (_attrMatch[j])
+ continue;
+ else if (ah1 == _attrHash[j])
+ {
+ _attrMatch[j] = true;
+ matchCount++;
+ found = true;
+ break;
+ }
+ else if (tag1.compare(_attrTag[j]) == 0)
+ {
+ _attrMatch[j] = true;
+ matchCount++;
+ dist++;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ dist += 2;
+ }
+
+ dist += (attrCount2 - matchCount) * 2;
+ return dist;
+}
+
+int XDiff::_diffText(int textCount1, int textCount2)
+{
+ for (int i = 0; i < textCount2; i++)
+ {
+ _textMatch2[i] = false;
+ _textHash[i] = _xtree2->getHashValue(_textList2[i]);
+ }
+
+ int mcount = 0;
+ for (int i = 0; i < textCount1; i++)
+ {
+ unsigned long long hash1 = _xtree1->getHashValue(_textList1[i]);
+ for (int j = 0; j < textCount2; j++)
+ {
+ if (!_textMatch2[j] && (hash1 == _textHash[j]))
+ {
+ _textMatch2[j] = true;
+ mcount++;
+ break;
+ }
+ }
+
+ if (mcount == textCount2)
+ break;
+ }
+
+ if (textCount1 >= textCount2)
+ return textCount1 - mcount;
+ else
+ return textCount2 - mcount;
+}
+
+int XDiff::_matchListO(int *nodes1, int *nodes2, int count1, int count2,
+ bool treeOrder)
+{
+ int **distanceMatrix = new int*[count1+1];
+ int *matching1 = new int[count1];
+ int *matching2 = new int[count2];
+
+ // insert cost.
+ distanceMatrix[count1] = new int[count2+1];
+ for (int i = 0; i < count2; i++)
+ distanceMatrix[count1][i] = (treeOrder ? _xtree2->getDecendentsCount(nodes2[i]) : _xtree1->getDecendentsCount(nodes2[i])) + 1;
+
+ for (int i = 0; i < count1; i++)
+ {
+ distanceMatrix[i] = new int[count2+1];
+ int deleteCost = (treeOrder ? _xtree1->getDecendentsCount(nodes1[i]) : _xtree2->getDecendentsCount(nodes1[i])) + 1;
+ for (int j = 0; j < count2; j++)
+ {
+ int dist = treeOrder ? distance(nodes1[i], nodes2[j], true, XTree::NO_CONNECTION) : distance(nodes2[j], nodes1[i], true, XTree::NO_CONNECTION);
+
+ // the default mode.
+ if (!_oFlag && (dist > 1) &&
+ (dist < XTree::NO_CONNECTION) &&
+ (dist >= _NO_MATCH_THRESHOLD *
+ (deleteCost + distanceMatrix[count1][j])))
+ dist = XTree::NO_CONNECTION;
+
+ if (dist < XTree::NO_CONNECTION)
+ {
+ if (treeOrder)
+ _xlut->add(nodes1[i], nodes2[j], dist);
+ else
+ _xlut->add(nodes2[j], nodes1[i], dist);
+ }
+ distanceMatrix[i][j] = dist;
+ }
+ // delete cost.
+ distanceMatrix[i][count2] = deleteCost;
+ }
+
+ // compute the minimal cost matching.
+ int dist_value = findMatching(count1, count2, distanceMatrix,
+ matching1, matching2);
+
+ delete[] matching1;
+ delete[] matching2;
+ for (int i = 0; i <= count1; i++)
+ delete[] distanceMatrix[i];
+ delete[] distanceMatrix;
+
+ return dist_value;
+}
+
+int XDiff::_matchList(int *nodes1, int *nodes2, int count1, int count2,
+ bool treeOrder, int threshold)
+{
+ int *matching1 = new int[count1];
+ int *matching2 = new int[count2];
+ for (int i = 0; i < count1; i++)
+ matching1[i] = XTree::NULL_NODE;
+ for (int i = 0; i < count2; i++)
+ matching2[i] = XTree::NULL_NODE;
+
+ int Distance = 0;
+ int scount1 = 0;
+ int scount2 = 0;
+ int matchingThreshold = 0;
+
+ for (int i = 0; ((i < _sampleCount) && (scount2 < count2)); scount2++)
+ {
+ int snode = rand() % (count2 - scount2) + scount2;
+ int dist = XTree::NO_CONNECTION;
+ int bestmatch = XTree::NULL_NODE;
+ for (int j = scount1; j < count1; j++)
+ {
+ int d = treeOrder ? distance(nodes1[j], nodes2[snode], false, threshold - Distance) : distance(nodes2[snode], nodes1[j], false, threshold - Distance);
+ if (d < dist)
+ {
+ dist = d;
+ bestmatch = j;
+ if (d == 1)
+ break;
+ }
+ }
+
+ int deleteCost = (treeOrder ? _xtree2->getDecendentsCount(nodes2[snode]) : _xtree1->getDecendentsCount(nodes2[snode])) + 1;
+
+ if ((dist > 1) && (dist > _NO_MATCH_THRESHOLD * deleteCost))
+ {
+ int tmp = nodes2[snode];
+ nodes2[snode] = nodes2[scount2];
+ nodes2[scount2] = tmp;
+ Distance += deleteCost;
+ }
+ else
+ {
+ int tmp = nodes1[bestmatch];
+ nodes1[bestmatch] = nodes1[scount1];
+ nodes1[scount1] = tmp;
+ tmp = nodes2[snode];
+ nodes2[snode] = nodes2[scount2];
+ nodes2[scount2] = tmp;
+
+ if (treeOrder)
+ _xlut->add(nodes1[scount1], nodes2[scount2], dist);
+ else
+ _xlut->add(nodes2[scount2], nodes1[scount1], dist);
+ matching1[scount1] = scount2;
+ matching2[scount2] = scount1;
+
+ i++;
+ scount1++;
+ if (matchingThreshold < dist)
+ matchingThreshold = dist;
+ Distance += dist;
+ }
+
+ if (Distance >= threshold)
+ {
+ delete[] matching1;
+ delete[] matching2;
+ return XTree::NO_CONNECTION;
+ }
+ }
+
+ for (;scount2 < count2; scount2++)
+ {
+ int deleteCost = (treeOrder ? _xtree2->getDecendentsCount(nodes2[scount2]) : _xtree1->getDecendentsCount(nodes2[scount2])) + 1;
+ int dist = XTree::NO_CONNECTION;
+ int bestmatch = XTree::NULL_NODE;
+ for (int i = scount1; i < count1; i++)
+ {
+ int d = treeOrder ? distance(nodes1[i], nodes2[scount2], false, threshold - Distance) : distance(nodes2[scount2], nodes1[i], false, threshold - Distance);
+ if (d <= matchingThreshold)
+ {
+ dist = d;
+ bestmatch = i;
+ break;
+ }
+ else if ((d == 1) || (d < _NO_MATCH_THRESHOLD * dist))
+ {
+ dist = d;
+ bestmatch = i;
+ }
+ }
+
+ if (bestmatch == XTree::NO_MATCH)
+ {
+ Distance += deleteCost;
+ }
+ else
+ {
+ int tmp = nodes1[bestmatch];
+ nodes1[bestmatch] = nodes1[scount1];
+ nodes1[scount1] = tmp;
+
+ if (treeOrder)
+ _xlut->add(nodes1[scount1], nodes2[scount2], dist);
+ else
+ _xlut->add(nodes2[scount2], nodes1[scount1], dist);
+
+ matching1[scount1] = scount2;
+ matching2[scount2] = scount1;
+ scount1++;
+ Distance += dist;
+ }
+
+ if (Distance >= threshold)
+ {
+ delete[] matching1;
+ delete[] matching2;
+ return XTree::NO_CONNECTION;
+ }
+ }
+
+ for (int i = 0; i < count1; i++)
+ {
+ if (matching1[i] == XTree::NO_MATCH)
+ {
+ Distance += (treeOrder ? _xtree1->getDecendentsCount(nodes1[i]) : _xtree2->getDecendentsCount(nodes1[i])) + 1;
+ if (Distance >= threshold)
+ {
+ delete[] matching1;
+ delete[] matching2;
+ return XTree::NO_CONNECTION;
+ }
+ }
+ }
+
+ delete[] matching1;
+ delete[] matching2;
+ return Distance;
+}
+
+int XDiff::findMatching(int count1, int count2, int** dist,
+ int* matching1, int* matching2)
+{
+ if (count1 == 1)
+ {
+ // count2 == 1
+ if (dist[0][0] < XTree::NO_CONNECTION)
+ {
+ matching1[0] = 0;
+ matching2[0] = 0;
+ }
+ else
+ {
+ matching1[0] = XTree::DELETE;
+ matching2[0] = XTree::DELETE;
+ }
+
+ return dist[0][0];
+ }
+ else if (count2 == 1)
+ {
+ int distance = 0;
+ int mate = 0;
+ int mindist = XTree::NO_CONNECTION;
+ matching2[0] = XTree::DELETE;
+
+ for (int i = 0; i < count1; i++)
+ {
+ matching1[i] = XTree::DELETE;
+ if (mindist > dist[i][0])
+ {
+ mindist = dist[i][0];
+ mate = i;
+ }
+
+ // Suppose we delete every node on list1.
+ distance += dist[i][1];
+ }
+
+ if (mindist < XTree::NO_CONNECTION)
+ {
+ matching1[mate] = 0;
+ matching2[0] = mate;
+ distance += mindist - dist[mate][1];
+ }
+ else
+ {
+ // Add the delete cost of the single node on list2.
+ distance += dist[count1][0];
+ }
+
+ return distance;
+ }
+ else if ((count1 == 2) && (count2 == 2))
+ {
+ int distance1 = dist[0][0] + dist[1][1];
+ int distance2 = dist[0][1] + dist[1][0];
+ if (distance1 < distance2)
+ {
+ if (dist[0][0] < XTree::NO_CONNECTION)
+ {
+ matching1[0] = 0;
+ matching2[0] = 0;
+ distance1 = dist[0][0];
+ }
+ else
+ {
+ matching1[0] = XTree::DELETE;
+ matching2[0] = XTree::DELETE;
+ distance1 = dist[0][2] + dist[2][0];
+ }
+
+ if (dist[1][1] < XTree::NO_CONNECTION)
+ {
+ matching1[1] = 1;
+ matching2[1] = 1;
+ distance1 += dist[1][1];
+ }
+ else
+ {
+ matching1[1] = XTree::DELETE;
+ matching2[1] = XTree::DELETE;
+ distance1 += dist[1][2] + dist[2][1];
+ }
+
+ return distance1;
+ }
+ else
+ {
+ if (dist[0][1] < XTree::NO_CONNECTION)
+ {
+ matching1[0] = 1;
+ matching2[1] = 0;
+ distance2 = dist[0][1];
+ }
+ else
+ {
+ matching1[0] = XTree::DELETE;
+ matching2[1] = XTree::DELETE;
+ distance2 = dist[0][2] + dist[2][1];
+ }
+
+ if (dist[1][0] < XTree::NO_CONNECTION)
+ {
+ matching1[1] = 0;
+ matching2[0] = 1;
+ distance2 += dist[1][0];
+ }
+ else
+ {
+ matching1[1] = XTree::DELETE;
+ matching2[0] = XTree::DELETE;
+ distance2 += dist[1][2] + dist[2][0];
+ }
+
+ return distance2;
+ }
+ }
+ else
+ {
+ return optimalMatching(count1, count2, dist,
+ matching1, matching2);
+ }
+}
+
+int XDiff::optimalMatching(int count1, int count2, int** dist,
+ int* matching1, int* matching2)
+{
+ // Initialize matching.
+ // Initial guess will be pair-matching between two lists.
+ // Others will be insertion or deletion
+ for (int i = 0; i < count2; i++)
+ matching1[i] = i;
+ for (int i = count2; i < count1; i++)
+ matching1[i] = XTree::DELETE;
+
+ // Three artificial nodes: "start", "end" and "delete".
+ int count = count1 + count2 + 3;
+
+ // Initialize least cost matrix and path matrix.
+ // Both have been initialized at the very beginning.
+
+ // Start algorithm.
+ while (true)
+ {
+ // Construct least cost matrix.
+ constructLCM(dist, matching1, count1, count2);
+
+ // Initialize path matrix.
+ for (int i = 0; i < count; i++)
+ for (int j = 0; j < count; j++)
+ _pathMatrix[i][j] = i;
+
+ // Search negative cost circuit.
+ int clen = searchNCC(count);
+ if (clen > 0)
+ {
+ // Modify matching.
+ for (int i = 0, next = 0; i < clen - 1; i++)
+ {
+ int n1 = _circuit[next];
+ next = _circuit[next+1];
+ // Node in node list 1.
+ if ((n1 > 0) && (n1 <= count1))
+ {
+ int nid1 = n1 - 1;
+ int nid2 = _circuit[next] - count1 - 1;
+ if (nid2 == count2)
+ nid2 = XTree::DELETE;
+
+ matching1[nid1] = nid2;
+ }
+ }
+ }
+ else // Stop.
+ break;
+ }
+
+ int distance = 0;
+ // Suppose all insertion on list2
+ for (int i = 0; i < count2; i++)
+ {
+ matching2[i] = XTree::INSERT;
+ distance += dist[count1][i];
+ }
+
+ // update distance by looking at matching pairs.
+ for (int i = 0; i < count1; i++)
+ {
+ int mmm = matching1[i];
+ if (mmm == XTree::DELETE)
+ distance += dist[i][count2];
+ else
+ {
+ matching2[mmm] = i;
+ distance += dist[i][mmm] -
+ dist[count1][mmm];
+ }
+ }
+
+ return distance;
+}
+
+void XDiff::constructLCM(int** costMatrix, int* matching,
+ int nodeCount1, int nodeCount2)
+{
+ // Three artificial nodes: "start", "end" and "delete".
+ int nodeCount = nodeCount1 + nodeCount2 + 3;
+
+ // Initialize.
+ for (int i = 0; i < nodeCount; i++)
+ {
+ for (int j = 0; j < nodeCount; j++)
+ _leastCostMatrix[i][j] = XTree::NO_CONNECTION;
+
+ // self.
+ _leastCostMatrix[i][i] = 0;
+ }
+
+ // Between start node and nodes in list 1.
+ // Start -> node1 = Infinity; node1 -> Start = -0.
+ for (int i = 0; i < nodeCount1; i++)
+ _leastCostMatrix[i+1][0] = 0;
+
+ // Between nodes in list2 and the end node.
+ // Unless matched (later), node2 -> end = 0;
+ // end -> node2 = Infinity.
+ for (int i = 0; i < nodeCount2; i++)
+ _leastCostMatrix[i+nodeCount1+1][nodeCount-1] = 0;
+
+ int deleteCount = 0;
+
+ // Between nodes in list1 and nodes in list2.
+ // For matched, node1 -> node2 = Infinity;
+ // node2 -> node1 = -1 * distance
+ // For unmatched, node1 -> node2 = distance;
+ // node2 -> node1 = Infinity
+ for (int i = 0; i < nodeCount1; i++)
+ {
+ int node1 = i + 1;
+ int node2;
+
+ // According to cost matrix.
+ for (int j = 0; j < nodeCount2; j++)
+ {
+ node2 = j + nodeCount1 + 1;
+ _leastCostMatrix[node1][node2] = costMatrix[i][j];
+ }
+
+ // According to matching.
+ if (matching[i] == XTree::DELETE)
+ {
+ deleteCount++;
+
+ // node1 -> Delete = Infinity;
+ // Delete -> node1 = -1 * DELETE_COST
+ _leastCostMatrix[nodeCount-2][node1] = -1 * costMatrix[i][nodeCount2];
+ }
+ else
+ {
+ node2 = matching[i] + nodeCount1 + 1;
+
+ // Between node1 and node2.
+ _leastCostMatrix[node1][node2] = XTree::NO_CONNECTION;
+ _leastCostMatrix[node2][node1] = costMatrix[i][matching[i]] * -1;
+
+ // Between node1 and delete.
+ _leastCostMatrix[node1][nodeCount-2] = costMatrix[i][nodeCount2];
+
+ // Between node2 and end.
+ _leastCostMatrix[node2][nodeCount-1] = XTree::NO_CONNECTION;
+ _leastCostMatrix[nodeCount-1][node2] = costMatrix[nodeCount1][matching[i]];
+ }
+ }
+
+ // Between the "Delete" and the "End".
+ // If delete all, delete -> end = Infinity; end -> delete = 0.
+ if (deleteCount == nodeCount1)
+ _leastCostMatrix[nodeCount-1][nodeCount-2] = 0;
+ // if no delete, delete -> end = 0; end -> delete = Infinity.
+ else if (deleteCount == 0)
+ _leastCostMatrix[nodeCount-2][nodeCount-1] = 0;
+ // else, both 0;
+ else
+ {
+ _leastCostMatrix[nodeCount-2][nodeCount-1] = 0;
+ _leastCostMatrix[nodeCount-1][nodeCount-2] = 0;
+ }
+}
+
+int XDiff::searchNCC(int nodeCount)
+{
+ for (int k = 0; k < nodeCount; k++)
+ {
+ for (int i = 0; i < nodeCount; i++)
+ {
+ if ((i != k) &&
+ (_leastCostMatrix[i][k] != XTree::NO_CONNECTION))
+ {
+ for (int j = 0; j < nodeCount; j++)
+ {
+ if ((j != k) &&
+ (_leastCostMatrix[k][j] != XTree::NO_CONNECTION))
+ {
+ int less = _leastCostMatrix[i][k] +
+ _leastCostMatrix[k][j];
+ if (less < _leastCostMatrix[i][j])
+ {
+ _leastCostMatrix[i][j] = less;
+ _pathMatrix[i][j] = k;
+
+ // Found!
+ if ((i == j) && (less < 0))
+ {
+ int clen = 0; // the length of the circuit.
+
+ // Locate the circuit.
+ //circuit.addElement(new Integer(i));
+ _circuit[0] = i;
+ _circuit[1] = 2;
+
+ //circuit.addElement(new Integer(pathMatrix[i][i]));
+ _circuit[2] = _pathMatrix[i][i];
+ _circuit[3] = 4;
+
+ //circuit.addElement(new Integer(i));
+ _circuit[4] = i;
+ _circuit[5] = -1;
+
+ clen = 3;
+
+ bool finish;
+
+ do
+ {
+ finish = true;
+ for (int cit = 0, n = 0; cit < clen - 1; cit++)
+ {
+ int left = _circuit[n];
+ int next = _circuit[n+1];
+ int right = (next == -1)? -1 : _circuit[next];
+
+ //int middle = pathMatrix[circuit[n-1]][circuit[n]];
+ int middle = _pathMatrix[left][right];
+
+ if (middle != left)
+ {
+ //circuit.insert( cit, middle );
+ _circuit[clen*2] = middle;
+ _circuit[clen*2+1] = next;
+ _circuit[n+1] = clen*2;
+ clen++;
+
+ finish = false;
+ break;
+ }
+ n = next;
+ }
+ } while (!finish);
+
+ return clen;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+void XDiff::writeDiff(const char* input, const char* output)
+{
+ ifstream in(input);
+ ofstream out(output, ios::out|ios::trunc);
+
+ int root1 = _xtree1->getRoot();
+ int root2 = _xtree2->getRoot();
+ // the header
+ // XXX <root > is valid and should be treated as <root>;
+ // < root> is NOT valid, so use <root as the comparison key.
+ string rootTag = "<" + _xtree1->getTag(root1);
+ string line;
+ while (getline(in, line))
+ {
+ if (line.find(rootTag) != string::npos)
+ break;
+ out << line << endl;
+ }
+ in.close();
+
+ int matchType, matchNode;
+ _xtree1->getMatching(root1, matchType, matchNode);
+ if (matchType == XTree::DELETE)
+ {
+ writeDeleteNode(out, root1);
+ writeInsertNode(out, root2);
+ }
+ else
+ writeDiffNode(out, root1, root2);
+
+ out.close();
+}
+
+void XDiff::writeDeleteNode(ofstream &out, int node)
+{
+ if (_xtree1->isElement(node))
+ {
+ string tag = _xtree1->getTag(node);
+ out << "<" << tag;
+
+ // Attributes.
+ int attr = _xtree1->getFirstAttribute(node);
+ while (attr > 0)
+ {
+ string atag = _xtree1->getTag(attr);
+ string value = _xtree1->getAttributeValue(attr);
+ out << " " << atag << "=\"" << value << "\"";
+ attr = _xtree1->getNextAttribute(attr);
+ }
+
+ // Child nodes.
+ int child = _xtree1->getFirstChild(node);
+
+ if (child < 0)
+ {
+ out << "/><?DELETE " << tag << "?>" << endl;
+ _needNewLine = false;
+ return;
+ }
+
+ out << "><?DELETE " << tag << "?>" << endl;
+ _needNewLine = false;
+
+ while (child > 0)
+ {
+ writeMatchNode(out, _xtree1, child);
+ child = _xtree1->getNextSibling(child);
+ }
+
+ if (_needNewLine)
+ {
+ out << endl;
+ _needNewLine = false;
+ }
+
+ out << "</" << tag << ">" << endl;
+ }
+ else
+ {
+ out << "<?DELETE \"" << constructText(_xtree1, node)
+ << "\"?>" << endl;
+ _needNewLine = false;
+ }
+}
+
+void XDiff::writeInsertNode(ofstream &out, int node)
+{
+ if (_xtree2->isElement(node))
+ {
+ string tag = _xtree2->getTag(node);
+ out << "<" << tag;
+
+ // Attributes.
+ int attr = _xtree2->getFirstAttribute(node);
+ while (attr > 0)
+ {
+ string atag = _xtree2->getTag(attr);
+ string value = _xtree2->getAttributeValue(attr);
+ out << " " << atag << "=\"" << value << "\"";
+ attr = _xtree2->getNextAttribute(attr);
+ }
+
+ // Child nodes.
+ int child = _xtree2->getFirstChild(node);
+ if (child < 0)
+ {
+ out << "/><?INSERT " << tag << "?>" << endl;
+ _needNewLine = false;
+ return;
+ }
+
+ out << "><?INSERT " << tag << "?>" << endl;
+ _needNewLine = false;
+
+ while (child > 0)
+ {
+ writeMatchNode(out, _xtree2, child);
+ child = _xtree2->getNextSibling(child);
+ }
+
+ if (_needNewLine)
+ {
+ out << endl;
+ _needNewLine = false;
+ }
+
+ out << "</" << tag << ">" << endl;
+ }
+ else
+ {
+ out << constructText(_xtree2, node) << "<?INSERT?>" << endl;
+ _needNewLine = false;
+ }
+}
+
+void XDiff::writeMatchNode(ofstream &out, XTree *xtree, int node)
+{
+ if (xtree->isElement(node))
+ {
+ string tag = xtree->getTag(node);
+ if (_needNewLine)
+ out << endl;
+
+ out << "<" << tag;
+
+ // Attributes.
+ int attr = xtree->getFirstAttribute(node);
+ while (attr > 0)
+ {
+ string atag = xtree->getTag(attr);
+ string value = xtree->getAttributeValue(attr);
+ out << " " << atag << "=\"" << value << "\"";
+ attr = xtree->getNextAttribute(attr);
+ }
+
+ // Child nodes.
+ int child = xtree->getFirstChild(node);
+ if (child < 0)
+ {
+ out << "/>" << endl;
+ _needNewLine = false;
+ return;
+ }
+
+ out << ">";
+ _needNewLine = true;
+
+ while (child > 0)
+ {
+ writeMatchNode(out, xtree, child);
+ child = xtree->getNextSibling(child);
+ }
+
+ if (_needNewLine)
+ {
+ out << endl;
+ _needNewLine = false;
+ }
+
+ out << "</" << tag << ">" << endl;
+ }
+ else
+ {
+ out << constructText(xtree, node);
+ _needNewLine = false;
+ }
+}
+
+void XDiff::writeDiffNode(ofstream &out, int node1, int node2)
+{
+ if (_xtree1->isElement(node1))
+ {
+ string tag = _xtree1->getTag(node1);
+ if (_needNewLine)
+ out << endl;
+ out << "<" << tag;
+
+ // Attributes.
+ int attr1 = _xtree1->getFirstAttribute(node1);
+ string diffff = "";
+ while (attr1 > 0)
+ {
+ string atag = _xtree1->getTag(attr1);
+ string value = _xtree1->getAttributeValue(attr1);
+ int matchType, matchNode;
+ _xtree1->getMatching(attr1, matchType, matchNode);
+ if (matchType == XTree::MATCH)
+ out << " " << atag << "=\"" << value << "\"";
+ else if (matchType == XTree::DELETE)
+ {
+ out << " " << atag << "=\"" << value << "\"";
+ diffff += "<?DELETE " + atag + "?>";
+ }
+ else
+ {
+ string value2 = _xtree2->getAttributeValue(matchNode);
+ out << " " << atag << "=\"" << value2 << "\"";
+ diffff += "<?UPDATE " + atag +
+ " FROM \"" + value + "\"?>";
+ }
+
+ attr1 = _xtree1->getNextAttribute(attr1);
+ }
+
+ int attr2 = _xtree2->getFirstAttribute(node2);
+ while (attr2 > 0)
+ {
+ int matchType, matchNode;
+ _xtree2->getMatching(attr2, matchType, matchNode);
+ if (matchType == XTree::INSERT)
+ {
+ string atag = _xtree2->getTag(attr2);
+ string value = _xtree2->getAttributeValue(attr2);
+ out << " " << atag << "=\"" << value << "\"";
+ diffff += "<?INSERT " + atag + "?>";
+ }
+
+ attr2 = _xtree2->getNextAttribute(attr2);
+ }
+
+ // Child nodes.
+ int child1 = _xtree1->getFirstChild(node1);
+ if (child1 < 0)
+ {
+ out << "/>" << diffff << endl;
+ _needNewLine = false;
+ return;
+ }
+
+ out << ">" << diffff;
+ _needNewLine = true;
+
+ while (child1 > 0)
+ {
+ int matchType, matchNode;
+ _xtree1->getMatching(child1, matchType, matchNode);
+ if (matchType == XTree::MATCH)
+ writeMatchNode(out, _xtree1, child1);
+ else if (matchType == XTree::DELETE)
+ writeDeleteNode(out, child1);
+ else
+ writeDiffNode(out, child1, matchNode);
+
+ child1 = _xtree1->getNextSibling(child1);
+ }
+
+ int child2 = _xtree2->getFirstChild(node2);
+ while (child2 > 0)
+ {
+ int matchType, matchNode;
+ _xtree2->getMatching(child2, matchType, matchNode);
+ if (matchType == XTree::INSERT)
+ writeInsertNode(out, child2);
+
+ child2 = _xtree2->getNextSibling(child2);
+ }
+
+ if (_needNewLine)
+ {
+ out << endl;
+ _needNewLine = false;
+ }
+
+ out << "</" << tag << ">" << endl;
+ }
+ else
+ {
+ out << constructText(_xtree2, node2) << "<?UPDATE FROM \""
+ << constructText(_xtree1, node1) << "\"?>";
+ _needNewLine = false;
+ }
+}
+
+string XDiff::constructText(XTree *xtree, int eid)
+{
+ string text = xtree->getText(eid);
+ vector<size_t> cdatalist = xtree->getCDATA(eid);
+ if (cdatalist.empty())
+ return text;
+
+ string buf = "";
+ int count = cdatalist.size();
+ size_t lastEnd = 0;
+ for (int i = 0; i < count; i += 2)
+ {
+ size_t cdataStart = cdatalist[i];
+ size_t cdataEnd = cdatalist[i+1];
+
+ if (cdataStart > lastEnd)
+ buf += text.substr(lastEnd, cdataStart - lastEnd);
+ buf += "<![CDATA[" +
+ text.substr(cdataStart, cdataEnd - cdataStart) +
+ "]]>";
+ lastEnd = cdataEnd;
+ }
+
+ if (lastEnd < text.length())
+ buf += text.substr(lastEnd);
+
+ return buf;
+}
+
+double XDiff::_diffTime(const struct timeval *time1,
+ const struct timeval *time2)
+{
+ long sec = time2->tv_sec - time1->tv_sec;
+ long usec = time2->tv_usec - time1->tv_usec;
+ if (usec < 0)
+ {
+ sec--;
+ usec += 1000000;
+ }
+
+ return 0.000001 * usec + sec;
+}
+
+int main(int argc, char* args[])
+{
+ try
+ {
+ XMLPlatformUtils::Initialize();
+ }
+ catch (const XMLException& e)
+ {
+ cerr << "Error during initialization! :\n"
+ << e.getMessage() << endl;
+ exit(1);
+ }
+
+ int opid = 1;
+ if (argc < 4)
+ {
+ cout << XDiff::USAGE << endl;
+ exit(1);
+ }
+ else if (strcmp(args[1], "-o") == 0)
+ {
+ XDiff::_oFlag = true;
+ opid++;
+ }
+ else if (strcmp(args[1], "-g") == 0)
+ {
+ XDiff::_gFlag = true;
+ opid++;
+ }
+
+ if (strcmp(args[opid], "-p") == 0)
+ {
+ opid++;
+ double p = strtod(args[opid], NULL);
+ if ((p <= 0) || (p > 1))
+ {
+ cout << XDiff::USAGE << endl;
+ exit(1);
+ }
+ XDiff::_NO_MATCH_THRESHOLD = p;
+ opid++;
+ }
+
+ if ((argc - opid) != 3)
+ {
+ cout << XDiff::USAGE << endl;
+ exit(1);
+ }
+
+ XDiff xdiff(args[opid], args[opid+1], args[opid+2]);
+}
--- /dev/null
+/**
+ * Copyright (c) 2001 - 2005
+ * Yuan Wang. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Redistributions in any form must be accompanied by information on
+ * how to obtain complete source code for the X-Diff software and any
+ * accompanying software that uses the X-Diff software. The source code
+ * must either be included in the distribution or be available for no
+ * more than the cost of distribution plus a nominal fee, and must be
+ * freely redistributable under reasonable conditions. For an executable
+ * file, complete source code means the source code for all modules it
+ * contains. It does not include source code for modules or files that
+ * typically accompany the major components of the operating system on
+ * which the executable file runs.
+ *
+ * THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+ * ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef __XDIFF__
+#define __XDIFF__
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <fstream>
+#include <math.h>
+#include <algorithm>
+
+#include "XTree.hpp"
+#include "XParser.hpp"
+#include "XLut.hpp"
+
+/**
+ * XDiff computes the difference between two input XML documents.
+ */
+class XDiff
+{
+public:
+ static const string USAGE;
+ static bool _oFlag, _gFlag;
+ static double _NO_MATCH_THRESHOLD;
+ /**
+ * @param input1 input file #1
+ * @param input2 input file #2
+ * @param output output file
+ */
+ XDiff(const char* input1, const char* input2, const char* output);
+ ~XDiff();
+
+private:
+ static const int _CIRCUIT_SIZE, _MATRIX_SIZE, _ATTRIBUTE_SIZE;
+ static const int _TEXT_SIZE, _sampleCount;
+
+ int *_attrList1, *_attrList2, *_textList1;
+ int *_textList2, *_circuit, _seed;
+ int **_leastCostMatrix, **_pathMatrix;
+ bool *_attrMatch, *_textMatch1, *_textMatch2;
+ bool _needNewLine;
+ unsigned long long *_attrHash, *_textHash;
+ string *_attrTag;
+ XTree *_xtree1, *_xtree2;
+ XLut *_xlut;
+
+ void xdiff(int pid1, int pid2, bool matchFlag);
+ void diffAttributes(int attrCount1, int attrCount2);
+ void diffText(int textCount1, int textCount2);
+ int _matchFilter(int *elements1, int *elements2, bool *matched1,
+ bool *matched2, int count1, int count2);
+ void matchListO(int *nodes1, int *nodes2, int count1, int count2,
+ bool treeOrder, bool matchFlag);
+ void matchList(int *nodes1, int *nodes2, int count1, int count2,
+ bool treeOrder, bool matchFlag);
+ int distance(int eid1, int eid2, bool toRecord, int threshold);
+ int _xdiff(int pid1, int pid2, int threshold = XTree::NO_CONNECTION);
+ int _diffAttributes(int attrCount1, int attrCount2);
+ int _diffText(int textCount1, int textCount2);
+ int _matchListO(int *nodes1, int *nodes2, int count1, int count2,
+ bool treeOrder);
+ int _matchList(int *nodes1, int *nodes2, int count1, int count2,
+ bool treeOrder, int threshold);
+ int findMatching(int count1, int count2, int** dist,
+ int* matching1, int* matching2);
+ int optimalMatching(int count1, int count2, int** dist,
+ int* matching1, int* matching2);
+ void constructLCM(int** costMatrix, int* matching,
+ int nodeCount1, int nodeCount2);
+ int searchNCC(int nodeCount);
+ double _diffTime(const struct timeval *time1,
+ const struct timeval *time2);
+
+ void writeDiff(const char* input, const char* output);
+ void writeDeleteNode(ofstream &out, int node);
+ void writeInsertNode(ofstream &out, int node);
+ void writeMatchNode(ofstream &out, XTree *xtree, int node);
+ void writeDiffNode(ofstream &out, int node1, int node2);
+ string constructText(XTree *xtree, int eid);
+};
+#endif
--- /dev/null
+/**
+ * Copyright (c) 2001 - 2005
+ * Yuan Wang. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Redistributions in any form must be accompanied by information on
+ * how to obtain complete source code for the X-Diff software and any
+ * accompanying software that uses the X-Diff software. The source code
+ * must either be included in the distribution or be available for no
+ * more than the cost of distribution plus a nominal fee, and must be
+ * freely redistributable under reasonable conditions. For an executable
+ * file, complete source code means the source code for all modules it
+ * contains. It does not include source code for modules or files that
+ * typically accompany the major components of the operating system on
+ * which the executable file runs.
+ *
+ * THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+ * ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "XHash.hpp"
+
+const unsigned int XHash::_PC_1[56] =
+{
+ 60, 52, 44, 36, 59, 51, 43, 35, 27, 19, 11, 3, 58, 50,
+ 42, 34, 26, 18, 10, 2, 57, 49, 41, 33, 25, 17, 9, 1,
+ 28, 20, 12, 4, 61, 53, 45, 37, 29, 21, 13, 5, 62, 54,
+ 46, 38, 30, 22, 14, 6, 63, 55, 47, 39, 31, 23, 15, 7
+};
+
+const unsigned int XHash::_PC_2[48] =
+{
+ 24, 27, 20, 6, 14, 10, 3, 22, 0, 17, 7, 12,
+ 8, 23, 11, 5, 16, 26, 1, 9, 19, 25, 4, 15,
+ 54, 43 ,36, 29, 49, 40, 48, 30, 52, 44, 37, 33,
+ 46, 35, 50, 41, 28, 53, 51, 55, 32, 45, 39, 42
+};
+
+const unsigned int XHash::_subKeyRotation[16] =
+{
+ 1, 2, 4, 6, 8, 10, 12, 14, 15, 17, 19, 21, 23, 25, 27, 28
+};
+
+const unsigned int XHash::_IP[64] =
+{
+ 57, 49, 41, 33, 25, 17, 9, 1, 59, 51, 43, 35, 27, 19, 11, 3,
+ 61, 53, 45, 37, 29, 21, 13, 5, 63, 55, 47, 39, 31, 23, 15, 7,
+ 56, 48, 40, 32 ,24, 16, 8, 0, 58, 50, 42, 34, 26, 18, 10, 2,
+ 60, 52, 44, 36, 28, 20, 12, 4, 62, 54, 46, 38, 30, 22, 14, 6
+};
+
+const unsigned int XHash::_FP[64] =
+{
+ 39, 7, 47, 15, 55, 23, 63, 31, 38, 6, 46, 14, 54, 22, 62, 30,
+ 37, 5, 45, 13, 53, 21, 61, 29, 36, 4, 44, 12, 52, 20, 60, 28,
+ 35, 3, 43, 11, 51, 19, 59, 27, 34, 2, 42, 10, 50, 18, 58, 26,
+ 33, 1, 41, 9, 49, 17, 57, 25, 32, 0, 40, 8, 48, 16, 56, 24
+};
+
+const unsigned int XHash::_sBox[8][64] =
+{
+ {
+ 0x00808200, 0x00000000, 0x00008000, 0x00808202,
+ 0x00808002, 0x00008202, 0x00000002, 0x00008000,
+ 0x00000200, 0x00808200, 0x00808202, 0x00000200,
+ 0x00800202, 0x00808002, 0x00800000, 0x00000002,
+ 0x00000202, 0x00800200, 0x00800200, 0x00008200,
+ 0x00008200, 0x00808000, 0x00808000, 0x00800202,
+ 0x00008002, 0x00800002, 0x00800002, 0x00008002,
+ 0x00000000, 0x00000202, 0x00008202, 0x00800000,
+ 0x00008000, 0x00808202, 0x00000002, 0x00808000,
+ 0x00808200, 0x00800000, 0x00800000, 0x00000200,
+ 0x00808002, 0x00008000, 0x00008200, 0x00800002,
+ 0x00000200, 0x00000002, 0x00800202, 0x00008202,
+ 0x00808202, 0x00008002, 0x00808000, 0x00800202,
+ 0x00800002, 0x00000202, 0x00008202, 0x00808200,
+ 0x00000202, 0x00800200, 0x00800200, 0x00000000,
+ 0x00008002, 0x00008200, 0x00000000, 0x00808002
+ },
+ {
+ 0x40084010, 0x40004000, 0x00004000, 0x00084010,
+ 0x00080000, 0x00000010, 0x40080010, 0x40004010,
+ 0x40000010, 0x40084010, 0x40084000, 0x40000000,
+ 0x40004000, 0x00080000, 0x00000010, 0x40080010,
+ 0x00084000, 0x00080010, 0x40004010, 0x00000000,
+ 0x40000000, 0x00004000, 0x00084010, 0x40080000,
+ 0x00080010, 0x40000010, 0x00000000, 0x00084000,
+ 0x00004010, 0x40084000, 0x40080000, 0x00004010,
+ 0x00000000, 0x00084010, 0x40080010, 0x00080000,
+ 0x40004010, 0x40080000, 0x40084000, 0x00004000,
+ 0x40080000, 0x40004000, 0x00000010, 0x40084010,
+ 0x00084010, 0x00000010, 0x00004000, 0x40000000,
+ 0x00004010, 0x40084000, 0x00080000, 0x40000010,
+ 0x00080010, 0x40004010, 0x40000010, 0x00080010,
+ 0x00084000, 0x00000000, 0x40004000, 0x00004010,
+ 0x40000000, 0x40080010, 0x40084010, 0x00084000
+ },
+ {
+ 0x00000104, 0x04010100, 0x00000000, 0x04010004,
+ 0x04000100, 0x00000000, 0x00010104, 0x04000100,
+ 0x00010004, 0x04000004, 0x04000004, 0x00010000,
+ 0x04010104, 0x00010004, 0x04010000, 0x00000104,
+ 0x04000000, 0x00000004, 0x04010100, 0x00000100,
+ 0x00010100, 0x04010000, 0x04010004, 0x00010104,
+ 0x04000104, 0x00010100, 0x00010000, 0x04000104,
+ 0x00000004, 0x04010104, 0x00000100, 0x04000000,
+ 0x04010100, 0x04000000, 0x00010004, 0x00000104,
+ 0x00010000, 0x04010100, 0x04000100, 0x00000000,
+ 0x00000100, 0x00010004, 0x04010104, 0x04000100,
+ 0x04000004, 0x00000100, 0x00000000, 0x04010004,
+ 0x04000104, 0x00010000, 0x04000000, 0x04010104,
+ 0x00000004, 0x00010104, 0x00010100, 0x04000004,
+ 0x04010000, 0x04000104, 0x00000104, 0x04010000,
+ 0x00010104, 0x00000004, 0x04010004, 0x00010100
+ },
+ {
+ 0x80401000, 0x80001040, 0x80001040, 0x00000040,
+ 0x00401040, 0x80400040, 0x80400000, 0x80001000,
+ 0x00000000, 0x00401000, 0x00401000, 0x80401040,
+ 0x80000040, 0x00000000, 0x00400040, 0x80400000,
+ 0x80000000, 0x00001000, 0x00400000, 0x80401000,
+ 0x00000040, 0x00400000, 0x80001000, 0x00001040,
+ 0x80400040, 0x80000000, 0x00001040, 0x00400040,
+ 0x00001000, 0x00401040, 0x80401040, 0x80000040,
+ 0x00400040, 0x80400000, 0x00401000, 0x80401040,
+ 0x80000040, 0x00000000, 0x00000000, 0x00401000,
+ 0x00001040, 0x00400040, 0x80400040, 0x80000000,
+ 0x80401000, 0x80001040, 0x80001040, 0x00000040,
+ 0x80401040, 0x80000040, 0x80000000, 0x00001000,
+ 0x80400000, 0x80001000, 0x00401040, 0x80400040,
+ 0x80001000, 0x00001040, 0x00400000, 0x80401000,
+ 0x00000040, 0x00400000, 0x00001000, 0x00401040
+ },
+ {
+ 0x00000080, 0x01040080, 0x01040000, 0x21000080,
+ 0x00040000, 0x00000080, 0x20000000, 0x01040000,
+ 0x20040080, 0x00040000, 0x01000080, 0x20040080,
+ 0x21000080, 0x21040000, 0x00040080, 0x20000000,
+ 0x01000000, 0x20040000, 0x20040000, 0x00000000,
+ 0x20000080, 0x21040080, 0x21040080, 0x01000080,
+ 0x21040000, 0x20000080, 0x00000000, 0x21000000,
+ 0x01040080, 0x01000000, 0x21000000, 0x00040080,
+ 0x00040000, 0x21000080, 0x00000080, 0x01000000,
+ 0x20000000, 0x01040000, 0x21000080, 0x20040080,
+ 0x01000080, 0x20000000, 0x21040000, 0x01040080,
+ 0x20040080, 0x00000080, 0x01000000, 0x21040000,
+ 0x21040080, 0x00040080, 0x21000000, 0x21040080,
+ 0x01040000, 0x00000000, 0x20040000, 0x21000000,
+ 0x00040080, 0x01000080, 0x20000080, 0x00040000,
+ 0x00000000, 0x20040000, 0x01040080, 0x20000080
+ },
+ {
+ 0x10000008, 0x10200000, 0x00002000, 0x10202008,
+ 0x10200000, 0x00000008, 0x10202008, 0x00200000,
+ 0x10002000, 0x00202008, 0x00200000, 0x10000008,
+ 0x00200008, 0x10002000, 0x10000000, 0x00002008,
+ 0x00000000, 0x00200008, 0x10002008, 0x00002000,
+ 0x00202000, 0x10002008, 0x00000008, 0x10200008,
+ 0x10200008, 0x00000000, 0x00202008, 0x10202000,
+ 0x00002008, 0x00202000, 0x10202000, 0x10000000,
+ 0x10002000, 0x00000008, 0x10200008, 0x00202000,
+ 0x10202008, 0x00200000, 0x00002008, 0x10000008,
+ 0x00200000, 0x10002000, 0x10000000, 0x00002008,
+ 0x10000008, 0x10202008, 0x00202000, 0x10200000,
+ 0x00202008, 0x10202000, 0x00000000, 0x10200008,
+ 0x00000008, 0x00002000, 0x10200000, 0x00202008,
+ 0x00002000, 0x00200008, 0x10002008, 0x00000000,
+ 0x10202000, 0x10000000, 0x00200008, 0x10002008
+ },
+ {
+ 0x00100000, 0x02100001, 0x02000401, 0x00000000,
+ 0x00000400, 0x02000401, 0x00100401, 0x02100400,
+ 0x02100401, 0x00100000, 0x00000000, 0x02000001,
+ 0x00000001, 0x02000000, 0x02100001, 0x00000401,
+ 0x02000400, 0x00100401, 0x00100001, 0x02000400,
+ 0x02000001, 0x02100000, 0x02100400, 0x00100001,
+ 0x02100000, 0x00000400, 0x00000401, 0x02100401,
+ 0x00100400, 0x00000001, 0x02000000, 0x00100400,
+ 0x02000000, 0x00100400, 0x00100000, 0x02000401,
+ 0x02000401, 0x02100001, 0x02100001, 0x00000001,
+ 0x00100001, 0x02000000, 0x02000400, 0x00100000,
+ 0x02100400, 0x00000401, 0x00100401, 0x02100400,
+ 0x00000401, 0x02000001, 0x02100401, 0x02100000,
+ 0x00100400, 0x00000000, 0x00000001, 0x02100401,
+ 0x00000000, 0x00100401, 0x02100000, 0x00000400,
+ 0x02000001, 0x02000400, 0x00000400, 0x00100001
+ },
+ {
+ 0x08000820, 0x00000800, 0x00020000, 0x08020820,
+ 0x08000000, 0x08000820, 0x00000020, 0x08000000,
+ 0x00020020, 0x08020000, 0x08020820, 0x00020800,
+ 0x08020800, 0x00020820, 0x00000800, 0x00000020,
+ 0x08020000, 0x08000020, 0x08000800, 0x00000820,
+ 0x00020800, 0x00020020, 0x08020020, 0x08020800,
+ 0x00000820, 0x00000000, 0x00000000, 0x08020020,
+ 0x08000020, 0x08000800, 0x00020820, 0x00020000,
+ 0x00020820, 0x00020000, 0x08020800, 0x00000800,
+ 0x00000020, 0x08020020, 0x00000800, 0x00020820,
+ 0x08000800, 0x00000020, 0x08000020, 0x08020000,
+ 0x08020020, 0x08000000, 0x00020000, 0x08000820,
+ 0x00000000, 0x08020820, 0x00020020, 0x08000020,
+ 0x08020000, 0x08000800, 0x08000820, 0x00000000,
+ 0x08020820, 0x00020800, 0x00020800, 0x00000820,
+ 0x00000820, 0x00020020, 0x08000000, 0x08020800
+ }
+};
+
+// From 0x00b5a69788796a5bL
+const unsigned long long XHash::_initialKey = 0xb5d3a4f186cba8b6ULL;
+unsigned long long XHash::_keys[16];
+
+void XHash::initialize()
+{
+ makeKeys(_initialKey);
+}
+
+void XHash::initialize(unsigned long long key)
+{
+ makeKeys(key);
+}
+
+unsigned long long XHash::hash(string text)
+{
+ int len = text.length();
+ unsigned long long value = 0;
+ for (int pos = 0; pos < len; pos += 8)
+ {
+ int piecelen = (len - pos >= 8) ? 8 : (len - pos);
+ unsigned long long piece = (unsigned long long)text.at(pos);
+ for (int i = 1; i < piecelen; i++)
+ piece = (piece << 8) | text.at(pos + i);
+ piece <<= ((8 - piecelen) * 8);
+ value += (des(piece) & 0xffffffff);
+ }
+ return value;
+}
+
+void XHash::makeKeys(unsigned long long key)
+{
+ unsigned long long reduced = permutate(key, _PC_1, 56);
+ unsigned int l = (unsigned int)(reduced >> 28);
+ unsigned int r = (unsigned int)(reduced & 0xfffffff);
+ for (unsigned int i = 0; i < 16; i++)
+ _keys[i] = permutate(rotate(l, r, _subKeyRotation[i]),
+ _PC_2, 48);
+}
+
+unsigned long long XHash::permutate(unsigned long long k,
+ const unsigned int p[],
+ unsigned int plen)
+{
+ unsigned long long s = 0;
+ for (unsigned int i = 0; i < plen; i++)
+ {
+ if ((k & (1LL << p[i])) != 0)
+ s |= 1LL << i;
+ }
+
+ return s;
+}
+
+unsigned long long XHash::rotate(unsigned int l, unsigned int r, unsigned int s)
+{
+ return ((unsigned long long)(((l << s) & 0xfffffff) | (l >> (28 - s))) << 28) | ((r << s) & 0xfffffff) | (r >> (28 - s));
+}
+
+unsigned long long XHash::des(unsigned long long data)
+{
+ unsigned long long x = permutate(data, _IP, 64);
+ unsigned int l = (unsigned int)(x >> 32);
+ unsigned int r = (unsigned int)(x & 0xffffffff);
+ for (int i = 0; i < 16; i++)
+ {
+ unsigned int tmp = desCore(r, _keys[i]) ^ l;
+ l = r;
+ r = tmp;
+ }
+ unsigned long long y = ((unsigned long long)r << 32) |
+ ((unsigned long long)l & 0xffffffffL);
+ return permutate(y, _FP, 64);
+}
+
+// E-Bit Selection Table is hardcoded in this function.
+int XHash::desCore(unsigned int x, unsigned long long k)
+{
+ unsigned int p = x >> 27;
+ unsigned int q = (p & 3) << 4;
+ unsigned int r = x << 5;
+ p |= r;
+ r = _sBox[0][(unsigned int)((k >> 42) ^ p) & 0x3f];
+ p >>= 4;
+ r |= _sBox[7][(unsigned int)((k >> 0) ^ p) & 0x3f];
+ p >>= 4;
+ r |= _sBox[6][(unsigned int)((k >> 6) ^ p) & 0x3f];
+ p >>= 4;
+ r |= _sBox[5][(unsigned int)((k >> 12) ^ p) & 0x3f];
+ p >>= 4;
+ r |= _sBox[4][(unsigned int)((k >> 18) ^ p) & 0x3f];
+ p >>= 4;
+ r |= _sBox[3][(unsigned int)((k >> 24) ^ p) & 0x3f];
+ p >>= 4;
+ r |= _sBox[2][(unsigned int)((k >> 30) ^ p) & 0x3f];
+ p >>= 4;
+ r |= _sBox[1][(unsigned int)((k >> 36) ^ (p | q)) & 0x3f];
+ return r;
+}
--- /dev/null
+/**
+ * Copyright (c) 2001 - 2005
+ * Yuan Wang. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Redistributions in any form must be accompanied by information on
+ * how to obtain complete source code for the X-Diff software and any
+ * accompanying software that uses the X-Diff software. The source code
+ * must either be included in the distribution or be available for no
+ * more than the cost of distribution plus a nominal fee, and must be
+ * freely redistributable under reasonable conditions. For an executable
+ * file, complete source code means the source code for all modules it
+ * contains. It does not include source code for modules or files that
+ * typically accompany the major components of the operating system on
+ * which the executable file runs.
+ *
+ * THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+ * ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef __XHASH__
+#define __XHASH__
+
+#include <string>
+#include <iostream>
+
+/**
+ * XHash is an implementation of DES
+ */
+class XHash
+{
+public:
+ static void initialize();
+ static void initialize(unsigned long long key);
+ static void finish();
+
+ static unsigned long long hash(string text);
+
+private:
+ static const unsigned long long _initialKey;
+
+ // Permuted Choices #1 and #2
+ static const unsigned int _PC_1[56], _PC_2[48];
+ static const unsigned int _subKeyRotation[16];
+
+ // Initial Permutation and Final Permutation (Inverse IP, or IP^(-1))
+ static const unsigned int _IP[64],_FP[64];
+
+ static const unsigned int _sBox[8][64];
+
+ static unsigned long long _key;
+ static unsigned long long _keys[];
+
+ static void makeKeys(unsigned long long key);
+ static unsigned long long permutate(unsigned long long k,
+ const unsigned int p[],
+ unsigned int plen);
+ static unsigned long long rotate(unsigned int l, unsigned int r,
+ unsigned int s);
+
+ static unsigned long long des(unsigned long long data);
+ static int desCore(unsigned int x, unsigned long long k);
+};
+#endif
--- /dev/null
+/**
+ * Copyright (c) 2001 - 2005
+ * Yuan Wang. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Redistributions in any form must be accompanied by information on
+ * how to obtain complete source code for the X-Diff software and any
+ * accompanying software that uses the X-Diff software. The source code
+ * must either be included in the distribution or be available for no
+ * more than the cost of distribution plus a nominal fee, and must be
+ * freely redistributable under reasonable conditions. For an executable
+ * file, complete source code means the source code for all modules it
+ * contains. It does not include source code for modules or files that
+ * typically accompany the major components of the operating system on
+ * which the executable file runs.
+ *
+ * THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+ * ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "XLut.hpp"
+
+XLut::XLut(bool con)
+: _xTable(4096)
+{
+ _possibleConflict = con;
+ _conflict = false;
+}
+
+void XLut::add(int eid1, int eid2, int dist)
+{
+ unsigned int key = ((unsigned int)eid1 << 16) | (eid2 & 0xffff);
+ unsigned long long value = ((unsigned long long)eid1 << 40) | ((unsigned long long)(eid2 & 0xffffff) << 16) | dist;
+ if (_possibleConflict)
+ {
+ hash_map<unsigned int, unsigned long long>::const_iterator hit = _xTable.find(key);
+ if (hit == _xTable.end())
+ _xTable[key] = value;
+ else
+ {
+ do
+ {
+ key++;
+ hit = _xTable.find(key);
+ }
+ while (hit != _xTable.end());
+ _xTable[key] = value;
+ _conflict = true;
+ }
+ }
+ else
+ _xTable[key] = value;
+}
+
+int XLut::get(int eid1, int eid2)
+{
+ unsigned int key = ((unsigned int)eid1 << 16) | (eid2 & 0xffff);
+ hash_map<unsigned int, unsigned long long>::const_iterator hit = _xTable.find(key);
+ if (hit == _xTable.end())
+ return XTree::NO_CONNECTION;
+
+ if (!_conflict)
+ return (int)(hit->second & 0xffff);
+ else
+ {
+ unsigned long long partialValue = ((unsigned long long)eid1 << 40) | ((unsigned long long)(eid2 & 0xffffff) << 16);
+ unsigned long long bucket = hit->second;
+ if (((partialValue ^ bucket) >> 16) == 0)
+ return (int)(bucket & 0xffff);
+ else
+ {
+ do
+ {
+ key++;
+ hit = _xTable.find(key);
+ if (hit == _xTable.end())
+ return XTree::NO_CONNECTION;
+ bucket = hit->second;
+ }
+ while (((partialValue ^ bucket) >> 16) != 0);
+ return (int)(bucket & 0xffff);
+ }
+ }
+}
--- /dev/null
+/**
+ * Copyright (c) 2001 - 2005
+ * Yuan Wang. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Redistributions in any form must be accompanied by information on
+ * how to obtain complete source code for the X-Diff software and any
+ * accompanying software that uses the X-Diff software. The source code
+ * must either be included in the distribution or be available for no
+ * more than the cost of distribution plus a nominal fee, and must be
+ * freely redistributable under reasonable conditions. For an executable
+ * file, complete source code means the source code for all modules it
+ * contains. It does not include source code for modules or files that
+ * typically accompany the major components of the operating system on
+ * which the executable file runs.
+ *
+ * THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+ * ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef __XLUT__
+#define __XLUT__
+
+#include <stdio.h>
+#include "XTree.hpp"
+
+/**
+ * XLut is the hash lookup table for distance between nodes. It can be
+ * extended to contain more buckets.
+ */
+class XLut
+{
+public:
+
+ XLut(bool con);
+ ~XLut(){};
+
+ /**
+ * Add a node pair and their distance to this table.
+ * @param eid1 element id #1
+ * @param eid2 element id #2
+ * @param dist distance
+ */
+ void add(int eid1, int eid2, int dist);
+
+ /**
+ * Get the distance of a node pair.
+ * @param eid1 element id #1
+ * @param eid2 element id #2
+ * @return distance or -1 if not found
+ */
+ int get(int eid1, int eid2);
+
+private:
+ bool _possibleConflict, _conflict;
+ hash_map<unsigned int, unsigned long long> _xTable;
+};
+#endif
--- /dev/null
+/**
+ * Copyright (c) 2001 - 2005
+ * Yuan Wang. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Redistributions in any form must be accompanied by information on
+ * how to obtain complete source code for the X-Diff software and any
+ * accompanying software that uses the X-Diff software. The source code
+ * must either be included in the distribution or be available for no
+ * more than the cost of distribution plus a nominal fee, and must be
+ * freely redistributable under reasonable conditions. For an executable
+ * file, complete source code means the source code for all modules it
+ * contains. It does not include source code for modules or files that
+ * typically accompany the major components of the operating system on
+ * which the executable file runs.
+ *
+ * THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+ * ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "XParser.hpp"
+
+const char* XParser::_feature_Validation = "http://xml.org/sax/features/validation";
+const char* XParser::_feature_NameSpaces = "http://xml.org/sax/features/namespaces";
+//const char* XParser::_feature_SchemaSupport = "http://apache.org/xml/features/validation/schema";
+//const char* XParser::_feature_SchemaFullSupport = "http://apache.org/xml/features/validation/schema-full-checking";
+const char* XParser::_feature_NameSpacePrefixes = "http://xml.org/sax/features/namespace-prefixes";
+bool XParser::_setValidation = false;
+bool XParser::_setNameSpaces = true;
+bool XParser::_setSchemaSupport = true;
+bool XParser::_setSchemaFullSupport = false;
+bool XParser::_setNameSpacePrefixes = true;
+
+const char* XParser::_trimReject = " \t\n";
+
+XParser::XParser()
+{
+ _parser = XMLReaderFactory::createXMLReader();
+ XMLCh* validation = XMLString::transcode(_feature_Validation);
+ _parser->setFeature((const XMLCh*)validation, _setValidation);
+ delete validation;
+ XMLCh* namespaces = XMLString::transcode(_feature_NameSpaces);
+ _parser->setFeature((const XMLCh*)namespaces, _setNameSpaces);
+ delete namespaces;
+/*
+ XMLCh* schemasupport = XMLString::transcode(_feature_SchemaSupport);
+ _parser->setFeature((const XMLCh*)schemasupport, _setSchemaSupport);
+ delete schemasupport;
+ XMLCh* schemafullsupport = XMLString::transcode(_feature_SchemaFullSupport);
+ _parser->setFeature((const XMLCh*)schemafullsupport, _setSchemaFullSupport);
+ delete schemafullsupport;
+*/
+ XMLCh* namespaceprefixes = XMLString::transcode(_feature_NameSpacePrefixes);
+ _parser->setFeature((const XMLCh*)namespaceprefixes, _setNameSpacePrefixes);
+ delete namespaceprefixes;
+
+ _parser->setContentHandler(this);
+ _parser->setErrorHandler(this);
+ _parser->setLexicalHandler(this);
+
+ for(int i = 0; i < _STACK_SIZE; i++)
+ {
+ _idStack[i] = 0;
+ _lsidStack[i] = 0;
+ _valueStack[i] = 0;
+ }
+ _stackTop = 0;
+ _currentNodeID = XTree::NULL_NODE;
+ _xtree = new XTree();
+ _elementBuffer = string("");
+}
+
+XParser::~XParser()
+{
+ delete _parser;
+}
+
+XTree* XParser::parse(const char* uri)
+{
+ _idStack[_stackTop] = XTree::NULL_NODE;
+ _readElement = false;
+
+ try
+ {
+ _parser->parse(uri);
+ }
+ catch (const XMLException& e)
+ {
+ cerr << "File not found:\t" << uri << "\nException message:\n"
+ << e.getMessage() << endl;
+ return NULL;
+ }
+
+ return _xtree;
+}
+
+void XParser::startElement(const XMLCh* const uri, const XMLCh* const local,
+ const XMLCh* const raw, const Attributes& attrs)
+{
+ // if text is mixed with elements.
+ if (_elementBuffer.length() > 0)
+ {
+ string text = _trim(_elementBuffer);
+ if (text.length() > 0)
+ {
+ unsigned long long value = XHash::hash(text);
+ int tid = _xtree->addText(_idStack[_stackTop],
+ _lsidStack[_stackTop],
+ text, value);
+ _lsidStack[_stackTop] = tid;
+ _currentNodeID = tid;
+ _valueStack[_stackTop] += value;
+ }
+ }
+
+ string local_s(XMLString::transcode(local));
+
+//cout << "Add element " << _idStack[_stackTop] << "\t" << _lsidStack[_stackTop] << "\t" << local_s << endl;
+ int eid = _xtree->addElement(_idStack[_stackTop],
+ _lsidStack[_stackTop], local_s);
+ // Update last sibling info.
+ _lsidStack[_stackTop] = eid;
+
+ // Push
+ _stackTop++;
+ _idStack[_stackTop] = eid;
+ _currentNodeID = eid;
+ _lsidStack[_stackTop] = XTree::NULL_NODE;
+ _valueStack[_stackTop] = XHash::hash(local_s);
+
+ // Take care of attributes
+ if (attrs.getLength() > 0)
+ {
+ for (unsigned int i = 0; i < attrs.getLength(); i++)
+ {
+ string name(XMLString::transcode(attrs.getQName(i)));
+ unsigned long long namehash = XHash::hash(name);
+
+ unsigned long long attrhash = namehash * namehash;
+ string value = "";
+ char *valueP = XMLString::transcode(attrs.getValue(i));
+ if (valueP != NULL)
+ {
+ value = string(valueP);
+ unsigned long long valuehash = XHash::hash(value);
+ attrhash += valuehash * valuehash;
+ }
+ int aid = _xtree->addAttribute(eid, _lsidStack[_stackTop], name, value, namehash, attrhash);
+
+ _lsidStack[_stackTop] = aid;
+ _currentNodeID = aid + 1;
+ _valueStack[_stackTop] += attrhash * attrhash;
+ }
+ }
+
+ _readElement = true;
+ _elementBuffer = string("");
+
+}
+
+void XParser::characters(const XMLCh* const ch, const unsigned int length)
+{
+ const char* str = XMLString::transcode(ch);
+ _elementBuffer.append(str);
+ delete[] str;
+}
+
+void XParser::endElement(const XMLCh* const uri, const XMLCh* const local,
+ const XMLCh* const raw)
+{
+ if (_readElement)
+ {
+ if (_elementBuffer.length() > 0)
+ {
+ unsigned long long value = XHash::hash(_elementBuffer);
+
+ _currentNodeID = _xtree->addText(_idStack[_stackTop],
+ _lsidStack[_stackTop],
+ _elementBuffer, value);
+
+ _valueStack[_stackTop] += value;
+ }
+ else // an empty element
+ {
+ _currentNodeID = _xtree->addText(_idStack[_stackTop],
+ _lsidStack[_stackTop],
+ "", 0);
+ }
+
+ _readElement = false;
+ }
+ else // mixed contents
+ {
+ if (_elementBuffer.length() > 0)
+ {
+ string text = _trim(_elementBuffer);
+ if (text.length() > 0)
+ {
+ unsigned long long value = XHash::hash(text);
+ _currentNodeID =
+ _xtree->addText(_idStack[_stackTop],
+ _lsidStack[_stackTop],
+ text, value);
+ _valueStack[_stackTop] += value;
+ }
+ }
+ }
+
+ _elementBuffer = string("");
+ _xtree->addHashValue(_idStack[_stackTop], _valueStack[_stackTop]);
+ _valueStack[_stackTop-1] += _valueStack[_stackTop] *
+ _valueStack[_stackTop];
+ _lsidStack[_stackTop-1] = _idStack[_stackTop];
+
+ // Pop
+ _stackTop--;
+}
+
+// The lexical handler methods.
+void XParser::startCDATA()
+{
+ int textid = _currentNodeID + 1;
+ string text = _elementBuffer;
+ _xtree->addCDATA(textid, text.length());
+}
+
+void XParser::endCDATA()
+{
+ int textid = _currentNodeID + 1;
+ string text = _elementBuffer;
+ _xtree->addCDATA(textid, text.length());
+}
+
+string XParser::_trim(const char* input)
+{
+ string base(input);
+ if (base.length() == 0)
+ return string("");
+
+ int start = base.find_first_not_of(_trimReject);
+ if (start < 0)
+ return string("");
+
+ int end = base.find_last_not_of(_trimReject);
+ return base.substr(start, end - start + 1);
+}
+
+string XParser::_trim(string input)
+{
+ if (input.length() == 0)
+ return string("");
+
+ int start = input.find_first_not_of(_trimReject);
+ if (start < 0)
+ return string("");
+
+ int end = input.find_last_not_of(_trimReject);
+ return input.substr(start, end - start + 1);
+}
+
+// For testing purpose.
+/*
+int main(int argc, char* args[])
+{
+ try
+ {
+ XMLPlatformUtils::Initialize();
+ }
+ catch (const XMLException& e)
+ {
+ cerr << "Error during initialization! :\n"
+ << e.getMessage() << endl;
+ exit(1);
+ }
+
+ XParser *parser = new XParser();
+ XTree *xtree = parser->parse(args[1]);
+
+ xtree->dump();
+
+ delete parser;
+ delete xtree;
+}
+*/
--- /dev/null
+/**
+ * Copyright (c) 2001 - 2005
+ * Yuan Wang. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Redistributions in any form must be accompanied by information on
+ * how to obtain complete source code for the X-Diff software and any
+ * accompanying software that uses the X-Diff software. The source code
+ * must either be included in the distribution or be available for no
+ * more than the cost of distribution plus a nominal fee, and must be
+ * freely redistributable under reasonable conditions. For an executable
+ * file, complete source code means the source code for all modules it
+ * contains. It does not include source code for modules or files that
+ * typically accompany the major components of the operating system on
+ * which the executable file runs.
+ *
+ * THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+ * ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef __XPARSER__
+#define __XPARSER__
+
+#include <string>
+
+#include <sax2/Attributes.hpp>
+#include <sax2/DefaultHandler.hpp>
+#include <sax2/SAX2XMLReader.hpp>
+#include <sax2/XMLReaderFactory.hpp>
+#include <util/PlatformUtils.hpp>
+#include <util/XMLString.hpp>
+
+#include "XTree.hpp"
+#include "XHash.hpp"
+
+#define _STACK_SIZE 64
+
+
+/**
+ * XParser parses an input XML document and constructs an XTree.
+ * Note: this parser may generate incorrect result on characters whose
+ * ASCII value is beyond 127.
+ */
+class XParser : public DefaultHandler
+{
+public:
+
+ XParser();
+ ~XParser();
+
+ /**
+ * Parse an XML document
+ * @param uri input XML document
+ * @return the created XTree
+ */
+ XTree* parse(const char* uri);
+
+ // Document handler methods
+
+ void startElement(const XMLCh* const uri, const XMLCh* const local,
+ const XMLCh* const raw, const Attributes& attrs);
+
+ void characters(const XMLCh* const ch, const unsigned int length);
+
+ void endElement(const XMLCh* const uri, const XMLCh* const local,
+ const XMLCh* const raw);
+
+ void startCDATA();
+
+ void endCDATA();
+
+private:
+
+ static const char* _feature_Validation;
+ static const char* _feature_NameSpaces;
+ static const char* _feature_SchemaSupport;
+ static const char* _feature_SchemaFullSupport;
+ static const char* _feature_NameSpacePrefixes;
+ static bool _setValidation, _setNameSpaces;
+ static bool _setSchemaSupport, _setSchemaFullSupport;
+ static bool _setNameSpacePrefixes;
+ static const char* _trimReject;
+
+ SAX2XMLReader *_parser;
+ XTree *_xtree;
+ int _idStack[_STACK_SIZE];
+ int _lsidStack[_STACK_SIZE]; // id and left sibling
+ unsigned long long _valueStack[_STACK_SIZE];
+ int _stackTop, _currentNodeID;
+ bool _readElement;
+ string _elementBuffer;
+
+ /**
+ * Trim a string.
+ * @param input input string
+ * @return a trimmed string
+ */
+ string _trim(const char* input);
+ string _trim(string input);
+};
+
+#endif
--- /dev/null
+/**
+ * Copyright (c) 2001 - 2005
+ * Yuan Wang. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Redistributions in any form must be accompanied by information on
+ * how to obtain complete source code for the X-Diff software and any
+ * accompanying software that uses the X-Diff software. The source code
+ * must either be included in the distribution or be available for no
+ * more than the cost of distribution plus a nominal fee, and must be
+ * freely redistributable under reasonable conditions. For an executable
+ * file, complete source code means the source code for all modules it
+ * contains. It does not include source code for modules or files that
+ * typically accompany the major components of the operating system on
+ * which the executable file runs.
+ *
+ * THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+ * ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "XTree.hpp"
+
+const int XTree::MATCH = 0;
+const int XTree::NO_MATCH = -1;
+const int XTree::INSERT = -1;
+const int XTree::DELETE = -1;
+const int XTree::CHANGE = 1;
+const int XTree::NULL_NODE = -1;
+const int XTree::NO_CONNECTION = 1048576;
+
+const int XTree::_TOP_LEVEL_CAPACITY = 16384;
+const int XTree::_BOT_LEVEL_CAPACITY = 4096;
+const int XTree::_ROOT = 0;
+
+
+XTree::XTree()
+: _tagNames(256),
+ _cdataTable(256)
+{
+ _topCap = _TOP_LEVEL_CAPACITY;
+ _botCap = _BOT_LEVEL_CAPACITY;
+ _initialize();
+}
+
+XTree::XTree(int topcap, int botcap)
+: _tagNames(256),
+ _cdataTable(256)
+{
+ _topCap = topcap;
+ _botCap = botcap;
+ _initialize();
+}
+
+XTree::~XTree()
+{
+ int size = _elementIndex / _botCap;
+ if (size >= 0)
+ {
+ for (int i = 0; i <= size; i++)
+ {
+ delete[] _valueIndex[i];
+ delete[] _firstChild[i];
+ delete[] _nextSibling[i];
+ delete[] _childrenCount[i];
+ delete[] _matching[i];
+ delete[] _isAttribute[i];
+ delete[] _hashValue[i];
+ }
+ }
+ delete[] _firstChild;
+ delete[] _nextSibling;
+ delete[] _childrenCount;
+ delete[] _valueIndex;
+ delete[] _matching;
+ delete[] _isAttribute;
+ delete[] _hashValue;
+
+ size = _valueCount / _botCap;
+ for (int i = 0; i <= size; i++)
+ delete[] _value[i];
+ delete[] _value;
+}
+
+void XTree::_initialize()
+{
+ _firstChild = new int*[_topCap];
+ _nextSibling = new int*[_topCap];
+ _childrenCount = new int*[_topCap];
+ _valueIndex = new int*[_topCap];
+ _matching = new int*[_topCap];
+ _isAttribute = new bool*[_topCap];
+ _hashValue = new unsigned long long*[_topCap];
+ _value = new string*[_topCap];
+
+ _value[0] = new string[_botCap];
+ _elementIndex = -1;
+ _tagIndex = -1;
+ _valueCount = _botCap - 1;
+}
+
+void XTree::_expand(int topid)
+{
+ _firstChild[topid] = new int[_botCap];
+ _nextSibling[topid] = new int[_botCap];
+ _childrenCount[topid] = new int[_botCap];
+ _valueIndex[topid] = new int[_botCap];
+ _matching[topid] = new int[_botCap];
+ _isAttribute[topid] = new bool[_botCap];
+ _hashValue[topid] = new unsigned long long[_botCap];
+
+ for (int i = 0; i < _botCap; i++)
+ {
+ _firstChild[topid][i] = NULL_NODE;
+ _nextSibling[topid][i] = NULL_NODE;
+ _childrenCount[topid][i]= 0;
+ _matching[topid][i] = MATCH;
+ _valueIndex[topid][i] = -1;
+ _isAttribute[topid][i] = false;
+ }
+}
+
+int XTree::addElement(int pid, int lsid, string tagName)
+{
+ _elementIndex++;
+
+ int topid = _elementIndex / _botCap;
+ int botid = _elementIndex % _botCap;
+ if (botid == 0)
+ _expand(topid);
+
+ // Check if we've got the element name
+ hash_map <string, int, HashString>::const_iterator
+ hit = _tagNames.find(tagName);
+ if (hit != _tagNames.end())
+ {
+ int id = hit->second;
+ _valueIndex[topid][botid] = id;
+ }
+ else
+ {
+ _tagIndex++;
+ _value[0][_tagIndex] = tagName;
+ _tagNames[tagName] = _tagIndex;
+ _valueIndex[topid][botid] = _tagIndex;
+ }
+
+ if (pid == NULL_NODE)
+ return _elementIndex;
+
+ int ptopid = pid / _botCap;
+ int pbotid = pid % _botCap;
+ // parent-child relation or sibling-sibling ralation
+ if (lsid == NULL_NODE)
+ _firstChild[ptopid][pbotid] = _elementIndex;
+ else
+ _nextSibling[lsid/_botCap][lsid%_botCap] = _elementIndex;
+
+ // update children count
+ _childrenCount[ptopid][pbotid]++;
+
+ return _elementIndex;
+}
+
+int XTree::addText(int eid, int lsid, string text, unsigned long long value)
+{
+ _elementIndex++;
+
+ int topid = _elementIndex / _botCap;
+ int botid = _elementIndex % _botCap;
+ if (botid == 0)
+ _expand(topid);
+
+ int etopid = eid / _botCap;
+ int ebotid = eid % _botCap;
+ if (lsid == NULL_NODE)
+ _firstChild[etopid][ebotid] = _elementIndex;
+ else
+ _nextSibling[lsid/_botCap][lsid%_botCap] = _elementIndex;
+
+ _childrenCount[etopid][ebotid]++;
+ _hashValue[topid][botid] = value;
+
+ _valueCount++;
+ int vtopid = _valueCount / _botCap;
+ int vbotid = _valueCount % _botCap;
+ if (vbotid == 0)
+ _value[vtopid] = new string[_botCap];
+
+ _value[vtopid][vbotid] = text;
+ _valueIndex[topid][botid] = _valueCount;
+
+ return _elementIndex;
+}
+
+int XTree::addAttribute(int eid, int lsid, string name, string value,
+ unsigned long long valuehash,
+ unsigned long long attrhash)
+{
+ // attribute name first.
+ int aid = addElement(eid, lsid, name);
+
+ // attribute value second.
+ addText(aid, NULL_NODE, value, valuehash);
+
+ // hash value third
+ int atopid = aid / _botCap;
+ int abotid = aid % _botCap;
+ _isAttribute[atopid][abotid] = true;
+ _hashValue[atopid][abotid] = attrhash;
+
+ return aid;
+}
+
+void XTree::addHashValue(int eid, unsigned long long value)
+{
+ _hashValue[eid/_botCap][eid%_botCap] = value;
+}
+
+void XTree::addCDATA(int eid, size_t position)
+{
+ hash_map<int, vector<size_t> >::const_iterator
+ hit = _cdataTable.find(eid);
+ if (hit != _cdataTable.end())
+ {
+ vector<size_t> poslist = hit->second;
+ poslist.push_back(position);
+ _cdataTable[eid] = poslist;
+ }
+ else
+ {
+ vector<size_t> poslist;
+ poslist.push_back(position);
+ _cdataTable[eid] = poslist;
+ }
+}
+
+void XTree::addMatching(int eid, int matchType, int matchNode = -1)
+{
+ if (matchType == NO_MATCH)
+ _matching[eid/_botCap][eid%_botCap] = NO_MATCH;
+ else if (matchType == MATCH)
+ _matching[eid/_botCap][eid%_botCap] = MATCH;
+ else
+ _matching[eid/_botCap][eid%_botCap] = matchNode + 1;
+}
+
+void XTree::getMatching(int eid, int &matchType, int &matchNode)
+{
+ int mid = _matching[eid/_botCap][eid%_botCap];
+ if (mid == NO_MATCH)
+ matchType = NO_MATCH;
+ else if (mid == MATCH)
+ matchType = MATCH;
+ else
+ {
+ matchType = CHANGE;
+ matchNode = mid - 1;
+ }
+}
+
+int XTree::getRoot()
+{
+ return _ROOT;
+}
+
+int XTree::getFirstChild(int eid)
+{
+ int cid = _firstChild[eid/_botCap][eid%_botCap];
+ while (cid > _ROOT)
+ {
+ int ctopid = cid / _botCap;
+ int cbotid = cid % _botCap;
+ if (_isAttribute[ctopid][cbotid])
+ cid = _nextSibling[ctopid][cbotid];
+ else
+ return cid;
+ }
+
+ return NULL_NODE;
+}
+
+int XTree::getNextSibling(int eid)
+{
+ return _nextSibling[eid/_botCap][eid%_botCap];
+}
+
+int XTree::getFirstAttribute(int eid)
+{
+ int aid = _firstChild[eid/_botCap][eid%_botCap];
+ if ((aid > _ROOT) && (_isAttribute[aid/_botCap][aid%_botCap]))
+ return aid;
+ else
+ return NULL_NODE;
+}
+
+int XTree::getNextAttribute(int aid)
+{
+ int aid1 = _nextSibling[aid/_botCap][aid%_botCap];
+ if ((aid1 > _ROOT) && (_isAttribute[aid1/_botCap][aid1%_botCap]))
+ return aid1;
+ else
+ return NULL_NODE;
+}
+
+string XTree::getAttributeValue(int aid)
+{
+ int cid = _firstChild[aid/_botCap][aid%_botCap];
+ int index = _valueIndex[cid/_botCap][cid%_botCap];
+ if (index > _ROOT)
+ return _value[index/_botCap][index%_botCap];
+ else
+ return NULL;
+}
+
+unsigned long long XTree::getHashValue(int eid)
+{
+ return _hashValue[eid/_botCap][eid%_botCap];
+}
+
+vector<size_t> XTree::getCDATA(int eid)
+{
+ hash_map<int, vector<size_t> >::const_iterator
+ hit = _cdataTable.find(eid);
+ if (hit != _cdataTable.end())
+ return hit->second;
+ else
+ {
+ vector<size_t> a;
+ return a;
+ }
+}
+
+int XTree::getChildrenCount(int eid)
+{
+ return _childrenCount[eid/_botCap][eid%_botCap];
+}
+
+int XTree::getDecendentsCount(int eid)
+{
+ int topid = eid / _botCap;
+ int botid = eid % _botCap;
+ int count = _childrenCount[topid][botid];
+ if (count == 0)
+ return 0;
+
+ int cid = _firstChild[topid][botid];
+ while (cid > _ROOT)
+ {
+ count += getDecendentsCount(cid);
+ cid = _nextSibling[cid/_botCap][cid%_botCap];
+ }
+
+ return count;
+}
+
+int XTree::getValueIndex(int eid)
+{
+ return _valueIndex[eid/_botCap][eid%_botCap];
+}
+
+string XTree::getValue(int index)
+{
+ return _value[index/_botCap][index%_botCap];
+}
+
+string XTree::getTag(int eid)
+{
+ int index = _valueIndex[eid/_botCap][eid%_botCap];
+ return _value[0][index];
+}
+
+string XTree::getText(int eid)
+{
+ int index = _valueIndex[eid/_botCap][eid%_botCap];
+ return _value[index/_botCap][index%_botCap];
+}
+
+bool XTree::isElement(int eid)
+{
+ int index = _valueIndex[eid/_botCap][eid%_botCap];
+ if (index < _botCap)
+ return true;
+ else
+ return false;
+}
+
+bool XTree::isLeaf(int eid)
+{
+ int index = _valueIndex[eid/_botCap][eid%_botCap];
+ if (index < _botCap)
+ return false;
+ else
+ return true;
+}
+
+bool XTree::isAttribute(int eid)
+{
+ return _isAttribute[eid/_botCap][eid%_botCap];
+}
+
+int XTree::getNodeCount()
+{
+ return _elementIndex;
+}
+
+void XTree::dump()
+{
+ cout << "eid\tfirstC\tnextS\tattr?\tcCount\thash\tmatch\tvalue\n";
+ for (int i = _ROOT; i <= _elementIndex; i++)
+ {
+ int topid = i / _botCap;
+ int botid = i % _botCap;
+ int vid = _valueIndex[topid][botid];
+ int vtopid = vid / _botCap;
+ int vbotid = vid % _botCap;
+ cout << i << "\t" << _firstChild[topid][botid] << "\t"
+ << _nextSibling[topid][botid] << "\t"
+ << _isAttribute[topid][botid] << "\t"
+ << _childrenCount[topid][botid] << "\t"
+ << _hashValue[topid][botid] << "\t"
+ << _matching[topid][botid] << "\t"
+ << _value[vtopid][vbotid] << endl;
+ }
+}
+
+void XTree::dumpHash()
+{
+ cout << "hash table:" << _tagNames.size() << endl;
+ hash_map<string, int, HashString>::const_iterator
+ hit;// = _tagNames.begin();
+ for(hit=_tagNames.begin(); hit != _tagNames.end(); hit++)
+ {
+ cout << hit->first << "\t" << hit->second << endl;
+ //hit++;
+ }
+}
--- /dev/null
+/**
+ * Copyright (c) 2001 - 2005
+ * Yuan Wang. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Redistributions in any form must be accompanied by information on
+ * how to obtain complete source code for the X-Diff software and any
+ * accompanying software that uses the X-Diff software. The source code
+ * must either be included in the distribution or be available for no
+ * more than the cost of distribution plus a nominal fee, and must be
+ * freely redistributable under reasonable conditions. For an executable
+ * file, complete source code means the source code for all modules it
+ * contains. It does not include source code for modules or files that
+ * typically accompany the major components of the operating system on
+ * which the executable file runs.
+ *
+ * THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+ * ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef __XTREE__
+#define __XTREE__
+
+#include <iostream>
+#include <string>
+#include <hash_map>
+#include <vector>
+
+class HashString
+{
+public:
+ bool operator()(string const &str) const
+ {
+ return hash<char const *>()(str.c_str());
+ }
+};
+
+
+/**
+ * XTree provides the storage for input XML documents. Basically, an XML
+ * document is parsed by a SAX parser and stored in an XTree.
+ */
+class XTree
+{
+public:
+
+ static const int MATCH, NO_MATCH, INSERT, DELETE, CHANGE;
+ static const int NULL_NODE, NO_CONNECTION;
+
+ XTree();
+ XTree(int topcap, int botcap);
+ ~XTree();
+
+ // Start -- methods for constructing a tree.
+
+ /**
+ * Add a new element to the tree.
+ * @param pid parent id
+ * @param lsid left-side sibling id
+ * @param tagName element name
+ * @return the element id in the tree.
+ */
+ int addElement(int pid, int lsid, string tagName);
+
+ // Add a text node.
+ int addText(int eid, int lsid, string text, unsigned long long value);
+
+ /**
+ * Add an attribute.
+ * @param eid element id
+ * @param lsid the sibling id on the left
+ * @param name attribute name
+ * @param value attribute value
+ * @param valuehash hash value of the value
+ * @param attrhash hash value of the entire attribute
+ * @return the element id of the attribute
+ */
+ int addAttribute(int eid, int lsid, string name, string value,
+ unsigned long long valuehash,
+ unsigned long long attrhash);
+
+ // Add more information (hash value) to the tree.
+ void addHashValue(int eid, unsigned long long value);
+
+ /**
+ * Add a CDATA section (either a start or an end) to the CDATA
+ * hashtable, in which each entry should have an even number of
+ * position slots. This value is interpreted as a string.
+ * @param eid The text node id
+ * @param position the section tag position
+ */
+ void addCDATA(int eid, size_t position);
+
+ /**
+ * Add matching information.
+ * @param eid element id
+ * @param matchType match type
+ * @param matchNode matched element id
+ */
+ void addMatching(int eid, int matchType, int matchNode = -1);
+ // End -- methods for constructing a tree.
+
+ // Start -- methods for accessing a tree.
+ // Get matching information.
+ void getMatching(int eid, int &matchType, int &matchNode);
+
+ int getRoot();
+
+ int getFirstChild(int eid);
+
+ int getNextSibling(int eid);
+
+ int getFirstAttribute(int eid);
+
+ int getNextAttribute(int aid);
+
+ string getAttributeValue(int aid);
+
+ unsigned long long getHashValue(int eid);
+
+ /**
+ * Get the CDATA position list for a text node.
+ * @param eid The text node id
+ * @return the position vector.
+ */
+ vector<unsigned int> getCDATA(int eid);
+
+ int getChildrenCount(int eid);
+
+ int getDecendentsCount(int eid);
+
+ int getValueIndex(int eid);
+
+ string getTag(int eid);
+
+ // Get the value of a leaf node using the value index
+ string getValue(int index);
+
+ // Get the value of a leaf node using the node id
+ string getText(int eid);
+
+ // Check if a node is an element node.
+ bool isElement(int eid);
+
+ // Check if a node is a leaf text node.
+ bool isLeaf(int eid);
+
+ // Check if a ndoe is an attribute node
+ bool isAttribute(int eid);
+
+ // Return the number of nodes in the tree.
+ int getNodeCount();
+
+ // End -- methods for accessing a tree.
+ // For testing purpose.
+ void dump();
+ void dumpHash();
+
+private:
+
+ static const int _TOP_LEVEL_CAPACITY, _BOT_LEVEL_CAPACITY;
+ static const int _ROOT;
+
+ int _topCap, _botCap, _elementIndex, _tagIndex, _valueCount;
+ int **_firstChild, **_nextSibling, **_childrenCount;
+ int **_valueIndex, **_matching;
+ bool **_isAttribute;
+ unsigned long long **_hashValue;
+ string **_value;
+ hash_map<string, int, HashString> _tagNames;
+ hash_map<int, vector<size_t> > _cdataTable;
+
+ void _initialize();
+ void _expand(int topid);
+};
+#endif