Merge branch 'master' into translations
[QuestHelper.git] / Development / xml.lua
blob0752a5496eacc8736481bbd1bb320e768ab49e65
1 -- Here we attempt to convert XML text into a Lua table.
2 -- This isn't an exact conversion, and is more special case
3 -- to suit my purposes, but it's good enough.
5 -- This codepage seems to work fine for English, German, French, and I assume Spanish.
6 -- I don't know how the other locales are encoded, and I'm sure it will make their already messed up text even more unreadable.
7 local codepage =
9 [0]=
10 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F,
11 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F,
12 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F,
13 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F,
14 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F,
15 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F,
16 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F,
17 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F,
18 0x20AC,0xFFFD,0x201A,0x0192,0x201E,0x2026,0x2020,0x2021,0x02C6,0x2030,0x0160,0x2039,0x0152,0xFFFD,0x017D,0xFFFD,
19 0xFFFD,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014,0x02DC,0x2122,0x0161,0x203A,0x0153,0xFFFD,0x017E,0x0178,
20 0x00A0,0x00A1,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7,0x00A8,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF,
21 0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7,0x00B8,0x00B9,0x00BA,0x00BB,0x00BC,0x00BD,0x00BE,0x00BF,
22 0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7,0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF,
23 0x00D0,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7,0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF,
24 0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7,0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF,
25 0x00F0,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7,0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x00FF,
28 local char_scale = {[0]=0x01, 0x40, 0x1000, 0x40000, 0x1000000, 0x40000000, 0x80000000}
29 local char_max = {[0]=0x00, 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}
30 local char_base = {[0]=0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe}
32 -- Converts non-UTF-8 characters to UTF-8, assuming they're encoded as Windows-1252.
33 -- No guarentees it works, afterall, this was malformed text to begin with.
34 function correctText(text)
35 local i, e = 1,string.len(text)+1
36 while i < e do
37 local byte = string.byte(text, i)
38 assert(byte)
39 local valid = false
41 for l = 1,6 do
42 if byte < char_base[l] then
43 valid = true
45 if l == 1 and byte >= 128 then
46 valid = false
47 else
48 for c=i+1,i+l-1 do
49 byte = string.byte(text, c)
50 if not byte or byte < 0x80 or byte >= 0xC0 then
51 valid = false
52 break
53 end
54 end
55 end
57 if valid then
58 i = i + l
59 end
61 break
62 end
63 end
65 if not valid then
66 --print("character at "..i.." not valid.")
67 local char = codepage[string.byte(text, i)]
69 for size=1,6 do
70 if char <= char_max[size] then
71 local s = string.char(char_base[size-1]+math.floor(char/char_scale[size-1]))
73 for o = size-2,0,-1 do
74 s = s .. string.char(0x80+math.floor(char/char_scale[o])%0x40)
75 end
77 --print("replaced with '"..s.."'.")
78 text = string.format("%s%s%s", string.sub(text, 1,i-1), s, string.sub(text,i+1))
79 i = i + size
80 e = e + size - 1
81 assert(e == string.len(text)+1)
82 break
83 end
84 end
85 end
86 end
88 return text
89 end
91 local function createObj()
92 return {}
93 end
95 local function readVar(data)
96 data:skipws()
97 local var = ""
98 while string.find(data:peek(), "%a") do
99 var = var .. data:get()
101 data:skipws()
102 return (var ~= "" and var) or nil
105 local function readString(data)
106 data:skipws()
107 local s = ""
108 if data:peek() == "\"" then
109 data:get()
110 while true do
111 local c, e = data:get()
112 if c == "\"" and not e then
113 return s
115 s = s .. c
117 return s
118 else
119 return readVar(data)
123 local function loadObj(obj, data)
124 local buffer = CreateBuffer()
125 data:skipws()
127 while true do
128 local c, e = data:get()
129 if not c then break end
130 if c == "<" and not e then
131 data:skipws()
132 local p = data:peek()
134 if p == "?" then
135 data:skipto("?>")
136 elseif p == "/" then
137 data:skipto(">")
138 break
139 else
140 local name = readVar(data)
141 if name then
142 local obj2 = createObj()
143 local closed = false
144 obj[name] = obj2
146 while not string.find(data:peek(), "[/>]") do
147 local varname = readVar(data)
148 if not varname then break end
149 if data:peek() == "=" then
150 data:get()
151 local value = readString(data)
152 if value then
153 obj2[varname] = value
158 if data:peek() == ">" then
159 data:get()
160 obj[name] = loadObj(obj2, data)
161 else
162 -- assuming got "/>", closing the tag.
163 data:skipto(">")
167 else
168 buffer:add(c)
172 local value = select(3, string.find(buffer:dump(), "^%s*(.-)%s*$"))
173 if value == "" then
174 value = nil
175 else
176 value = tonumber(value) or value
179 if value then
180 if not next(obj) then
181 return value
182 else
183 obj.value = value
186 return obj
189 local function readText(self)
190 if self[3] then
191 local c, e = self[3], self[4]
192 self[3] = nil
193 return c, e
196 local p = self[2]
197 local c = string.sub(self[1], p, p)
199 if c == "&" then
200 c = nil
201 local s, e, code = string.find(self[1], "^(.-);", p+1)
203 if not s then
204 --print("EOF while reading escape sequence, assuming literal '&' intended.")
205 self[2] = p+1
206 return "&"
209 self[2] = e+1
211 if code == "amp" then
212 return "&", true
213 elseif code == "lt" then
214 return "<", true
215 elseif code == "gt" then
216 return ">", true
217 elseif code == "quot" then
218 return "\"", true
219 elseif code == "nbsp" then
220 return " ", true
221 else
222 if string.find(code, " ") then
223 --print("Escape sequence contains spaces, assuming literal '&' intended.")
224 self[2] = p+1
225 return "&"
228 assert(false, "Unknown entity code: "..code)
230 elseif c == "" then
231 return nil, false
232 else
233 self[2] = p + 1
236 self[3] = nil
238 return c, false
241 local function peekText(self)
242 if self[3] then
243 return self[3], self[4]
246 self[3], self[4] = readText(self)
247 return self[3], self[4]
250 local function skipSpaces(self)
251 while string.find(peekText(self), "%s") do readText(self) end
254 local function skipTo(self, pattern)
255 self[2] = (select(2, string.find(self[1], pattern, self[2])) or self[2]) + 1
256 self[3] = nil
259 function XMLtoLUA(filename)
260 local stream = io.open(FileUtil.fileName(filename), "r")
261 if stream then
262 local data = {correctText(stream:read("*a")), 1, get=readText, peek=peekText, skipws=skipSpaces, skipto=skipTo}
263 io.close(stream)
264 local obj = createObj()
265 loadObj(obj, data)
266 return obj.xml
268 return nil
271 --loadfile("dump.lua")()
272 --print(ScanAndDumpVariable(XMLtoLUA("test.xml"), "XML", true))