tools/i-pi/ipi/utils/io/io_xml.py

   1 """Contains the functions used to read the input file and print the checkpoint
   2 files with xml formatting.
   3
   4 Copyright (C) 2013, Joshua More and Michele Ceriotti
   5
   6 This program is free software: you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation, either version 3 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program. If not, see <http.//www.gnu.org/licenses/>.
  18
  19
  20 Functions:
  21    xml_node: Class to handle a particular xml tag.
  22    xml_handler: Class giving general xml data reading methods.
  23    xml_parse_string: Parses a string made from a section of a xml input file.
  24    xml_parse_file: Parses an entire xml input file.
  25    read_type: Reads a string and outputs data of a specified type.
  26    read_float: Reads a string and outputs a float.
  27    read_int: Reads a string and outputs an integer.
  28    read_bool: Reads a string and outputs a boolean.
  29    read_list: Reads a string and outputs a list.
  30    read_array: Reads a string and outputs an array.
  31    read_tuple: Reads a string and outputs a tuple.
  32    read_dict: Reads a string and outputs a dictionary.
  33    write_type: Writes a string from data of a specified type.
  34    write_list: Writes a string from a list.
  35    write_tuple: Writes a string from a tuple.
  36    write_float: Writes a string from a float.
  37    write_bool: Writes a string from a boolean.
  38    write_dict: Writes a string from a dictionary.
  39 """
  40
  41 __all__ = ['xml_node', 'xml_handler', 'xml_parse_string', 'xml_parse_file',
  42            'read_type', 'read_float', 'read_int', 'read_bool', 'read_list',
  43            'read_array', 'read_tuple', 'read_dict', 'write_type', 'write_list',
  44            'write_tuple', 'write_float', 'write_bool', 'write_dict']
  45
  46 from xml.sax import parseString, parse
  47 from xml.sax.handler import ContentHandler
  48 import numpy as np
  49 import string
  50
  51 class xml_node(object):
  52    """Class to handle a particular xml tag.
  53
  54    Tags are generally written in the form
  55    <tag_name attribs="attrib_data"> main_data </tag_name>. This class holds
  56    tag_name, attrib_data and main_data separately so they can be used to
  57    create the objects with the appropriate names and data.
  58
  59    Attributes:
  60       attribs: The attribute data for the tag.
  61       fields: The rest of the data.
  62       name: The tag name.
  63    """
  64
  65    def __init__(self, attribs=None, name="", fields=None):
  66       """Initialises xml_node.
  67
  68       Args:
  69          attribs: An optional dictionary giving attribute data. Defaults to {}.
  70          fields: An optional dictionary holding all the data between the start
  71             and end tags, including information about other nodes.
  72             Defaults to {}.
  73          name: An optional string giving the tag name. Defaults to ''.
  74       """
  75
  76       if attribs is None:
  77          attribs = {}
  78       if fields is None:
  79          fields = []
  80
  81       self.attribs = attribs
  82       self.name = name
  83       self.fields = fields
  84
  85
  86 class xml_handler(ContentHandler):
  87    """Class giving general xml_reading methods.
  88
  89    Uses the standard python xml_reader to read the different kinds of data.
  90    Keeps track of the heirarchial nature of an xml file by recording the level
  91    of nesting, so that the correct data and attributes can be associated with
  92    the correct tag name.
  93
  94    Attributes:
  95       root: An xml_node object for the root node.
  96       open: The list of the tags that the parser is currently between the start
  97          and end tags of.
  98       level: The level of nesting that the parser is currently at.
  99       buffer: A list of the data found between the tags at the different levels
 100          of nesting.
 101    """
 102
 103    def __init__(self):
 104       """Initialises xml_handler."""
 105
 106       #root xml node with all the data
 107       self.root = xml_node(name="root", fields=[])
 108       self.open = [self.root]
 109       #current level of the hierarchy
 110       self.level = 0
 111       #Holds all the data between each of the tags.
 112       #If level = 1, then buffer[0] holds all the data collected between the
 113       #root tags, and buffer[1] holds all the data collected between the
 114       #first child tag.
 115       self.buffer = [[""]]
 116
 117    def startElement(self, name, attrs):
 118       """Reads an opening tag.
 119
 120       Adds the opening tag to the list of open tags, adds a new space in the
 121       buffer, reads the appropriate attributes and adds a new level to the
 122       heirarchy.
 123
 124       Args:
 125          name: The tag_name.
 126          attrs: The attribute data.
 127       """
 128
 129       #creates a new node
 130       newnode = xml_node(attribs=dict((k,attrs[k]) for k in attrs.keys()), name=name, fields=[])
 131       #adds it to the list of open nodes
 132       self.open.append(newnode)
 133       #adds it to the list of fields of the parent tag
 134       self.open[self.level].fields.append((name,newnode))
 135       #gets ready to read new data
 136       self.buffer.append([""])
 137       self.level += 1
 138
 139    def characters(self, data):
 140       """Reads data.
 141
 142       Adds the data to the buffer of the current level of the heirarchy.
 143       Data is read as a string, and needs to be converted to the required
 144       type later.
 145
 146       Args:
 147          data: The data to be read.
 148       """
 149
 150       self.buffer[self.level].append(data)
 151
 152    def endElement(self, name):
 153       """Reads a closing tag.
 154
 155       Once all the data has been read, and the closing tag found, the buffer
 156       is read into the appropriate field.
 157
 158       Args:
 159          name: The tag_name.
 160       """
 161
 162       #all the text found between the tags stored in the appropriate xml_node
 163       #object
 164       self.buffer[self.level] = ''.join(self.buffer[self.level])
 165       self.open[self.level].fields.append(("_text" , self.buffer[self.level]))
 166       #'closes' the xml_node object, as we are no longer within its tags, so
 167       #there is no more data to be added to it.
 168       #Note that the xml_node is still held within the parent tag, so we
 169       #no longer require this xml node object.
 170       self.buffer.pop(self.level)
 171       self.open.pop(self.level)
 172       self.level -= 1
 173
 174 def xml_parse_string(buf):
 175    """Parses a string made from a section of a xml input file.
 176
 177    Args:
 178       buf: A string in correct xml format.
 179
 180    Returns:
 181       A xml_node for the root node of the file.
 182    """
 183
 184    myhandle = xml_handler()
 185    parseString(buf, myhandle)
 186    return myhandle.root
 187
 188 def xml_parse_file(stream):
 189    """Parses an entire xml input file.
 190
 191    Args:
 192       stream: A string describing a xml formatted file.
 193
 194    Returns:
 195       A xml_node for the root node of the file.
 196    """
 197
 198    myhandle = xml_handler()
 199    parse(stream, myhandle)
 200    return myhandle.root
 201
 202 def read_type(type, data):
 203    """Reads a string and outputs data of a specified type.
 204
 205    Args:
 206       type: The data type of the target container.
 207       data: The string to be read in.
 208
 209    Raises:
 210       TypeError: Raised if it tries to read into a data type that has not been
 211          implemented.
 212
 213    Returns:
 214       An object of type type.
 215    """
 216
 217    if not type in readtype_funcs:
 218       raise TypeError("Conversion not available for given type")
 219    return type(readtype_funcs[type](data))
 220
 221 def read_float(data):
 222    """Reads a string and outputs a float.
 223
 224    Args:
 225       data: The string to be read in.
 226
 227    Raises:
 228       ValueError: Raised if the input data is not of the correct format.
 229
 230    Returns:
 231       A float.
 232    """
 233
 234    return float(data)
 235
 236 def read_int(data):
 237    """Reads a string and outputs a integer.
 238
 239    Args:
 240       data: The string to be read in.
 241
 242    Raises:
 243       ValueError: Raised if the input data is not of the correct format.
 244
 245    Returns:
 246       An integer.
 247    """
 248
 249    return int(data)
 250
 251 def read_bool(data):
 252    """Reads a string and outputs a boolean.
 253
 254    Takes a string of the form 'true' or 'false', and returns the appropriate
 255    boolean.
 256
 257    Args:
 258       data: The string to be read in.
 259
 260    Raises:
 261       ValueError: Raised if the string is not 'true' or 'false'.
 262
 263    Returns:
 264       A boolean.
 265    """
 266
 267
 268    if data.strip().upper() == "TRUE":
 269       return True
 270    elif data.strip().upper() == "FALSE":
 271       return False
 272    else:
 273       raise ValueError(data + " does not represent a bool value")
 274
 275 def read_list(data, delims="[]", split=",", strip=" \n\t'"):
 276    """Reads a formatted string and outputs a list.
 277
 278    The string must be formatted in the correct way.
 279    The start character must be delimiters[0], the end character
 280    must be delimiters[1] and each element must be split along
 281    the character split. Characters at the beginning or
 282    end of each element in strip are ignored. The standard list format is of the
 283    form '[array[0], array[1],..., array[n]]', which is used for actual lists.
 284    Other formats are used for tuples and dictionaries.
 285
 286    Args:
 287       data: The string to be read in. '[]' by default.
 288       delims: A string of two characters giving the first and last character of
 289          the list format. ',' by default.
 290       split: The character between different elements of the list format.
 291       strip: Characters to be removed from the beginning and end of each
 292          element. ' \n\t' by default.
 293
 294    Raises:
 295       ValueError: Raised if the input data is not of the correct format.
 296
 297    Returns:
 298       A list of strings.
 299    """
 300
 301    try:
 302       begin = data.index(delims[0])
 303       end = data.index(delims[1])
 304    except ValueError:
 305       raise ValueError("Error in list syntax: could not locate delimiters")
 306
 307    rlist = data[begin+1:end].split(split)
 308    for i in range(len(rlist)):
 309       rlist[i] = rlist[i].strip(strip)
 310
 311    # handles empty lists correctly
 312    if len(rlist) == 1 and rlist[0] == "":
 313       rlist = []
 314
 315    return rlist
 316
 317 def read_array(dtype, data):
 318    """Reads a formatted string and outputs an array.
 319
 320    The format is as for standard python arrays, which is
 321    [array[0], array[1], ... , array[n]]. Note the use of comma separators, and
 322    the use of square brackets.
 323
 324    Args:
 325       data: The string to be read in.
 326       dtype: The data type of the elements of the target array.
 327
 328    Raises:
 329       ValueError: Raised if the input data is not of the correct format.
 330
 331    Returns:
 332       An array of data type dtype.
 333    """
 334
 335    rlist = read_list(data)
 336    for i in range(len(rlist)):
 337       rlist[i] = read_type(dtype,rlist[i])
 338
 339    return np.array(rlist, dtype)
 340
 341 def read_tuple(data, delims="()", split=",", strip=" \n\t'", arg_type=int):
 342    """Reads a formatted string and outputs a tuple.
 343
 344    The format is as for standard python tuples, which is
 345    (tuple[0], tuple[1], ... , tuple[n]). Note the comma
 346    separators, and the use of brackets.
 347
 348    Args:
 349       data: The string to be read in.
 350       delims: A string of two characters giving the first and last character of
 351          the list format. ',' by default.
 352       split: The character between different elements of the list format.
 353       strip: Characters to be removed from the beginning and end of each
 354          element. ' \n\t' by default.
 355       arg_type: The strings in the input will be converted, and a tuple
 356          of ar_type will be returned.
 357
 358    Raises:
 359       ValueError: Raised if the input data is not of the correct format.
 360
 361    Returns:
 362       A tuple of elements of the specified data type.
 363    """
 364
 365    rlist = read_list(data, delims=delims, split=split, strip=strip)
 366    return tuple([arg_type(i) for i in rlist])
 367
 368 def read_dict(data, delims="{}", split=",", key_split=":", strip=" \n\t"):
 369    """Reads a formatted string and outputs a dictionary.
 370
 371    The format is as for standard python dictionaries, which is
 372    {keyword[0]: arg[0], keyword[1]: arg[1], ... , keyword[n]: arg[n]}. Note the
 373    comma separators, and the use of curly brackets.
 374
 375    Args:
 376       data: The string to be read in.
 377       delims: A string of two characters giving the first and last character of
 378          the list format. ',' by default.
 379       split: The character between different elements of the list format.
 380       key_split: The character between the key word and the value.
 381       strip: Characters to be removed from the beginning and end of each
 382          element. ' \n\t' by default.
 383
 384    Raises:
 385       ValueError: Raised if the input data is not of the correct format.
 386
 387    Returns:
 388       A dictionary of strings.
 389    """
 390
 391    rlist = read_list(data, delims=delims, split=split, strip=strip)
 392    def mystrip(data):
 393       return data.strip(strip)
 394    rdict = {}
 395    for s in rlist:
 396       rtuple = map(mystrip,s.split(key_split))
 397       if not len(rtuple) == 2:
 398          raise ValueError("Format for a key:value format is wrong for item " + s)
 399       rdict[rtuple[0]] = rtuple[1]
 400
 401    return rdict
 402
 403 readtype_funcs = {np.ndarray: read_array, dict: read_dict, float: read_float, int: read_int, bool: read_bool, str: string.strip, tuple: read_tuple, np.uint : read_int}
 404
 405 def write_type(type, data):
 406    """Writes a formatted string from a value of a specified type.
 407
 408    Args:
 409       type: The data type of the value.
 410       data: The value to be read in.
 411
 412    Raises:
 413       TypeError: Raised if it tries to write from a data type that has not been
 414          implemented.
 415
 416    Returns:
 417       A formatted string.
 418    """
 419
 420    if not type in writetype_funcs:
 421       raise TypeError("Conversion not available for given type")
 422    return writetype_funcs[type](data)
 423
 424 def write_list(data, delims="[]"):
 425    """Writes a formatted string from a list.
 426
 427    The format of the output is as for a standard python list,
 428    [list[0], list[1],..., list[n]]. Note the space after the commas, and the
 429    use of square brackets.
 430
 431    Args:
 432       data: The value to be read in.
 433       delims: An optional string of two characters giving the first and last
 434          character to be printed. Defaults to "[]".
 435
 436    Returns:
 437       A formatted string.
 438    """
 439
 440    rstr = delims[0]
 441
 442    for v in data:
 443       rstr += str(v) + ", "
 444
 445    rstr = rstr.rstrip(", ")
 446    rstr += delims[1]
 447    return rstr
 448
 449 def write_tuple(data):
 450    """Writes a formatted string from a tuple.
 451
 452    The format of the output is as for a standard python tuple,
 453    (tuple[0], tuple[1],..., tuple[n]). Note the space after the commas, and the
 454    use of brackets.
 455
 456    Args:
 457       data: The value to be read in.
 458
 459    Returns:
 460       A formatted string.
 461    """
 462
 463    return write_list(data, delims="()")
 464
 465 def write_float(data):
 466    """Writes a formatted string from a float.
 467
 468    Floats are printed out in exponential format, to 8 decimal places and
 469    filling up any spaces under 16 not used with spaces.
 470
 471    For example 1.0 --> '  1.00000000e+00'
 472
 473    Args:
 474       data: The value to be read in.
 475
 476    Returns:
 477       A formatted string.
 478    """
 479
 480    return "%16.8e" % (data)
 481
 482 def write_bool(data):
 483    """Writes a formatted string from a float.
 484
 485    Booleans are printed as a string of either ' true' or 'false'. Note that
 486    both are printed out as exactly 5 characters.
 487
 488    Args:
 489       data: The value to be read in.
 490
 491    Returns:
 492       A formatted string.
 493    """
 494
 495    return "%5.5s" % (str(data))
 496
 497 def write_dict(data, delims="{}"):
 498    """Writes a formatted string from a dictionary.
 499
 500    The format of the output is as for a standard python dictionary,
 501    {keyword[0]: arg[0], keyword[1]: arg[1],..., keyword[n]: arg[n]}. Note the
 502    space after the commas, and the use of curly brackets.
 503
 504    Args:
 505       data: The value to be read in.
 506       delims: An optional string of two characters giving the first and last
 507          character to be printed. Defaults to "{}".
 508
 509    Returns:
 510       A formatted string.
 511    """
 512
 513    rstr = delims[0]
 514    for v in data:
 515       rstr += str(v) + ": " + str(data[v]) + ", "
 516    rstr = rstr.strip(", ")
 517    rstr += delims[1]
 518    return rstr
 519
 520 writetype_funcs = {float: write_float, dict: write_dict, int: str, bool: write_bool, str: string.strip, tuple: write_tuple, np.uint : str}