Source code for plsxml.plsxml

from __future__ import print_function
import os
import re
import ast
import zipfile
import pandas as pd
import xml.etree.cElementTree as et

__all__ = ['PLSXML']


[docs]class PLSXML(dict):
    """
    A class for parsing PLS-CADD XML files.

    Parameters
    ----------
    path : str or list, default is None
        A string or list of strings defining the ZIP or XML file path(s).
        If None, then no files will be loaded.
    tables : list, default is None
        A list of strings defining the table names to be loaded from the
        referenced XML files. If None, then all tables in the XML files
        will be parsed.
    verbose : bool, default is False
        If True, status messages will be printed during the parsing process.
        This can be useful to see the progress of long XML files.

    Examples
    --------
    >>> from plsxml import PLSXML
    >>> from plsxml.data import data_path

    To load data from the intializer:

    >>> path = data_path('galloping') # DATA_FOLDER/galloping.xml
    >>> xml = PLSXML(path)

    You can add files after the initialization via the `append` method:

    >>> xml.append(path)

    The class is a subclass of a dictionary. Once loaded, data can be accessed
    via table name > column name > row index:

    >>> xml['galloping_ellipses_summary']['minimum_clearance_galloping_ellipse_method'][0]
    'Single mid span'

    A summary of keys can be acquired via the `table_summary` method:

    >>> print(xml.table_summary())
    galloping_ellipses_summary
    	rowtext                                          None
    	structure                                        'TERM'
    	set                                              1
    	phase                                            1
    	ahead_span_length                                258.2
    	minimum_clearance_set                            1
    	minimum_clearance_phase                          2
    	minimum_clearance_galloping_ellipse_method       'Single mid span'
    	minimum_clearance_distance                       1.52
    	minimum_clearance_overlap                        0.0
    	minimum_clearance_wind_from                      'Left'
    	minimum_clearance_mid_span_sag                   12.15
    	minimum_clearance_insulator_swing_angle          0.0
    	minimum_clearance_span_swing_angle               63.1
    	minimum_clearance_major_axis_length              16.2
    	minimum_clearance_minor_axis_length              6.5
    	minimum_clearance_b_distance                     3.0

    """
    def __init__(self, path=None, tables=None, verbose=False):
        self.verbose = verbose

        if path is not None:
            if type(path) == str:
                path = [path]
            for x in path:
                self.append(x, tables)

[docs]    def append(self, path, tables=None):
        """
        Parses the input file into the class dictionary. If tables is None,
        all tables will be loaded. Otherwise, pass a list of the specific
        table names to be parsed.

        Parameters
        ----------
        path : str
            A string defining the XML file path.
        tables : list, default is None
            A list of strings defining the table names to be loaded from the
            referenced XML file. If None, then all tables in the XML file
            will be parsed.
        """
        def is_xml(p):
            fname, ext = os.path.splitext(p)
            return ext in valid_ext and not excl_ext.search(fname)

        _print = self._print_func()
        valid_ext = {'.xml'} # Valid extensions
        excl_ext = re.compile('__MACOSX|\.') # Excluded regex expressions

        if tables is not None:
            if isinstance(tables, str):
                tables = {tables}
            else:
                tables = set(tables)

        # Zipfile
        if zipfile.is_zipfile(path):
            with zipfile.ZipFile(path, 'r') as zf:
                for x in zf.namelist():
                    if is_xml(x):
                        with zf.open(x, 'r') as fh:
                            _print('Parsing:', path, x)
                            self._load_xml(fh, tables)

        # XML
        elif os.path.isfile(path) and is_xml(path):
            with open(path, 'rb') as fh:
                _print('Parsing:', path)
                self._load_xml(fh, tables)

        else:
            print('Append Skipped:: {!r} is not a valid path.'.format(path))

    def _print_func(self):
        """
        If the `verbose` property is True, returns the standard print function.
        Otherwise, returns a function that does nothing.
        """
        def no_print(*args):
            return

        if self.verbose:
            return print
        return no_print

    def _load_xml(self, source, tables):
        """
        Loads the input file into the class dictionary. If tables is None,
        all tables will be loaded. Otherwise, pass a list of the specific
        table names to be parsed.

        Parameters
        ----------
        source : file handle
            A file handle for the XML file.
        tables : list, default is None
            A list of strings defining the table names to be loaded from the
            referenced XML file. If None, then all tables in the XML file
            will be parsed.
        """
        def convert_type(data):
            """Converts data into appropriate type if it can."""
            try:
                return ast.literal_eval(data)
            except:
                return data

        tablesdict = {}
        table = obj = titledetail = None
        excl_tags = {'source_file'}
        _print = self._print_func()

        for event, e in et.iterparse(source, events=('start', 'end')):
            # Start event
            if event == 'start':

                if e.tag == 'table':
                    # Check if table is to be loaded and perform setup if it is
                    if (tables is None) or (e.attrib['tagname'] in tables):
                        table = e.attrib['tagname']
                        titledetail = e.attrib['titledetail']
                        _print('Loading:', table)

                        if table not in tablesdict:
                            tablesdict[table] = []

                elif (table is not None) and (obj is None) and (e.tag not in excl_tags):
                    # Create new dictionary
                    obj = e.tag
                    odict = {}

                    if titledetail not in {None, ''}:
                        # Title details are included in some POLE and TOWER reports
                        odict['titledetail'] = convert_type(titledetail)

            # End event
            elif event == 'end':

                if e.tag == 'table':
                    table = obj = titledetail = None

                elif (table is not None) and (e.tag == obj):
                    tablesdict[table].append(odict)
                    obj = None

                elif obj is not None:
                    odict[e.tag] = convert_type(e.text)

                e.clear()

        for k in list(tablesdict):
            d = tablesdict.pop(k)

            if k in self:
                self[k].append(d, sort=False)
                _print('Dropping Duplicates:', k)
                self[k].drop_duplicates(inplace=True)

            else:
                self[k] = pd.DataFrame.from_dict(d)
                # Create new dataframe with columns in order.
                self[k] = self[k][list(d[0])].copy()

[docs]    def table_summary(self):
        """
        Returns a string of all parsed tables, keys, and example values.
        """
        s = ''

        for table in sorted(self):
            s += '\n{:s}\n'.format(table)

            for key in self[table]:
                v = self[table][key][0]
                s += '\t{!s:60}\t{}\n'.format(key, v)

        return s
Quick search

Source code for plsxml.plsxml