Revisions to Parsing Geographic Data From XML Files in Python

Add some more explanation and fix typo

Source Link

edited Apr 24, 2017 at 18:53

4.3k
12
24
36

Flattened nodes using getiterator

getiterator finds all of the nodes under a node, so allows the removal of the special if for the nested position data.
Removed need for stacked if/else by using converters dict

By using the node tag as a key into a dict, you can store the pieces needed to parse that tag's value. In this case I stored a conversion function, and the desired name for the value.
Removed garbage string by more simply splitting on } and using text after the }
Convert the data directly to the desired representation.

The writing of the data into a secondary file during the XML tree traversal, added quite a bit of complexity, for no apparent gain. Instead this code stores each element of the current node into a dict. The dict is then used to init an Entry object.

Dict keys are mapped to same names used in Entry.__init__

The parameter names for __init__ match the keys used in the data_point dict. This mapping was done via the second element in the tuple in the converters dict. This makes it easy to map things cleanly. This allows return cls(**data_point) to directly create a class instance. Note the fact that you can specify non-keyword parameters with keywords, which is how the **data_point can expand into the 5 parameters to init the Entry class

Since I have completed removed the lists of 'lat', 'lng' etc, this is less relevant, but I thought I would show zip this way:

import geopy.distance
import dateutil.parser
import xml.etree.ElementTree as etree


class Entry(object):

    def __init__(self, time, lat, lng, alt, dist):
        self.time = time
        self.lat = lat
        self.lng = lng
        self.alt = alt
        self.dist = dist

    def __str__(self):
        return 'time: %s, lat: %.6f, lng: %.6f, alt: %.2f, dst:%.3f' % (
            self.time, self.lat, self.lng, self.alt, self.dist)

    def distance_from(self, other):
        return geopy.distance.vincenty(self.pos, other.pos).m

    def time_since(self, other):
        return (self.time - other.time).total_seconds()

    @property
    def pos(self):
        return self.lat, self.lng

    converters = dict(
        AltitudeMeters=(float, 'alt'),
        DistanceMeters=(float, 'dist'),
        LatitudeDegrees=(float, 'lat'),
        LongitudeDegrees=(float, 'lng'),
        Time=(dateutil.parser.parse, 'time'),
    )

    @classmethod
    def from_xml_node(cls, node):
        data_point = {}
        for info in node.getiterator():
            tag = info.tag.split('}')[-1]
            if tag in cls.converters:
                converter, tag_text = cls.converters[tag]
                data_point[tag_text] = converter(info.text.strip())
        return cls(**data_point)

    @classmethod
    def from_xml(cls, xml_source):
        tree = etree.parse(xml_source)
        root = tree.getroot()
        return [cls.from_xml_node(child) for child in root[0][0][1][4]]


while True:
    filename = input('The file to parse: ')

    entries = Entry.from_xml(filename)

    total = 0.0
    for prev, entry in zip(entries[:-1], entries[1:]):
        dx = entry.distance_from(prev)
        dt = entry.time_since(prev)
        total += dx
        print('Total: %.3f, Logged Dist: %.3f, dx: %.3f, dt: %.3f, '
              'Speed: %.1f' % (total, entry.dist, dx, dt, dx/dt if dt else 0))

    if 'n' !=== input('Parse another file? y/n: '):
        break

Flattened nodes using getiterator

getiterator finds all of the nodes under a node, so allows the removal of the special if for the nested position data.
Removed need for stacked if/else by using converters dict

By using the node tag as a key into a dict, you can store the pieces needed to parse that tag's value. In this case I stored a conversion function, and the desired name for the value.
Removed garbage string by more simply splitting on } and using text after the }
Convert the data directly to the desired representation.

The writing of the data into a secondary file during the XML tree traversal, added quite a bit of complexity, for no apparent gain. Instead this code stores each element of the current node into a dict. The dict is then used to init an Entry object.

Since I have completed removed the lists of 'lat', 'lng' etc this is less relevant, but I thought I would show zip this way:

import geopy.distance
import dateutil.parser
import xml.etree.ElementTree as etree


class Entry(object):

    def __init__(self, time, lat, lng, alt, dist):
        self.time = time
        self.lat = lat
        self.lng = lng
        self.alt = alt
        self.dist = dist

    def __str__(self):
        return 'time: %s, lat: %.6f, lng: %.6f, alt: %.2f, dst:%.3f' % (
            self.time, self.lat, self.lng, self.alt, self.dist)

    def distance_from(self, other):
        return geopy.distance.vincenty(self.pos, other.pos).m

    def time_since(self, other):
        return (self.time - other.time).total_seconds()

    @property
    def pos(self):
        return self.lat, self.lng

    converters = dict(
        AltitudeMeters=(float, 'alt'),
        DistanceMeters=(float, 'dist'),
        LatitudeDegrees=(float, 'lat'),
        LongitudeDegrees=(float, 'lng'),
        Time=(dateutil.parser.parse, 'time'),
    )

    @classmethod
    def from_xml_node(cls, node):
        data_point = {}
        for info in node.getiterator():
            tag = info.tag.split('}')[-1]
            if tag in cls.converters:
                converter, tag_text = cls.converters[tag]
                data_point[tag_text] = converter(info.text.strip())
        return cls(**data_point)

    @classmethod
    def from_xml(cls, xml_source):
        tree = etree.parse(xml_source)
        root = tree.getroot()
        return [cls.from_xml_node(child) for child in root[0][0][1][4]]


while True:
    filename = input('The file to parse: ')

    entries = Entry.from_xml(filename)

    total = 0.0
    for prev, entry in zip(entries[:-1], entries[1:]):
        dx = entry.distance_from(prev)
        dt = entry.time_since(prev)
        total += dx
        print('Total: %.3f, Logged Dist: %.3f, dx: %.3f, dt: %.3f, '
              'Speed: %.1f' % (total, entry.dist, dx, dt, dx/dt if dt else 0))

    if 'n' != input('Parse another file? y/n: '):
        break

Flattened nodes using getiterator

getiterator finds all of the nodes under a node, so allows the removal of the special if for the nested position data.
Removed need for stacked if/else by using converters dict

By using the node tag as a key into a dict, you can store the pieces needed to parse that tag's value. In this case I stored a conversion function, and the desired name for the value.
Removed garbage string by more simply splitting on } and using text after the }
Convert the data directly to the desired representation.

The writing of the data into a secondary file during the XML tree traversal, added quite a bit of complexity, for no apparent gain. Instead this code stores each element of the current node into a dict. The dict is then used to init an Entry object.

Dict keys are mapped to same names used in Entry.__init__

The parameter names for __init__ match the keys used in the data_point dict. This mapping was done via the second element in the tuple in the converters dict. This makes it easy to map things cleanly. This allows return cls(**data_point) to directly create a class instance. Note the fact that you can specify non-keyword parameters with keywords, which is how the **data_point can expand into the 5 parameters to init the Entry class

Since I have completed removed the lists of 'lat', 'lng' etc, this is less relevant, but I thought I would show zip this way:

import geopy.distance
import dateutil.parser
import xml.etree.ElementTree as etree


class Entry(object):

    def __init__(self, time, lat, lng, alt, dist):
        self.time = time
        self.lat = lat
        self.lng = lng
        self.alt = alt
        self.dist = dist

    def __str__(self):
        return 'time: %s, lat: %.6f, lng: %.6f, alt: %.2f, dst:%.3f' % (
            self.time, self.lat, self.lng, self.alt, self.dist)

    def distance_from(self, other):
        return geopy.distance.vincenty(self.pos, other.pos).m

    def time_since(self, other):
        return (self.time - other.time).total_seconds()

    @property
    def pos(self):
        return self.lat, self.lng

    converters = dict(
        AltitudeMeters=(float, 'alt'),
        DistanceMeters=(float, 'dist'),
        LatitudeDegrees=(float, 'lat'),
        LongitudeDegrees=(float, 'lng'),
        Time=(dateutil.parser.parse, 'time'),
    )

    @classmethod
    def from_xml_node(cls, node):
        data_point = {}
        for info in node.getiterator():
            tag = info.tag.split('}')[-1]
            if tag in cls.converters:
                converter, tag_text = cls.converters[tag]
                data_point[tag_text] = converter(info.text.strip())
        return cls(**data_point)

    @classmethod
    def from_xml(cls, xml_source):
        tree = etree.parse(xml_source)
        root = tree.getroot()
        return [cls.from_xml_node(child) for child in root[0][0][1][4]]


while True:
    filename = input('The file to parse: ')

    entries = Entry.from_xml(filename)

    total = 0.0
    for prev, entry in zip(entries[:-1], entries[1:]):
        dx = entry.distance_from(prev)
        dt = entry.time_since(prev)
        total += dx
        print('Total: %.3f, Logged Dist: %.3f, dx: %.3f, dt: %.3f, '
              'Speed: %.1f' % (total, entry.dist, dx, dt, dx/dt if dt else 0))

    if 'n' == input('Parse another file? y/n: '):
        break

Minor cleanup

Source Link

edited Apr 23, 2017 at 22:52

Stephen Rauch

4.3k
12
24
36

Flattened nodes using getiterator

getiterator finds all of the nodes under a node, so allows the removal of the special if for the nested position data.
Removed need for stacked if/else by using converters dict

By using the node tag as a key into a dict, you can store the pieces needed to parse that tagstag's value. In this case I stored a conversion function, and the desired name for the value.
Removed garbage string by more simply splitting on } and using text after the }
Convert the data directly to the desired representation.

The writing of the data into a secondary file during the XML tree traversal, added quite a bit of complexity, for no apparent gain. Instead this code stores each element of the current node into a dict. The dict is then used to init an Entry object.

If you are passing a class instance to a static method of that class, you are probably doing it wrong. static Static methods are intended for code which is closely aligned with a class but which it not directly associated with an instance.

def distancecalcdistance_from(self, other):

In distancecalcdistance_from I converted:

Since lat, lng as a pair is a position, making that relationship explicit by using a property, better documents the relationship.

By storing the timestamp in the class as a datetime valuesvalue, I was able to convert:

import geopy.distance
import dateutil.parser
import xml.etree.ElementTree as etree


class Entry(object):

    def __init__(self, time, lat, lng, alt, dist):
        self.time = time
        self.lat = lat
        self.lng = lng
        self.alt = alt
        self.dist = dist

    def __str__(self):
        return 'time: %s, lat: %.6f, lng: %.6f, alt: %.2f, dst:%.3f' % (
            self.time, self.lat, self.lng, self.alt, self.dist)

    def distancecalcdistance_from(self, other):
        return geopy.distance.vincenty(self.pos, other.pos).m

    def time_since(self, other):
        return (self.time - other.time).total_seconds()

    @property
    def pos(self):
        return self.lat, self.lng

    converters = dict(
        AltitudeMeters=(float, 'alt'),
        DistanceMeters=(float, 'dist'),
        LatitudeDegrees=(float, 'lat'),
        LongitudeDegrees=(float, 'lng'),
        Time=(dateutil.parser.parse, 'time'),
    )

    @classmethod
    def from_xml_node(cls, node):
        data_point = {}
        for info in node.getiterator():
            tag = info.tag.split('}')[-1]
            if tag in cls.converters:
                converter, tag_text = cls.converters[tag]
                data_point[tag_text] = converter(info.text.strip())
        return cls(**data_point)

    @classmethod
    def from_xml(cls, xml_source):
        tree = etree.parse(xml_source)
        root = tree.getroot()
        return [cls.from_xml_node(child) for child in root[0][0][1][4]]


while True:
    filename = input('The file to parse: ')

    entries = Entry.from_xml(filename)

    total = 0.0
    for prev, entry in zip(entries[:-1], entries[1:]):
        dx = entry.distancecalcdistance_from(prev)
        dt = entry.time_since(prev)
        total += dx
        print('Total: %.3f, Logged Dist: %.3f, dx: %.3f, dt: %.3f, '
              'Speed: %.1f' % (total, entry.dist, dx, dt, dx/dt if dt else 0))

        prev = entry

    if 'n' != input('Parse another file? y/n: '):
        break

Flattened nodes using getiterator

getiterator finds all of the nodes under a node, so allows the removal of the special if for the nested position data.
Removed need for stacked if/else by using converters dict

By using the node tag as a key into a dict, you can store the pieces needed to parse that tags value. In this case I stored a conversion function, and the desired name for the value.
Removed garbage string by more simply splitting on } and using text after the }
Convert the data directly to the desired representation.

The writing of the data into a secondary file during the XML tree traversal, added quite a bit of complexity, for no apparent gain. Instead this code stores each element of the current node into a dict. The dict is then used to init an Entry object.

If you are passing a class instance to a static method of that class, you are probably doing it wrong. static methods are intended for code which is closely aligned with a class but which it not directly associated with an instance.

def distancecalc(self, other):

In distancecalc I converted:

Since lat, lng as a pair is a position, making that relationship explicit by using a property better documents the relationship.

By storing the timestamp in the class as datetime values I was able to convert:

import geopy.distance
import dateutil.parser
import xml.etree.ElementTree as etree


class Entry(object):

    def __init__(self, time, lat, lng, alt, dist):
        self.time = time
        self.lat = lat
        self.lng = lng
        self.alt = alt
        self.dist = dist

    def __str__(self):
        return 'time: %s, lat: %.6f, lng: %.6f, alt: %.2f, dst:%.3f' % (
            self.time, self.lat, self.lng, self.alt, self.dist)

    def distancecalc(self, other):
        return geopy.distance.vincenty(self.pos, other.pos).m

    def time_since(self, other):
        return (self.time - other.time).total_seconds()

    @property
    def pos(self):
        return self.lat, self.lng

    converters = dict(
        AltitudeMeters=(float, 'alt'),
        DistanceMeters=(float, 'dist'),
        LatitudeDegrees=(float, 'lat'),
        LongitudeDegrees=(float, 'lng'),
        Time=(dateutil.parser.parse, 'time'),
    )

    @classmethod
    def from_xml_node(cls, node):
        data_point = {}
        for info in node.getiterator():
            tag = info.tag.split('}')[-1]
            if tag in cls.converters:
                converter, tag_text = cls.converters[tag]
                data_point[tag_text] = converter(info.text.strip())
        return cls(**data_point)

    @classmethod
    def from_xml(cls, xml_source):
        tree = etree.parse(xml_source)
        root = tree.getroot()
        return [cls.from_xml_node(child) for child in root[0][0][1][4]]


while True:
    filename = input('The file to parse: ')

    entries = Entry.from_xml(filename)

    total = 0.0
    for prev, entry in zip(entries[:-1], entries[1:]):
        dx = entry.distancecalc(prev)
        dt = entry.time_since(prev)
        total += dx
        print('Total: %.3f, Logged Dist: %.3f, dx: %.3f, dt: %.3f, '
              'Speed: %.1f' % (total, entry.dist, dx, dt, dx/dt if dt else 0))

        prev = entry

    if 'n' != input('Parse another file? y/n: '):
        break

Flattened nodes using getiterator

getiterator finds all of the nodes under a node, so allows the removal of the special if for the nested position data.
Removed need for stacked if/else by using converters dict

By using the node tag as a key into a dict, you can store the pieces needed to parse that tag's value. In this case I stored a conversion function, and the desired name for the value.
Removed garbage string by more simply splitting on } and using text after the }
Convert the data directly to the desired representation.

The writing of the data into a secondary file during the XML tree traversal, added quite a bit of complexity, for no apparent gain. Instead this code stores each element of the current node into a dict. The dict is then used to init an Entry object.

If you are passing a class instance to a static method of that class, you are probably doing it wrong. Static methods are intended for code which is closely aligned with a class but which it not directly associated with an instance.

def distance_from(self, other):

In distance_from I converted:

Since lat, lng as a pair is a position, making that relationship explicit by using a property, better documents the relationship.

By storing the timestamp in the class as a datetime value, I was able to convert:

import geopy.distance
import dateutil.parser
import xml.etree.ElementTree as etree


class Entry(object):

    def __init__(self, time, lat, lng, alt, dist):
        self.time = time
        self.lat = lat
        self.lng = lng
        self.alt = alt
        self.dist = dist

    def __str__(self):
        return 'time: %s, lat: %.6f, lng: %.6f, alt: %.2f, dst:%.3f' % (
            self.time, self.lat, self.lng, self.alt, self.dist)

    def distance_from(self, other):
        return geopy.distance.vincenty(self.pos, other.pos).m

    def time_since(self, other):
        return (self.time - other.time).total_seconds()

    @property
    def pos(self):
        return self.lat, self.lng

    converters = dict(
        AltitudeMeters=(float, 'alt'),
        DistanceMeters=(float, 'dist'),
        LatitudeDegrees=(float, 'lat'),
        LongitudeDegrees=(float, 'lng'),
        Time=(dateutil.parser.parse, 'time'),
    )

    @classmethod
    def from_xml_node(cls, node):
        data_point = {}
        for info in node.getiterator():
            tag = info.tag.split('}')[-1]
            if tag in cls.converters:
                converter, tag_text = cls.converters[tag]
                data_point[tag_text] = converter(info.text.strip())
        return cls(**data_point)

    @classmethod
    def from_xml(cls, xml_source):
        tree = etree.parse(xml_source)
        root = tree.getroot()
        return [cls.from_xml_node(child) for child in root[0][0][1][4]]


while True:
    filename = input('The file to parse: ')

    entries = Entry.from_xml(filename)

    total = 0.0
    for prev, entry in zip(entries[:-1], entries[1:]):
        dx = entry.distance_from(prev)
        dt = entry.time_since(prev)
        total += dx
        print('Total: %.3f, Logged Dist: %.3f, dx: %.3f, dt: %.3f, '
              'Speed: %.1f' % (total, entry.dist, dx, dt, dx/dt if dt else 0))

    if 'n' != input('Parse another file? y/n: '):
        break

Source Link

answered Apr 23, 2017 at 20:58

Stephen Rauch

4.3k
12
24
36

I have recast your code, and the complete listing is below. I will discuss various elements of the changes here. If I missed explaining a change, or some concept, please leave a comment and I will see what I can do to better cover that.

Organization

First thing I did was move the parsing code to be a method of your Entry class. This parsing code seems to be very specific to this class, so let's make the relationship explicit. I also broke the parsing into parsing a single Entry Node, and the part which manages parsing the entire XML file.

Parsing

File Parse Code:

@classmethod
def from_xml(cls, xml_source):
    tree = etree.parse(xml_source)
    root = tree.getroot()
    return [cls.from_xml_node(child) for child in root[0][0][1][4]]

Some notes:

Don't prompt for user input in the parsing code.
Convert the data directly to the desired representation (a list of Entries).

Node Parse Code:

converters = dict(
    AltitudeMeters=(float, 'alt'),
    DistanceMeters=(float, 'dist'),
    LatitudeDegrees=(float, 'lat'),
    LongitudeDegrees=(float, 'lng'),
    Time=(dateutil.parser.parse, 'time'),
)

@classmethod
def from_xml_node(cls, node):
    data_point = {}
    for info in node.getiterator():
        tag = info.tag.split('}')[-1]
        if tag in cls.converters:
            converter, tag_text = cls.converters[tag]
            data_point[tag_text] = converter(info.text.strip())
    return cls(**data_point)

A few highlights:

Flattened nodes using getiterator

getiterator finds all of the nodes under a node, so allows the removal of the special if for the nested position data.
Removed need for stacked if/else by using converters dict

By using the node tag as a key into a dict, you can store the pieces needed to parse that tags value. In this case I stored a conversion function, and the desired name for the value.
Removed garbage string by more simply splitting on } and using text after the }
Convert the data directly to the desired representation.

The writing of the data into a secondary file during the XML tree traversal, added quite a bit of complexity, for no apparent gain. Instead this code stores each element of the current node into a dict. The dict is then used to init an Entry object.

Class Structure

static methods probably should not be passed a class instance

If you are passing a class instance to a static method of that class, you are probably doing it wrong. static methods are intended for code which is closely aligned with a class but which it not directly associated with an instance.

So I converted:

@staticmethod
def distancecalc(x1, x2):

To:

def distancecalc(self, other):

properties allow calculated fields to be easily treated

In distancecalc I converted:

coord_1 = (x1.pos[0], x1.pos[1])
coord_2 = (x2.pos[0], x2.pos[1])
return geopy.distance.vincenty(coord_1, coord_2).m

To:

return geopy.distance.vincenty(self.pos, other.pos).m

By providing:

@property
def pos(self):
    return self.lat, self.lng

Since lat, lng as a pair is a position, making that relationship explicit by using a property better documents the relationship.

Python features

Using python libraries

By storing the timestamp in the class as datetime values I was able to convert:

@staticmethod
def timecalc(x1, x2):
    a, b = x1.time, x2.time
    t1 = dt.datetime(a[0], a[1], a[2], a[3], a[4], int(a[5]))
    t2 = dt.datetime(b[0], b[1], b[2], b[3], b[4], int(b[5]))
    t1_decimal = float(a[5]-int(a[5]))
    t2_decimal = float(b[5]-int(b[5]))
    return (t2 - t1).total_seconds() + (t2_decimal - t1_decimal)

To:

def time_since(self, other):
    return (self.time - other.time).total_seconds()

This removed all of the date conversion and other math.

Use string formatting

By using python string formatting functionality I was able to reduce the __str__ method from 11 lines to 2

def __str__(self):
    return 'time: %s, lat: %.6f, lng: %.6f, alt: %.2f, dst:%.3f' % (
        self.time, self.lat, self.lng, self.alt, self.dist)

zip allows processing multiple lists at once

Since I have completed removed the lists of 'lat', 'lng' etc this is less relevant, but I thought I would show zip this way:

for prev, entry in zip(entries[:-1], entries[1:]):

this line recasts your print loop from:

prev = entries[0]
for entry in entries:
    ....
    prev = entry

This has the side benefit of not printing the first line of all zeros, but is mostly here to show how to use zip. zip grabs the first element of each of the lists passed to it, and presents them on the first iteration. On the second iteration it presents the second element of each list, etc. In this case I used all but the last element of the entries combined with all but the first element as two lists to get adjacent pairs for each iteration.

Recast Code

import geopy.distance
import dateutil.parser
import xml.etree.ElementTree as etree


class Entry(object):

    def __init__(self, time, lat, lng, alt, dist):
        self.time = time
        self.lat = lat
        self.lng = lng
        self.alt = alt
        self.dist = dist

    def __str__(self):
        return 'time: %s, lat: %.6f, lng: %.6f, alt: %.2f, dst:%.3f' % (
            self.time, self.lat, self.lng, self.alt, self.dist)

    def distancecalc(self, other):
        return geopy.distance.vincenty(self.pos, other.pos).m

    def time_since(self, other):
        return (self.time - other.time).total_seconds()

    @property
    def pos(self):
        return self.lat, self.lng

    converters = dict(
        AltitudeMeters=(float, 'alt'),
        DistanceMeters=(float, 'dist'),
        LatitudeDegrees=(float, 'lat'),
        LongitudeDegrees=(float, 'lng'),
        Time=(dateutil.parser.parse, 'time'),
    )

    @classmethod
    def from_xml_node(cls, node):
        data_point = {}
        for info in node.getiterator():
            tag = info.tag.split('}')[-1]
            if tag in cls.converters:
                converter, tag_text = cls.converters[tag]
                data_point[tag_text] = converter(info.text.strip())
        return cls(**data_point)

    @classmethod
    def from_xml(cls, xml_source):
        tree = etree.parse(xml_source)
        root = tree.getroot()
        return [cls.from_xml_node(child) for child in root[0][0][1][4]]


while True:
    filename = input('The file to parse: ')

    entries = Entry.from_xml(filename)

    total = 0.0
    for prev, entry in zip(entries[:-1], entries[1:]):
        dx = entry.distancecalc(prev)
        dt = entry.time_since(prev)
        total += dx
        print('Total: %.3f, Logged Dist: %.3f, dx: %.3f, dt: %.3f, '
              'Speed: %.1f' % (total, entry.dist, dx, dt, dx/dt if dt else 0))

        prev = entry

    if 'n' != input('Parse another file? y/n: '):
        break

Stack Exchange Network

Return to Answer

Organization

Parsing

Class Structure

Python features

Recast Code