import xml.etree.ElementTree as ET #Element Tree XML parsing library
import re #Regex library
import yaml
from yaml import load, dump
try:
from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
from yaml import Loader, Dumper
# @params
# root: root of an ElementTree
# columns: empty array to store column names
# Fills the columns array with column names
def dfs(root, columns):
if root.getchildren() == []:
columns.append(root)
for child in root.getchildren():
dfs(child, columns)
# @param root: root of the tree
# Outputs the node containing the 'instance' tag which is the start of the data form
def findInstanceTag(root):
results = []
if ('instance' in root.tag):
return root
for child in root.getchildren():
tagElement = root.find(child.tag)
results.append(findInstanceTag(tagElement))
for node in results:
if node is not None:
return node
# @param columns: array of column names
# Outputs a new array of column names stripped of their xmlns tags
def cleanColumnNames(columns):
xmlns_re = "({.+})"
cleaned_columns = [re.sub(xmlns_re, '', name.tag) for name in columns]
return cleaned_columns
def xmlParse(fileName):
inputFile = fileName #insert xml file name
parser = ET.parse(inputFile) #initialize the ElementTree parser
root = parser.getroot() #Find the root of the tree, the <html> tag
form = findInstanceTag(root) #find the instances tag - this is the start of the data
columns = [] #initialize empty array to store the column names
dfs(form, columns) #extract column names with DFS
return cleanColumnNames(columns) #clean the columns and return
xmlParse('xml-files/death_report.xml')
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:17: DeprecationWarning: This method will be removed in future versions. Use 'list(elem)' or iteration over elem instead.
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:6: DeprecationWarning: This method will be removed in future versions. Use 'list(elem)' or iteration over elem instead.
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:8: DeprecationWarning: This method will be removed in future versions. Use 'list(elem)' or iteration over elem instead.
def yaml_creator(filename, path = "xml-files/", destination="yaml-files/"):
assert '.xml' in filename[-4:]
keyvalue_pairs = [{'column': i, 'type': None} for i in xmlParse(path + filename)]
output = yaml.dump(keyvalue_pairs, explicit_start=True, default_flow_style=False)
try:
file_test = open(destination + filename[:-4] + ".yaml", "r") # attempt to read the directory first
except FileNotFoundError: # if there is no file with the given filename, then proceed
file_object = open(destination + filename[:-4] + ".yaml", "w+") # w+ means read and write
file_object.write(output) # write to file
file_object.close() # close file
return
file_test.close() # if we get here, there was already a file
raise MemoryError(path + 'File ' + filename[:-4] + '''.yaml already exists in the current working directory. To avoid overwriting, aborting process.''')
yaml_creator("death_report.xml")
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:17: DeprecationWarning: This method will be removed in future versions. Use 'list(elem)' or iteration over elem instead.
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:6: DeprecationWarning: This method will be removed in future versions. Use 'list(elem)' or iteration over elem instead.
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:8: DeprecationWarning: This method will be removed in future versions. Use 'list(elem)' or iteration over elem instead.