restructured code
Change-Id: I430313b84efa873323f82756d53fe8b7a03ff9fe
diff --git a/bunc2tei.py b/bunc2tei.py
index 055f45c..c142567 100644
--- a/bunc2tei.py
+++ b/bunc2tei.py
@@ -1,127 +1,101 @@
-import os, sys
+import sys
import xml.etree.ElementTree as ET
-from xml.dom import minidom
-from lxml import etree
-from io import StringIO
def main():
- # Create corpus structure from string and save into file
- corpus = "<teiCorpus>\n</teiCorpus>"
- origRoot = ET.fromstring(corpus)
- corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
- if not os.path.exists('input'):
- os.makedirs("input")
- if not os.path.exists('output'):
- os.makedirs("output")
- with open("input/tree_structure.xml", "w") as f:
- f.write(corpusStr)
+ corpusRoot = ET.Element("teiCorpus")
- # Parse corpus tree
- corpusTree = ET.parse("input/tree_structure.xml")
- corpusRoot = corpusTree.getroot()
-
- # Process documents and append to corpus tree
for j in range(1, len(sys.argv)):
try:
- currentTree = convert(j-1, sys.argv[j])
- currentRoot = currentTree.getroot()
- #print(currentRoot.tag)
+ doc_data = extract_data(sys.argv[j])
+ doc_tree = create_tree(doc_data, j-1)
+ currentRoot = doc_tree.getroot()
corpusRoot.append(currentRoot)
except:
- print(sys.argv[j])
+ print("Warning: could not parse file: " + sys.argv[j], file=sys.stderr)
continue
- # Indent and save tree
+ corpusTree = ET.ElementTree(corpusRoot)
ET.indent(corpusTree, " ")
- corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
+ corpusTree.write(sys.stdout, encoding='unicode')
-def convert(j, file):
- # Parse document tree and get root
+def extract_data(file):
+ ''' Parses an xml file and saves the metadata and texts into a dictionary that is returned.
+ The dictionary is of the following form:
+ data = {filenumber: {title}, {url}, {author}, {date}, {time}, {text}}
+ '''
+
tree = ET.parse(file)
root = tree.getroot()
ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
-
- # Store metadata and texts in lists
- titles = root.findall(".//*[@type='title']")
- #domains = root.findall(".//*[@type='domain']")
- pageURLs = root.findall(".//*[@type='pageURL']")
- #ids = root.findall(".//*[@type='id']")
- #mainImageURLs = root.findall(".//*[@type='mainImageURL']")
- #mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
- #mainImageSources = root.findall(".//*[@type='mainImageSources']")
- authors = root.findall(".//*[@type='authors']")
- #authorURLs = root.findall(".//*[@type='authorURLs']")
- #categories = root.findall(".//*[@type='category']")
- #subCategories = root.findall(".//*[@type='subCategory']")
- #tags = root.findall(".//*[@type='tags']")
- datesPublished = root.findall(".//*[@type='datePublished']")
- timesPublished = root.findall(".//*[@type='timePublished']")
- #datesModified = root.findall(".//*[@type='dateModified']")
- #timesModified = root.findall(".//*[@type='timeModified']")
- #mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
- #mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
- #mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
- texts = []
- # Count text elements and remove metadata
- number_of_texts = 0
- for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
- number_of_texts+=1
- for body in text:
- for div1 in body:
- for div2 in div1:
- for div3 in div2:
- if div3.get('type') == "metadata":
- div2.remove(div3)
+ data = {}
- texts.append(div2)
+ for i, text in enumerate(root.iter("{http://www.tei-c.org/ns/1.0}text")):
+ data[i] = {}
+ data[i]['title'] = text.get('title')
+ data[i]['url'] = text.get('url')
+ data[i]['author'] = text.get('author')
+ data[i]['date'] = text.get('date').split(' ')[0]
+ data[i]['time'] = text.get('date').split(' ')[1]
- # Remove all elements from root
- for elem in root.findall("*"):
- root.remove(elem)
+ textelem = text.find(".{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div/"\
+ "{http://www.tei-c.org/ns/1.0}div")
+
+ data[i]['text'] = []
+ for p in textelem.findall(".{http://www.tei-c.org/ns/1.0}p"):
+ data[i]['text'].append(p)
+
+ return data
- # Rename root
- root.tag = "teiDoc"
- # Create target structure
- for i in range(number_of_texts):
- tei = ET.SubElement(root, "TEI")
+def create_tree(data, filenumber):
+ ''' Receives a dictionary containing the data and returns an xml tree in the
+ desired format. Generates text sigles of the following format: BNC/filenumber.textnumber,
+ e.g. BNC/000.00000
+ '''
+ docRoot = ET.Element("teiDoc")
+
+ for i in range(len(data)):
+ tei = ET.SubElement(docRoot, "TEI")
teiHeader = ET.SubElement(tei, "teiHeader")
fileDesc = ET.SubElement(teiHeader, "fileDesc")
titleStmt = ET.SubElement(fileDesc, "titleStmt")
textSigle = ET.SubElement(titleStmt, "textSigle")
- textSigle.text = "BNC/" + f"{j:03}" + "." + f"{i:05}"
+ textSigle.text = "BNC/" + f"{filenumber:03}" + "." + f"{i:05}"
sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
analytic = ET.SubElement(sourceDesc, "analytic")
htitle = ET.SubElement(analytic, "h.title")
- htitle.text = titles[i].text
+ htitle.text = data[i]['title']
hauthor = ET.SubElement(analytic, "h.author")
- hauthor.text = authors[i].text
+ hauthor.text = data[i]['author']
imprint = ET.SubElement(sourceDesc, "imprint")
pubDateYear = ET.SubElement(imprint, "pubDate")
pubDateYear.set("type", "year")
- pubDateYear.text = datesPublished[i].text[0:4]
+ pubDateYear.text = data[i]['date'][0:4]
pubDateMonth = ET.SubElement(imprint, "pubDate")
pubDateMonth.set("type", "month")
- pubDateMonth.text = datesPublished[i].text[5:7]
+ pubDateMonth.text = data[i]['date'][5:7]
pubDateDay = ET.SubElement(imprint, "pubDate")
pubDateDay.set("type", "day")
- pubDateDay.text = datesPublished[i].text[8:10]
+ pubDateDay.text = data[i]['date'][8:10]
pubDateTime = ET.SubElement(imprint, "pubDate")
pubDateTime.set("type", "time")
- pubDateTime.text = timesPublished[i].text
+ pubDateTime.text = data[i]['time']
pubPlace = ET.SubElement(imprint, "pubPlace")
ref = ET.SubElement(pubPlace, "ref")
ref.set("type", "page_url")
- ref.set("target", pageURLs[i].text)
+ ref.set("target", data[i]['url'])
text = ET.SubElement(tei, "text")
body = ET.SubElement(text, "body")
- for p in texts[i]:
+ for p in data[i]['text']:
body.append(p)
- return tree
+ docTree = ET.ElementTree(docRoot)
+ ET.indent(docTree, " ")
+
+ return docTree
-main()
+main()
\ No newline at end of file