restructured code Change-Id: I430313b84efa873323f82756d53fe8b7a03ff9fe

commit: 95b8f925c8b2e05d9a298dc7120f228eff9e69b7 [log] [tgz]
author: lora-sp <lora.spassova@swhk.ids-mannheim.de> Thu Apr 06 11:28:19 2023 +0200
committer: lora-sp <lora.spassova@swhk.ids-mannheim.de> Thu Apr 06 11:28:19 2023 +0200
tree: 3af594ac77b263c2edef25069949a48bf958d45a
parent: 52f1a295b8c4f92ab8544631868dc9ffeccdd7b3 [diff]
diff --git a/bunc2tei.py b/bunc2tei.py
index 055f45c..c142567 100644
--- a/bunc2tei.py
+++ b/bunc2tei.py

@@ -1,127 +1,101 @@
-import os, sys
+import sys
 import xml.etree.ElementTree as ET
-from xml.dom import minidom
-from lxml import etree
-from io import StringIO
 
 
 def main():
-    # Create corpus structure from string and save into file 
-    corpus = "<teiCorpus>\n</teiCorpus>"
-    origRoot = ET.fromstring(corpus)
-    corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent="  ")
-    if not os.path.exists('input'):
-        os.makedirs("input")
-    if not os.path.exists('output'):
-        os.makedirs("output")
-    with open("input/tree_structure.xml", "w") as f:
-        f.write(corpusStr)
+    corpusRoot = ET.Element("teiCorpus")
 
-    # Parse corpus tree
-    corpusTree = ET.parse("input/tree_structure.xml")
-    corpusRoot = corpusTree.getroot()
-
-    # Process documents and append to corpus tree
     for j in range(1, len(sys.argv)):
         try:
-            currentTree = convert(j-1, sys.argv[j])
-            currentRoot = currentTree.getroot()
-            #print(currentRoot.tag)
+            doc_data = extract_data(sys.argv[j])
+            doc_tree = create_tree(doc_data, j-1)
+            currentRoot = doc_tree.getroot()
             corpusRoot.append(currentRoot)
         except:
-            print(sys.argv[j])
+            print("Warning: could not parse file: " + sys.argv[j], file=sys.stderr) 
             continue
 
-    # Indent and save tree
+    corpusTree = ET.ElementTree(corpusRoot)
     ET.indent(corpusTree, "  ")
-    corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
+    corpusTree.write(sys.stdout, encoding='unicode')
 
 
-def convert(j, file):
-    # Parse document tree and get root
+def extract_data(file):
+    ''' Parses an xml file and saves the metadata and texts into a dictionary that is returned.
+    The dictionary is of the following form:
+    data = {filenumber: {title}, {url}, {author}, {date}, {time}, {text}}
+    '''
+
     tree = ET.parse(file)
     root = tree.getroot()
     ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
- 
-    # Store metadata and texts in lists
-    titles = root.findall(".//*[@type='title']")
-    #domains = root.findall(".//*[@type='domain']")
-    pageURLs = root.findall(".//*[@type='pageURL']")
-    #ids = root.findall(".//*[@type='id']")
-    #mainImageURLs = root.findall(".//*[@type='mainImageURL']")
-    #mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
-    #mainImageSources = root.findall(".//*[@type='mainImageSources']")
-    authors = root.findall(".//*[@type='authors']")
-    #authorURLs = root.findall(".//*[@type='authorURLs']")
-    #categories = root.findall(".//*[@type='category']")
-    #subCategories = root.findall(".//*[@type='subCategory']")
-    #tags = root.findall(".//*[@type='tags']")
-    datesPublished = root.findall(".//*[@type='datePublished']")
-    timesPublished = root.findall(".//*[@type='timePublished']")
-    #datesModified = root.findall(".//*[@type='dateModified']")
-    #timesModified = root.findall(".//*[@type='timeModified']")
-    #mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
-    #mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
-    #mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
-    texts = []
 
-    # Count text elements and remove metadata
-    number_of_texts = 0
-    for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
-        number_of_texts+=1
-        for body in text:
-            for div1 in body:
-                for div2 in div1:
-                    for div3 in div2:
-                        if div3.get('type') == "metadata":
-                            div2.remove(div3)
+    data = {}
 
-                        texts.append(div2)
+    for i, text in enumerate(root.iter("{http://www.tei-c.org/ns/1.0}text")):
+        data[i] = {}
+        data[i]['title'] = text.get('title')
+        data[i]['url'] = text.get('url')
+        data[i]['author'] = text.get('author')
+        data[i]['date'] = text.get('date').split(' ')[0]
+        data[i]['time'] = text.get('date').split(' ')[1]
 
-    # Remove all elements from root
-    for elem in root.findall("*"):
-        root.remove(elem)
+        textelem = text.find(".{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div/"\
+                             "{http://www.tei-c.org/ns/1.0}div")
+    
+        data[i]['text'] = []
+        for p in textelem.findall(".{http://www.tei-c.org/ns/1.0}p"):
+            data[i]['text'].append(p)
+    
+    return data
 
-    # Rename root
-    root.tag = "teiDoc"
 
-    # Create target structure
-    for i in range(number_of_texts):
-        tei = ET.SubElement(root, "TEI")
+def create_tree(data, filenumber):
+    ''' Receives a dictionary containing the data and returns an xml tree in the 
+    desired format. Generates text sigles of the following format: BNC/filenumber.textnumber,
+    e.g. BNC/000.00000
+    '''
+    docRoot = ET.Element("teiDoc")
+
+    for i in range(len(data)):
+        tei = ET.SubElement(docRoot, "TEI")
         teiHeader = ET.SubElement(tei, "teiHeader")
         fileDesc = ET.SubElement(teiHeader, "fileDesc")
         titleStmt = ET.SubElement(fileDesc, "titleStmt")
         textSigle = ET.SubElement(titleStmt, "textSigle")
-        textSigle.text = "BNC/" + f"{j:03}" + "." + f"{i:05}"
+        textSigle.text = "BNC/" + f"{filenumber:03}" + "." + f"{i:05}"
         sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
         analytic = ET.SubElement(sourceDesc, "analytic")
         htitle = ET.SubElement(analytic, "h.title")
-        htitle.text = titles[i].text
+        htitle.text = data[i]['title']
         hauthor = ET.SubElement(analytic, "h.author")
-        hauthor.text = authors[i].text
+        hauthor.text = data[i]['author']
         imprint = ET.SubElement(sourceDesc, "imprint")
         pubDateYear = ET.SubElement(imprint, "pubDate")
         pubDateYear.set("type", "year")
-        pubDateYear.text = datesPublished[i].text[0:4]
+        pubDateYear.text = data[i]['date'][0:4]
         pubDateMonth = ET.SubElement(imprint, "pubDate")
         pubDateMonth.set("type", "month")
-        pubDateMonth.text = datesPublished[i].text[5:7]
+        pubDateMonth.text = data[i]['date'][5:7]
         pubDateDay = ET.SubElement(imprint, "pubDate")
         pubDateDay.set("type", "day")
-        pubDateDay.text = datesPublished[i].text[8:10]
+        pubDateDay.text = data[i]['date'][8:10]
         pubDateTime = ET.SubElement(imprint, "pubDate")
         pubDateTime.set("type", "time")
-        pubDateTime.text = timesPublished[i].text
+        pubDateTime.text = data[i]['time']
         pubPlace = ET.SubElement(imprint, "pubPlace")
         ref = ET.SubElement(pubPlace, "ref")
         ref.set("type", "page_url")
-        ref.set("target", pageURLs[i].text)
+        ref.set("target", data[i]['url'])
         text = ET.SubElement(tei, "text")
         body = ET.SubElement(text, "body")
-        for p in texts[i]:
+        for p in data[i]['text']:
             body.append(p)
 
-    return tree
+    docTree = ET.ElementTree(docRoot)
+    ET.indent(docTree, "  ")
+
+    return docTree
 
 
-main()
+main()
\ No newline at end of file
commit	95b8f925c8b2e05d9a298dc7120f228eff9e69b7	[log] [tgz]
author	lora-sp <lora.spassova@swhk.ids-mannheim.de>	Thu Apr 06 11:28:19 2023 +0200
committer	lora-sp <lora.spassova@swhk.ids-mannheim.de>	Thu Apr 06 11:28:19 2023 +0200
tree	3af594ac77b263c2edef25069949a48bf958d45a
parent	52f1a295b8c4f92ab8544631868dc9ffeccdd7b3 [diff]