Adapt to ICC-ENG

First Version 2023-04-27

Change-Id: I7b05bdadcf23f3416c2488ad3a86283676d855d9
diff --git a/eng2tei.py b/eng2tei.py
new file mode 100644
index 0000000..be20615
--- /dev/null
+++ b/eng2tei.py
@@ -0,0 +1,101 @@
+import os, sys
+import xml.etree.ElementTree as ET
+from xml.dom import minidom
+
+
+def main():
+    # Create corpus structure from string and save into file 
+    corpus = "<teiCorpus>\n</teiCorpus>"
+    origRoot = ET.fromstring(corpus)
+    corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent="  ")
+    if not os.path.exists('input'):
+        os.makedirs("input")
+    if not os.path.exists('output'):
+        os.makedirs("output")
+    with open("input/tree_structure.xml", "w") as f:
+        f.write(corpusStr)
+
+    # Parse corpus tree
+    corpusTree = ET.parse("input/tree_structure.xml")
+    corpusRoot = corpusTree.getroot()
+
+    # Process documents and append to corpus tree
+    for j in range(1, len(sys.argv)):
+        try:
+            currentTree = convert(j-1, sys.argv[j])
+            currentRoot = currentTree.getroot()
+            corpusRoot.append(currentRoot)
+        except:
+            print(sys.argv[j])
+            continue
+
+    # Indent and save tree
+    ET.indent(corpusTree, "  ")
+    corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
+
+
+def convert(j, file):
+    # Parse document tree and get root
+    tree = ET.parse(file)
+    root = tree.getroot()
+    ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
+ 
+    # Store metadata and texts in lists
+    titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
+    uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
+    authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
+    dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
+    texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div") 
+    domains = root.find(".//*[@type]")
+
+    # Count text elements 
+    number_of_texts = 0
+    for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
+        for body in text:
+            for div in body:
+                number_of_texts+=1
+
+    # Remove all elements from root
+    for elem in root.findall("*"):
+        root.remove(elem)
+
+    # Rename root
+    root.tag = "teiDoc"
+    
+    # Create target structure
+    for i in range(number_of_texts):
+        tei = ET.SubElement(root, "TEI")
+        teiHeader = ET.SubElement(tei, "teiHeader")
+        fileDesc = ET.SubElement(teiHeader, "fileDesc")
+        titleStmt = ET.SubElement(fileDesc, "titleStmt")
+        textSigle = ET.SubElement(titleStmt, "textSigle")
+        textSigle.text = "EN/" + f"{j:03}" + "." + f"{i:05}"
+        sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
+        analytic = ET.SubElement(sourceDesc, "analytic")
+        title = ET.SubElement(titleStmt, "title")
+        title.text = titles.text
+        hauthor = ET.SubElement(analytic, "h.author")
+        hauthor.text = authors.text
+        imprint = ET.SubElement(sourceDesc, "imprint")
+        pubDateYear = ET.SubElement(imprint, "pubDate")
+        pubDateYear.set("type", "year")
+        pubDateYear.text = dates.text
+        pubDateMonth = ET.SubElement(imprint, "pubDate")
+        pubDateMonth.set("type", "month")
+        #pubDateMonth.text = datesPublished[i].text
+        pubDateDay = ET.SubElement(imprint, "pubDate")
+        pubDateDay.set("type", "day")
+        #pubDateDay.text = datesPublished[i].text
+        pubPlace = ET.SubElement(imprint, "pubPlace")
+        idno = ET.SubElement(pubPlace, "idno")
+        idno.set("type", "URI")
+        idno.text = uris.text
+        domain = ET.SubElement(titleStmt, "domain")
+        domain.text = domains.get("type")
+        text = ET.SubElement(tei, "text")
+        text.append(texts[i])
+
+    return tree
+
+
+main()