Adapt to ICC-ENG
First Version 2023-04-27
Change-Id: I7b05bdadcf23f3416c2488ad3a86283676d855d9
diff --git a/eng2tei.py b/eng2tei.py
new file mode 100644
index 0000000..be20615
--- /dev/null
+++ b/eng2tei.py
@@ -0,0 +1,101 @@
+import os, sys
+import xml.etree.ElementTree as ET
+from xml.dom import minidom
+
+
+def main():
+ # Create corpus structure from string and save into file
+ corpus = "<teiCorpus>\n</teiCorpus>"
+ origRoot = ET.fromstring(corpus)
+ corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
+ if not os.path.exists('input'):
+ os.makedirs("input")
+ if not os.path.exists('output'):
+ os.makedirs("output")
+ with open("input/tree_structure.xml", "w") as f:
+ f.write(corpusStr)
+
+ # Parse corpus tree
+ corpusTree = ET.parse("input/tree_structure.xml")
+ corpusRoot = corpusTree.getroot()
+
+ # Process documents and append to corpus tree
+ for j in range(1, len(sys.argv)):
+ try:
+ currentTree = convert(j-1, sys.argv[j])
+ currentRoot = currentTree.getroot()
+ corpusRoot.append(currentRoot)
+ except:
+ print(sys.argv[j])
+ continue
+
+ # Indent and save tree
+ ET.indent(corpusTree, " ")
+ corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
+
+
+def convert(j, file):
+ # Parse document tree and get root
+ tree = ET.parse(file)
+ root = tree.getroot()
+ ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
+
+ # Store metadata and texts in lists
+ titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
+ uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
+ authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
+ dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
+ texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
+ domains = root.find(".//*[@type]")
+
+ # Count text elements
+ number_of_texts = 0
+ for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
+ for body in text:
+ for div in body:
+ number_of_texts+=1
+
+ # Remove all elements from root
+ for elem in root.findall("*"):
+ root.remove(elem)
+
+ # Rename root
+ root.tag = "teiDoc"
+
+ # Create target structure
+ for i in range(number_of_texts):
+ tei = ET.SubElement(root, "TEI")
+ teiHeader = ET.SubElement(tei, "teiHeader")
+ fileDesc = ET.SubElement(teiHeader, "fileDesc")
+ titleStmt = ET.SubElement(fileDesc, "titleStmt")
+ textSigle = ET.SubElement(titleStmt, "textSigle")
+ textSigle.text = "EN/" + f"{j:03}" + "." + f"{i:05}"
+ sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
+ analytic = ET.SubElement(sourceDesc, "analytic")
+ title = ET.SubElement(titleStmt, "title")
+ title.text = titles.text
+ hauthor = ET.SubElement(analytic, "h.author")
+ hauthor.text = authors.text
+ imprint = ET.SubElement(sourceDesc, "imprint")
+ pubDateYear = ET.SubElement(imprint, "pubDate")
+ pubDateYear.set("type", "year")
+ pubDateYear.text = dates.text
+ pubDateMonth = ET.SubElement(imprint, "pubDate")
+ pubDateMonth.set("type", "month")
+ #pubDateMonth.text = datesPublished[i].text
+ pubDateDay = ET.SubElement(imprint, "pubDate")
+ pubDateDay.set("type", "day")
+ #pubDateDay.text = datesPublished[i].text
+ pubPlace = ET.SubElement(imprint, "pubPlace")
+ idno = ET.SubElement(pubPlace, "idno")
+ idno.set("type", "URI")
+ idno.text = uris.text
+ domain = ET.SubElement(titleStmt, "domain")
+ domain.text = domains.get("type")
+ text = ET.SubElement(tei, "text")
+ text.append(texts[i])
+
+ return tree
+
+
+main()