commit

commit: 4201a5e1837cffd09410330dc165126db0299f27 [log] [tgz]
author: lora-sp <lora.spassova@swhk.ids-mannheim.de> Thu Mar 09 16:19:57 2023 +0100
committer: lora-sp <lora.spassova@swhk.ids-mannheim.de> Thu Mar 09 16:19:57 2023 +0100
tree: 3220dd25a1160d279ac35bd8329a2f9d073ad002
parent: ea9ccda1c361184b6a647b9462bda99c96fc9820 [diff]
diff --git a/bunc2tei.py b/bunc2tei.py
index fccc2c1..5ecff9b 100644
--- a/bunc2tei.py
+++ b/bunc2tei.py

@@ -1,95 +1,94 @@
 import os
 import xml.etree.ElementTree as ET
 
-# Path to documents
-path = "/home/spassova/BGCorpusExamples"
-files = os.listdir(path)
+def main():
+    corpus = "<teiCorpus></teiCorpus>"
+    root = ET.fromstring(corpus)
+    path = "/home/spassova/BGCorpusExamples/"
+    files = os.listdir(path)
+    for j in range(len(files)):
+        processing(path + files[j])
+        teiDoc = ET.SubElement(root, "teiDoc")
+    
 
+def processing(file):
 # Parse tree and get root
-tree = ET.parse(path + "/" + files[0])
-root = tree.getroot()
-
+    tree = ET.parse(file)
+    root = tree.getroot()
 # Store metadata and texts in lists
-titles = root.findall(".//*[@type='title']")
-domains = root.findall(".//*[@type='domain']")
-pageURLs = root.findall(".//*[@type='pageURL']")
-ids = root.findall(".//*[@type='id']")
-mainImageURLs = root.findall(".//*[@type='mainImageURL']")
-mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
-mainImageSources = root.findall(".//*[@type='mainImageSources']")
-authors = root.findall(".//*[@type='authors']")
-authorURLs = root.findall(".//*[@type='authorURLs']")
-categories = root.findall(".//*[@type='category']")
-subCategories = root.findall(".//*[@type='subCategory']")
-tags = root.findall(".//*[@type='tags']")
-datesPublished = root.findall(".//*[@type='datePublished']")
-timesPublished = root.findall(".//*[@type='timePublished']")
-datesModified = root.findall(".//*[@type='dateModified']")
-timesModified = root.findall(".//*[@type='timeModified']")
-mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
-mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
-mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
-texts = []
-
+    titles = root.findall(".//*[@type='title']")
+    domains = root.findall(".//*[@type='domain']")
+    pageURLs = root.findall(".//*[@type='pageURL']")
+    ids = root.findall(".//*[@type='id']")
+    mainImageURLs = root.findall(".//*[@type='mainImageURL']")
+    mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
+    mainImageSources = root.findall(".//*[@type='mainImageSources']")
+    authors = root.findall(".//*[@type='authors']")
+    authorURLs = root.findall(".//*[@type='authorURLs']")
+    categories = root.findall(".//*[@type='category']")
+    subCategories = root.findall(".//*[@type='subCategory']")
+    tags = root.findall(".//*[@type='tags']")
+    datesPublished = root.findall(".//*[@type='datePublished']")
+    timesPublished = root.findall(".//*[@type='timePublished']")
+    datesModified = root.findall(".//*[@type='dateModified']")
+    timesModified = root.findall(".//*[@type='timeModified']")
+    mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
+    mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
+    mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
+    texts = []
 # Count text elements and remove metadata
-number_of_texts = 0
-for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
-    number_of_texts+=1
-    for body in text:
-        for div1 in body:
-            for div2 in div1:
-                for div3 in div2:
-                    if div3.get('type') == "metadata":
-                        div2.remove(div3)
+    number_of_texts = 0
+    for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
+        number_of_texts+=1
+        for body in text:
+            for div1 in body:
+                for div2 in div1:
+                    for div3 in div2:
+                        if div3.get('type') == "metadata":
+                            div2.remove(div3)
 
-                    texts.append(div2)
-                    
-
+                        texts.append(div2)
 # Remove all elements from root
-for elem in root.findall("*"):
-    root.remove(elem)
-
+    for elem in root.findall("*"):
+        root.remove(elem)
 # Rename root
-root.tag = "teiCorpus"
-
+    root.tag = "teiDoc"
 # Create i5 structure
-for i in range(number_of_texts):
-    tei = ET.SubElement(root, "TEI")
-    teiHeader = ET.SubElement(tei, "teiHeader")
-    fileDesc = ET.SubElement(teiHeader, "fileDesc")
-    titleStmt = ET.SubElement(fileDesc, "titleStmt")
-    textSigle = ET.SubElement(titleStmt, "textSigle")
-    textSigle.text = "BNC/TST." + f"{i:05}"
-    sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
-    analytic = ET.SubElement(sourceDesc, "analytic")
-    htitle = ET.SubElement(analytic, "h.title")
-    htitle.text = titles[i].text
-    hauthor = ET.SubElement(analytic, "h.author")
-    hauthor.text = authors[i].text
-    imprint = ET.SubElement(sourceDesc, "imprint")
-    pubDateYear = ET.SubElement(imprint, "pubDate")
-    pubDateYear.set("type", "year")
-    pubDateYear.text = datesPublished[i].text[0:4]
-    pubDateMonth = ET.SubElement(imprint, "pubDate")
-    pubDateMonth.set("type", "month")
-    pubDateMonth.text = datesPublished[i].text[5:7]
-    pubDateDay = ET.SubElement(imprint, "pubDate")
-    pubDateDay.set("type", "day")
-    pubDateDay.text = datesPublished[i].text[8:10]
-    pubDateTime = ET.SubElement(imprint, "pubDate")
-    pubDateTime.set("type", "time")
-    pubDateTime.text = timesPublished[i].text
-    pubPlace = ET.SubElement(imprint, "pubPlace")
-    ref = ET.SubElement(pubPlace, "ref")
-    ref.set("type", "page_url")
-    ref.set("target", pageURLs[i].text)
-    text = ET.SubElement(tei, "text")
-    body = ET.SubElement(text, "body")
-    for p in texts[i]:
-        body.append(p)
+    for i in range(number_of_texts):
+        tei = ET.SubElement(root, "TEI")
+        teiHeader = ET.SubElement(tei, "teiHeader")
+        fileDesc = ET.SubElement(teiHeader, "fileDesc")
+        titleStmt = ET.SubElement(fileDesc, "titleStmt")
+        textSigle = ET.SubElement(titleStmt, "textSigle")
+        textSigle.text = "BNC/" + f"{j:02}" + "." + f"{i:05}"
+        sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
+        analytic = ET.SubElement(sourceDesc, "analytic")
+        htitle = ET.SubElement(analytic, "h.title")
+        htitle.text = titles[i].text
+        hauthor = ET.SubElement(analytic, "h.author")
+        hauthor.text = authors[i].text
+        imprint = ET.SubElement(sourceDesc, "imprint")
+        pubDateYear = ET.SubElement(imprint, "pubDate")
+        pubDateYear.set("type", "year")
+        pubDateYear.text = datesPublished[i].text[0:4]
+        pubDateMonth = ET.SubElement(imprint, "pubDate")
+        pubDateMonth.set("type", "month")
+        pubDateMonth.text = datesPublished[i].text[5:7]
+        pubDateDay = ET.SubElement(imprint, "pubDate")
+        pubDateDay.set("type", "day")
+        pubDateDay.text = datesPublished[i].text[8:10]
+        pubDateTime = ET.SubElement(imprint, "pubDate")
+        pubDateTime.set("type", "time")
+        pubDateTime.text = timesPublished[i].text
+        pubPlace = ET.SubElement(imprint, "pubPlace")
+        ref = ET.SubElement(pubPlace, "ref")
+        ref.set("type", "page_url")
+        ref.set("target", pageURLs[i].text)
+        text = ET.SubElement(tei, "text")
+        body = ET.SubElement(text, "body")
+        for p in texts[i]:
+            body.append(p)
 
-
-
-ET.indent(tree, "  ")
-ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
-tree.write("04_output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)
+    ET.indent(tree, "  ")
+    ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
+    tree.write(f"{j:02}" + "_" + "output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)
commit	4201a5e1837cffd09410330dc165126db0299f27	[log] [tgz]
author	lora-sp <lora.spassova@swhk.ids-mannheim.de>	Thu Mar 09 16:19:57 2023 +0100
committer	lora-sp <lora.spassova@swhk.ids-mannheim.de>	Thu Mar 09 16:19:57 2023 +0100
tree	3220dd25a1160d279ac35bd8329a2f9d073ad002
parent	ea9ccda1c361184b6a647b9462bda99c96fc9820 [diff]