added example files

commit: ab4e0ea4d6d6562fa275ace307e40ea0e58cbd20 [log] [tgz]
author: lora-sp <lora.spassova@swhk.ids-mannheim.de> Fri Mar 10 12:02:24 2023 +0100
committer: lora-sp <lora.spassova@swhk.ids-mannheim.de> Fri Mar 10 12:02:24 2023 +0100
tree: b1785e24c16fad014837f6edfbdeba517e4c8188
parent: df49785b33276ad004c4a42e2cb00b29eab1811a [diff] [blame]
diff --git a/bunc2tei.py b/bunc2tei.py
index bf5bf4f..d143470 100644
--- a/bunc2tei.py
+++ b/bunc2tei.py

@@ -1,45 +1,60 @@
-import os
+import os, sys
 import xml.etree.ElementTree as ET
+from xml.dom import minidom
+
 
 def main():
-    corpus = "<teiCorpus></teiCorpus>"
-    corpusTree = ET.parse(corpus)
-    corpusRoot = ET.fromstring(corpus)
-    path = "/home/spassova/BGCorpusExamples/"
-    files = os.listdir(path)
-    for j in range(len(files)):
-        processing(path + files[j]) 
+    # Create corpus structure from string and save into file 
+    corpus = "<teiCorpus>\n</teiCorpus>"
+    origRoot = ET.fromstring(corpus)
+    corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent="  ")
+    with open("tree_structure.xml", "w") as f:
+        f.write(corpusStr)
 
+    # Process all documents and append to corpusTree
+    #path = "./BGCorpusExamples/"
+    #files = os.listdir(path)
+    process(0, sys.argv[1])
+    #process(sys.argv[2]) 
+    # Parse corpus tree, indent and output
+    corpusTree = ET.parse("tree_structure.xml")
     ET.indent(corpusTree, "  ")
-    corpusTree.write(f"{j:02}" + "_" + "output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)
-    
+    corpusTree.write("output.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
 
-def processing(file):
-# Parse tree and get root
+
+def process(j, file):
+    #j = 0
+    # Parse corpus tree and get corpus root
+    corpusTree = ET.parse("tree_structure.xml")
+    corpusRoot = corpusTree.getroot()
+
+    # Parse document tree and get root
     tree = ET.parse(file)
     root = tree.getroot()
-# Store metadata and texts in lists
+
+    # Store metadata and texts in lists
     titles = root.findall(".//*[@type='title']")
-    domains = root.findall(".//*[@type='domain']")
+    #domains = root.findall(".//*[@type='domain']")
     pageURLs = root.findall(".//*[@type='pageURL']")
-    ids = root.findall(".//*[@type='id']")
-    mainImageURLs = root.findall(".//*[@type='mainImageURL']")
-    mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
-    mainImageSources = root.findall(".//*[@type='mainImageSources']")
+    #ids = root.findall(".//*[@type='id']")
+    #mainImageURLs = root.findall(".//*[@type='mainImageURL']")
+    #mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
+    #mainImageSources = root.findall(".//*[@type='mainImageSources']")
     authors = root.findall(".//*[@type='authors']")
-    authorURLs = root.findall(".//*[@type='authorURLs']")
-    categories = root.findall(".//*[@type='category']")
-    subCategories = root.findall(".//*[@type='subCategory']")
-    tags = root.findall(".//*[@type='tags']")
+    #authorURLs = root.findall(".//*[@type='authorURLs']")
+    #categories = root.findall(".//*[@type='category']")
+    #subCategories = root.findall(".//*[@type='subCategory']")
+    #tags = root.findall(".//*[@type='tags']")
     datesPublished = root.findall(".//*[@type='datePublished']")
     timesPublished = root.findall(".//*[@type='timePublished']")
-    datesModified = root.findall(".//*[@type='dateModified']")
-    timesModified = root.findall(".//*[@type='timeModified']")
-    mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
-    mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
-    mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
+    #datesModified = root.findall(".//*[@type='dateModified']")
+    #timesModified = root.findall(".//*[@type='timeModified']")
+    #mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
+    #mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
+    #mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
     texts = []
-# Count text elements and remove metadata
+
+    # Count text elements and remove metadata
     number_of_texts = 0
     for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
         number_of_texts+=1
@@ -51,19 +66,22 @@
                             div2.remove(div3)
 
                         texts.append(div2)
-# Remove all elements from root
+
+    # Remove all elements from root
     for elem in root.findall("*"):
         root.remove(elem)
-# Rename root
+
+    # Rename root
     root.tag = "teiDoc"
-# Create i5 structure
+
+    # Create target structure
     for i in range(number_of_texts):
         tei = ET.SubElement(root, "TEI")
         teiHeader = ET.SubElement(tei, "teiHeader")
         fileDesc = ET.SubElement(teiHeader, "fileDesc")
         titleStmt = ET.SubElement(fileDesc, "titleStmt")
         textSigle = ET.SubElement(titleStmt, "textSigle")
-        textSigle.text = "BNC/" + f"{j:02}" + "." + f"{i:05}"
+        textSigle.text = "BNC/" + f"{j:03}" + "." + f"{i:05}"
         sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
         analytic = ET.SubElement(sourceDesc, "analytic")
         htitle = ET.SubElement(analytic, "h.title")
@@ -96,6 +114,6 @@
     corpusRoot.append(root)    
     ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
 
-    return 
-    
 
+if __name__ == "__main__":
+    main()
commit	ab4e0ea4d6d6562fa275ace307e40ea0e58cbd20	[log] [tgz]
author	lora-sp <lora.spassova@swhk.ids-mannheim.de>	Fri Mar 10 12:02:24 2023 +0100
committer	lora-sp <lora.spassova@swhk.ids-mannheim.de>	Fri Mar 10 12:02:24 2023 +0100
tree	b1785e24c16fad014837f6edfbdeba517e4c8188
parent	df49785b33276ad004c4a42e2cb00b29eab1811a [diff] [blame]