try except for unescaped amps

commit: 09a58a06c835f42a2ab70a8a5e5bdb5a57910db5 [log] [tgz]
author: lora-sp <lora.spassova@swhk.ids-mannheim.de> Fri Mar 10 16:33:46 2023 +0100
committer: lora-sp <lora.spassova@swhk.ids-mannheim.de> Fri Mar 10 16:33:46 2023 +0100
tree: 8888a2d3ac846eea466dae5ad918f09116afec24
parent: ab4e0ea4d6d6562fa275ace307e40ea0e58cbd20 [diff] [blame]
diff --git a/bunc2tei.py b/bunc2tei.py
index d143470..f0b54ef 100644
--- a/bunc2tei.py
+++ b/bunc2tei.py

@@ -1,6 +1,8 @@
 import os, sys
 import xml.etree.ElementTree as ET
 from xml.dom import minidom
+from lxml import etree
+from io import StringIO
 
 
 def main():
@@ -11,27 +13,32 @@
     with open("tree_structure.xml", "w") as f:
         f.write(corpusStr)
 
-    # Process all documents and append to corpusTree
-    #path = "./BGCorpusExamples/"
-    #files = os.listdir(path)
-    process(0, sys.argv[1])
-    #process(sys.argv[2]) 
-    # Parse corpus tree, indent and output
+    # Parse corpus tree
     corpusTree = ET.parse("tree_structure.xml")
+    corpusRoot = corpusTree.getroot()
+
+    # Process documents and append to corpus tree
+    for j in range(1, len(sys.argv)):
+        try:
+            currentTree = process(j-1, sys.argv[j])
+            currentRoot = currentTree.getroot()
+            #print(currentRoot.tag)
+            corpusRoot.append(currentRoot)
+        except:
+            print("sorry")
+            continue
+
+    # Indent and save tree
     ET.indent(corpusTree, "  ")
     corpusTree.write("output.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
 
 
 def process(j, file):
-    #j = 0
-    # Parse corpus tree and get corpus root
-    corpusTree = ET.parse("tree_structure.xml")
-    corpusRoot = corpusTree.getroot()
-
     # Parse document tree and get root
     tree = ET.parse(file)
     root = tree.getroot()
-
+    ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
+ 
     # Store metadata and texts in lists
     titles = root.findall(".//*[@type='title']")
     #domains = root.findall(".//*[@type='domain']")
@@ -110,10 +117,7 @@
         for p in texts[i]:
             body.append(p)
 
-
-    corpusRoot.append(root)    
-    ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
+    return tree
 
 
-if __name__ == "__main__":
-    main()
+main()
commit	09a58a06c835f42a2ab70a8a5e5bdb5a57910db5	[log] [tgz]
author	lora-sp <lora.spassova@swhk.ids-mannheim.de>	Fri Mar 10 16:33:46 2023 +0100
committer	lora-sp <lora.spassova@swhk.ids-mannheim.de>	Fri Mar 10 16:33:46 2023 +0100
tree	8888a2d3ac846eea466dae5ad918f09116afec24
parent	ab4e0ea4d6d6562fa275ace307e40ea0e58cbd20 [diff] [blame]