Merge remote-tracking branch 'refs/remotes/origin/main'
diff --git a/bunc2tei.py b/bunc2tei.py
index 6ea85c6..738eddb 100644
--- a/bunc2tei.py
+++ b/bunc2tei.py
@@ -1,6 +1,94 @@
 import os
 import xml.etree.ElementTree as ET
 
+<<<<<<< HEAD
+# Path to documents
+path = "/home/spassova/BGCorpusExamples"
+files = os.listdir(path)
+
+# Parse tree and get root
+tree = ET.parse(path + "/" + files[0])
+root = tree.getroot()
+
+# Store metadata and texts in lists
+titles = root.findall(".//*[@type='title']")
+domains = root.findall(".//*[@type='domain']")
+pageURLs = root.findall(".//*[@type='pageURL']")
+ids = root.findall(".//*[@type='id']")
+mainImageURLs = root.findall(".//*[@type='mainImageURL']")
+mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
+mainImageSources = root.findall(".//*[@type='mainImageSources']")
+authors = root.findall(".//*[@type='authors']")
+authorURLs = root.findall(".//*[@type='authorURLs']")
+categories = root.findall(".//*[@type='category']")
+subCategories = root.findall(".//*[@type='subCategory']")
+tags = root.findall(".//*[@type='tags']")
+datesPublished = root.findall(".//*[@type='datePublished']")
+timesPublished = root.findall(".//*[@type='timePublished']")
+datesModified = root.findall(".//*[@type='dateModified']")
+timesModified = root.findall(".//*[@type='timeModified']")
+mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
+mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
+mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
+texts = []
+
+# Count text elements and remove metadata
+number_of_texts = 0
+for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
+    number_of_texts+=1
+    for body in text:
+        for div1 in body:
+            for div2 in div1:
+                for div3 in div2:
+                    if div3.get('type') == "metadata":
+                        div2.remove(div3)
+
+                    texts.append(div2)
+                    
+
+# Remove all elements from root
+for elem in root.findall("*"):
+    root.remove(elem)
+
+# Rename root
+root.tag = "teiCorpus"
+
+# Create i5 structure
+for i in range(number_of_texts):
+    tei = ET.SubElement(root, "TEI")
+    teiHeader = ET.SubElement(tei, "teiHeader")
+    fileDesc = ET.SubElement(teiHeader, "fileDesc")
+    titleStmt = ET.SubElement(fileDesc, "titleStmt")
+    textSigle = ET.SubElement(titleStmt, "textSigle")
+    textSigle.text = "BNC/TST." + f"{i:05}"
+    sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
+    analytic = ET.SubElement(sourceDesc, "analytic")
+    htitle = ET.SubElement(analytic, "h.title")
+    htitle.text = titles[i].text
+    hauthor = ET.SubElement(analytic, "h.author")
+    hauthor.text = authors[i].text
+    imprint = ET.SubElement(sourceDesc, "imprint")
+    pubDateYear = ET.SubElement(imprint, "pubDate")
+    pubDateYear.set("type", "year")
+    pubDateYear.text = datesPublished[i].text[0:4]
+    pubDateMonth = ET.SubElement(imprint, "pubDate")
+    pubDateMonth.set("type", "month")
+    pubDateMonth.text = datesPublished[i].text[5:7]
+    pubDateDay = ET.SubElement(imprint, "pubDate")
+    pubDateDay.set("type", "day")
+    pubDateDay.text = datesPublished[i].text[8:10]
+    pubDateTime = ET.SubElement(imprint, "pubDate")
+    pubDateTime.set("type", "time")
+    pubDateTime.text = timesPublished[i].text
+    pubPlace = ET.SubElement(imprint, "pubPlace")
+    ref = ET.SubElement(pubPlace, "ref")
+    ref.set("type", "page_url")
+    ref.set("target", pageURLs[i].text)
+    text = ET.SubElement(tei, "text")
+    body = ET.SubElement(text, "body")
+    for p in texts[i]:
+        body.append(p)
+=======
 path = '/home/spassova/BGCorpusExamples'
 files = os.listdir(path)
 
@@ -43,10 +131,17 @@
 
 
     
+>>>>>>> refs/remotes/origin/main
 
 
 
 
+<<<<<<< HEAD
+ET.indent(tree, "  ")
+ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
+tree.write("04_output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)
+=======
 ET.indent(tree, '  ')
 ET.register_namespace('', 'http://www.tei-c.org/ns/1.0')
-tree.write('04_output.xml', encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
\ No newline at end of file
+tree.write('04_output.xml', encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
+>>>>>>> refs/remotes/origin/main