commit
diff --git a/bunc2tei.py b/bunc2tei.py
index fccc2c1..5ecff9b 100644
--- a/bunc2tei.py
+++ b/bunc2tei.py
@@ -1,95 +1,94 @@
import os
import xml.etree.ElementTree as ET
-# Path to documents
-path = "/home/spassova/BGCorpusExamples"
-files = os.listdir(path)
+def main():
+ corpus = "<teiCorpus></teiCorpus>"
+ root = ET.fromstring(corpus)
+ path = "/home/spassova/BGCorpusExamples/"
+ files = os.listdir(path)
+ for j in range(len(files)):
+ processing(path + files[j])
+ teiDoc = ET.SubElement(root, "teiDoc")
+
+def processing(file):
# Parse tree and get root
-tree = ET.parse(path + "/" + files[0])
-root = tree.getroot()
-
+ tree = ET.parse(file)
+ root = tree.getroot()
# Store metadata and texts in lists
-titles = root.findall(".//*[@type='title']")
-domains = root.findall(".//*[@type='domain']")
-pageURLs = root.findall(".//*[@type='pageURL']")
-ids = root.findall(".//*[@type='id']")
-mainImageURLs = root.findall(".//*[@type='mainImageURL']")
-mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
-mainImageSources = root.findall(".//*[@type='mainImageSources']")
-authors = root.findall(".//*[@type='authors']")
-authorURLs = root.findall(".//*[@type='authorURLs']")
-categories = root.findall(".//*[@type='category']")
-subCategories = root.findall(".//*[@type='subCategory']")
-tags = root.findall(".//*[@type='tags']")
-datesPublished = root.findall(".//*[@type='datePublished']")
-timesPublished = root.findall(".//*[@type='timePublished']")
-datesModified = root.findall(".//*[@type='dateModified']")
-timesModified = root.findall(".//*[@type='timeModified']")
-mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
-mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
-mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
-texts = []
-
+ titles = root.findall(".//*[@type='title']")
+ domains = root.findall(".//*[@type='domain']")
+ pageURLs = root.findall(".//*[@type='pageURL']")
+ ids = root.findall(".//*[@type='id']")
+ mainImageURLs = root.findall(".//*[@type='mainImageURL']")
+ mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
+ mainImageSources = root.findall(".//*[@type='mainImageSources']")
+ authors = root.findall(".//*[@type='authors']")
+ authorURLs = root.findall(".//*[@type='authorURLs']")
+ categories = root.findall(".//*[@type='category']")
+ subCategories = root.findall(".//*[@type='subCategory']")
+ tags = root.findall(".//*[@type='tags']")
+ datesPublished = root.findall(".//*[@type='datePublished']")
+ timesPublished = root.findall(".//*[@type='timePublished']")
+ datesModified = root.findall(".//*[@type='dateModified']")
+ timesModified = root.findall(".//*[@type='timeModified']")
+ mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
+ mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
+ mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
+ texts = []
# Count text elements and remove metadata
-number_of_texts = 0
-for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
- number_of_texts+=1
- for body in text:
- for div1 in body:
- for div2 in div1:
- for div3 in div2:
- if div3.get('type') == "metadata":
- div2.remove(div3)
+ number_of_texts = 0
+ for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
+ number_of_texts+=1
+ for body in text:
+ for div1 in body:
+ for div2 in div1:
+ for div3 in div2:
+ if div3.get('type') == "metadata":
+ div2.remove(div3)
- texts.append(div2)
-
-
+ texts.append(div2)
# Remove all elements from root
-for elem in root.findall("*"):
- root.remove(elem)
-
+ for elem in root.findall("*"):
+ root.remove(elem)
# Rename root
-root.tag = "teiCorpus"
-
+ root.tag = "teiDoc"
# Create i5 structure
-for i in range(number_of_texts):
- tei = ET.SubElement(root, "TEI")
- teiHeader = ET.SubElement(tei, "teiHeader")
- fileDesc = ET.SubElement(teiHeader, "fileDesc")
- titleStmt = ET.SubElement(fileDesc, "titleStmt")
- textSigle = ET.SubElement(titleStmt, "textSigle")
- textSigle.text = "BNC/TST." + f"{i:05}"
- sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
- analytic = ET.SubElement(sourceDesc, "analytic")
- htitle = ET.SubElement(analytic, "h.title")
- htitle.text = titles[i].text
- hauthor = ET.SubElement(analytic, "h.author")
- hauthor.text = authors[i].text
- imprint = ET.SubElement(sourceDesc, "imprint")
- pubDateYear = ET.SubElement(imprint, "pubDate")
- pubDateYear.set("type", "year")
- pubDateYear.text = datesPublished[i].text[0:4]
- pubDateMonth = ET.SubElement(imprint, "pubDate")
- pubDateMonth.set("type", "month")
- pubDateMonth.text = datesPublished[i].text[5:7]
- pubDateDay = ET.SubElement(imprint, "pubDate")
- pubDateDay.set("type", "day")
- pubDateDay.text = datesPublished[i].text[8:10]
- pubDateTime = ET.SubElement(imprint, "pubDate")
- pubDateTime.set("type", "time")
- pubDateTime.text = timesPublished[i].text
- pubPlace = ET.SubElement(imprint, "pubPlace")
- ref = ET.SubElement(pubPlace, "ref")
- ref.set("type", "page_url")
- ref.set("target", pageURLs[i].text)
- text = ET.SubElement(tei, "text")
- body = ET.SubElement(text, "body")
- for p in texts[i]:
- body.append(p)
+ for i in range(number_of_texts):
+ tei = ET.SubElement(root, "TEI")
+ teiHeader = ET.SubElement(tei, "teiHeader")
+ fileDesc = ET.SubElement(teiHeader, "fileDesc")
+ titleStmt = ET.SubElement(fileDesc, "titleStmt")
+ textSigle = ET.SubElement(titleStmt, "textSigle")
+ textSigle.text = "BNC/" + f"{j:02}" + "." + f"{i:05}"
+ sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
+ analytic = ET.SubElement(sourceDesc, "analytic")
+ htitle = ET.SubElement(analytic, "h.title")
+ htitle.text = titles[i].text
+ hauthor = ET.SubElement(analytic, "h.author")
+ hauthor.text = authors[i].text
+ imprint = ET.SubElement(sourceDesc, "imprint")
+ pubDateYear = ET.SubElement(imprint, "pubDate")
+ pubDateYear.set("type", "year")
+ pubDateYear.text = datesPublished[i].text[0:4]
+ pubDateMonth = ET.SubElement(imprint, "pubDate")
+ pubDateMonth.set("type", "month")
+ pubDateMonth.text = datesPublished[i].text[5:7]
+ pubDateDay = ET.SubElement(imprint, "pubDate")
+ pubDateDay.set("type", "day")
+ pubDateDay.text = datesPublished[i].text[8:10]
+ pubDateTime = ET.SubElement(imprint, "pubDate")
+ pubDateTime.set("type", "time")
+ pubDateTime.text = timesPublished[i].text
+ pubPlace = ET.SubElement(imprint, "pubPlace")
+ ref = ET.SubElement(pubPlace, "ref")
+ ref.set("type", "page_url")
+ ref.set("target", pageURLs[i].text)
+ text = ET.SubElement(tei, "text")
+ body = ET.SubElement(text, "body")
+ for p in texts[i]:
+ body.append(p)
-
-
-ET.indent(tree, " ")
-ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
-tree.write("04_output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)
+ ET.indent(tree, " ")
+ ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
+ tree.write(f"{j:02}" + "_" + "output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)