Converting xml files and merging them into one corpus file
Change-Id: I0fbdd1e89658523c0f4bbcda73b41af7e277c2f8
diff --git a/bunc2tei.py b/bunc2tei.py
index 9dda9f2..055f45c 100644
--- a/bunc2tei.py
+++ b/bunc2tei.py
@@ -29,7 +29,7 @@
#print(currentRoot.tag)
corpusRoot.append(currentRoot)
except:
- print("sorry")
+ print(sys.argv[j])
continue
# Indent and save tree
diff --git a/ill-formed_docs.txt b/ill-formed_docs.txt
new file mode 100644
index 0000000..a058fc0
--- /dev/null
+++ b/ill-formed_docs.txt
@@ -0,0 +1,4 @@
+Ill-formed documents:
+
+- 132 instances of unescaped "&" in text-elements
+- doc "investor.bg - 2020-01-04.xml" contains ill-formed line "<p><div</p>" (line 168)