Converting xml files and merging them into one corpus file

Change-Id: I0fbdd1e89658523c0f4bbcda73b41af7e277c2f8
diff --git a/bunc2tei.py b/bunc2tei.py
index 9dda9f2..055f45c 100644
--- a/bunc2tei.py
+++ b/bunc2tei.py
@@ -29,7 +29,7 @@
             #print(currentRoot.tag)
             corpusRoot.append(currentRoot)
         except:
-            print("sorry")
+            print(sys.argv[j])
             continue
 
     # Indent and save tree
diff --git a/ill-formed_docs.txt b/ill-formed_docs.txt
new file mode 100644
index 0000000..a058fc0
--- /dev/null
+++ b/ill-formed_docs.txt
@@ -0,0 +1,4 @@
+Ill-formed documents:
+
+- 132 instances of unescaped "&" in text-elements
+- doc "investor.bg - 2020-01-04.xml" contains ill-formed line "<p><div</p>" (line 168)