auxiliary scripts
diff --git a/sources.awk b/sources.awk
new file mode 100644
index 0000000..1c38420
--- /dev/null
+++ b/sources.awk
@@ -0,0 +1,17 @@
+    /<text/     {
+	pubTitle = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+	pubType  = gensub(/^.+publ_type=\"([^\"]+?)\".+$/,  "\\1", "1", $0);
+	language = gensub(/^.+language=\"([^\"]+?)\".+$/,   "\\1", "1", $0);
+	tokenc   = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+	
+	pubTitles[pubTitle]++;
+	pubTypes[pubTitle] = pubType;
+	languages[pubTitle]= language;
+	tokens[pubTitle] = tokens[pubTitle] + tokenc;
+    }
+
+    END {
+	for (pubTitle in pubTitles){
+	    print pubTitle, "\t", pubTitles[pubTitle], "\t", tokens[pubTitle], "\t", pubTypes[pubTitle], "\t", languages[pubTitle];
+	}
+    }