auxiliary scripts
diff --git a/sources.awk b/sources.awk
new file mode 100644
index 0000000..1c38420
--- /dev/null
+++ b/sources.awk
@@ -0,0 +1,17 @@
+ /<text/ {
+ pubTitle = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ pubType = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ language = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ tokenc = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+
+ pubTitles[pubTitle]++;
+ pubTypes[pubTitle] = pubType;
+ languages[pubTitle]= language;
+ tokens[pubTitle] = tokens[pubTitle] + tokenc;
+ }
+
+ END {
+ for (pubTitle in pubTitles){
+ print pubTitle, "\t", pubTitles[pubTitle], "\t", tokens[pubTitle], "\t", pubTypes[pubTitle], "\t", languages[pubTitle];
+ }
+ }