/<text/ { | |
pubTitle = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0); | |
pubType = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0); | |
language = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0); | |
tokenc = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0); | |
pubTitles[pubTitle]++; | |
pubTypes[pubTitle] = pubType; | |
languages[pubTitle]= language; | |
tokens[pubTitle] = tokens[pubTitle] + tokenc; | |
} | |
END { | |
for (pubTitle in pubTitles){ | |
print pubTitle, "\t", pubTitles[pubTitle], "\t", tokens[pubTitle], "\t", pubTypes[pubTitle], "\t", languages[pubTitle]; | |
} | |
} |