blob: b635ee5bfcfda84c306d32b3614af4bc6fb7c84d [file] [log] [blame]
package de.ids_mannheim.korap.tokenizer;
import static org.junit.Assert.*;
import java.util.*;
import java.io.*;
import java.net.URLDecoder;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class TokenizerCoverTest {
/**
* This test suite checks for the tokenization coverage of our
* tokenizer implementation based on the EmpiriST 2015
* Gold Standard Suite, published under Creative Commons license
* BY-SA 3.0.
*
* Michael Beißwenger, Sabine Bartsch, Stefan Evert and
* Kay-Michael Würzner (2016). EmpiriST 2015: A shared task
* on the automatic linguistic annotation of computer-mediated
* communication and web corpora. In Proceedings of the 10th
* Web as Corpus Workshop (WAC-X) and the EmpiriST Shared Task,
* pages 78–90. Berlin, Germany.
*
* https://sites.google.com/site/empirist2015/home/gold
*/
// Get a data file
private String getFile (String file) {
String path = getClass().getResource(file).getFile();
StringBuilder content = new StringBuilder();
try {
BufferedReader in = new BufferedReader(
new InputStreamReader(
new FileInputStream(URLDecoder.decode(path, "UTF-8")),
"UTF-8"
)
);
String str;
while ((str = in.readLine()) != null) {
content.append(str + "\n");
}
in.close();
}
catch (IOException e) {
fail(e.getMessage());
}
return content.toString();
}
/**
* Scan Empirist articles and iterate through
*/
private class EmpiristScanner implements Iterator {
private Scanner sc;
public EmpiristScanner (String file) {
sc = new Scanner(getFile(file));
sc.useDelimiter("<(?:posting|article)[^>]+?/>");
}
// Return next posting/article
public String next () {
return sc.next().trim();
}
// Check if new posting/article exists
public boolean hasNext () {
return sc.hasNext();
}
}
/**
* To calculate the difference between the gold standard version and
* our version, we calculate the levenshtein difference between both lists.
* It's not very intuitive that way, as it does not treat merges and splits
* specifically (i.e. a merge is one replacement and one deletion, a split
* is one replacement and one insertion) - so the number is not
* really meaningful - it's just a way to measure the differences.
* It's important to note that this differs from the comparison of
* EmpiriST, where the end boundaries of all tokens are compared.
*/
public static int levenshteinForStringArrays (String[] s, String[] t) {
if (s == null || t == null) {
throw new IllegalArgumentException("Lists must not be null");
}
// Code based on Rosettacode.org
int [] costs = new int[t.length + 1];
for (int j = 0; j < costs.length; j++)
costs[j] = j;
for (int i = 1; i <= s.length; i++) {
costs[0] = i;
int nw = i - 1;
for (int j = 1; j <= t.length; j++) {
int cj = Math.min(
1 + Math.min(costs[j], costs[j - 1]),
s[i - 1].equals(t[j - 1]) ? nw : nw + 1
);
nw = costs[j];
costs[j] = cj;
}
}
return costs[t.length];
}
/**
* Compare the tokenized data of one example file
* with the gold standard and return the sum of
* levenshtein distances.
*/
public int distanceToGoldStandard (KorAPTokenizerImpl tok, String suite, String postings) {
// Load raw postings
EmpiristScanner esRaw = new EmpiristScanner(
"/empirist_gold_standard/" + suite + "/raw/" + postings + ".txt"
);
// Load tokenized postings
EmpiristScanner esTokenized = new EmpiristScanner(
"/empirist_gold_standard/" + suite + "/tokenized/" + postings + ".txt"
);
int distance = 0;
// Iterate over all postings
while (esRaw.hasNext() && esTokenized.hasNext()) {
// Get the gold standard splitted on new lines
String [] goldTokens = esTokenized.next().split("\n+");
// Tokenize the test data
String [] testTokens = tok.tokenize(esRaw.next());
if (false) {
System.err.println("-----------------");
for (int i = 0; i < Math.min(goldTokens.length, testTokens.length); i++) {
System.err.println(goldTokens[i] + " = "+ testTokens[i]);
}
}
// Calculate the edit distance of both arrays
distance += levenshteinForStringArrays(goldTokens, testTokens);
}
// Return the sum of all distances
return distance;
}
@Test
public void testTokenizerCoverEmpiristCmc () {
// Create tokenizer object
KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
String test = "cmc_test_blog_comment";
int dist = distanceToGoldStandard(tok, "test_cmc", test);
assertTrue(test + " = " + dist, dist == 0);
test = "cmc_test_professional_chat";
dist = distanceToGoldStandard(tok, "test_cmc", test);
assertTrue(test + " = " + dist, dist <= 20);
test = "cmc_test_social_chat";
dist = distanceToGoldStandard(tok, "test_cmc", test);
assertTrue(test + " = " + dist, dist <= 23);
test = "cmc_test_twitter";
dist = distanceToGoldStandard(tok, "test_cmc", test);
assertTrue(test + " = " + dist, dist <= 153);
test = "cmc_test_whatsapp";
dist = distanceToGoldStandard(tok, "test_cmc", test);
assertTrue(test + " = " + dist, dist <= 0);
test = "cmc_test_wiki_discussion";
dist = distanceToGoldStandard(tok, "test_cmc", test);
assertTrue(test + " = " + dist, dist <= 24);
}
@Test
public void testTokenizerCoverEmpiristWeb () {
// Create tokenizer object
KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
String test = "web_test_001";
int dist = distanceToGoldStandard(tok, "test_web", test);
assertTrue(test + " = " + dist, dist <= 21);
test = "web_test_002";
dist = distanceToGoldStandard(tok, "test_web", test);
assertTrue(test + " = " + dist, dist <= 5);
test = "web_test_003";
dist = distanceToGoldStandard(tok, "test_web", test);
assertTrue(test + " = " + dist, dist <= 17);
test = "web_test_004";
dist = distanceToGoldStandard(tok, "test_web", test);
assertTrue(test + " = " + dist, dist <= 20);
test = "web_test_005";
dist = distanceToGoldStandard(tok, "test_web", test);
assertTrue(test + " = " + dist, dist <= 8);
test = "web_test_006";
dist = distanceToGoldStandard(tok, "test_web", test);
assertTrue(test + " = " + dist, dist <= 8);
test = "web_test_007";
dist = distanceToGoldStandard(tok, "test_web", test);
assertTrue(test + " = " + dist, dist <= 12);
test = "web_test_008";
dist = distanceToGoldStandard(tok, "test_web", test);
assertTrue(test + " = " + dist, dist <= 2);
test = "web_test_009";
dist = distanceToGoldStandard(tok, "test_web", test);
assertTrue(test + " = " + dist, dist <= 8);
test = "web_test_010";
dist = distanceToGoldStandard(tok, "test_web", test);
assertTrue(test + " = " + dist, dist == 0);
test = "web_test_011";
dist = distanceToGoldStandard(tok, "test_web", test);
assertTrue(test + " = " + dist, dist <= 28);
test = "web_test_012";
dist = distanceToGoldStandard(tok, "test_web", test);
assertTrue(test + " = " + dist, dist <= 7);
}
}