blob: b635ee5bfcfda84c306d32b3614af4bc6fb7c84d [file] [log] [blame]
Marc Kupietz33677732020-09-04 22:07:39 +02001package de.ids_mannheim.korap.tokenizer;
2
3import static org.junit.Assert.*;
4import java.util.*;
5import java.io.*;
6import java.net.URLDecoder;
7import org.junit.Test;
Marc Kupietz33677732020-09-04 22:07:39 +02008import org.junit.runner.RunWith;
9import org.junit.runners.JUnit4;
10
11@RunWith(JUnit4.class)
12public class TokenizerCoverTest {
13
14 /**
15 * This test suite checks for the tokenization coverage of our
16 * tokenizer implementation based on the EmpiriST 2015
17 * Gold Standard Suite, published under Creative Commons license
18 * BY-SA 3.0.
19 *
20 * Michael Beißwenger, Sabine Bartsch, Stefan Evert and
21 * Kay-Michael Würzner (2016). EmpiriST 2015: A shared task
22 * on the automatic linguistic annotation of computer-mediated
23 * communication and web corpora. In Proceedings of the 10th
24 * Web as Corpus Workshop (WAC-X) and the EmpiriST Shared Task,
25 * pages 78–90. Berlin, Germany.
26 *
27 * https://sites.google.com/site/empirist2015/home/gold
28 */
29
30 // Get a data file
31 private String getFile (String file) {
32 String path = getClass().getResource(file).getFile();
33
34 StringBuilder content = new StringBuilder();
35 try {
36 BufferedReader in = new BufferedReader(
37 new InputStreamReader(
38 new FileInputStream(URLDecoder.decode(path, "UTF-8")),
39 "UTF-8"
40 )
41 );
42 String str;
43 while ((str = in.readLine()) != null) {
44 content.append(str + "\n");
Marc Kupietz478632e2020-09-05 21:52:54 +020045 }
Marc Kupietz33677732020-09-04 22:07:39 +020046 in.close();
47 }
48 catch (IOException e) {
49 fail(e.getMessage());
50 }
51 return content.toString();
Marc Kupietz478632e2020-09-05 21:52:54 +020052 }
Marc Kupietz33677732020-09-04 22:07:39 +020053
54
55 /**
56 * Scan Empirist articles and iterate through
57 */
58 private class EmpiristScanner implements Iterator {
59 private Scanner sc;
60
61 public EmpiristScanner (String file) {
62 sc = new Scanner(getFile(file));
63 sc.useDelimiter("<(?:posting|article)[^>]+?/>");
64 }
65
66 // Return next posting/article
67 public String next () {
68 return sc.next().trim();
69 }
70
71 // Check if new posting/article exists
72 public boolean hasNext () {
73 return sc.hasNext();
74 }
75 }
76
77 /**
78 * To calculate the difference between the gold standard version and
79 * our version, we calculate the levenshtein difference between both lists.
80 * It's not very intuitive that way, as it does not treat merges and splits
81 * specifically (i.e. a merge is one replacement and one deletion, a split
82 * is one replacement and one insertion) - so the number is not
83 * really meaningful - it's just a way to measure the differences.
84 * It's important to note that this differs from the comparison of
85 * EmpiriST, where the end boundaries of all tokens are compared.
86 */
87 public static int levenshteinForStringArrays (String[] s, String[] t) {
88 if (s == null || t == null) {
89 throw new IllegalArgumentException("Lists must not be null");
90 }
91
92 // Code based on Rosettacode.org
93 int [] costs = new int[t.length + 1];
94
95 for (int j = 0; j < costs.length; j++)
96 costs[j] = j;
97
98 for (int i = 1; i <= s.length; i++) {
99 costs[0] = i;
100 int nw = i - 1;
101 for (int j = 1; j <= t.length; j++) {
102 int cj = Math.min(
103 1 + Math.min(costs[j], costs[j - 1]),
104 s[i - 1].equals(t[j - 1]) ? nw : nw + 1
105 );
106 nw = costs[j];
107 costs[j] = cj;
108 }
109 }
110
111 return costs[t.length];
112 }
113
114 /**
115 * Compare the tokenized data of one example file
116 * with the gold standard and return the sum of
117 * levenshtein distances.
118 */
119 public int distanceToGoldStandard (KorAPTokenizerImpl tok, String suite, String postings) {
120
121 // Load raw postings
122 EmpiristScanner esRaw = new EmpiristScanner(
123 "/empirist_gold_standard/" + suite + "/raw/" + postings + ".txt"
124 );
125
126 // Load tokenized postings
127 EmpiristScanner esTokenized = new EmpiristScanner(
128 "/empirist_gold_standard/" + suite + "/tokenized/" + postings + ".txt"
129 );
130
131 int distance = 0;
132
133 // Iterate over all postings
134 while (esRaw.hasNext() && esTokenized.hasNext()) {
135
136 // Get the gold standard splitted on new lines
137 String [] goldTokens = esTokenized.next().split("\n+");
138
139 // Tokenize the test data
140 String [] testTokens = tok.tokenize(esRaw.next());
141
142 if (false) {
143 System.err.println("-----------------");
144 for (int i = 0; i < Math.min(goldTokens.length, testTokens.length); i++) {
145 System.err.println(goldTokens[i] + " = "+ testTokens[i]);
146 }
147 }
148
149 // Calculate the edit distance of both arrays
150 distance += levenshteinForStringArrays(goldTokens, testTokens);
Marc Kupietz478632e2020-09-05 21:52:54 +0200151 }
Marc Kupietz33677732020-09-04 22:07:39 +0200152
153 // Return the sum of all distances
154 return distance;
Marc Kupietz478632e2020-09-05 21:52:54 +0200155 }
156
Marc Kupietz33677732020-09-04 22:07:39 +0200157
158 @Test
159 public void testTokenizerCoverEmpiristCmc () {
160
161 // Create tokenizer object
162 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
163
164 String test = "cmc_test_blog_comment";
165 int dist = distanceToGoldStandard(tok, "test_cmc", test);
166 assertTrue(test + " = " + dist, dist == 0);
167
168 test = "cmc_test_professional_chat";
169 dist = distanceToGoldStandard(tok, "test_cmc", test);
170 assertTrue(test + " = " + dist, dist <= 20);
171
172 test = "cmc_test_social_chat";
173 dist = distanceToGoldStandard(tok, "test_cmc", test);
174 assertTrue(test + " = " + dist, dist <= 23);
175
176 test = "cmc_test_twitter";
177 dist = distanceToGoldStandard(tok, "test_cmc", test);
178 assertTrue(test + " = " + dist, dist <= 153);
179
180 test = "cmc_test_whatsapp";
181 dist = distanceToGoldStandard(tok, "test_cmc", test);
182 assertTrue(test + " = " + dist, dist <= 0);
183
184 test = "cmc_test_wiki_discussion";
185 dist = distanceToGoldStandard(tok, "test_cmc", test);
186 assertTrue(test + " = " + dist, dist <= 24);
187
188 }
189
190 @Test
191 public void testTokenizerCoverEmpiristWeb () {
192
193 // Create tokenizer object
194 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
195
196 String test = "web_test_001";
197 int dist = distanceToGoldStandard(tok, "test_web", test);
198 assertTrue(test + " = " + dist, dist <= 21);
199
200 test = "web_test_002";
201 dist = distanceToGoldStandard(tok, "test_web", test);
202 assertTrue(test + " = " + dist, dist <= 5);
203
204 test = "web_test_003";
205 dist = distanceToGoldStandard(tok, "test_web", test);
206 assertTrue(test + " = " + dist, dist <= 17);
207
208 test = "web_test_004";
209 dist = distanceToGoldStandard(tok, "test_web", test);
210 assertTrue(test + " = " + dist, dist <= 20);
211
212 test = "web_test_005";
213 dist = distanceToGoldStandard(tok, "test_web", test);
214 assertTrue(test + " = " + dist, dist <= 8);
215
216 test = "web_test_006";
217 dist = distanceToGoldStandard(tok, "test_web", test);
218 assertTrue(test + " = " + dist, dist <= 8);
219
220 test = "web_test_007";
221 dist = distanceToGoldStandard(tok, "test_web", test);
222 assertTrue(test + " = " + dist, dist <= 12);
223
224 test = "web_test_008";
225 dist = distanceToGoldStandard(tok, "test_web", test);
226 assertTrue(test + " = " + dist, dist <= 2);
227
228 test = "web_test_009";
229 dist = distanceToGoldStandard(tok, "test_web", test);
230 assertTrue(test + " = " + dist, dist <= 8);
231
232 test = "web_test_010";
233 dist = distanceToGoldStandard(tok, "test_web", test);
234 assertTrue(test + " = " + dist, dist == 0);
235
236 test = "web_test_011";
237 dist = distanceToGoldStandard(tok, "test_web", test);
238 assertTrue(test + " = " + dist, dist <= 28);
239
240 test = "web_test_012";
241 dist = distanceToGoldStandard(tok, "test_web", test);
242 assertTrue(test + " = " + dist, dist <= 7);
243 }
244}