blob: 2d717de1ee68125d431efbb6e4e024f7914f90b4 [file] [log] [blame]
Marc Kupietz33677732020-09-04 22:07:39 +02001package de.ids_mannheim.korap.tokenizer;
2
3import static org.junit.Assert.*;
4import java.util.*;
5import java.io.*;
6import java.net.URLDecoder;
7import org.junit.Test;
8import org.junit.Ignore;
9import org.junit.runner.RunWith;
10import org.junit.runners.JUnit4;
11
12@RunWith(JUnit4.class)
13public class TokenizerCoverTest {
14
15 /**
16 * This test suite checks for the tokenization coverage of our
17 * tokenizer implementation based on the EmpiriST 2015
18 * Gold Standard Suite, published under Creative Commons license
19 * BY-SA 3.0.
20 *
21 * Michael Beißwenger, Sabine Bartsch, Stefan Evert and
22 * Kay-Michael Würzner (2016). EmpiriST 2015: A shared task
23 * on the automatic linguistic annotation of computer-mediated
24 * communication and web corpora. In Proceedings of the 10th
25 * Web as Corpus Workshop (WAC-X) and the EmpiriST Shared Task,
26 * pages 78–90. Berlin, Germany.
27 *
28 * https://sites.google.com/site/empirist2015/home/gold
29 */
30
31 // Get a data file
32 private String getFile (String file) {
33 String path = getClass().getResource(file).getFile();
34
35 StringBuilder content = new StringBuilder();
36 try {
37 BufferedReader in = new BufferedReader(
38 new InputStreamReader(
39 new FileInputStream(URLDecoder.decode(path, "UTF-8")),
40 "UTF-8"
41 )
42 );
43 String str;
44 while ((str = in.readLine()) != null) {
45 content.append(str + "\n");
46 };
47 in.close();
48 }
49 catch (IOException e) {
50 fail(e.getMessage());
51 }
52 return content.toString();
53 };
54
55
56 /**
57 * Scan Empirist articles and iterate through
58 */
59 private class EmpiristScanner implements Iterator {
60 private Scanner sc;
61
62 public EmpiristScanner (String file) {
63 sc = new Scanner(getFile(file));
64 sc.useDelimiter("<(?:posting|article)[^>]+?/>");
65 }
66
67 // Return next posting/article
68 public String next () {
69 return sc.next().trim();
70 }
71
72 // Check if new posting/article exists
73 public boolean hasNext () {
74 return sc.hasNext();
75 }
76 }
77
78 /**
79 * To calculate the difference between the gold standard version and
80 * our version, we calculate the levenshtein difference between both lists.
81 * It's not very intuitive that way, as it does not treat merges and splits
82 * specifically (i.e. a merge is one replacement and one deletion, a split
83 * is one replacement and one insertion) - so the number is not
84 * really meaningful - it's just a way to measure the differences.
85 * It's important to note that this differs from the comparison of
86 * EmpiriST, where the end boundaries of all tokens are compared.
87 */
88 public static int levenshteinForStringArrays (String[] s, String[] t) {
89 if (s == null || t == null) {
90 throw new IllegalArgumentException("Lists must not be null");
91 }
92
93 // Code based on Rosettacode.org
94 int [] costs = new int[t.length + 1];
95
96 for (int j = 0; j < costs.length; j++)
97 costs[j] = j;
98
99 for (int i = 1; i <= s.length; i++) {
100 costs[0] = i;
101 int nw = i - 1;
102 for (int j = 1; j <= t.length; j++) {
103 int cj = Math.min(
104 1 + Math.min(costs[j], costs[j - 1]),
105 s[i - 1].equals(t[j - 1]) ? nw : nw + 1
106 );
107 nw = costs[j];
108 costs[j] = cj;
109 }
110 }
111
112 return costs[t.length];
113 }
114
115 /**
116 * Compare the tokenized data of one example file
117 * with the gold standard and return the sum of
118 * levenshtein distances.
119 */
120 public int distanceToGoldStandard (KorAPTokenizerImpl tok, String suite, String postings) {
121
122 // Load raw postings
123 EmpiristScanner esRaw = new EmpiristScanner(
124 "/empirist_gold_standard/" + suite + "/raw/" + postings + ".txt"
125 );
126
127 // Load tokenized postings
128 EmpiristScanner esTokenized = new EmpiristScanner(
129 "/empirist_gold_standard/" + suite + "/tokenized/" + postings + ".txt"
130 );
131
132 int distance = 0;
133
134 // Iterate over all postings
135 while (esRaw.hasNext() && esTokenized.hasNext()) {
136
137 // Get the gold standard splitted on new lines
138 String [] goldTokens = esTokenized.next().split("\n+");
139
140 // Tokenize the test data
141 String [] testTokens = tok.tokenize(esRaw.next());
142
143 if (false) {
144 System.err.println("-----------------");
145 for (int i = 0; i < Math.min(goldTokens.length, testTokens.length); i++) {
146 System.err.println(goldTokens[i] + " = "+ testTokens[i]);
147 }
148 }
149
150 // Calculate the edit distance of both arrays
151 distance += levenshteinForStringArrays(goldTokens, testTokens);
152 };
153
154 // Return the sum of all distances
155 return distance;
156 };
157
158
159 @Test
160 public void testTokenizerCoverEmpiristCmc () {
161
162 // Create tokenizer object
163 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
164
165 String test = "cmc_test_blog_comment";
166 int dist = distanceToGoldStandard(tok, "test_cmc", test);
167 assertTrue(test + " = " + dist, dist == 0);
168
169 test = "cmc_test_professional_chat";
170 dist = distanceToGoldStandard(tok, "test_cmc", test);
171 assertTrue(test + " = " + dist, dist <= 20);
172
173 test = "cmc_test_social_chat";
174 dist = distanceToGoldStandard(tok, "test_cmc", test);
175 assertTrue(test + " = " + dist, dist <= 23);
176
177 test = "cmc_test_twitter";
178 dist = distanceToGoldStandard(tok, "test_cmc", test);
179 assertTrue(test + " = " + dist, dist <= 153);
180
181 test = "cmc_test_whatsapp";
182 dist = distanceToGoldStandard(tok, "test_cmc", test);
183 assertTrue(test + " = " + dist, dist <= 0);
184
185 test = "cmc_test_wiki_discussion";
186 dist = distanceToGoldStandard(tok, "test_cmc", test);
187 assertTrue(test + " = " + dist, dist <= 24);
188
189 }
190
191 @Test
192 public void testTokenizerCoverEmpiristWeb () {
193
194 // Create tokenizer object
195 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
196
197 String test = "web_test_001";
198 int dist = distanceToGoldStandard(tok, "test_web", test);
199 assertTrue(test + " = " + dist, dist <= 21);
200
201 test = "web_test_002";
202 dist = distanceToGoldStandard(tok, "test_web", test);
203 assertTrue(test + " = " + dist, dist <= 5);
204
205 test = "web_test_003";
206 dist = distanceToGoldStandard(tok, "test_web", test);
207 assertTrue(test + " = " + dist, dist <= 17);
208
209 test = "web_test_004";
210 dist = distanceToGoldStandard(tok, "test_web", test);
211 assertTrue(test + " = " + dist, dist <= 20);
212
213 test = "web_test_005";
214 dist = distanceToGoldStandard(tok, "test_web", test);
215 assertTrue(test + " = " + dist, dist <= 8);
216
217 test = "web_test_006";
218 dist = distanceToGoldStandard(tok, "test_web", test);
219 assertTrue(test + " = " + dist, dist <= 8);
220
221 test = "web_test_007";
222 dist = distanceToGoldStandard(tok, "test_web", test);
223 assertTrue(test + " = " + dist, dist <= 12);
224
225 test = "web_test_008";
226 dist = distanceToGoldStandard(tok, "test_web", test);
227 assertTrue(test + " = " + dist, dist <= 2);
228
229 test = "web_test_009";
230 dist = distanceToGoldStandard(tok, "test_web", test);
231 assertTrue(test + " = " + dist, dist <= 8);
232
233 test = "web_test_010";
234 dist = distanceToGoldStandard(tok, "test_web", test);
235 assertTrue(test + " = " + dist, dist == 0);
236
237 test = "web_test_011";
238 dist = distanceToGoldStandard(tok, "test_web", test);
239 assertTrue(test + " = " + dist, dist <= 28);
240
241 test = "web_test_012";
242 dist = distanceToGoldStandard(tok, "test_web", test);
243 assertTrue(test + " = " + dist, dist <= 7);
244 }
245}