Blame - src/test/java/TokenizerCoverTest.java - KorAP/KorAP-Tokenizer

blob: 2d717de1ee68125d431efbb6e4e024f7914f90b4 [file] [log] [blame]

Marc Kupietz	3367773	2020-09-04 22:07:39 +0200	[diff] [blame^]	1	package de.ids_mannheim.korap.tokenizer;
				2
				3	import static org.junit.Assert.*;
				4	import java.util.*;
				5	import java.io.*;
				6	import java.net.URLDecoder;
				7	import org.junit.Test;
				8	import org.junit.Ignore;
				9	import org.junit.runner.RunWith;
				10	import org.junit.runners.JUnit4;
				11
				12	@RunWith(JUnit4.class)
				13	public class TokenizerCoverTest {
				14
				15	/**
				16	* This test suite checks for the tokenization coverage of our
				17	* tokenizer implementation based on the EmpiriST 2015
				18	* Gold Standard Suite, published under Creative Commons license
				19	* BY-SA 3.0.
				20	*
				21	* Michael Beißwenger, Sabine Bartsch, Stefan Evert and
				22	* Kay-Michael Würzner (2016). EmpiriST 2015: A shared task
				23	* on the automatic linguistic annotation of computer-mediated
				24	* communication and web corpora. In Proceedings of the 10th
				25	* Web as Corpus Workshop (WAC-X) and the EmpiriST Shared Task,
				26	* pages 78–90. Berlin, Germany.
				27	*
				28	* https://sites.google.com/site/empirist2015/home/gold
				29	*/
				30
				31	// Get a data file
				32	private String getFile (String file) {
				33	String path = getClass().getResource(file).getFile();
				34
				35	StringBuilder content = new StringBuilder();
				36	try {
				37	BufferedReader in = new BufferedReader(
				38	new InputStreamReader(
				39	new FileInputStream(URLDecoder.decode(path, "UTF-8")),
				40	"UTF-8"
				41	)
				42	);
				43	String str;
				44	while ((str = in.readLine()) != null) {
				45	content.append(str + "\n");
				46	};
				47	in.close();
				48	}
				49	catch (IOException e) {
				50	fail(e.getMessage());
				51	}
				52	return content.toString();
				53	};
				54
				55
				56	/**
				57	* Scan Empirist articles and iterate through
				58	*/
				59	private class EmpiristScanner implements Iterator {
				60	private Scanner sc;
				61
				62	public EmpiristScanner (String file) {
				63	sc = new Scanner(getFile(file));
				64	sc.useDelimiter("<(?:posting\|article)[^>]+?/>");
				65	}
				66
				67	// Return next posting/article
				68	public String next () {
				69	return sc.next().trim();
				70	}
				71
				72	// Check if new posting/article exists
				73	public boolean hasNext () {
				74	return sc.hasNext();
				75	}
				76	}
				77
				78	/**
				79	* To calculate the difference between the gold standard version and
				80	* our version, we calculate the levenshtein difference between both lists.
				81	* It's not very intuitive that way, as it does not treat merges and splits
				82	* specifically (i.e. a merge is one replacement and one deletion, a split
				83	* is one replacement and one insertion) - so the number is not
				84	* really meaningful - it's just a way to measure the differences.
				85	* It's important to note that this differs from the comparison of
				86	* EmpiriST, where the end boundaries of all tokens are compared.
				87	*/
				88	public static int levenshteinForStringArrays (String[] s, String[] t) {
				89	if (s == null \|\| t == null) {
				90	throw new IllegalArgumentException("Lists must not be null");
				91	}
				92
				93	// Code based on Rosettacode.org
				94	int [] costs = new int[t.length + 1];
				95
				96	for (int j = 0; j < costs.length; j++)
				97	costs[j] = j;
				98
				99	for (int i = 1; i <= s.length; i++) {
				100	costs[0] = i;
				101	int nw = i - 1;
				102	for (int j = 1; j <= t.length; j++) {
				103	int cj = Math.min(
				104	1 + Math.min(costs[j], costs[j - 1]),
				105	s[i - 1].equals(t[j - 1]) ? nw : nw + 1
				106	);
				107	nw = costs[j];
				108	costs[j] = cj;
				109	}
				110	}
				111
				112	return costs[t.length];
				113	}
				114
				115	/**
				116	* Compare the tokenized data of one example file
				117	* with the gold standard and return the sum of
				118	* levenshtein distances.
				119	*/
				120	public int distanceToGoldStandard (KorAPTokenizerImpl tok, String suite, String postings) {
				121
				122	// Load raw postings
				123	EmpiristScanner esRaw = new EmpiristScanner(
				124	"/empirist_gold_standard/" + suite + "/raw/" + postings + ".txt"
				125	);
				126
				127	// Load tokenized postings
				128	EmpiristScanner esTokenized = new EmpiristScanner(
				129	"/empirist_gold_standard/" + suite + "/tokenized/" + postings + ".txt"
				130	);
				131
				132	int distance = 0;
				133
				134	// Iterate over all postings
				135	while (esRaw.hasNext() && esTokenized.hasNext()) {
				136
				137	// Get the gold standard splitted on new lines
				138	String [] goldTokens = esTokenized.next().split("\n+");
				139
				140	// Tokenize the test data
				141	String [] testTokens = tok.tokenize(esRaw.next());
				142
				143	if (false) {
				144	System.err.println("-----------------");
				145	for (int i = 0; i < Math.min(goldTokens.length, testTokens.length); i++) {
				146	System.err.println(goldTokens[i] + " = "+ testTokens[i]);
				147	}
				148	}
				149
				150	// Calculate the edit distance of both arrays
				151	distance += levenshteinForStringArrays(goldTokens, testTokens);
				152	};
				153
				154	// Return the sum of all distances
				155	return distance;
				156	};
				157
				158
				159	@Test
				160	public void testTokenizerCoverEmpiristCmc () {
				161
				162	// Create tokenizer object
				163	KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
				164
				165	String test = "cmc_test_blog_comment";
				166	int dist = distanceToGoldStandard(tok, "test_cmc", test);
				167	assertTrue(test + " = " + dist, dist == 0);
				168
				169	test = "cmc_test_professional_chat";
				170	dist = distanceToGoldStandard(tok, "test_cmc", test);
				171	assertTrue(test + " = " + dist, dist <= 20);
				172
				173	test = "cmc_test_social_chat";
				174	dist = distanceToGoldStandard(tok, "test_cmc", test);
				175	assertTrue(test + " = " + dist, dist <= 23);
				176
				177	test = "cmc_test_twitter";
				178	dist = distanceToGoldStandard(tok, "test_cmc", test);
				179	assertTrue(test + " = " + dist, dist <= 153);
				180
				181	test = "cmc_test_whatsapp";
				182	dist = distanceToGoldStandard(tok, "test_cmc", test);
				183	assertTrue(test + " = " + dist, dist <= 0);
				184
				185	test = "cmc_test_wiki_discussion";
				186	dist = distanceToGoldStandard(tok, "test_cmc", test);
				187	assertTrue(test + " = " + dist, dist <= 24);
				188
				189	}
				190
				191	@Test
				192	public void testTokenizerCoverEmpiristWeb () {
				193
				194	// Create tokenizer object
				195	KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
				196
				197	String test = "web_test_001";
				198	int dist = distanceToGoldStandard(tok, "test_web", test);
				199	assertTrue(test + " = " + dist, dist <= 21);
				200
				201	test = "web_test_002";
				202	dist = distanceToGoldStandard(tok, "test_web", test);
				203	assertTrue(test + " = " + dist, dist <= 5);
				204
				205	test = "web_test_003";
				206	dist = distanceToGoldStandard(tok, "test_web", test);
				207	assertTrue(test + " = " + dist, dist <= 17);
				208
				209	test = "web_test_004";
				210	dist = distanceToGoldStandard(tok, "test_web", test);
				211	assertTrue(test + " = " + dist, dist <= 20);
				212
				213	test = "web_test_005";
				214	dist = distanceToGoldStandard(tok, "test_web", test);
				215	assertTrue(test + " = " + dist, dist <= 8);
				216
				217	test = "web_test_006";
				218	dist = distanceToGoldStandard(tok, "test_web", test);
				219	assertTrue(test + " = " + dist, dist <= 8);
				220
				221	test = "web_test_007";
				222	dist = distanceToGoldStandard(tok, "test_web", test);
				223	assertTrue(test + " = " + dist, dist <= 12);
				224
				225	test = "web_test_008";
				226	dist = distanceToGoldStandard(tok, "test_web", test);
				227	assertTrue(test + " = " + dist, dist <= 2);
				228
				229	test = "web_test_009";
				230	dist = distanceToGoldStandard(tok, "test_web", test);
				231	assertTrue(test + " = " + dist, dist <= 8);
				232
				233	test = "web_test_010";
				234	dist = distanceToGoldStandard(tok, "test_web", test);
				235	assertTrue(test + " = " + dist, dist == 0);
				236
				237	test = "web_test_011";
				238	dist = distanceToGoldStandard(tok, "test_web", test);
				239	assertTrue(test + " = " + dist, dist <= 28);
				240
				241	test = "web_test_012";
				242	dist = distanceToGoldStandard(tok, "test_web", test);
				243	assertTrue(test + " = " + dist, dist <= 7);
				244	}
				245	}