| Marc Kupietz | 793f85d | 2020-09-08 14:40:24 +0200 | [diff] [blame] | 1 | package de.ids_mannheim.korap.tokenizer; |
| 2 | |
| Marc Kupietz | 793f85d | 2020-09-08 14:40:24 +0200 | [diff] [blame] | 3 | import org.junit.Test; |
| 4 | import org.junit.runner.RunWith; |
| 5 | import org.junit.runners.Parameterized; |
| 6 | |
| 7 | import java.io.*; |
| 8 | import java.net.URL; |
| 9 | import java.nio.charset.StandardCharsets; |
| 10 | import java.nio.file.Files; |
| 11 | import java.nio.file.Paths; |
| 12 | import java.util.ArrayList; |
| 13 | import java.util.Collection; |
| 14 | |
| 15 | import static org.junit.Assert.*; |
| 16 | |
| 17 | @RunWith(Parameterized.class) |
| Marc Kupietz | 793f85d | 2020-09-08 14:40:24 +0200 | [diff] [blame] | 18 | public class IPCOffsetTests { |
| Marc Kupietz | 74141b3 | 2020-10-01 23:23:18 +0200 | [diff] [blame] | 19 | |
| 20 | static final String testFiletemplate = "/other_test_data/test.%s.%s.%02d.%s.txt"; |
| Marc Kupietz | 793f85d | 2020-09-08 14:40:24 +0200 | [diff] [blame] | 21 | @Parameterized.Parameters |
| 22 | public static Collection<Object[]> data() { |
| 23 | Collection<Object[]> testData = new ArrayList<>(); |
| Marc Kupietz | f5a7e04 | 2020-10-12 10:43:24 +0200 | [diff] [blame] | 24 | for (String language : new String[]{"de", "en", "fr"}) { |
| Marc Kupietz | 74141b3 | 2020-10-01 23:23:18 +0200 | [diff] [blame] | 25 | for (String encoding : new String[]{"ascii", "latin1", "utf8"}) { |
| 26 | for (int i = 1; true; i++) { |
| 27 | URL inputUrl = IPCOffsetTests.class.getResource(String.format(testFiletemplate, language, encoding, i, "input")); |
| 28 | URL positionsUrl = IPCOffsetTests.class.getResource(String.format(testFiletemplate, language, encoding, i, "positions")); |
| 29 | URL tokensUrl = IPCOffsetTests.class.getResource(String.format(testFiletemplate, language, encoding, i, "tokens")); |
| 30 | if (inputUrl == null) |
| 31 | break; |
| 32 | testData.add(new String[]{inputUrl.getFile(), positionsUrl.getFile(), tokensUrl.getFile(), language, encoding}); |
| 33 | } |
| Marc Kupietz | 8e197f3 | 2020-10-08 09:20:37 +0200 | [diff] [blame] | 34 | } |
| Marc Kupietz | 793f85d | 2020-09-08 14:40:24 +0200 | [diff] [blame] | 35 | } |
| 36 | return testData; |
| 37 | } |
| 38 | |
| 39 | private final String input; |
| Marc Kupietz | 74141b3 | 2020-10-01 23:23:18 +0200 | [diff] [blame] | 40 | private final String positions; |
| 41 | private final String tokens; |
| Marc Kupietz | 8e197f3 | 2020-10-08 09:20:37 +0200 | [diff] [blame] | 42 | private final String encoding; |
| Marc Kupietz | 74141b3 | 2020-10-01 23:23:18 +0200 | [diff] [blame] | 43 | private final String language; |
| Marc Kupietz | 793f85d | 2020-09-08 14:40:24 +0200 | [diff] [blame] | 44 | |
| 45 | static String readFile(String path) |
| 46 | throws IOException { |
| 47 | byte[] encoded = Files.readAllBytes(Paths.get(path)); |
| 48 | return new String(encoded, StandardCharsets.UTF_8); |
| 49 | } |
| 50 | |
| Marc Kupietz | 74141b3 | 2020-10-01 23:23:18 +0200 | [diff] [blame] | 51 | public IPCOffsetTests(String input, String positions, String tokens, String language, String encoding) { |
| Marc Kupietz | 793f85d | 2020-09-08 14:40:24 +0200 | [diff] [blame] | 52 | this.input = input; |
| Marc Kupietz | 74141b3 | 2020-10-01 23:23:18 +0200 | [diff] [blame] | 53 | this.positions = positions; |
| 54 | this.tokens = tokens; |
| 55 | this.language = language; |
| Marc Kupietz | 8e197f3 | 2020-10-08 09:20:37 +0200 | [diff] [blame] | 56 | this.encoding = encoding; |
| Marc Kupietz | 793f85d | 2020-09-08 14:40:24 +0200 | [diff] [blame] | 57 | } |
| 58 | |
| 59 | @Test |
| 60 | public void testMainWithOffsetsAndSentencesOnDifferentInputFiles() throws IOException { |
| Marc Kupietz | 74141b3 | 2020-10-01 23:23:18 +0200 | [diff] [blame] | 61 | File tempFile = File.createTempFile("position_output", ".txt"); |
| 62 | String[] args = {"--language", language, "--encoding", encoding, "--no-tokens", "--positions", "--sentence-boundaries", "--force", "-o", tempFile.getAbsolutePath(), input}; |
| Marc Kupietz | 751868b | 2020-09-25 17:59:38 +0200 | [diff] [blame] | 63 | Main.main(args); |
| Marc Kupietz | c419d5b | 2020-09-17 15:21:26 +0200 | [diff] [blame] | 64 | String actualResult = readFile(tempFile.getAbsolutePath()); |
| Marc Kupietz | 74141b3 | 2020-10-01 23:23:18 +0200 | [diff] [blame] | 65 | String goldData = readFile(positions); |
| Marc Kupietz | 6d28ed1 | 2021-07-15 21:09:47 +0200 | [diff] [blame] | 66 | assertEquals("Testing "+tempFile+ " against " + new File(positions).getName(), goldData, actualResult); |
| Marc Kupietz | 74141b3 | 2020-10-01 23:23:18 +0200 | [diff] [blame] | 67 | } |
| 68 | |
| 69 | @Test |
| 70 | public void testMainWithTokenOutputOnDifferentInputFiles() throws IOException { |
| 71 | File tempFile = File.createTempFile("token_output", ".txt"); |
| 72 | String[] args = {"--language", language, "--encoding", encoding, "--tokens", "--force", "-o", tempFile.getAbsolutePath(), input}; |
| 73 | Main.main(args); |
| 74 | String actualResult = readFile(tempFile.getAbsolutePath()); |
| 75 | String goldData = readFile(tokens); |
| Marc Kupietz | 6d28ed1 | 2021-07-15 21:09:47 +0200 | [diff] [blame] | 76 | assertEquals("Testing " + tempFile + " against " + new File(tokens).getName(), goldData, actualResult); |
| Marc Kupietz | 793f85d | 2020-09-08 14:40:24 +0200 | [diff] [blame] | 77 | } |
| 78 | } |
| 79 | |