| Eliza Margaretha | 6a78069 | 2014-01-15 09:45:42 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.search; |
| 2 | |
| Eliza Margaretha | 805e27f | 2016-10-14 21:39:42 +0200 | [diff] [blame] | 3 | import static de.ids_mannheim.korap.TestSimple.getJsonString; |
| margaretha | f70addb | 2015-04-27 13:17:18 +0200 | [diff] [blame] | 4 | import static org.junit.Assert.assertEquals; |
| Akron | 001dab3 | 2015-07-02 12:30:15 +0200 | [diff] [blame] | 5 | import static org.junit.Assert.assertTrue; |
| margaretha | f70addb | 2015-04-27 13:17:18 +0200 | [diff] [blame] | 6 | import static org.junit.Assert.assertFalse; |
| 7 | import static org.junit.Assert.assertNull; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 8 | |
| margaretha | f70addb | 2015-04-27 13:17:18 +0200 | [diff] [blame] | 9 | import java.io.IOException; |
| 10 | import java.util.HashMap; |
| Nils Diewald | 56dc258 | 2014-11-04 21:33:46 +0000 | [diff] [blame] | 11 | |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 12 | import org.junit.Test; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 13 | import org.junit.Ignore; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 14 | import org.junit.runner.RunWith; |
| 15 | import org.junit.runners.JUnit4; |
| 16 | |
| margaretha | f70addb | 2015-04-27 13:17:18 +0200 | [diff] [blame] | 17 | import com.fasterxml.jackson.databind.JsonNode; |
| 18 | import com.fasterxml.jackson.databind.ObjectMapper; |
| 19 | |
| 20 | import de.ids_mannheim.korap.Krill; |
| 21 | import de.ids_mannheim.korap.KrillCollection; |
| 22 | import de.ids_mannheim.korap.KrillIndex; |
| 23 | import de.ids_mannheim.korap.KrillMeta; |
| 24 | import de.ids_mannheim.korap.collection.CollectionBuilder; |
| 25 | import de.ids_mannheim.korap.index.FieldDocument; |
| 26 | import de.ids_mannheim.korap.query.QueryBuilder; |
| margaretha | f70addb | 2015-04-27 13:17:18 +0200 | [diff] [blame] | 27 | import de.ids_mannheim.korap.response.Result; |
| Akron | 69b958c | 2017-02-15 22:49:45 +0100 | [diff] [blame] | 28 | import de.ids_mannheim.korap.response.Match; |
| margaretha | f70addb | 2015-04-27 13:17:18 +0200 | [diff] [blame] | 29 | import de.ids_mannheim.korap.response.SearchContext; |
| 30 | |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 31 | @RunWith(JUnit4.class) |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 32 | public class TestKrill { |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 33 | @Test |
| 34 | public void searchCount () { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 35 | Krill k = new Krill(new QueryBuilder("field1").seg("a").with("b")); |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 36 | |
| 37 | KrillMeta meta = k.getMeta(); |
| 38 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 39 | // Count: |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 40 | meta.setCount(30); |
| 41 | assertEquals(meta.getCount(), 30); |
| 42 | meta.setCount(20); |
| 43 | assertEquals(meta.getCount(), 20); |
| 44 | meta.setCount(-50); |
| 45 | assertEquals(meta.getCount(), 20); |
| 46 | meta.setCount(500); |
| 47 | assertEquals(meta.getCount(), meta.getCountMax()); |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 48 | }; |
| 49 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 50 | |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 51 | @Test |
| 52 | public void searchStartIndex () { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 53 | Krill k = new Krill(new QueryBuilder("field1").seg("a").with("b")); |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 54 | |
| 55 | KrillMeta meta = k.getMeta(); |
| 56 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 57 | // startIndex |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 58 | meta.setStartIndex(5); |
| 59 | assertEquals(meta.getStartIndex(), 5); |
| 60 | meta.setStartIndex(1); |
| 61 | assertEquals(meta.getStartIndex(), 1); |
| 62 | meta.setStartIndex(0); |
| 63 | assertEquals(meta.getStartIndex(), 0); |
| 64 | meta.setStartIndex(70); |
| 65 | assertEquals(meta.getStartIndex(), 70); |
| 66 | meta.setStartIndex(-5); |
| 67 | assertEquals(meta.getStartIndex(), 0); |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 68 | }; |
| 69 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 70 | |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 71 | @Test |
| 72 | public void searchQuery () { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 73 | Krill ks = new Krill(new QueryBuilder("field1").seg("a").with("b")); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 74 | // query |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 75 | assertEquals(ks.getSpanQuery().toString(), |
| 76 | "spanSegment(field1:a, field1:b)"); |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 77 | }; |
| 78 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 79 | |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 80 | @Test |
| 81 | public void searchIndex () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 82 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 83 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 84 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 85 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 86 | "00005", "00006", "02439" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 87 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 88 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 89 | }; |
| 90 | ki.commit(); |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 91 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 92 | Krill ks = new Krill(new QueryBuilder("tokens").seg("s:Buchstaben")); |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 93 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 94 | CollectionBuilder cb = new CollectionBuilder(); |
| 95 | |
| 96 | ks.getCollection().fromBuilder(cb.term("textClass", "reisen")); |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 97 | |
| 98 | KrillMeta meta = ks.getMeta(); |
| 99 | meta.setCount(3); |
| 100 | meta.setStartIndex(5); |
| 101 | meta.getContext().left.setLength(1); |
| 102 | meta.getContext().right.setLength(1); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 103 | |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 104 | Result kr = ks.apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 105 | assertEquals(kr.getTotalResults(), 6); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 106 | assertEquals(kr.getMatch(0).getSnippetBrackets(), |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 107 | "... dem [[Buchstaben]] A ..."); |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 108 | |
| 109 | JsonNode res = ks.toJsonNode(); |
| 110 | assertEquals(3, res.at("/meta/count").asInt()); |
| 111 | assertEquals(5, res.at("/meta/startIndex").asInt()); |
| 112 | assertEquals("token", res.at("/meta/context/left/0").asText()); |
| 113 | assertEquals(1, res.at("/meta/context/left/1").asInt()); |
| 114 | assertEquals("token", res.at("/meta/context/right/0").asText()); |
| 115 | assertEquals(1, res.at("/meta/context/right/1").asInt()); |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 116 | }; |
| Nils Diewald | c6b7875 | 2013-12-05 19:05:12 +0000 | [diff] [blame] | 117 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 118 | |
| Nils Diewald | c6b7875 | 2013-12-05 19:05:12 +0000 | [diff] [blame] | 119 | @Test |
| 120 | public void searchJSON () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 121 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 122 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 123 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 124 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 125 | "00005", "00006", "02439" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 126 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 127 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 128 | }; |
| 129 | ki.commit(); |
| Nils Diewald | c6b7875 | 2013-12-05 19:05:12 +0000 | [diff] [blame] | 130 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 131 | String json = getJsonString( |
| 132 | getClass().getResource("/queries/metaquery3.jsonld").getFile()); |
| Nils Diewald | c6b7875 | 2013-12-05 19:05:12 +0000 | [diff] [blame] | 133 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 134 | Krill ks = new Krill(json); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 135 | Result kr = ks.apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 136 | assertEquals(kr.getTotalResults(), 66); |
| 137 | assertEquals(5, kr.getItemsPerPage()); |
| 138 | assertEquals(5, kr.getStartIndex()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 139 | assertEquals("... a: A ist [[der klangreichste]] der V ...", |
| 140 | kr.getMatch(0).getSnippetBrackets()); |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 141 | }; |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 142 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 143 | |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 144 | @Test |
| 145 | public void searchJSON2 () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 146 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 147 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 148 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 149 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 150 | "00005", "00006", "02439", "00012-fakemeta", "00030-fakemeta", |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 151 | /* |
| 152 | "02035-substring", |
| 153 | "05663-unbalanced", |
| 154 | "07452-deep" |
| 155 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 156 | }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 157 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 158 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 159 | }; |
| 160 | ki.commit(); |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 161 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 162 | String json = getJsonString( |
| 163 | getClass().getResource("/queries/metaquery4.jsonld").getFile()); |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 164 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 165 | Krill ks = new Krill(json); |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 166 | Result kr = ks.apply(ki); |
| Nils Diewald | c86aa48 | 2014-02-12 16:58:05 +0000 | [diff] [blame] | 167 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 168 | assertEquals(kr.getTotalResults(), 1); |
| Nils Diewald | 979b2fe | 2014-09-29 16:21:41 +0000 | [diff] [blame] | 169 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 170 | ks = new Krill(json); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 171 | // Ignore the collection part of the query! |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 172 | ks.setCollection(new KrillCollection()); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 173 | kr = ks.apply(ki); |
| Nils Diewald | 979b2fe | 2014-09-29 16:21:41 +0000 | [diff] [blame] | 174 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 175 | assertEquals(kr.getTotalResults(), 5); |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 176 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 177 | json = getJsonString( |
| 178 | getClass().getResource("/queries/metaquery5.jsonld").getFile()); |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 179 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 180 | ks = new Krill(json); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 181 | kr = ks.apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 182 | assertEquals(kr.getTotalResults(), 1); |
| 183 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 184 | json = getJsonString( |
| 185 | getClass().getResource("/queries/metaquery6.jsonld").getFile()); |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 186 | ks = new Krill(json); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 187 | kr = ks.apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 188 | assertEquals(kr.getTotalResults(), 1); |
| Nils Diewald | c6b7875 | 2013-12-05 19:05:12 +0000 | [diff] [blame] | 189 | }; |
| 190 | |
| Akron | bb5d173 | 2015-06-22 01:22:40 +0200 | [diff] [blame] | 191 | |
| Akron | c63697c | 2015-06-17 22:32:02 +0200 | [diff] [blame] | 192 | // Todo: There SHOULD be a failure here, but Koral currently creates empty collections |
| 193 | @Test |
| 194 | public void queryJSONapiTest1 () { |
| Akron | bb5d173 | 2015-06-22 01:22:40 +0200 | [diff] [blame] | 195 | Krill test = new Krill( |
| 196 | "{\"@context\":\"http://korap.ids-mannheim.de/ns/koral/0.3/context.jsonld\",\"errors\":[],\"warnings\":[],\"messages\":[],\"collection\":{},\"query\":{\"@type\":\"koral:token\",\"wrap\":{\"@type\":\"koral:term\",\"layer\":\"orth\",\"key\":\"Baum\",\"match\":\"match:eq\"}},\"meta\":{}}"); |
| Akron | c63697c | 2015-06-17 22:32:02 +0200 | [diff] [blame] | 197 | assertFalse(test.hasErrors()); |
| 198 | }; |
| 199 | |
| Nils Diewald | c6b7875 | 2013-12-05 19:05:12 +0000 | [diff] [blame] | 200 | |
| 201 | @Test |
| 202 | public void searchJSONFailure () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 203 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 204 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 205 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 206 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 207 | "00005", "00006", "02439" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 208 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 209 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 210 | }; |
| 211 | ki.commit(); |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 212 | Result kr = new Krill("{ query").apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 213 | assertEquals(kr.getTotalResults(), 0); |
| 214 | assertEquals(kr.getError(0).getMessage(), "Unable to parse JSON"); |
| Nils Diewald | c6b7875 | 2013-12-05 19:05:12 +0000 | [diff] [blame] | 215 | }; |
| 216 | |
| 217 | |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 218 | @Test |
| 219 | public void searchJSONindexboundary () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 220 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 221 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 222 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 223 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 224 | "00005", "00006", "02439" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 225 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 226 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 227 | }; |
| 228 | ki.commit(); |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 229 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 230 | String json = getJsonString( |
| 231 | getClass().getResource("/queries/bsp-fail1.jsonld").getFile()); |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 232 | |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 233 | Result kr = new Krill(json).apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 234 | assertEquals(0, kr.getStartIndex()); |
| 235 | assertEquals(kr.getTotalResults(), 0); |
| 236 | assertEquals(25, kr.getItemsPerPage()); |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 237 | }; |
| 238 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 239 | |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 240 | @Test |
| 241 | public void searchJSONindexboundary2 () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 242 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 243 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 244 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 245 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 246 | "00005", "00006", "02439" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 247 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 248 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 249 | }; |
| 250 | ki.commit(); |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 251 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 252 | String json = getJsonString( |
| 253 | getClass().getResource("/queries/bsp-fail2.jsonld").getFile()); |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 254 | |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 255 | Result kr = new Krill(json).apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 256 | assertEquals(50, kr.getItemsPerPage()); |
| 257 | assertEquals(49950, kr.getStartIndex()); |
| 258 | assertEquals(kr.getTotalResults(), 0); |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 259 | }; |
| 260 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 261 | |
| Akron | 001dab3 | 2015-07-02 12:30:15 +0200 | [diff] [blame] | 262 | /* |
| 263 | * Queries should be mirrored correctly for debugging reasons. |
| 264 | */ |
| 265 | @Test |
| 266 | public void queryJSONmirrorTestBug () throws IOException { |
| 267 | // Construct index |
| 268 | KrillIndex ki = new KrillIndex(); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 269 | String json = getJsonString(getClass() |
| 270 | .getResource("/queries/bugs/failing_mirror.jsonld").getFile()); |
| Akron | 001dab3 | 2015-07-02 12:30:15 +0200 | [diff] [blame] | 271 | Krill ks = new Krill(json); |
| 272 | Result kr = ks.apply(ki); |
| 273 | |
| 274 | ObjectMapper mapper = new ObjectMapper(); |
| 275 | JsonNode res = mapper.readTree(kr.toJsonString()); |
| 276 | |
| 277 | assertEquals("Unable to parse JSON", res.at("/errors/0/1").asText()); |
| 278 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 279 | json = getJsonString( |
| 280 | getClass().getResource("/queries/bugs/failing_mirror_2.jsonld") |
| 281 | .getFile()); |
| Akron | 001dab3 | 2015-07-02 12:30:15 +0200 | [diff] [blame] | 282 | ks = new Krill(json); |
| 283 | kr = ks.apply(ki); |
| 284 | |
| 285 | res = mapper.readTree(kr.toJsonString()); |
| 286 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 287 | assertEquals(23, res.at("/meta/count").asInt()); |
| 288 | assertEquals(25, res.at("/meta/itemsPerPage").asInt()); |
| Akron | 001dab3 | 2015-07-02 12:30:15 +0200 | [diff] [blame] | 289 | assertEquals("base/s:p", res.at("/meta/context").asText()); |
| 290 | assertFalse(res.at("/query").isMissingNode()); |
| 291 | assertTrue(res.at("/query/@type").isMissingNode()); |
| 292 | assertTrue(res.at("/collection/@type").isMissingNode()); |
| 293 | }; |
| 294 | |
| 295 | |
| Nils Diewald | c6b7875 | 2013-12-05 19:05:12 +0000 | [diff] [blame] | 296 | |
| Nils Diewald | eabed8b | 2013-12-17 16:46:43 +0000 | [diff] [blame] | 297 | @Test |
| 298 | public void searchJSONcontext () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 299 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 300 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 301 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 302 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 303 | "00005", "00006", "02439" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 304 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 305 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 306 | }; |
| 307 | ki.commit(); |
| Nils Diewald | eabed8b | 2013-12-17 16:46:43 +0000 | [diff] [blame] | 308 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 309 | String json = getJsonString(getClass() |
| 310 | .getResource("/queries/bsp-context.jsonld").getFile()); |
| Nils Diewald | eabed8b | 2013-12-17 16:46:43 +0000 | [diff] [blame] | 311 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 312 | Krill ks = new Krill(json); |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 313 | Result kr = ks.apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 314 | assertEquals(kr.getTotalResults(), 10); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 315 | assertEquals( |
| 316 | "A bzw. a ist der erste Buchstabe des" |
| 317 | + " lateinischen [[Alphabets]] und ein Vokal." |
| 318 | + " Der Buchstabe A hat in deutschen Texten" |
| 319 | + " eine durchschnittliche Häufigkeit ...", |
| 320 | kr.getMatch(0).getSnippetBrackets()); |
| Nils Diewald | b3a09db | 2013-12-21 00:22:02 +0000 | [diff] [blame] | 321 | |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 322 | ks.getMeta().setCount(5); |
| 323 | ks.getMeta().setStartPage(2); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 324 | kr = ks.apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 325 | assertEquals(kr.getTotalResults(), 10); |
| 326 | assertEquals(5, kr.getStartIndex()); |
| 327 | assertEquals(5, kr.getItemsPerPage()); |
| Nils Diewald | 891c53c | 2013-12-23 16:37:46 +0000 | [diff] [blame] | 328 | |
| Eliza Margaretha | 805e27f | 2016-10-14 21:39:42 +0200 | [diff] [blame] | 329 | json = getJsonString(getClass() |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 330 | .getResource("/queries/bsp-context-2.jsonld").getFile()); |
| Nils Diewald | 891c53c | 2013-12-23 16:37:46 +0000 | [diff] [blame] | 331 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 332 | kr = new Krill(json).apply(ki); |
| 333 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 334 | assertEquals(kr.getTotalResults(), -1); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 335 | assertEquals( |
| 336 | "... lls seit den Griechen beibehalten worden." |
| 337 | + " 3. Bedeutungen in der Biologie steht A für" |
| 338 | + " das Nukleosid Adenosin steht A die Base" |
| 339 | + " Adenin steht A für die Aminosäure Alanin" |
| 340 | + " in der Informatik steht a für den dezimalen" |
| 341 | + " [[Wert]] 97 sowohl im ASCII- als auch im" |
| 342 | + " Unicode-Zeichensatz steht A für den dezimalen" |
| 343 | + " Wert 65 sowohl im ASCII- als auch im" |
| 344 | + " Unicode-Zeichensatz als Kfz-Kennzeichen" |
| 345 | + " steht A in Deutschland für Augsburg." |
| 346 | + " in Österreich auf ...", |
| 347 | kr.getMatch(0).getSnippetBrackets()); |
| Nils Diewald | eabed8b | 2013-12-17 16:46:43 +0000 | [diff] [blame] | 348 | }; |
| 349 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 350 | |
| Nils Diewald | 364eb64 | 2013-12-22 15:03:01 +0000 | [diff] [blame] | 351 | @Test |
| 352 | public void searchJSONstartPage () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 353 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 354 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 355 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 356 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 357 | "00005", "00006", "02439" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 358 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 359 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 360 | }; |
| 361 | ki.commit(); |
| Nils Diewald | 364eb64 | 2013-12-22 15:03:01 +0000 | [diff] [blame] | 362 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 363 | String json = getJsonString( |
| 364 | getClass().getResource("/queries/bsp-paging.jsonld").getFile()); |
| Nils Diewald | 364eb64 | 2013-12-22 15:03:01 +0000 | [diff] [blame] | 365 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 366 | Krill ks = new Krill(json); |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 367 | Result kr = ks.apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 368 | assertEquals(kr.getTotalResults(), 10); |
| 369 | assertEquals(5, kr.getStartIndex()); |
| 370 | assertEquals(5, kr.getItemsPerPage()); |
| Nils Diewald | 364eb64 | 2013-12-22 15:03:01 +0000 | [diff] [blame] | 371 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 372 | json = getJsonString( |
| 373 | getClass().getResource("/queries/bsp-cutoff.jsonld").getFile()); |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 374 | ks = ks = new Krill(json); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 375 | kr = ks.apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 376 | assertEquals(kr.getTotalResults(), -1); |
| 377 | assertEquals(2, kr.getStartIndex()); |
| 378 | assertEquals(2, kr.getItemsPerPage()); |
| Nils Diewald | 364eb64 | 2013-12-22 15:03:01 +0000 | [diff] [blame] | 379 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 380 | json = getJsonString( |
| 381 | getClass().getResource("/queries/metaquery9.jsonld").getFile()); |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 382 | KrillCollection kc = new KrillCollection(json); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 383 | kc.setIndex(ki); |
| 384 | assertEquals(7, kc.numberOf("documents")); |
| Nils Diewald | 364eb64 | 2013-12-22 15:03:01 +0000 | [diff] [blame] | 385 | }; |
| Nils Diewald | eabed8b | 2013-12-17 16:46:43 +0000 | [diff] [blame] | 386 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 387 | |
| Nils Diewald | fb4d7b0 | 2014-04-09 17:56:17 +0000 | [diff] [blame] | 388 | @Test |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 389 | public void searchJSONitemsPerResource () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 390 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 391 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 392 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 393 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 394 | "00005", "00006", "02439" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 395 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 396 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 397 | }; |
| 398 | ki.commit(); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 399 | String json = getJsonString(getClass() |
| 400 | .getResource("/queries/bsp-itemsPerResource.jsonld").getFile()); |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 401 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 402 | Krill ks = new Krill(json); |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 403 | Result kr = ks.apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 404 | assertEquals(kr.getTotalResults(), 10); |
| 405 | assertEquals(0, kr.getStartIndex()); |
| 406 | assertEquals(20, kr.getItemsPerPage()); |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 407 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 408 | assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID()); |
| 409 | assertEquals("WPD_AAA.00001", kr.getMatch(1).getDocID()); |
| 410 | assertEquals("WPD_AAA.00001", kr.getMatch(6).getDocID()); |
| 411 | assertEquals("WPD_AAA.00002", kr.getMatch(7).getDocID()); |
| 412 | assertEquals("WPD_AAA.00002", kr.getMatch(8).getDocID()); |
| 413 | assertEquals("WPD_AAA.00004", kr.getMatch(9).getDocID()); |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 414 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 415 | ks = new Krill(json); |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 416 | ks.getMeta().setItemsPerResource(1); |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 417 | |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 418 | kr = ks.apply(ki); |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 419 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 420 | assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID()); |
| 421 | assertEquals("WPD_AAA.00002", kr.getMatch(1).getDocID()); |
| 422 | assertEquals("WPD_AAA.00004", kr.getMatch(2).getDocID()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 423 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 424 | assertEquals(kr.getTotalResults(), 3); |
| 425 | assertEquals(0, kr.getStartIndex()); |
| 426 | assertEquals(20, kr.getItemsPerPage()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 427 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 428 | ks = new Krill(json); |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 429 | ks.getMeta().setItemsPerResource(2); |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 430 | |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 431 | kr = ks.apply(ki); |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 432 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 433 | assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID()); |
| 434 | assertEquals("WPD_AAA.00001", kr.getMatch(1).getDocID()); |
| 435 | assertEquals("WPD_AAA.00002", kr.getMatch(2).getDocID()); |
| 436 | assertEquals("WPD_AAA.00002", kr.getMatch(3).getDocID()); |
| 437 | assertEquals("WPD_AAA.00004", kr.getMatch(4).getDocID()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 438 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 439 | assertEquals(kr.getTotalResults(), 5); |
| 440 | assertEquals(0, kr.getStartIndex()); |
| 441 | assertEquals(20, kr.getItemsPerPage()); |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 442 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 443 | ks = new Krill(json); |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 444 | KrillMeta meta = ks.getMeta(); |
| 445 | meta.setItemsPerResource(1); |
| 446 | meta.setStartIndex(1); |
| 447 | meta.setCount(1); |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 448 | |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 449 | kr = ks.apply(ki); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 450 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 451 | assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID()); |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 452 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 453 | assertEquals(kr.getTotalResults(), 3); |
| 454 | assertEquals(1, kr.getStartIndex()); |
| 455 | assertEquals(1, kr.getItemsPerPage()); |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 456 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 457 | assertEquals((short) 1, kr.getItemsPerResource()); |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 458 | }; |
| 459 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 460 | |
| Nils Diewald | d723d81 | 2014-09-23 18:50:52 +0000 | [diff] [blame] | 461 | @Test |
| 462 | public void searchJSONitemsPerResourceServer () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 463 | /* |
| 464 | * This test is a server-only implementation of |
| 465 | * TestResource#testCollection |
| 466 | */ |
| 467 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 468 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 469 | // Indexing test files |
| 470 | int uid = 1; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 471 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 472 | "00005", "00006", "02439" }) { |
| 473 | ki.addDoc(uid++, |
| 474 | getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| 475 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 476 | }; |
| 477 | ki.commit(); |
| Nils Diewald | d723d81 | 2014-09-23 18:50:52 +0000 | [diff] [blame] | 478 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 479 | String json = getJsonString(getClass() |
| 480 | .getResource("/queries/bsp-uid-example.jsonld").getFile()); |
| Nils Diewald | d723d81 | 2014-09-23 18:50:52 +0000 | [diff] [blame] | 481 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 482 | Krill ks = new Krill(json); |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 483 | ks.getMeta().setItemsPerResource(1); |
| 484 | |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 485 | KrillCollection kc = new KrillCollection(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 486 | kc.filterUIDs(new String[] { "1", "4" }); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 487 | kc.setIndex(ki); |
| 488 | ks.setCollection(kc); |
| Nils Diewald | d723d81 | 2014-09-23 18:50:52 +0000 | [diff] [blame] | 489 | |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 490 | Result kr = ks.apply(ki); |
| Nils Diewald | d723d81 | 2014-09-23 18:50:52 +0000 | [diff] [blame] | 491 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 492 | assertEquals(kr.getTotalResults(), 2); |
| 493 | assertEquals(0, kr.getStartIndex()); |
| 494 | assertEquals(25, kr.getItemsPerPage()); |
| Nils Diewald | d723d81 | 2014-09-23 18:50:52 +0000 | [diff] [blame] | 495 | }; |
| Nils Diewald | ba197f2 | 2014-11-01 17:21:46 +0000 | [diff] [blame] | 496 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 497 | |
| Nils Diewald | ba197f2 | 2014-11-01 17:21:46 +0000 | [diff] [blame] | 498 | @Test |
| 499 | public void searchJSONnewJSON () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 500 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 501 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 502 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 503 | FieldDocument fd = ki.addDoc(1, |
| 504 | getClass().getResourceAsStream("/goe/AGA-03828.json.gz"), true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 505 | ki.commit(); |
| Nils Diewald | ba197f2 | 2014-11-01 17:21:46 +0000 | [diff] [blame] | 506 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 507 | assertEquals(fd.getUID(), 1); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 508 | assertEquals(fd.getTextSigle(), "GOE_AGA.03828"); |
| 509 | assertEquals(fd.getDocSigle(), "GOE_AGA"); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 510 | assertEquals(fd.getCorpusSigle(), "GOE"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 511 | assertEquals(fd.getFieldValue("title"), "Autobiographische Einzelheiten"); |
| 512 | assertNull(fd.getFieldValue("subTitle")); |
| 513 | assertEquals(fd.getFieldValue("textType"), "Autobiographie"); |
| 514 | assertNull(fd.getFieldValue("textTypeArt")); |
| 515 | assertNull(fd.getFieldValue("textTypeRef")); |
| 516 | assertNull(fd.getFieldValue("textColumn")); |
| 517 | assertNull(fd.getFieldValue("textDomain")); |
| Akron | 69b958c | 2017-02-15 22:49:45 +0100 | [diff] [blame] | 518 | // assertEquals(fd.getPages(), "529-547"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 519 | assertEquals(fd.getFieldValue("availability"), "QAO-NC"); |
| 520 | assertEquals(fd.getFieldValue("creationDate"), "1820"); |
| 521 | assertEquals(fd.getFieldValue("pubDate"), "1982"); |
| 522 | assertEquals(fd.getFieldValue("author"), "Goethe, Johann Wolfgang von"); |
| 523 | assertNull(fd.getFieldValue("textClass")); |
| 524 | assertEquals(fd.getFieldValue("language"), "de"); |
| 525 | assertEquals(fd.getFieldValue("pubPlace"), "München"); |
| 526 | assertEquals(fd.getFieldValue("reference"), |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 527 | "Goethe, Johann Wolfgang von:" |
| 528 | + " Autobiographische Einzelheiten," |
| 529 | + " (Geschrieben bis 1832), In: Goethe," |
| 530 | + " Johann Wolfgang von: Goethes Werke," |
| 531 | + " Bd. 10, Autobiographische Schriften" |
| 532 | + " II, Hrsg.: Trunz, Erich. München: " |
| 533 | + "Verlag C. H. Beck, 1982, S. 529-547"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 534 | assertEquals(fd.getFieldValue("publisher"), "Verlag C. H. Beck"); |
| 535 | assertNull(fd.getFieldValue("editor")); |
| 536 | assertNull(fd.getFieldValue("fileEditionStatement")); |
| 537 | assertNull(fd.getFieldValue("biblEditionStatement")); |
| 538 | assertNull(fd.getFieldValue("keywords")); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 539 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 540 | assertEquals(fd.getFieldValue("tokenSource"), "opennlp#tokens"); |
| 541 | assertEquals(fd.getFieldValue("foundries"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 542 | "base base/paragraphs base/sentences corenlp " |
| 543 | + "corenlp/constituency corenlp/morpho " |
| 544 | + "corenlp/namedentities corenlp/sentences " |
| 545 | + "glemm glemm/morpho mate mate/morpho" |
| 546 | + " opennlp opennlp/morpho opennlp/sentences" |
| 547 | + " treetagger treetagger/morpho " |
| 548 | + "treetagger/sentences"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 549 | assertEquals(fd.getFieldValue("layerInfos"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 550 | "base/s=spans corenlp/c=spans corenlp/ne=tokens" |
| 551 | + " corenlp/p=tokens corenlp/s=spans glemm/l=tokens" |
| 552 | + " mate/l=tokens mate/m=tokens mate/p=tokens" |
| 553 | + " opennlp/p=tokens opennlp/s=spans tt/l=tokens" |
| 554 | + " tt/p=tokens tt/s=spans"); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 555 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 556 | assertEquals(fd.getFieldValue("corpusTitle"), "Goethes Werke"); |
| 557 | assertNull(fd.getFieldValue("corpusSubTitle")); |
| 558 | assertEquals(fd.getFieldValue("corpusAuthor"), "Goethe, Johann Wolfgang von"); |
| 559 | assertEquals(fd.getFieldValue("corpusEditor"), "Trunz, Erich"); |
| 560 | assertEquals(fd.getFieldValue("docTitle"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 561 | "Goethe: Autobiographische Schriften II, (1817-1825, 1832)"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 562 | assertNull(fd.getFieldValue("docSubTitle")); |
| 563 | assertNull(fd.getFieldValue("docEditor")); |
| 564 | assertNull(fd.getFieldValue("docAuthor")); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 565 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 566 | Krill ks = new Krill(new QueryBuilder("tokens").seg("mate/m:case:nom") |
| 567 | .with("mate/m:number:pl")); |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 568 | Result kr = ks.apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 569 | |
| 570 | assertEquals(kr.getTotalResults(), 148); |
| 571 | assertEquals(0, kr.getStartIndex()); |
| 572 | assertEquals(25, kr.getItemsPerPage()); |
| Nils Diewald | ba197f2 | 2014-11-01 17:21:46 +0000 | [diff] [blame] | 573 | }; |
| Nils Diewald | 06368ba | 2014-11-03 20:53:27 +0000 | [diff] [blame] | 574 | |
| Akron | 69b958c | 2017-02-15 22:49:45 +0100 | [diff] [blame] | 575 | |
| 576 | @Test |
| 577 | public void searchJSONwithPagebreaks () throws IOException { |
| 578 | // Construct index |
| 579 | KrillIndex ki = new KrillIndex(); |
| 580 | // Indexing test files |
| 581 | FieldDocument fd = ki.addDoc(1, |
| 582 | getClass().getResourceAsStream("/goe/AGA-03828-pb.json.gz"), true); |
| 583 | ki.commit(); |
| 584 | |
| 585 | assertEquals(fd.getUID(), 1); |
| 586 | assertEquals(fd.getTextSigle(), "GOE/AGA/03828"); |
| 587 | assertEquals(fd.getDocSigle(), "GOE/AGA"); |
| 588 | assertEquals(fd.getCorpusSigle(), "GOE"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 589 | assertEquals(fd.getFieldValue("title"), "Autobiographische Einzelheiten"); |
| 590 | assertNull(fd.getFieldValue("subTitle")); |
| 591 | assertEquals(fd.getFieldValue("textType"), "Autobiographie"); |
| 592 | assertNull(fd.getFieldValue("textTypeArt")); |
| 593 | assertNull(fd.getFieldValue("textTypeRef")); |
| 594 | assertNull(fd.getFieldValue("textColumn")); |
| 595 | assertNull(fd.getFieldValue("textDomain")); |
| Akron | 69b958c | 2017-02-15 22:49:45 +0100 | [diff] [blame] | 596 | // assertEquals(fd.getPages(), "529-547"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 597 | // assertEquals(fd.getFieldValue("availability"), "QAO-NC"); |
| 598 | assertEquals(fd.getFieldValue("creationDate"), "1820"); |
| 599 | assertEquals(fd.getFieldValue("pubDate"), "1982"); |
| 600 | assertEquals(fd.getFieldValue("author"), "Goethe, Johann Wolfgang von"); |
| 601 | assertNull(fd.getFieldValue("textClass")); |
| 602 | assertEquals(fd.getFieldValue("language"), "de"); |
| 603 | assertEquals(fd.getFieldValue("pubPlace"), "München"); |
| 604 | assertEquals(fd.getFieldValue("reference"), |
| Akron | 69b958c | 2017-02-15 22:49:45 +0100 | [diff] [blame] | 605 | "Goethe, Johann Wolfgang von:" |
| 606 | + " Autobiographische Einzelheiten," |
| 607 | + " (Geschrieben bis 1832), In: Goethe," |
| 608 | + " Johann Wolfgang von: Goethes Werke," |
| 609 | + " Bd. 10, Autobiographische Schriften" |
| 610 | + " II, Hrsg.: Trunz, Erich. München: " |
| 611 | + "Verlag C. H. Beck, 1982, S. 529-547"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 612 | assertEquals(fd.getFieldValue("publisher"), "Verlag C. H. Beck"); |
| 613 | assertNull(fd.getFieldValue("editor")); |
| 614 | assertNull(fd.getFieldValue("fileEditionStatement")); |
| 615 | assertNull(fd.getFieldValue("biblEditionStatement")); |
| 616 | assertNull(fd.getFieldValue("keywords")); |
| Akron | 69b958c | 2017-02-15 22:49:45 +0100 | [diff] [blame] | 617 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 618 | assertEquals(fd.getFieldValue("tokenSource"), "base#tokens_aggr"); |
| 619 | assertEquals(fd.getFieldValue("foundries"), |
| Akron | 69b958c | 2017-02-15 22:49:45 +0100 | [diff] [blame] | 620 | "dereko dereko/structure "+ |
| 621 | "dereko/structure/base-sentences-paragraphs-pagebreaks"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 622 | assertEquals(fd.getFieldValue("layerInfos"), "dereko/s=spans"); |
| Akron | 69b958c | 2017-02-15 22:49:45 +0100 | [diff] [blame] | 623 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 624 | assertEquals(fd.getFieldValue("corpusTitle"), "Goethes Werke"); |
| 625 | assertNull(fd.getFieldValue("corpusSubTitle")); |
| 626 | assertEquals(fd.getFieldValue("corpusAuthor"), "Goethe, Johann Wolfgang von"); |
| 627 | assertEquals(fd.getFieldValue("corpusEditor"), "Trunz, Erich"); |
| 628 | assertEquals(fd.getFieldValue("docTitle"), |
| Akron | 69b958c | 2017-02-15 22:49:45 +0100 | [diff] [blame] | 629 | "Goethe: Autobiographische Schriften II, (1817-1825, 1832)"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 630 | assertNull(fd.getFieldValue("docSubTitle")); |
| 631 | assertNull(fd.getFieldValue("docEditor")); |
| 632 | assertNull(fd.getFieldValue("docAuthor")); |
| Akron | 69b958c | 2017-02-15 22:49:45 +0100 | [diff] [blame] | 633 | |
| 634 | Krill ks = new Krill(new QueryBuilder("tokens").seg("s:der")); |
| 635 | Result kr = ks.apply(ki); |
| 636 | |
| 637 | assertEquals(kr.getTotalResults(), 97); |
| 638 | assertEquals(0, kr.getStartIndex()); |
| 639 | assertEquals(25, kr.getItemsPerPage()); |
| 640 | |
| 641 | Match m = kr.getMatch(5); |
| 642 | assertEquals("Start page", m.getStartPage(), 529); |
| 643 | |
| 644 | ObjectMapper mapper = new ObjectMapper(); |
| 645 | JsonNode res = mapper.readTree(m.toJsonString()); |
| 646 | assertEquals(529, res.at("/pages/0").asInt()); |
| 647 | }; |
| 648 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 649 | |
| Nils Diewald | 06368ba | 2014-11-03 20:53:27 +0000 | [diff] [blame] | 650 | @Test |
| 651 | public void searchJSONnewJSON2 () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 652 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 653 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 654 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 655 | FieldDocument fd = ki.addDoc(1, |
| 656 | getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 657 | ki.commit(); |
| Nils Diewald | 06368ba | 2014-11-03 20:53:27 +0000 | [diff] [blame] | 658 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 659 | assertEquals(fd.getUID(), 1); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 660 | assertEquals(fd.getTextSigle(), "BZK_D59.00089"); |
| 661 | assertEquals(fd.getDocSigle(), "BZK_D59"); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 662 | assertEquals(fd.getCorpusSigle(), "BZK"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 663 | assertEquals(fd.getFieldValue("title"), "Saragat-Partei zerfällt"); |
| 664 | assertEquals(fd.getFieldValue("pubDate"), "1959-02-19"); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 665 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 666 | assertNull(fd.getFieldValue("subTitle")); |
| 667 | assertNull(fd.getFieldValue("author")); |
| 668 | assertNull(fd.getFieldValue("editor")); |
| 669 | assertEquals(fd.getFieldValue("pubPlace"), "Berlin"); |
| 670 | assertNull(fd.getFieldValue("publisher")); |
| 671 | assertEquals(fd.getFieldValue("textType"), "Zeitung: Tageszeitung"); |
| 672 | assertNull(fd.getFieldValue("textTypeArt")); |
| 673 | assertEquals(fd.getFieldValue("textTypeRef"), "Tageszeitung"); |
| 674 | assertEquals(fd.getFieldValue("textDomain"), "Politik"); |
| 675 | assertEquals(fd.getFieldValue("creationDate"), "1959-02-19"); |
| 676 | assertEquals(fd.getFieldValue("availability"), "ACA-NC-LC"); |
| 677 | assertEquals(fd.getFieldValue("textColumn"), "POLITIK"); |
| Akron | 69b958c | 2017-02-15 22:49:45 +0100 | [diff] [blame] | 678 | // assertNull(fd.getPages()); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 679 | assertEquals(fd.getFieldValue("textClass"), "politik ausland"); |
| 680 | assertNull(fd.getFieldValue("fileEditionStatement")); |
| 681 | assertNull(fd.getFieldValue("biblEditionStatement")); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 682 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 683 | assertEquals(fd.getFieldValue("language"), "de"); |
| 684 | assertEquals(fd.getFieldValue("reference"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 685 | "Neues Deutschland, [Tageszeitung], 19.02.1959, Jg. 14," |
| 686 | + " Berliner Ausgabe, S. 7. - Sachgebiet: Politik, " |
| 687 | + "Originalressort: POLITIK; Saragat-Partei zerfällt"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 688 | assertNull(fd.getFieldValue("publisher")); |
| 689 | assertNull(fd.getFieldValue("keywords")); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 690 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 691 | assertEquals(fd.getFieldValue("tokenSource"), "opennlp#tokens"); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 692 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 693 | assertEquals(fd.getFieldValue("foundries"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 694 | "base base/paragraphs base/sentences corenlp " |
| 695 | + "corenlp/constituency corenlp/morpho corenlp/namedentities" |
| 696 | + " corenlp/sentences glemm glemm/morpho mate mate/morpho" |
| 697 | + " opennlp opennlp/morpho opennlp/sentences treetagger" |
| 698 | + " treetagger/morpho treetagger/sentences"); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 699 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 700 | assertEquals(fd.getFieldValue("layerInfos"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 701 | "base/s=spans corenlp/c=spans corenlp/ne=tokens" |
| 702 | + " corenlp/p=tokens corenlp/s=spans glemm/l=tokens" |
| 703 | + " mate/l=tokens mate/m=tokens mate/p=tokens" |
| 704 | + " opennlp/p=tokens opennlp/s=spans tt/l=tokens" |
| 705 | + " tt/p=tokens tt/s=spans"); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 706 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 707 | assertEquals(fd.getFieldValue("corpusTitle"), "Bonner Zeitungskorpus"); |
| 708 | assertNull(fd.getFieldValue("corpusSubTitle")); |
| 709 | assertNull(fd.getFieldValue("corpusAuthor")); |
| 710 | assertNull(fd.getFieldValue("corpusEditor")); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 711 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 712 | assertEquals(fd.getFieldValue("docTitle"), "Neues Deutschland"); |
| 713 | assertEquals(fd.getFieldValue("docSubTitle"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 714 | "Organ des Zentralkomitees der Sozialistischen " |
| 715 | + "Einheitspartei Deutschlands"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 716 | assertNull(fd.getFieldValue("docEditor")); |
| 717 | assertNull(fd.getFieldValue("docAuthor")); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 718 | |
| 719 | Krill ks = new Krill(new QueryBuilder("tokens").seg("mate/m:case:nom") |
| 720 | .with("mate/m:number:sg")); |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 721 | Result kr = ks.apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 722 | |
| 723 | assertEquals(kr.getTotalResults(), 6); |
| 724 | assertEquals(0, kr.getStartIndex()); |
| 725 | assertEquals(25, kr.getItemsPerPage()); |
| Nils Diewald | 06368ba | 2014-11-03 20:53:27 +0000 | [diff] [blame] | 726 | }; |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 727 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 728 | |
| Nils Diewald | 7cf8c6d | 2014-05-28 18:37:38 +0000 | [diff] [blame] | 729 | @Test |
| Nils Diewald | 56dc258 | 2014-11-04 21:33:46 +0000 | [diff] [blame] | 730 | public void searchJSONcosmasBoundaryBug () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 731 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 732 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 733 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 734 | FieldDocument fd = ki.addDoc(1, |
| 735 | getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 736 | ki.commit(); |
| Nils Diewald | 56dc258 | 2014-11-04 21:33:46 +0000 | [diff] [blame] | 737 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 738 | String json = getJsonString(getClass() |
| 739 | .getResource("/queries/bugs/cosmas_boundary.jsonld").getFile()); |
| Nils Diewald | 56dc258 | 2014-11-04 21:33:46 +0000 | [diff] [blame] | 740 | |
| Nils Diewald | 8904c1d | 2015-02-26 16:13:18 +0000 | [diff] [blame] | 741 | QueryBuilder kq = new QueryBuilder("tokens"); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 742 | Krill ks = new Krill(kq.focus(1, |
| Akron | 4f52a63 | 2018-02-09 19:02:40 +0100 | [diff] [blame] | 743 | kq.contains(kq.tag("base/s:s"), kq.nr(1, kq.seg("s:Leben"))))); |
| Nils Diewald | 56dc258 | 2014-11-04 21:33:46 +0000 | [diff] [blame] | 744 | |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 745 | Result kr = ks.apply(ki); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 746 | assertEquals(kr.getSerialQuery(), |
| Akron | a26184e | 2018-12-05 15:37:34 +0100 | [diff] [blame] | 747 | "focus(1: spanContain(<tokens:base/s:s />, {1: tokens:s:Leben}),sorting)"); |
| margaretha | f70addb | 2015-04-27 13:17:18 +0200 | [diff] [blame] | 748 | assertEquals(40, kr.getMatch(0).getStartPos()); |
| 749 | assertEquals(41, kr.getMatch(0).getEndPos()); |
| 750 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 751 | assertEquals(kr.getMatch(0).getSnippetBrackets(), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 752 | "... Initiative\" eine neue politische Gruppierung ins " |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 753 | + "[[{1:Leben}]] gerufen hatten. Pressemeldungen zufolge haben sich ..."); |
| Nils Diewald | 56dc258 | 2014-11-04 21:33:46 +0000 | [diff] [blame] | 754 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 755 | // Try with high class - don't highlight |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 756 | ks = new Krill(kq.focus(129, |
| Akron | 4f52a63 | 2018-02-09 19:02:40 +0100 | [diff] [blame] | 757 | kq.contains(kq.tag("base/s:s"), kq.nr(129, kq.seg("s:Leben"))))); |
| Nils Diewald | 56dc258 | 2014-11-04 21:33:46 +0000 | [diff] [blame] | 758 | |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 759 | kr = ks.apply(ki); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 760 | assertEquals(kr.getSerialQuery(), |
| Akron | a26184e | 2018-12-05 15:37:34 +0100 | [diff] [blame] | 761 | "focus(129: spanContain(<tokens:base/s:s />, {129: tokens:s:Leben}),sorting)"); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 762 | assertEquals(kr.getMatch(0).getSnippetBrackets(), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 763 | "... Initiative\" eine neue politische Gruppierung ins " |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 764 | + "[[Leben]] gerufen hatten. Pressemeldungen zufolge haben sich ..."); |
| Nils Diewald | 0fa2da2 | 2014-11-05 03:31:32 +0000 | [diff] [blame] | 765 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 766 | ks = new Krill(json); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 767 | kr = ks.apply(ki); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 768 | assertEquals(kr.getSerialQuery(), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 769 | "focus(129: spanElementDistance({129: tokens:s:Namen}, " |
| Akron | a26184e | 2018-12-05 15:37:34 +0100 | [diff] [blame] | 770 | + "{129: tokens:s:Leben}, [(base/s:s[0:1], notOrdered, notExcluded)]),sorting)"); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 771 | assertEquals(kr.getMatch(0).getSnippetBrackets(), |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 772 | "... ihren Austritt erklärt und unter dem [[Namen \"Einheitsbewegung " |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 773 | + "der sozialistischen Initiative\" eine neue politische Gruppierung " |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 774 | + "ins Leben]] gerufen hatten. Pressemeldungen zufolge haben sich ..."); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 775 | assertEquals(kr.getTotalResults(), 1); |
| 776 | assertEquals(0, kr.getStartIndex()); |
| Nils Diewald | 56dc258 | 2014-11-04 21:33:46 +0000 | [diff] [blame] | 777 | }; |
| 778 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 779 | |
| Nils Diewald | c7d08d9 | 2014-11-05 21:30:05 +0000 | [diff] [blame] | 780 | @Test |
| 781 | public void searchJSONmultipleClassesBug () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 782 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 783 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 784 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 785 | ki.addDoc(1, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), |
| 786 | true); |
| 787 | ki.addDoc(2, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), |
| 788 | true); |
| Nils Diewald | c7d08d9 | 2014-11-05 21:30:05 +0000 | [diff] [blame] | 789 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 790 | ki.commit(); |
| Nils Diewald | c7d08d9 | 2014-11-05 21:30:05 +0000 | [diff] [blame] | 791 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 792 | String json = getJsonString( |
| 793 | getClass().getResource("/queries/bugs/multiple_classes.jsonld") |
| 794 | .getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 795 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 796 | Krill ks = new Krill(json); |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 797 | Result kr = ks.apply(ki); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 798 | assertEquals(kr.getSerialQuery(), |
| 799 | "{4: spanNext({1: spanNext({2: tokens:s:ins}, " |
| 800 | + "{3: tokens:s:Leben})}, tokens:s:gerufen)}"); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 801 | assertEquals(kr.getMatch(0).getSnippetBrackets(), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 802 | "... sozialistischen Initiative\" eine neue politische" |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 803 | + " Gruppierung [[{4:{1:{2:ins} {3:Leben}} gerufen}]] hatten. " |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 804 | + "Pressemeldungen zufolge haben sich in ..."); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 805 | assertEquals(kr.getTotalResults(), 2); |
| 806 | assertEquals(0, kr.getStartIndex()); |
| Nils Diewald | c7d08d9 | 2014-11-05 21:30:05 +0000 | [diff] [blame] | 807 | }; |
| 808 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 809 | |
| Nils Diewald | 277e9ce | 2014-11-06 03:42:11 +0000 | [diff] [blame] | 810 | @Test |
| 811 | public void searchJSONmultipleClassesBugTokenList () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 812 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 813 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 814 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 815 | ki.addDoc(1, getClass().getResourceAsStream("/goe/AGA-03828.json.gz"), |
| 816 | true); |
| 817 | ki.addDoc(2, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), |
| 818 | true); |
| Nils Diewald | c7d08d9 | 2014-11-05 21:30:05 +0000 | [diff] [blame] | 819 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 820 | ki.commit(); |
| Nils Diewald | 277e9ce | 2014-11-06 03:42:11 +0000 | [diff] [blame] | 821 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 822 | String json = getJsonString( |
| 823 | getClass().getResource("/queries/bugs/multiple_classes.jsonld") |
| 824 | .getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 825 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 826 | Krill ks = new Krill(json); |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 827 | Result kr = ks.apply(ki); |
| Nils Diewald | 277e9ce | 2014-11-06 03:42:11 +0000 | [diff] [blame] | 828 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 829 | ObjectMapper mapper = new ObjectMapper(); |
| 830 | JsonNode res = mapper.readTree(kr.toTokenListJsonString()); |
| Nils Diewald | 277e9ce | 2014-11-06 03:42:11 +0000 | [diff] [blame] | 831 | |
| Akron | d504f21 | 2015-06-20 00:27:54 +0200 | [diff] [blame] | 832 | assertEquals(1, res.at("/meta/totalResults").asInt()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 833 | assertEquals( |
| 834 | "{4: spanNext({1: spanNext({2: tokens:s:ins}, " |
| 835 | + "{3: tokens:s:Leben})}, tokens:s:gerufen)}", |
| Akron | d504f21 | 2015-06-20 00:27:54 +0200 | [diff] [blame] | 836 | res.at("/meta/serialQuery").asText()); |
| 837 | assertEquals(0, res.at("/meta/startIndex").asInt()); |
| 838 | assertEquals(25, res.at("/meta/itemsPerPage").asInt()); |
| Nils Diewald | 277e9ce | 2014-11-06 03:42:11 +0000 | [diff] [blame] | 839 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 840 | assertEquals("BZK_D59.00089", res.at("/matches/0/textSigle").asText()); |
| 841 | assertEquals(328, res.at("/matches/0/tokens/0/0").asInt()); |
| 842 | assertEquals(331, res.at("/matches/0/tokens/0/1").asInt()); |
| 843 | assertEquals(332, res.at("/matches/0/tokens/1/0").asInt()); |
| 844 | assertEquals(337, res.at("/matches/0/tokens/1/1").asInt()); |
| 845 | assertEquals(338, res.at("/matches/0/tokens/2/0").asInt()); |
| 846 | assertEquals(345, res.at("/matches/0/tokens/2/1").asInt()); |
| Nils Diewald | 277e9ce | 2014-11-06 03:42:11 +0000 | [diff] [blame] | 847 | }; |
| Nils Diewald | c7d08d9 | 2014-11-05 21:30:05 +0000 | [diff] [blame] | 848 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 849 | |
| Nils Diewald | b84e727 | 2014-11-07 01:27:38 +0000 | [diff] [blame] | 850 | @Test |
| 851 | public void searchJSONmultitermRewriteBug () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 852 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 853 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | b84e727 | 2014-11-07 01:27:38 +0000 | [diff] [blame] | 854 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 855 | assertEquals(ki.numberOf("documents"), 0); |
| 856 | |
| 857 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 858 | FieldDocument fd = ki.addDoc(1, |
| 859 | getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 860 | ki.commit(); |
| 861 | |
| 862 | assertEquals(ki.numberOf("documents"), 1); |
| 863 | assertEquals("BZK", fd.getCorpusSigle()); |
| 864 | |
| 865 | // [tt/p="A.*"]{0,3}[tt/p="N.*"] |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 866 | String json = getJsonString( |
| 867 | getClass().getResource("/queries/bugs/multiterm_rewrite.jsonld") |
| 868 | .getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 869 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 870 | Krill ks = new Krill(json); |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 871 | KrillCollection kc = ks.getCollection(); |
| Nils Diewald | c471b18 | 2014-11-19 22:51:15 +0000 | [diff] [blame] | 872 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 873 | // No index was set |
| 874 | assertEquals(-1, kc.numberOf("documents")); |
| 875 | kc.setIndex(ki); |
| Nils Diewald | c471b18 | 2014-11-19 22:51:15 +0000 | [diff] [blame] | 876 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 877 | // Index was set but vc restricted to WPD |
| 878 | assertEquals(0, kc.numberOf("documents")); |
| Nils Diewald | c471b18 | 2014-11-19 22:51:15 +0000 | [diff] [blame] | 879 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 880 | /* |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 881 | kc.extend(new CollectionBuilder().or("corpusSigle", "BZK")); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 882 | */ |
| 883 | CollectionBuilder cb = new CollectionBuilder(); |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 884 | kc.fromBuilder(cb.orGroup().with(kc.getBuilder()) |
| 885 | .with(cb.term("corpusSigle", "BZK"))); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 886 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 887 | ks.setCollection(kc); |
| 888 | assertEquals(1, kc.numberOf("documents")); |
| Nils Diewald | 1220e3e | 2014-11-08 03:18:58 +0000 | [diff] [blame] | 889 | |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 890 | Result kr = ks.apply(ki); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 891 | |
| 892 | assertEquals(kr.getSerialQuery(), |
| 893 | "spanOr([SpanMultiTermQueryWrapper(tokens:/tt/p:N.*/), " |
| 894 | + "spanNext(spanRepetition(SpanMultiTermQueryWrapper" |
| 895 | + "(tokens:/tt/p:A.*/){1,3}), " |
| 896 | + "SpanMultiTermQueryWrapper(tokens:/tt/p:N.*/))])"); |
| Nils Diewald | b84e727 | 2014-11-07 01:27:38 +0000 | [diff] [blame] | 897 | |
| margaretha | 7f4fd65 | 2018-11-22 18:00:02 +0100 | [diff] [blame] | 898 | assertEquals(68,kr.getTotalResults()); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 899 | assertEquals(0, kr.getStartIndex()); |
| Nils Diewald | 5871e4d | 2014-11-07 03:48:25 +0000 | [diff] [blame] | 900 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 901 | assertEquals(kr.getMatch(0).getSnippetBrackets(), |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 902 | "[[Saragat-Partei]] zerfällt Rom (ADN) die von dem ..."); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 903 | assertEquals(kr.getMatch(1).getSnippetBrackets(), |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 904 | "[[Saragat-Partei]] zerfällt Rom (ADN) die von dem ..."); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 905 | assertEquals(kr.getMatch(2).getSnippetBrackets(), |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 906 | "Saragat-Partei zerfällt [[Rom]] (ADN) " |
| Akron | 43cea66 | 2016-02-15 23:43:59 +0100 | [diff] [blame] | 907 | + "die von dem Rechtssozialisten Saragat ..."); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 908 | assertEquals(kr.getMatch(3).getSnippetBrackets(), |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 909 | "Saragat-Partei zerfällt Rom ([[ADN]]) " |
| Akron | 43cea66 | 2016-02-15 23:43:59 +0100 | [diff] [blame] | 910 | + "die von dem Rechtssozialisten Saragat geführte ..."); |
| margaretha | 7f4fd65 | 2018-11-22 18:00:02 +0100 | [diff] [blame] | 911 | assertEquals("... auseinander, nachdem vor einiger Zeit mehrere " |
| 912 | + "[[prominente Mitglieder]] ihren Austritt erklärt " |
| 913 | + "und unter dem ...", kr.getMatch(23).getSnippetBrackets()); |
| Nils Diewald | b84e727 | 2014-11-07 01:27:38 +0000 | [diff] [blame] | 914 | }; |
| 915 | |
| 916 | |
| Nils Diewald | 56dc258 | 2014-11-04 21:33:46 +0000 | [diff] [blame] | 917 | @Test |
| Akron | e4fdce4 | 2015-11-13 16:06:10 +0100 | [diff] [blame] | 918 | public void searchJSONtokenDistanceSpanBug () throws IOException { |
| 919 | // Construct index |
| 920 | KrillIndex ki = new KrillIndex(); |
| 921 | ki.addDoc(1, getClass().getResourceAsStream("/goe/AGX-00002.json"), |
| Akron | 4299355 | 2016-02-04 13:24:24 +0100 | [diff] [blame] | 922 | false); |
| Akron | e4fdce4 | 2015-11-13 16:06:10 +0100 | [diff] [blame] | 923 | ki.addDoc(2, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), |
| Akron | 4299355 | 2016-02-04 13:24:24 +0100 | [diff] [blame] | 924 | true); |
| Akron | e4fdce4 | 2015-11-13 16:06:10 +0100 | [diff] [blame] | 925 | ki.commit(); |
| Akron | 4299355 | 2016-02-04 13:24:24 +0100 | [diff] [blame] | 926 | |
| Akron | e4fdce4 | 2015-11-13 16:06:10 +0100 | [diff] [blame] | 927 | // ({1:Sonne []* Erde} | {2: Erde []* Sonne}) |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 928 | String json = getJsonString(getClass() |
| 929 | .getResource("/queries/bugs/tokendistancespan_bug.jsonld") |
| 930 | .getFile()); |
| Akron | e4fdce4 | 2015-11-13 16:06:10 +0100 | [diff] [blame] | 931 | |
| 932 | Krill ks = new Krill(json); |
| 933 | Result kr = ks.apply(ki); |
| 934 | ObjectMapper mapper = new ObjectMapper(); |
| 935 | JsonNode res = mapper.readTree(kr.toJsonString()); |
| 936 | assertTrue(res.at("/errors").isMissingNode()); |
| 937 | }; |
| 938 | |
| 939 | |
| 940 | @Test |
| Nils Diewald | fb4d7b0 | 2014-04-09 17:56:17 +0000 | [diff] [blame] | 941 | public void searchJSONCollection () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 942 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 943 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 944 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 945 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 946 | "00005", "00006", "02439" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 947 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 948 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 949 | }; |
| 950 | ki.commit(); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 951 | String json = getJsonString(getClass() |
| 952 | .getResource("/queries/metaquery8-nocollection.jsonld") |
| 953 | .getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 954 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 955 | Krill ks = new Krill(json); |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 956 | Result kr = ks.apply(ki); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 957 | assertEquals(kr.getTotalResults(), 276); |
| 958 | assertEquals(0, kr.getStartIndex()); |
| 959 | assertEquals(10, kr.getItemsPerPage()); |
| Nils Diewald | fb4d7b0 | 2014-04-09 17:56:17 +0000 | [diff] [blame] | 960 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 961 | json = getJsonString( |
| 962 | getClass().getResource("/queries/metaquery8.jsonld").getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 963 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 964 | ks = new Krill(json); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 965 | kr = ks.apply(ki); |
| Nils Diewald | fb4d7b0 | 2014-04-09 17:56:17 +0000 | [diff] [blame] | 966 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 967 | assertEquals(kr.getTotalResults(), 147); |
| 968 | assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID()); |
| 969 | assertEquals(0, kr.getStartIndex()); |
| 970 | assertEquals(10, kr.getItemsPerPage()); |
| Nils Diewald | fb4d7b0 | 2014-04-09 17:56:17 +0000 | [diff] [blame] | 971 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 972 | json = getJsonString(getClass() |
| 973 | .getResource("/queries/metaquery8-filtered.jsonld").getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 974 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 975 | ks = new Krill(json); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 976 | kr = ks.apply(ki); |
| Nils Diewald | fb4d7b0 | 2014-04-09 17:56:17 +0000 | [diff] [blame] | 977 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 978 | assertEquals(kr.getTotalResults(), 28); |
| 979 | assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID()); |
| 980 | assertEquals(0, kr.getStartIndex()); |
| 981 | assertEquals(10, kr.getItemsPerPage()); |
| Nils Diewald | fb4d7b0 | 2014-04-09 17:56:17 +0000 | [diff] [blame] | 982 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 983 | json = getJsonString(getClass() |
| 984 | .getResource("/queries/metaquery8-filtered-further.jsonld") |
| 985 | .getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 986 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 987 | ks = new Krill(json); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 988 | kr = ks.apply(ki); |
| Nils Diewald | fb4d7b0 | 2014-04-09 17:56:17 +0000 | [diff] [blame] | 989 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 990 | assertEquals(kr.getTotalResults(), 0); |
| 991 | assertEquals(0, kr.getStartIndex()); |
| 992 | assertEquals(10, kr.getItemsPerPage()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 993 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 994 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 995 | json = getJsonString(getClass() |
| 996 | .getResource("/queries/metaquery8-filtered-nested.jsonld") |
| 997 | .getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 998 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 999 | ks = new Krill(json); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 1000 | kr = ks.apply(ki); |
| Nils Diewald | fb4d7b0 | 2014-04-09 17:56:17 +0000 | [diff] [blame] | 1001 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 1002 | /* |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1003 | assertEquals("filter with QueryWrapperFilter(" |
| 1004 | + "+(ID:WPD_AAA.00003 (+tokens:s:die" |
| 1005 | + " +tokens:s:Schriftzeichen)))", |
| 1006 | ks.getCollection().getFilter(1).toString()); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 1007 | */ |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 1008 | assertEquals( |
| 1009 | "AndGroup(OrGroup(ID:WPD_AAA.00001 ID:WPD_AAA.00002) OrGroup(ID:WPD_AAA.00003 AndGroup(tokens:s:die tokens:s:Schriftzeichen)))", |
| 1010 | ks.getCollection().toString()); |
| Nils Diewald | fb4d7b0 | 2014-04-09 17:56:17 +0000 | [diff] [blame] | 1011 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 1012 | assertEquals(kr.getTotalResults(), 119); |
| 1013 | assertEquals(0, kr.getStartIndex()); |
| 1014 | assertEquals(10, kr.getItemsPerPage()); |
| Nils Diewald | fb4d7b0 | 2014-04-09 17:56:17 +0000 | [diff] [blame] | 1015 | }; |
| 1016 | |
| Nils Diewald | 1e5d594 | 2014-05-20 13:29:53 +0000 | [diff] [blame] | 1017 | |
| 1018 | @Test |
| 1019 | public void searchJSONSentenceContext () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 1020 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 1021 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 1022 | // Indexing test files |
| Akron | 4299355 | 2016-02-04 13:24:24 +0100 | [diff] [blame] | 1023 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1024 | "00005", "00006", "02439" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1025 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1026 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 1027 | }; |
| 1028 | ki.commit(); |
| Nils Diewald | 1e5d594 | 2014-05-20 13:29:53 +0000 | [diff] [blame] | 1029 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1030 | String json = getJsonString(getClass() |
| 1031 | .getResource("/queries/bsp-context-2.jsonld").getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1032 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 1033 | Krill ks = new Krill(json); |
| Nils Diewald | f5ab4b2 | 2015-02-25 20:55:16 +0000 | [diff] [blame] | 1034 | ks.getMeta().setCutOff(false); |
| 1035 | SearchContext sc = ks.getMeta().getContext(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 1036 | sc.left.setLength((short) 10); |
| 1037 | sc.right.setLength((short) 10); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1038 | |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 1039 | Result kr = ks.apply(ki); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 1040 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1041 | assertEquals(kr.getMatch(1).getSnippetBrackets(), |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 1042 | "... dezimalen [[Wert]] 65 sowohl ..."); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 1043 | assertEquals(kr.getTotalResults(), 3); |
| 1044 | assertEquals(0, kr.getStartIndex()); |
| 1045 | assertEquals(25, kr.getItemsPerPage()); |
| Akron | 499c94c | 2016-02-04 13:13:43 +0100 | [diff] [blame] | 1046 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1047 | assertFalse( |
| 1048 | kr.getContext().toJsonNode().toString().equals("\"base/s:s\"")); |
| Nils Diewald | 1e5d594 | 2014-05-20 13:29:53 +0000 | [diff] [blame] | 1049 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1050 | json = getJsonString(getClass() |
| 1051 | .getResource("/queries/bsp-context-sentence.jsonld").getFile()); |
| Nils Diewald | 1e5d594 | 2014-05-20 13:29:53 +0000 | [diff] [blame] | 1052 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 1053 | kr = new Krill(json).apply(ki); |
| Akron | 43cea66 | 2016-02-15 23:43:59 +0100 | [diff] [blame] | 1054 | assertEquals(kr.getContext().toJsonNode().toString(), "\"base/s:s\""); |
| Akron | 499c94c | 2016-02-04 13:13:43 +0100 | [diff] [blame] | 1055 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1056 | assertEquals(kr.getMatch(0).getSnippetBrackets(), |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 1057 | "steht a für den dezimalen [[Wert]] 97 sowohl im ASCII-" |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1058 | + " als auch im Unicode-Zeichensatz"); |
| 1059 | assertEquals(kr.getMatch(1).getSnippetBrackets(), |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 1060 | "steht A für den dezimalen [[Wert]] 65 sowohl im ASCII-" |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1061 | + " als auch im Unicode-Zeichensatz"); |
| 1062 | assertEquals(kr.getMatch(2).getSnippetBrackets(), |
| 1063 | "In einem Zahlensystem mit einer Basis größer " |
| 1064 | + "als 10 steht A oder a häufig für den dezimalen" |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 1065 | + " [[Wert]] 10, siehe auch Hexadezimalsystem."); |
| Nils Diewald | 1e5d594 | 2014-05-20 13:29:53 +0000 | [diff] [blame] | 1066 | }; |
| 1067 | |
| 1068 | |
| Nils Diewald | 2276e1c | 2014-04-10 15:01:59 +0000 | [diff] [blame] | 1069 | @Test |
| Nils Diewald | 5418763 | 2014-06-11 14:39:29 +0000 | [diff] [blame] | 1070 | public void searchJSONbug () throws IOException { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 1071 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 1072 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 1073 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1074 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 1075 | "00005", "00006", "02439" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1076 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1077 | true); |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 1078 | }; |
| 1079 | ki.commit(); |
| Nils Diewald | 5418763 | 2014-06-11 14:39:29 +0000 | [diff] [blame] | 1080 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1081 | String json = getJsonString( |
| 1082 | getClass().getResource("/queries/bsp-bug.jsonld").getFile()); |
| Nils Diewald | 5418763 | 2014-06-11 14:39:29 +0000 | [diff] [blame] | 1083 | |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 1084 | Result kr = new Krill(json).apply(ki); |
| Nils Diewald | c471b18 | 2014-11-19 22:51:15 +0000 | [diff] [blame] | 1085 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1086 | assertEquals(kr.getError(0).getMessage(), |
| 1087 | "Operation needs operand list"); |
| Nils Diewald | 5418763 | 2014-06-11 14:39:29 +0000 | [diff] [blame] | 1088 | }; |
| 1089 | |
| Akron | f9def5e | 2016-10-10 21:26:46 +0200 | [diff] [blame] | 1090 | |
| 1091 | @Test |
| Akron | f785dae | 2016-08-10 17:12:40 +0200 | [diff] [blame] | 1092 | public void searchJSONdistanceWithRegexesBug () throws IOException { |
| 1093 | // Construct index |
| 1094 | KrillIndex ki = new KrillIndex(); |
| 1095 | // Indexing test files |
| 1096 | for (String i : new String[] { "00001" }) { |
| Akron | f9def5e | 2016-10-10 21:26:46 +0200 | [diff] [blame] | 1097 | // , "00002", "00003", "00004", "00005", "00006", "02439" |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1098 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Akron | f785dae | 2016-08-10 17:12:40 +0200 | [diff] [blame] | 1099 | true); |
| 1100 | }; |
| 1101 | ki.commit(); |
| 1102 | |
| Akron | f9def5e | 2016-10-10 21:26:46 +0200 | [diff] [blame] | 1103 | // "der" []{2,3} [opennlp/p="NN"] |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1104 | String json = getJsonString(getClass() |
| 1105 | .getResource("/queries/bugs/distances_with_regex_bug.jsonld") |
| 1106 | .getFile()); |
| Akron | f785dae | 2016-08-10 17:12:40 +0200 | [diff] [blame] | 1107 | |
| 1108 | Result kr = new Krill(json).apply(ki); |
| 1109 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1110 | assertEquals(kr.getMatch(0).getSnippetBrackets(), |
| Akron | f9def5e | 2016-10-10 21:26:46 +0200 | [diff] [blame] | 1111 | "Mit Ausnahme von Fremdwörtern und Namen ist das A der einzige Buchstabe im Deutschen, [[der zweifach am Anfang]] eines Wortes stehen darf, etwa im Wort Aal."); |
| Akron | f785dae | 2016-08-10 17:12:40 +0200 | [diff] [blame] | 1112 | |
| 1113 | }; |
| 1114 | |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 1115 | |
| Nils Diewald | ef7124e | 2014-11-12 20:08:13 +0000 | [diff] [blame] | 1116 | /** |
| 1117 | * This is a breaking test for #179 |
| 1118 | */ |
| 1119 | @Test |
| 1120 | public void searchJSONexpansionBug () throws IOException { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1121 | // Construct index |
| 1122 | KrillIndex ki = new KrillIndex(); |
| 1123 | // Indexing test files |
| 1124 | ki.addDoc(getClass().getResourceAsStream("/wiki/00002.json.gz"), true); |
| 1125 | ki.commit(); |
| 1126 | |
| 1127 | // Expansion bug |
| 1128 | // der alte Digraph Aa durch Å |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1129 | String json = getJsonString(getClass() |
| 1130 | .getResource("/queries/bugs/expansion_bug_2.jsonld").getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1131 | |
| 1132 | Result kr = new Krill(json).apply(ki); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1133 | assertEquals( |
| 1134 | "... Buchstabe des Alphabetes. In Dänemark ist " |
| 1135 | + "[[der alte Digraph Aa durch Å]] ersetzt worden, " |
| 1136 | + "in Eigennamen und Ortsnamen ...", |
| 1137 | kr.getMatch(0).getSnippetBrackets()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1138 | assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID()); |
| 1139 | assertEquals(kr.getTotalResults(), 1); |
| 1140 | |
| Akron | 9f1a55b | 2016-04-20 19:11:06 +0200 | [diff] [blame] | 1141 | |
| 1142 | // TODO: base/s:t needs to be defined!!! |
| 1143 | QueryBuilder qb = new QueryBuilder("tokens"); |
| 1144 | kr = new Krill(qb.tag("base/s:t")).apply(ki); |
| Akron | 9f1a55b | 2016-04-20 19:11:06 +0200 | [diff] [blame] | 1145 | assertEquals(kr.getTotalResults(), 1); |
| 1146 | |
| 1147 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1148 | // der alte Digraph Aa durch [] |
| 1149 | // Works with one document |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1150 | json = getJsonString(getClass() |
| 1151 | .getResource("/queries/bugs/expansion_bug.jsonld").getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1152 | |
| 1153 | kr = new Krill(json).apply(ki); |
| 1154 | |
| Akron | 9f1a55b | 2016-04-20 19:11:06 +0200 | [diff] [blame] | 1155 | // focus(254: spanContain(<tokens:base/s:t />, {254: spanNext(spanNext(spanNext(spanNext(tokens:s:der, tokens:s:alte), tokens:s:Digraph), tokens:s:Aa), spanExpansion(tokens:s:durch, []{1, 1}, right))})) |
| 1156 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1157 | assertEquals( |
| 1158 | "... Buchstabe des Alphabetes. In Dänemark ist " |
| 1159 | + "[[der alte Digraph Aa durch Å]] ersetzt worden, " |
| 1160 | + "in Eigennamen und Ortsnamen ...", |
| 1161 | kr.getMatch(0).getSnippetBrackets()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1162 | assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID()); |
| 1163 | assertEquals(kr.getTotalResults(), 1); |
| 1164 | |
| 1165 | // Now try with one file ahead |
| 1166 | ki = new KrillIndex(); |
| 1167 | for (String i : new String[] { "00001", "00002" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1168 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1169 | true); |
| 1170 | }; |
| 1171 | ki.commit(); |
| 1172 | |
| 1173 | // Expansion bug |
| 1174 | // der alte Digraph Aa durch Å |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1175 | json = getJsonString(getClass() |
| 1176 | .getResource("/queries/bugs/expansion_bug_2.jsonld").getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1177 | |
| 1178 | kr = new Krill(json).apply(ki); |
| 1179 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1180 | assertEquals( |
| 1181 | "... Buchstabe des Alphabetes. In Dänemark ist " |
| 1182 | + "[[der alte Digraph Aa durch Å]] ersetzt worden, " |
| 1183 | + "in Eigennamen und Ortsnamen ...", |
| 1184 | kr.getMatch(0).getSnippetBrackets()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1185 | assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID()); |
| 1186 | assertEquals(kr.getTotalResults(), 1); |
| 1187 | |
| 1188 | // der alte Digraph Aa durch [] |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1189 | json = getJsonString(getClass() |
| 1190 | .getResource("/queries/bugs/expansion_bug.jsonld").getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1191 | |
| 1192 | kr = new Krill(json).apply(ki); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1193 | assertEquals( |
| 1194 | "... Buchstabe des Alphabetes. In Dänemark ist " |
| 1195 | + "[[der alte Digraph Aa durch Å]] ersetzt worden, " |
| 1196 | + "in Eigennamen und Ortsnamen ...", |
| 1197 | kr.getMatch(0).getSnippetBrackets()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 1198 | assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID()); |
| 1199 | assertEquals(kr.getTotalResults(), 1); |
| Nils Diewald | ef7124e | 2014-11-12 20:08:13 +0000 | [diff] [blame] | 1200 | }; |
| Akron | 8abefa1 | 2016-02-13 05:35:42 +0100 | [diff] [blame] | 1201 | |
| Akron | f9def5e | 2016-10-10 21:26:46 +0200 | [diff] [blame] | 1202 | |
| 1203 | @Test |
| Akron | dfc9357 | 2016-08-10 19:01:34 +0200 | [diff] [blame] | 1204 | public void queryJSONzeroRepetitionBug () throws IOException { |
| Akron | f9def5e | 2016-10-10 21:26:46 +0200 | [diff] [blame] | 1205 | // der{0} |
| 1206 | KrillIndex ki = new KrillIndex(); |
| 1207 | ki.addDoc(getClass().getResourceAsStream("/wiki/00001.json.gz"), true); |
| 1208 | ki.commit(); |
| Akron | dfc9357 | 2016-08-10 19:01:34 +0200 | [diff] [blame] | 1209 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1210 | String json = getJsonString(getClass() |
| 1211 | .getResource("/queries/bugs/zero_repetition_bug.jsonld") |
| 1212 | .getFile()); |
| Akron | dfc9357 | 2016-08-10 19:01:34 +0200 | [diff] [blame] | 1213 | |
| Akron | f9def5e | 2016-10-10 21:26:46 +0200 | [diff] [blame] | 1214 | Result kr = new Krill(json).apply(ki); |
| 1215 | |
| 1216 | assertEquals(783, kr.getError(0).getCode()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1217 | assertEquals("This query can't match anywhere", |
| 1218 | kr.getError(0).getMessage()); |
| Akron | f9def5e | 2016-10-10 21:26:46 +0200 | [diff] [blame] | 1219 | }; |
| 1220 | |
| Akron | 13db615 | 2016-02-19 14:08:38 +0100 | [diff] [blame] | 1221 | |
| Akron | 8abefa1 | 2016-02-13 05:35:42 +0100 | [diff] [blame] | 1222 | /** |
| Akron | 13db615 | 2016-02-19 14:08:38 +0100 | [diff] [blame] | 1223 | * This is a Schreibgebrauch ressource that didn't work for |
| 1224 | * element queries. |
| Akron | 8abefa1 | 2016-02-13 05:35:42 +0100 | [diff] [blame] | 1225 | */ |
| 1226 | @Test |
| 1227 | public void searchSchreibgebrauchData () throws IOException { |
| 1228 | // Construct index |
| 1229 | KrillIndex ki = new KrillIndex(); |
| 1230 | // Indexing test files |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 1231 | ki.addDoc( |
| 1232 | getClass().getResourceAsStream("/sgbr/BSP-2013-01-32.json.gz"), |
| 1233 | true); |
| Akron | 8abefa1 | 2016-02-13 05:35:42 +0100 | [diff] [blame] | 1234 | ki.commit(); |
| 1235 | |
| 1236 | Krill k = new Krill(new QueryBuilder("tokens").tag("base/s:s")); |
| 1237 | |
| Akron | 13db615 | 2016-02-19 14:08:38 +0100 | [diff] [blame] | 1238 | assertEquals(k.getSpanQuery().toString(), "<tokens:base/s:s />"); |
| Akron | 8abefa1 | 2016-02-13 05:35:42 +0100 | [diff] [blame] | 1239 | |
| 1240 | Result kr = k.apply(ki); |
| 1241 | assertEquals(kr.getTotalResults(), 1); |
| 1242 | assertEquals(kr.getMatch(0).getSnippetBrackets(), |
| Akron | f05fde6 | 2016-08-03 23:46:17 +0200 | [diff] [blame] | 1243 | "[[Selbst ist der Jeck]]"); |
| Akron | 8abefa1 | 2016-02-13 05:35:42 +0100 | [diff] [blame] | 1244 | |
| 1245 | assertEquals(kr.getMatch(0).getTextSigle(), "PRO-DUD_BSP-2013-01.32"); |
| 1246 | }; |
| 1247 | |
| Akron | 2ea48e6 | 2017-04-28 20:23:30 +0200 | [diff] [blame] | 1248 | |
| 1249 | /** |
| 1250 | * This is a Schreibgebrauch ressource that didn't work for |
| 1251 | * element queries. |
| 1252 | */ |
| 1253 | @Test |
| 1254 | public void searchNewDeReKoData () throws IOException { |
| 1255 | // Construct index |
| 1256 | KrillIndex ki = new KrillIndex(); |
| 1257 | // Indexing test files |
| 1258 | // Indexing test files |
| 1259 | FieldDocument fd = ki.addDoc(1, |
| 1260 | getClass().getResourceAsStream("/goe/AGA-03828-new.json.gz"), |
| 1261 | true); |
| 1262 | ki.commit(); |
| 1263 | |
| 1264 | assertEquals(fd.getUID(), 1); |
| 1265 | assertEquals(fd.getTextSigle(), "GOE/AGA/03828"); |
| 1266 | assertEquals(fd.getDocSigle(), "GOE/AGA"); |
| 1267 | assertEquals(fd.getCorpusSigle(), "GOE"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 1268 | assertEquals(fd.getFieldValue("title"), "Autobiographische Einzelheiten"); |
| 1269 | assertNull(fd.getFieldValue("subTitle")); |
| 1270 | assertEquals(fd.getFieldValue("textType"), "Autobiographie"); |
| 1271 | assertNull(fd.getFieldValue("textTypeArt")); |
| 1272 | assertNull(fd.getFieldValue("textTypeRef")); |
| 1273 | assertNull(fd.getFieldValue("textColumn")); |
| 1274 | assertNull(fd.getFieldValue("textDomain")); |
| Akron | 2ea48e6 | 2017-04-28 20:23:30 +0200 | [diff] [blame] | 1275 | // assertEquals(fd.getPages(), "529-547"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 1276 | assertEquals(fd.getFieldValue("availability"), "QAO-NC"); |
| 1277 | assertEquals(fd.getFieldValue("creationDate"), "1820"); |
| 1278 | assertEquals(fd.getFieldValue("pubDate"), "1982"); |
| 1279 | assertEquals(fd.getFieldValue("author"), "Goethe, Johann Wolfgang von"); |
| 1280 | assertNull(fd.getFieldValue("textClass")); |
| 1281 | assertEquals(fd.getFieldValue("language"), "de"); |
| 1282 | assertEquals(fd.getFieldValue("pubPlace"), "München"); |
| 1283 | assertEquals(fd.getFieldValue("reference"), |
| Akron | 2ea48e6 | 2017-04-28 20:23:30 +0200 | [diff] [blame] | 1284 | "Goethe, Johann Wolfgang von:" |
| 1285 | + " Autobiographische Einzelheiten," |
| 1286 | + " (Geschrieben bis 1832), In: Goethe," |
| 1287 | + " Johann Wolfgang von: Goethes Werke," |
| 1288 | + " Bd. 10, Autobiographische Schriften" |
| 1289 | + " II, Hrsg.: Trunz, Erich. München: " |
| 1290 | + "Verlag C. H. Beck, 1982, S. 529-547"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 1291 | assertEquals(fd.getFieldValue("publisher"), "Verlag C. H. Beck"); |
| 1292 | assertNull(fd.getFieldValue("editor")); |
| 1293 | assertNull(fd.getFieldValue("fileEditionStatement")); |
| 1294 | assertNull(fd.getFieldValue("biblEditionStatement")); |
| 1295 | assertNull(fd.getFieldValue("keywords")); |
| Akron | 2ea48e6 | 2017-04-28 20:23:30 +0200 | [diff] [blame] | 1296 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 1297 | assertEquals(fd.getFieldValue("tokenSource"), "base#tokens"); |
| 1298 | assertEquals(fd.getFieldValue("foundries"), |
| Akron | 2ea48e6 | 2017-04-28 20:23:30 +0200 | [diff] [blame] | 1299 | "corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure dereko/structure/base-sentences-paragraphs-pagebreaks malt malt/dependency marmot marmot/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 1300 | assertEquals(fd.getFieldValue("layerInfos"), |
| Akron | 2ea48e6 | 2017-04-28 20:23:30 +0200 | [diff] [blame] | 1301 | "corenlp/c=spans corenlp/p=tokens corenlp/s=spans dereko/s=spans malt/d=rels marmot/m=tokens marmot/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens"); |
| 1302 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 1303 | assertEquals(fd.getFieldValue("corpusTitle"), "Goethes Werke"); |
| 1304 | assertNull(fd.getFieldValue("corpusSubTitle")); |
| 1305 | assertEquals(fd.getFieldValue("corpusAuthor"), "Goethe, Johann Wolfgang von"); |
| 1306 | assertEquals(fd.getFieldValue("corpusEditor"), "Trunz, Erich"); |
| 1307 | assertEquals(fd.getFieldValue("docTitle"), |
| Akron | 2ea48e6 | 2017-04-28 20:23:30 +0200 | [diff] [blame] | 1308 | "Goethe: Autobiographische Schriften II, (1817-1825, 1832)"); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 1309 | assertNull(fd.getFieldValue("docSubTitle")); |
| 1310 | assertNull(fd.getFieldValue("docEditor")); |
| 1311 | assertNull(fd.getFieldValue("docAuthor")); |
| Akron | 2ea48e6 | 2017-04-28 20:23:30 +0200 | [diff] [blame] | 1312 | |
| 1313 | Krill ks = new Krill(new QueryBuilder("tokens").seg("marmot/m:case:nom") |
| 1314 | .with("marmot/m:number:pl")); |
| 1315 | Result kr = ks.apply(ki); |
| 1316 | |
| 1317 | assertEquals(kr.getTotalResults(), 141); |
| 1318 | assertEquals(0, kr.getStartIndex()); |
| 1319 | assertEquals(25, kr.getItemsPerPage()); |
| 1320 | }; |
| 1321 | |
| Akron | 70ce0c0 | 2018-05-25 23:44:26 +0200 | [diff] [blame] | 1322 | @Test |
| 1323 | public void searchLongMatch () throws IOException { |
| 1324 | |
| 1325 | // Construct index |
| 1326 | KrillIndex ki = new KrillIndex(); |
| 1327 | // Indexing test files |
| 1328 | ki.addDoc( |
| 1329 | getClass().getResourceAsStream("/goe/AGX-00002.json"), |
| 1330 | false); |
| 1331 | ki.commit(); |
| 1332 | |
| 1333 | Krill k = new Krill(new QueryBuilder("tokens").tag("xy/z:long")); |
| 1334 | |
| 1335 | assertEquals(k.getSpanQuery().toString(), "<tokens:xy/z:long />"); |
| 1336 | |
| 1337 | Result kr = k.apply(ki); |
| 1338 | assertEquals(kr.getTotalResults(), 1); |
| 1339 | assertEquals(2, kr.getMatch(0).getStartPos()); |
| 1340 | assertEquals(52, kr.getMatch(0).getEndPos()); |
| 1341 | assertEquals(kr.getMatch(0).getSnippetBrackets(), |
| 1342 | "Maximen und [[Reflexionen Religion und Christentum. wir sind naturforschend Pantheisten, dichtend Polytheisten, sittlich Monotheisten. Gott, wenn wir hoch stehen, ist alles; stehen wir niedrig, so ist er ein Supplement unsrer Armseligkeit. die Kreatur ist sehr schwach; denn sucht sie etwas, findet sie's nicht. stark aber ist Gott; denn sucht er die Kreatur]<!>], so hat er sie gleich in ..."); |
| 1343 | assertEquals(kr.getMatch(0).getSnippetHTML(), |
| 1344 | "<span class=\"context-left\">Maximen und </span><span class=\"match\"><mark>Reflexionen Religion und Christentum. wir sind naturforschend Pantheisten, dichtend Polytheisten, sittlich Monotheisten. Gott, wenn wir hoch stehen, ist alles; stehen wir niedrig, so ist er ein Supplement unsrer Armseligkeit. die Kreatur ist sehr schwach; denn sucht sie etwas, findet sie's nicht. stark aber ist Gott; denn sucht er die Kreatur</mark><span class=\"cutted\"></span></span><span class=\"context-right\">, so hat er sie gleich in<span class=\"more\"></span></span>"); |
| 1345 | assertEquals(kr.getMatch(0).getTextSigle(), "GOE_AGX.00002"); |
| 1346 | }; |
| 1347 | |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 1348 | }; |