| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.search; |
| 2 | |
| 3 | import java.util.*; |
| 4 | import java.io.*; |
| 5 | |
| 6 | import static de.ids_mannheim.korap.TestSimple.*; |
| 7 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 8 | import de.ids_mannheim.korap.Krill; |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 9 | import de.ids_mannheim.korap.KrillCollection; |
| Nils Diewald | 0339d46 | 2015-02-26 14:53:56 +0000 | [diff] [blame] | 10 | import de.ids_mannheim.korap.KrillQuery; |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 11 | import de.ids_mannheim.korap.KrillIndex; |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 12 | import de.ids_mannheim.korap.index.FieldDocument; |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 13 | import de.ids_mannheim.korap.response.Result; |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 14 | import java.nio.file.Files; |
| 15 | import java.nio.file.FileSystem; |
| 16 | import java.nio.file.Path; |
| 17 | import java.nio.charset.StandardCharsets; |
| 18 | import java.nio.ByteBuffer; |
| 19 | |
| Akron | 932dd59 | 2021-07-27 12:52:46 +0200 | [diff] [blame] | 20 | import org.apache.commons.lang3.StringUtils; |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 21 | import org.apache.lucene.analysis.Analyzer; |
| 22 | import org.apache.lucene.analysis.TokenStream; |
| 23 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| 24 | |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 25 | import com.fasterxml.jackson.databind.ObjectMapper; |
| 26 | import com.fasterxml.jackson.databind.JsonNode; |
| 27 | |
| 28 | import static org.junit.Assert.*; |
| 29 | import org.junit.Test; |
| 30 | import org.junit.Ignore; |
| 31 | import org.junit.runner.RunWith; |
| 32 | import org.junit.runners.JUnit4; |
| 33 | |
| 34 | @RunWith(JUnit4.class) |
| 35 | public class TestMetaFields { |
| Akron | 640458c | 2015-06-25 12:36:15 +0200 | [diff] [blame] | 36 | |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 37 | @Test |
| 38 | public void searchMetaFields () throws IOException { |
| 39 | |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 40 | // Construct index |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 41 | KrillIndex ki = new KrillIndex(); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 42 | // Indexing test files |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 43 | for (String i : new String[] { "00001", "00002" }) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 44 | ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 45 | true); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 46 | }; |
| 47 | ki.commit(); |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 48 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 49 | String jsonString = getJsonString(getClass() |
| 50 | .getResource("/queries/metas/fields.jsonld").getFile()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 51 | |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 52 | Krill ks = new Krill(jsonString); |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 53 | |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 54 | Result kr = ks.apply(ki); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 55 | assertEquals((long) 17, kr.getTotalResults()); |
| 56 | assertEquals(0, kr.getStartIndex()); |
| 57 | assertEquals(9, kr.getItemsPerPage()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 58 | |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 59 | ObjectMapper mapper = new ObjectMapper(); |
| 60 | JsonNode res = mapper.readTree(kr.toJsonString()); |
| Akron | b116644 | 2015-06-27 00:34:19 +0200 | [diff] [blame] | 61 | |
| Akron | b116644 | 2015-06-27 00:34:19 +0200 | [diff] [blame] | 62 | // mirror fields |
| 63 | assertEquals(9, res.at("/meta/count").asInt()); |
| 64 | |
| 65 | if (res.at("/meta/fields/0").asText().equals("UID")) { |
| 66 | assertEquals("corpusID", res.at("/meta/fields/1").asText()); |
| 67 | } |
| 68 | else { |
| 69 | assertEquals("corpusID", res.at("/meta/fields/0").asText()); |
| 70 | assertEquals("UID", res.at("/meta/fields/1").asText()); |
| 71 | }; |
| 72 | |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 73 | assertEquals(0, res.at("/matches/0/UID").asInt()); |
| 74 | assertEquals("WPD", res.at("/matches/0/corpusID").asText()); |
| Akron | 12f1f5b | 2015-06-24 15:56:52 +0200 | [diff] [blame] | 75 | assertTrue(res.at("/matches/0/docID").isMissingNode()); |
| 76 | assertTrue(res.at("/matches/0/textSigle").isMissingNode()); |
| 77 | assertTrue(res.at("/matches/0/ID").isMissingNode()); |
| 78 | assertTrue(res.at("/matches/0/author").isMissingNode()); |
| 79 | assertTrue(res.at("/matches/0/title").isMissingNode()); |
| 80 | assertTrue(res.at("/matches/0/subTitle").isMissingNode()); |
| 81 | assertTrue(res.at("/matches/0/textClass").isMissingNode()); |
| 82 | assertTrue(res.at("/matches/0/pubPlace").isMissingNode()); |
| 83 | assertTrue(res.at("/matches/0/pubDate").isMissingNode()); |
| 84 | assertTrue(res.at("/matches/0/foundries").isMissingNode()); |
| 85 | assertTrue(res.at("/matches/0/layerInfos").isMissingNode()); |
| 86 | assertTrue(res.at("/matches/0/tokenization").isMissingNode()); |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 87 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 88 | jsonString = getJsonString(getClass() |
| 89 | .getResource("/queries/metas/fields_2.jsonld").getFile()); |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 90 | ks = new Krill(jsonString); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 91 | kr = ks.apply(ki); |
| 92 | assertEquals((long) 17, kr.getTotalResults()); |
| 93 | assertEquals(0, kr.getStartIndex()); |
| 94 | assertEquals(2, kr.getItemsPerPage()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 95 | |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 96 | mapper = new ObjectMapper(); |
| 97 | res = mapper.readTree(kr.toJsonString()); |
| 98 | assertEquals(0, res.at("/matches/0/UID").asInt()); |
| Akron | 12f1f5b | 2015-06-24 15:56:52 +0200 | [diff] [blame] | 99 | assertTrue(res.at("/matches/0/corpusID").isMissingNode()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 100 | assertEquals("Ruru,Jens.Ol,Aglarech", |
| 101 | res.at("/matches/0/author").asText()); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 102 | assertEquals("A", res.at("/matches/0/title").asText()); |
| 103 | assertEquals("WPD_AAA.00001", res.at("/matches/0/docID").asText()); |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 104 | assertTrue(res.at("/matches/0/textSigle").isMissingNode()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 105 | assertEquals("match-WPD_AAA.00001-p6-7", |
| 106 | res.at("/matches/0/matchID").asText()); |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 107 | // assertEquals("p6-7", res.at("/matches/0/matchID").asText()); |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 108 | assertTrue(res.at("/matches/0/subTitle").isMissingNode()); |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 109 | assertEquals("", res.at("/matches/0/subTitle").asText()); |
| 110 | assertEquals("", res.at("/matches/0/textClass").asText()); |
| 111 | assertEquals("", res.at("/matches/0/pubPlace").asText()); |
| 112 | assertEquals("", res.at("/matches/0/pubDate").asText()); |
| 113 | assertEquals("", res.at("/matches/0/foundries").asText()); |
| 114 | assertEquals("", res.at("/matches/0/layerInfo").asText()); |
| 115 | assertEquals("", res.at("/matches/0/tokenization").asText()); |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 116 | }; |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 117 | |
| Akron | 640458c | 2015-06-25 12:36:15 +0200 | [diff] [blame] | 118 | |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 119 | @Test |
| 120 | public void searchMetaFieldsNew () throws IOException { |
| 121 | |
| 122 | // Construct index |
| 123 | KrillIndex ki = new KrillIndex(); |
| Akron | 640458c | 2015-06-25 12:36:15 +0200 | [diff] [blame] | 124 | ki.addDoc(getClass().getResourceAsStream("/goe/AGX-00002.json"), false); |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 125 | ki.commit(); |
| 126 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 127 | String jsonString = getJsonString(getClass() |
| 128 | .getResource("/queries/metas/fields_no.jsonld").getFile()); |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 129 | |
| 130 | Krill ks = new Krill(jsonString); |
| 131 | Result kr = ks.apply(ki); |
| 132 | ObjectMapper mapper = new ObjectMapper(); |
| 133 | JsonNode res = mapper.readTree(kr.toJsonString()); |
| Akron | be9638d | 2019-02-07 17:09:42 +0100 | [diff] [blame] | 134 | |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 135 | assertEquals(0, res.at("/matches/0/UID").asInt()); |
| 136 | assertEquals("GOE_AGX.00002", res.at("/matches/0/textSigle").asText()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 137 | assertEquals("Maximen und Reflexionen", |
| 138 | res.at("/matches/0/title").asText()); |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 139 | assertEquals("1982", res.at("/matches/0/pubDate").asText()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 140 | assertEquals("Goethe, Johann Wolfgang von", |
| 141 | res.at("/matches/0/author").asText()); |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 142 | assertEquals("GOE_AGX", res.at("/matches/0/docSigle").asText()); |
| 143 | assertEquals("GOE", res.at("/matches/0/corpusSigle").asText()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 144 | assertEquals("Religion und Christentum", |
| 145 | res.at("/matches/0/subTitle").asText()); |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 146 | assertEquals("München", res.at("/matches/0/pubPlace").asText()); |
| Akron | 640458c | 2015-06-25 12:36:15 +0200 | [diff] [blame] | 147 | assertEquals( |
| 148 | "base/s=spans cnx/c=spans cnx/l=tokens cnx/m=tokens cnx/p=tokens cnx/s=spans cnx/syn=tokens corenlp/c=spans corenlp/ne=tokens corenlp/p=tokens corenlp/s=spans glemm/l=tokens mate/l=tokens mate/m=tokens mate/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens tt/s=spans xip/c=spans xip/l=tokens xip/p=tokens xip/s=spans", |
| 149 | res.at("/matches/0/layerInfos").asText()); |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 150 | assertTrue(res.at("/matches/0/textType").isMissingNode()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 151 | assertEquals("match-GOE_AGX.00002-p7-8", |
| 152 | res.at("/matches/0/matchID").asText()); |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 153 | |
| Akron | efbb754 | 2025-06-18 10:36:24 +0200 | [diff] [blame^] | 154 | assertFalse(res.at("/meta/rewrites").isMissingNode()); |
| 155 | assertEquals("Kustvakt", res.at("/meta/rewrites/0/src").asText()); |
| 156 | |
| 157 | |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 158 | // All fields |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 159 | jsonString = getJsonString(getClass() |
| 160 | .getResource("/queries/metas/fields_all.jsonld").getFile()); |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 161 | |
| 162 | ks = new Krill(jsonString); |
| 163 | kr = ks.apply(ki); |
| 164 | mapper = new ObjectMapper(); |
| 165 | res = mapper.readTree(kr.toJsonString()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 166 | assertEquals("Verlag C. H. Beck", |
| 167 | res.at("/matches/0/publisher").asText()); |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 168 | assertEquals("Aphorismus", res.at("/matches/0/textType").asText()); |
| 169 | assertEquals("Aphorismen", res.at("/matches/0/textTypeRef").asText()); |
| Akron | 640458c | 2015-06-25 12:36:15 +0200 | [diff] [blame] | 170 | assertEquals( |
| 171 | "Goethe, Johann Wolfgang von: Maximen und Reflexionen. Religion und Christentum, [Aphorismen], (Erstveröffentlichung: Stuttgart ; Tübingen, 1827-1842), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 12, Schriften zur Kunst. Schriften zur Literatur. Maximen und Reflexionen, Hrsg.: Trunz, Erich. München: Verlag C. H. Beck, 1982, S. 372-377", |
| 172 | res.at("/matches/0/reference").asText()); |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 173 | assertEquals("de", res.at("/matches/0/language").asText()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 174 | assertEquals("opennlp#tokens", |
| 175 | res.at("/matches/0/tokenSource").asText()); |
| Akron | 640458c | 2015-06-25 12:36:15 +0200 | [diff] [blame] | 176 | assertEquals( |
| 177 | "base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/namedentities corenlp/sentences glemm glemm/morpho mate mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences", |
| 178 | res.at("/matches/0/foundries").asText()); |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 179 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 180 | assertEquals("Goethe-Korpus", |
| 181 | res.at("/matches/0/corpusTitle").asText()); |
| Akron | 7e3a10f | 2017-05-05 15:36:20 +0200 | [diff] [blame] | 182 | assertEquals("QAO-NC", res.at("/matches/0/availability").asText()); |
| Akron | 640458c | 2015-06-25 12:36:15 +0200 | [diff] [blame] | 183 | assertEquals("Goethe: Maximen und Reflexionen, (1827-1842)", |
| 184 | res.at("/matches/0/docTitle").asText()); |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 185 | assertEquals("1827", res.at("/matches/0/creationDate").asText()); |
| Akron | 69b958c | 2017-02-15 22:49:45 +0100 | [diff] [blame] | 186 | // assertEquals("372-377", res.at("/matches/0/pages").asText()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 187 | assertEquals("match-GOE_AGX.00002-p7-8", |
| 188 | res.at("/matches/0/matchID").asText()); |
| Akron | efbb754 | 2025-06-18 10:36:24 +0200 | [diff] [blame^] | 189 | assertTrue(res.at("/meta/rewrites").isMissingNode()); |
| Akron | 6590c32 | 2015-07-02 16:08:13 +0200 | [diff] [blame] | 190 | |
| 191 | |
| 192 | // @All fields |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 193 | jsonString = getJsonString(getClass() |
| 194 | .getResource("/queries/metas/fields_at_all.jsonld").getFile()); |
| Akron | 6590c32 | 2015-07-02 16:08:13 +0200 | [diff] [blame] | 195 | |
| 196 | ks = new Krill(jsonString); |
| 197 | kr = ks.apply(ki); |
| 198 | mapper = new ObjectMapper(); |
| 199 | res = mapper.readTree(kr.toJsonString()); |
| 200 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 201 | assertEquals("Verlag C. H. Beck", |
| 202 | res.at("/matches/0/publisher").asText()); |
| Akron | 6590c32 | 2015-07-02 16:08:13 +0200 | [diff] [blame] | 203 | assertEquals("Aphorismus", res.at("/matches/0/textType").asText()); |
| 204 | assertEquals("Aphorismen", res.at("/matches/0/textTypeRef").asText()); |
| 205 | assertEquals( |
| 206 | "Goethe, Johann Wolfgang von: Maximen und Reflexionen. Religion und Christentum, [Aphorismen], (Erstveröffentlichung: Stuttgart ; Tübingen, 1827-1842), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 12, Schriften zur Kunst. Schriften zur Literatur. Maximen und Reflexionen, Hrsg.: Trunz, Erich. München: Verlag C. H. Beck, 1982, S. 372-377", |
| 207 | res.at("/matches/0/reference").asText()); |
| 208 | assertEquals("de", res.at("/matches/0/language").asText()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 209 | assertEquals("opennlp#tokens", |
| 210 | res.at("/matches/0/tokenSource").asText()); |
| Akron | 6590c32 | 2015-07-02 16:08:13 +0200 | [diff] [blame] | 211 | assertEquals( |
| 212 | "base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/namedentities corenlp/sentences glemm glemm/morpho mate mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences", |
| 213 | res.at("/matches/0/foundries").asText()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 214 | assertEquals("Goethe-Korpus", |
| 215 | res.at("/matches/0/corpusTitle").asText()); |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 216 | assertEquals("QAO-NC", res.at("/matches/0/availability").asText()); |
| Akron | 6590c32 | 2015-07-02 16:08:13 +0200 | [diff] [blame] | 217 | assertEquals("Goethe: Maximen und Reflexionen, (1827-1842)", |
| 218 | res.at("/matches/0/docTitle").asText()); |
| 219 | assertEquals("1827", res.at("/matches/0/creationDate").asText()); |
| Akron | eb8c02b | 2024-06-26 14:34:45 +0200 | [diff] [blame] | 220 | assertTrue(res.at("/matches/0/pages").isMissingNode()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 221 | assertEquals("match-GOE_AGX.00002-p7-8", |
| 222 | res.at("/matches/0/matchID").asText()); |
| Akron | 379fed0 | 2024-06-24 10:02:29 +0200 | [diff] [blame] | 223 | |
| 224 | |
| 225 | // Missing field |
| 226 | jsonString = getJsonString(getClass() |
| 227 | .getResource("/queries/metas/fields_missing.jsonld").getFile()); |
| 228 | |
| 229 | ks = new Krill(jsonString); |
| 230 | kr = ks.apply(ki); |
| 231 | mapper = new ObjectMapper(); |
| 232 | res = mapper.readTree(kr.toJsonString()); |
| 233 | |
| 234 | assertTrue(res.at("/matches/0/publisher").isMissingNode()); |
| 235 | assertEquals("Goethe-Korpus", res.at("/matches/0/corpusTitle").asText()); |
| 236 | assertTrue(res.at("/matches/0/textType").isMissingNode()); |
| 237 | assertEquals("", res.at("/matches/0/UID").asText()); |
| 238 | assertTrue(res.at("/matches/0/namespace.new").isMissingNode()); |
| Akron | 3e0403f | 2015-06-24 20:59:13 +0200 | [diff] [blame] | 239 | }; |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 240 | |
| Akron | d34403f | 2024-06-24 11:44:12 +0200 | [diff] [blame] | 241 | @Test |
| 242 | public void searchMetaFieldsWithPeriods () throws IOException { |
| 243 | |
| 244 | // Construct index |
| 245 | KrillIndex ki = new KrillIndex(); |
| 246 | FieldDocument fd = ki.addDoc(getClass().getResourceAsStream("/others/KED-KLX-03212.json.gz"), true); |
| 247 | |
| 248 | ki.commit(); |
| 249 | |
| 250 | String jsonString = getJsonString(getClass() |
| 251 | .getResource("/queries/metas/fields_with_periods.jsonld").getFile()); |
| 252 | |
| 253 | Krill ks = new Krill(jsonString); |
| 254 | Result kr = ks.apply(ki); |
| 255 | ObjectMapper mapper = new ObjectMapper(); |
| 256 | JsonNode res = mapper.readTree(kr.toJsonString()); |
| 257 | |
| 258 | String sv = fd.doc.getField("textSigle").stringValue(); |
| 259 | assertEquals("KED/KLX/03212", sv); |
| 260 | |
| 261 | sv = fd.doc.getField("KED.corpusRcpntLabel").stringValue(); |
| 262 | assertEquals("data:,Kinder", sv); |
| 263 | |
| 264 | assertEquals(1, res.at("/meta/totalResults").asInt()); |
| 265 | |
| 266 | assertEquals(0, res.at("/matches/0/UID").asInt()); |
| 267 | assertEquals("KED/KLX/03212", res.at("/matches/0/textSigle").asText()); |
| 268 | assertTrue(res.at("/matches/0/title").isMissingNode()); |
| Akron | 74563e1 | 2024-06-24 18:00:57 +0200 | [diff] [blame] | 269 | assertEquals("data:,Kinder", res.at("/matches/0/KED.corpusRcpntLabel").asText()); |
| Akron | d34403f | 2024-06-24 11:44:12 +0200 | [diff] [blame] | 270 | assertFalse(res.at("/matches/0/fields").isMissingNode()); |
| 271 | |
| 272 | Iterator fieldIter = res.at("/matches/0/fields").elements(); |
| 273 | |
| 274 | int checkC = 0; |
| 275 | int checkF = 0; |
| 276 | while (fieldIter.hasNext()) { |
| 277 | JsonNode field = (JsonNode) fieldIter.next(); |
| 278 | |
| 279 | String key = field.at("/key").asText(); |
| 280 | |
| 281 | switch (key) { |
| 282 | case "KED.corpusRcpntLabel": |
| 283 | assertEquals("type:attachement", field.at("/type").asText()); |
| 284 | assertEquals("koral:field", field.at("/@type").asText()); |
| 285 | assertEquals("data:,Kinder", field.at("/value").asText()); |
| 286 | checkC++; |
| 287 | break; |
| 288 | case "UID": |
| 289 | checkF++; |
| 290 | break; |
| 291 | case "textSigle": |
| 292 | assertEquals("type:string", field.at("/type").asText()); |
| 293 | assertEquals("koral:field", field.at("/@type").asText()); |
| 294 | assertEquals("KED/KLX/03212", field.at("/value").asText()); |
| 295 | checkC++; |
| 296 | break; |
| 297 | default: |
| 298 | checkF++; |
| 299 | } |
| 300 | }; |
| 301 | |
| 302 | assertEquals(2, checkC); |
| 303 | assertEquals(0, checkF); |
| 304 | }; |
| 305 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 306 | |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 307 | @Test |
| Akron | be9638d | 2019-02-07 17:09:42 +0100 | [diff] [blame] | 308 | public void searchMetaFieldsDuplicateKeys () throws IOException { |
| 309 | |
| 310 | // Construct index |
| 311 | KrillIndex ki = new KrillIndex(); |
| 312 | ki.addDoc(getClass().getResourceAsStream("/goe/AGX-00002.json"), false); |
| 313 | ki.commit(); |
| 314 | |
| 315 | String jsonString = getJsonString(getClass() |
| 316 | .getResource("/queries/metas/fields_single.jsonld").getFile()); |
| 317 | |
| 318 | Krill ks = new Krill(jsonString); |
| 319 | ks.getMeta().setLimit(1); |
| 320 | Result kr = ks.apply(ki); |
| 321 | |
| 322 | String resultJson = kr.toJsonString(); |
| 323 | |
| 324 | assertTrue(resultJson.indexOf("\"textSigle\":\"GOE_AGX.00002\"") > 0); |
| 325 | assertTrue(resultJson.indexOf("\"docSigle\":\"GOE_AGX\"") > 0); |
| 326 | assertTrue(resultJson.indexOf("\"corpusSigle\":\"GOE\"") > 0); |
| Akron | d475d99 | 2021-11-23 18:39:47 +0100 | [diff] [blame] | 327 | // assertTrue(resultJson.indexOf("\"UID\":") > 0); |
| Akron | be9638d | 2019-02-07 17:09:42 +0100 | [diff] [blame] | 328 | assertTrue(resultJson.indexOf("\"availability\":") > 0); |
| 329 | |
| 330 | assertEquals( |
| 331 | resultJson.indexOf("\"textSigle\":\"GOE_AGX.00002\""), |
| 332 | resultJson.lastIndexOf("\"textSigle\":\"GOE_AGX.00002\"") |
| 333 | ); |
| 334 | assertEquals( |
| 335 | resultJson.indexOf("\"docSigle\":\"GOE_AGX\""), |
| 336 | resultJson.lastIndexOf("\"docSigle\":\"GOE_AGX\"") |
| 337 | ); |
| 338 | assertEquals( |
| 339 | resultJson.indexOf("\"corpusSigle\":\"GOE\""), |
| 340 | resultJson.lastIndexOf("\"corpusSigle\":\"GOE\"") |
| 341 | ); |
| 342 | assertEquals( |
| 343 | resultJson.indexOf("\"UID\":0"), |
| 344 | resultJson.lastIndexOf("\"UID\":0") |
| 345 | ); |
| 346 | assertEquals( |
| 347 | resultJson.indexOf("\"availability\":"), |
| 348 | resultJson.lastIndexOf("\"availability\":") |
| 349 | ); |
| 350 | }; |
| 351 | |
| 352 | @Test |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 353 | public void searchCollectionFields () throws IOException { |
| 354 | KrillIndex ki = new KrillIndex(); |
| 355 | FieldDocument fd = new FieldDocument(); |
| 356 | fd.addString("corpusSigle", "ABC"); |
| 357 | fd.addString("docSigle", "ABC-123"); |
| 358 | fd.addString("textSigle", "ABC-123-0001"); |
| 359 | fd.addText("title", "Die Wahlverwandschaften"); |
| 360 | fd.addText("author", "Johann Wolfgang von Goethe"); |
| Akron | a6dabb7 | 2019-01-09 13:09:41 +0100 | [diff] [blame] | 361 | fd.addKeywords("textClass", "reisen wissenschaft"); |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 362 | fd.addInt("pubDate", 20130617); |
| 363 | fd.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" |
| 364 | + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]"); |
| 365 | ki.addDoc(fd); |
| 366 | |
| 367 | FieldDocument fd2 = new FieldDocument(); |
| 368 | fd2.addString("corpusSigle", "ABC"); |
| 369 | fd2.addString("docSigle", "ABC-125"); |
| 370 | fd2.addString("textSigle", "ABC-125-0001"); |
| 371 | fd2.addText("title", "Die Glocke"); |
| 372 | fd2.addText("author", "Schiller, Friedrich"); |
| Akron | a6dabb7 | 2019-01-09 13:09:41 +0100 | [diff] [blame] | 373 | fd2.addKeywords("textClass", "Reisen geschichte"); |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 374 | fd2.addInt("pubDate", 20130203); |
| 375 | fd2.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" |
| 376 | + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]"); |
| 377 | ki.addDoc(fd2); |
| 378 | ki.commit(); |
| 379 | |
| 380 | // textClass = reisen & wissenschaft |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 381 | String jsonString = getJsonString(getClass() |
| 382 | .getResource("/queries/collections/collection_textClass.jsonld") |
| 383 | .getFile()); |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 384 | Krill ks = new Krill(jsonString); |
| 385 | KrillCollection kc = ks.getCollection(); |
| 386 | kc.setIndex(ki); |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 387 | assertEquals(1, kc.numberOf("documents")); |
| 388 | |
| 389 | // textClass = reisen |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 390 | jsonString = getJsonString(getClass() |
| 391 | .getResource( |
| 392 | "/queries/collections/collection_textClass_2.jsonld") |
| 393 | .getFile()); |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 394 | ks = new Krill(jsonString); |
| 395 | kc = ks.getCollection(); |
| 396 | kc.setIndex(ki); |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 397 | assertEquals(2, kc.numberOf("documents")); |
| 398 | |
| 399 | /* |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 400 | TokenStream ts = fd2.doc.getField("author").tokenStream( |
| 401 | (Analyzer) ki.writer().getAnalyzer(), |
| 402 | (TokenStream) null |
| 403 | ); |
| 404 | // OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); |
| 405 | CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 406 | |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 407 | ts.reset(); |
| 408 | while (ts.incrementToken()) { |
| 409 | String term = charTermAttribute.toString(); |
| 410 | System.err.println(">>" + term + "<<"); |
| 411 | }; |
| 412 | */ |
| 413 | |
| 414 | // author = wolfgang |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 415 | jsonString = getJsonString(getClass() |
| 416 | .getResource("/queries/collections/collection_goethe.jsonld") |
| 417 | .getFile()); |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 418 | ks = new Krill(jsonString); |
| 419 | kc = ks.getCollection(); |
| 420 | kc.setIndex(ki); |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 421 | assertEquals(1, kc.numberOf("documents")); |
| 422 | |
| 423 | // author = Wolfgang |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 424 | jsonString = getJsonString(getClass() |
| 425 | .getResource("/queries/collections/collection_goethe_2.jsonld") |
| 426 | .getFile()); |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 427 | ks = new Krill(jsonString); |
| 428 | kc = ks.getCollection(); |
| 429 | kc.setIndex(ki); |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 430 | assertEquals(1, kc.numberOf("documents")); |
| 431 | |
| 432 | Result kr = ks.apply(ki); |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 433 | |
| Akron | 484c3c1 | 2015-07-07 20:25:44 +0200 | [diff] [blame] | 434 | ObjectMapper mapper = new ObjectMapper(); |
| 435 | JsonNode res = mapper.readTree(kr.toJsonString()); |
| 436 | assertEquals(1, res.at("/meta/totalResults").asInt()); |
| 437 | }; |
| 438 | |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 439 | |
| 440 | @Test |
| 441 | public void searchMetaContext () throws IOException { |
| 442 | |
| 443 | // All fields |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 444 | String jsonString = getJsonString(getClass() |
| 445 | .getResource("/queries/metas/context_paragraph.jsonld") |
| 446 | .getFile()); |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 447 | |
| 448 | Krill ks = new Krill(jsonString); |
| 449 | assertTrue(ks.getMeta().getContext().isSpanDefined()); |
| 450 | assertEquals("base/p", ks.getMeta().getContext().getSpanContext()); |
| 451 | }; |
| Akron | 8bb3bc3 | 2018-12-12 19:34:56 +0100 | [diff] [blame] | 452 | |
| Akron | 9de655e | 2021-07-05 15:23:31 +0200 | [diff] [blame] | 453 | |
| 454 | @Test |
| 455 | public void searchMetaAndSnippets () throws IOException { |
| 456 | |
| 457 | // All fields |
| 458 | String jsonString = getJsonString(getClass() |
| 459 | .getResource("/queries/metas/no-snippets.jsonld") |
| 460 | .getFile()); |
| 461 | |
| 462 | Krill ks = new Krill(jsonString); |
| 463 | assertFalse(ks.getMeta().hasSnippets()); |
| 464 | }; |
| 465 | |
| Akron | 8bb3bc3 | 2018-12-12 19:34:56 +0100 | [diff] [blame] | 466 | |
| 467 | @Test |
| 468 | public void searchMetaAssets () throws IOException { |
| 469 | KrillIndex ki = new KrillIndex(); |
| 470 | FieldDocument fd = new FieldDocument(); |
| 471 | fd.addString("textSigle", "ABC-123-0002"); |
| 472 | fd.addText("title", "Die Wahlverwandtschaften"); |
| 473 | fd.addText("author", "Johann Wolfgang von Goethe"); |
| Akron | a6dabb7 | 2019-01-09 13:09:41 +0100 | [diff] [blame] | 474 | fd.addKeywords("textClass", "reisen wissenschaft"); |
| Akron | c7a2abc | 2019-01-17 14:21:34 +0100 | [diff] [blame] | 475 | fd.addDate("pubDate", 20130617); |
| Akron | 8bb3bc3 | 2018-12-12 19:34:56 +0100 | [diff] [blame] | 476 | fd.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" |
| 477 | + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]"); |
| 478 | fd.addAttachement("WikiLink", "data:application/x.korap-link,https://de.wikipedia.org/wiki/Beispiel"); |
| 479 | ki.addDoc(fd); |
| 480 | ki.commit(); |
| 481 | |
| 482 | assertEquals(fd.doc.getField("textSigle").stringValue(), "ABC-123-0002"); |
| 483 | assertEquals(fd.doc.getField("title").stringValue(), "Die Wahlverwandtschaften"); |
| 484 | assertEquals(fd.doc.getField("author").stringValue(), "Johann Wolfgang von Goethe"); |
| 485 | assertEquals(fd.doc.getField("textClass").stringValue(), "reisen wissenschaft"); |
| 486 | assertEquals(fd.doc.getField("pubDate").stringValue(), "20130617"); |
| 487 | assertEquals(fd.doc.getField("WikiLink").stringValue(), "data:application/x.korap-link,https://de.wikipedia.org/wiki/Beispiel"); |
| 488 | } |
| 489 | |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 490 | }; |