blob: 186654925c2a3f77d45b434bdd216d95139ed83e [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap;
2
3import java.util.*;
4import java.io.IOException;
5
Nils Diewald5c375702015-02-09 20:58:24 +00006import static org.junit.Assert.*;
7
Nils Diewalde4986d72015-02-27 17:35:00 +00008import de.ids_mannheim.korap.index.MultiTermTokenStream;
9import de.ids_mannheim.korap.index.MultiTermToken;
Nils Diewald5c375702015-02-09 20:58:24 +000010import de.ids_mannheim.korap.util.CorpusDataException;
Nils Diewaldf399a672013-11-18 17:55:22 +000011
12import org.apache.lucene.document.Document;
13import org.apache.lucene.document.TextField;
14import org.apache.lucene.document.StringField;
15import org.apache.lucene.document.IntField;
16import org.apache.lucene.document.Field;
17import org.apache.lucene.document.FieldType;
18import org.apache.lucene.index.IndexWriter;
19
20/**
Nils Diewalda14ecd62015-02-26 21:00:20 +000021 * Helper class for testing the KrillIndex framework (Normal).
Nils Diewaldbb33da22015-03-04 16:24:25 +000022 *
Nils Diewald5c375702015-02-09 20:58:24 +000023 * @author diewald
Nils Diewaldf399a672013-11-18 17:55:22 +000024 */
25public class Test {
26
Nils Diewaldbb33da22015-03-04 16:24:25 +000027 public static void addDoc (IndexWriter w, Map<String, String> m)
28 throws IOException {
Nils Diewald5c375702015-02-09 20:58:24 +000029 Document doc = new Document();
30 String[] strInt = { "pubDate" };
31 String[] strStr = { "id", "corpus", "pubPlace" };
32 String[] strTxt = { "title", "subtitle", "textClass" };
Nils Diewaldf399a672013-11-18 17:55:22 +000033
Nils Diewald5c375702015-02-09 20:58:24 +000034 // Text fields
35 for (String s : strTxt) {
36 doc.add(new TextField(s, m.get(s), Field.Store.YES));
37 };
Nils Diewaldf399a672013-11-18 17:55:22 +000038
Nils Diewald5c375702015-02-09 20:58:24 +000039 // String fields
40 for (String s : strStr) {
41 doc.add(new StringField(s, m.get(s), Field.Store.YES));
42 };
Nils Diewaldf399a672013-11-18 17:55:22 +000043
Nils Diewald5c375702015-02-09 20:58:24 +000044 // Integer fields
45 for (String s : strInt) {
Eliza Margaretha6f989202016-10-14 21:48:29 +020046 doc.add(new IntField(s, Integer.parseInt(m.get(s)),
47 Field.Store.YES));
Nils Diewald5c375702015-02-09 20:58:24 +000048 };
Nils Diewaldf399a672013-11-18 17:55:22 +000049
Nils Diewaldbb33da22015-03-04 16:24:25 +000050 FieldType textFieldWithTermVectors = new FieldType(
51 TextField.TYPE_STORED);
Nils Diewald5c375702015-02-09 20:58:24 +000052 textFieldWithTermVectors.setStoreTermVectors(true);
53 textFieldWithTermVectors.setStoreTermVectorOffsets(true);
54 textFieldWithTermVectors.setStoreTermVectorPositions(true);
55 textFieldWithTermVectors.setStoreTermVectorPayloads(true);
Nils Diewaldf399a672013-11-18 17:55:22 +000056
Nils Diewaldbb33da22015-03-04 16:24:25 +000057 Field textFieldAnalyzed = new Field("text", m.get("textStr"),
58 textFieldWithTermVectors);
Nils Diewaldf399a672013-11-18 17:55:22 +000059
Nils Diewald5c375702015-02-09 20:58:24 +000060 MultiTermTokenStream ts = getTermVector(m.get("text"));
Nils Diewaldf399a672013-11-18 17:55:22 +000061
Nils Diewaldbb33da22015-03-04 16:24:25 +000062 textFieldAnalyzed.setTokenStream(ts);
Nils Diewaldf399a672013-11-18 17:55:22 +000063
Nils Diewald5c375702015-02-09 20:58:24 +000064 doc.add(textFieldAnalyzed);
Nils Diewaldf399a672013-11-18 17:55:22 +000065
Nils Diewald5c375702015-02-09 20:58:24 +000066 // Add document to writer
67 w.addDocument(doc);
Nils Diewaldf399a672013-11-18 17:55:22 +000068 };
69
Nils Diewaldbb33da22015-03-04 16:24:25 +000070
Nils Diewaldf399a672013-11-18 17:55:22 +000071 public static MultiTermTokenStream getTermVector (String stream) {
Nils Diewald5c375702015-02-09 20:58:24 +000072 MultiTermTokenStream ts = new MultiTermTokenStream();
Nils Diewaldf399a672013-11-18 17:55:22 +000073
Nils Diewald5c375702015-02-09 20:58:24 +000074 int pos = 0;
75 for (String seg : stream.split(" ")) {
Nils Diewaldbb33da22015-03-04 16:24:25 +000076
Nils Diewald5c375702015-02-09 20:58:24 +000077 String[] tokseg = seg.split("\\|");
Nils Diewaldf399a672013-11-18 17:55:22 +000078
Nils Diewald5c375702015-02-09 20:58:24 +000079 try {
80 MultiTermToken mtt = new MultiTermToken('s', tokseg[0]);
Nils Diewaldbb33da22015-03-04 16:24:25 +000081
Nils Diewald5c375702015-02-09 20:58:24 +000082 mtt.add("T");
83 mtt.add('i', tokseg[0].toLowerCase());
84 mtt.add('p', tokseg[1]);
85 mtt.add('l', tokseg[2]);
Nils Diewaldf399a672013-11-18 17:55:22 +000086
Nils Diewald5c375702015-02-09 20:58:24 +000087 if (tokseg.length == 4) {
88 for (String morph : tokseg[3].split(";")) {
89 mtt.add('m', morph);
90 }
91 };
92 if (tokseg.length == 5) {
93 mtt.add('e', tokseg[4]);
94 };
Nils Diewaldbb33da22015-03-04 16:24:25 +000095
Nils Diewald5c375702015-02-09 20:58:24 +000096 ts.addMultiTermToken(mtt);
97 }
98 catch (CorpusDataException cde) {
99 fail(cde.getErrorCode() + ": " + cde.getMessage());
100 };
101 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000102
Nils Diewald5c375702015-02-09 20:58:24 +0000103 return ts;
Nils Diewaldf399a672013-11-18 17:55:22 +0000104 };
105};