blob: 9f338035eaf42835292044c9674306ccfafa2c21 [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap;
2
3import java.util.*;
4import java.io.IOException;
5
Nils Diewald5c375702015-02-09 20:58:24 +00006import static org.junit.Assert.*;
7
Nils Diewalde4986d72015-02-27 17:35:00 +00008import de.ids_mannheim.korap.index.MultiTermTokenStream;
9import de.ids_mannheim.korap.index.MultiTermToken;
Nils Diewald5c375702015-02-09 20:58:24 +000010import de.ids_mannheim.korap.util.CorpusDataException;
Nils Diewaldf399a672013-11-18 17:55:22 +000011
12import org.apache.lucene.document.Document;
13import org.apache.lucene.document.TextField;
14import org.apache.lucene.document.StringField;
15import org.apache.lucene.document.IntField;
16import org.apache.lucene.document.Field;
17import org.apache.lucene.document.FieldType;
18import org.apache.lucene.index.IndexWriter;
19
20/**
Nils Diewalda14ecd62015-02-26 21:00:20 +000021 * Helper class for testing the KrillIndex framework (Normal).
Nils Diewaldbb33da22015-03-04 16:24:25 +000022 *
Nils Diewald5c375702015-02-09 20:58:24 +000023 * @author diewald
Nils Diewaldf399a672013-11-18 17:55:22 +000024 */
25public class Test {
26
Nils Diewaldbb33da22015-03-04 16:24:25 +000027 public static void addDoc (IndexWriter w, Map<String, String> m)
28 throws IOException {
Nils Diewald5c375702015-02-09 20:58:24 +000029 Document doc = new Document();
30 String[] strInt = { "pubDate" };
31 String[] strStr = { "id", "corpus", "pubPlace" };
32 String[] strTxt = { "title", "subtitle", "textClass" };
Nils Diewaldf399a672013-11-18 17:55:22 +000033
Nils Diewald5c375702015-02-09 20:58:24 +000034 // Text fields
35 for (String s : strTxt) {
36 doc.add(new TextField(s, m.get(s), Field.Store.YES));
37 };
Nils Diewaldf399a672013-11-18 17:55:22 +000038
Nils Diewald5c375702015-02-09 20:58:24 +000039 // String fields
40 for (String s : strStr) {
41 doc.add(new StringField(s, m.get(s), Field.Store.YES));
42 };
Nils Diewaldf399a672013-11-18 17:55:22 +000043
Nils Diewald5c375702015-02-09 20:58:24 +000044 // Integer fields
45 for (String s : strInt) {
46 doc.add(new IntField(s, Integer.parseInt(m.get(s)), Field.Store.YES));
47 };
Nils Diewaldf399a672013-11-18 17:55:22 +000048
Nils Diewaldbb33da22015-03-04 16:24:25 +000049 FieldType textFieldWithTermVectors = new FieldType(
50 TextField.TYPE_STORED);
Nils Diewald5c375702015-02-09 20:58:24 +000051 textFieldWithTermVectors.setStoreTermVectors(true);
52 textFieldWithTermVectors.setStoreTermVectorOffsets(true);
53 textFieldWithTermVectors.setStoreTermVectorPositions(true);
54 textFieldWithTermVectors.setStoreTermVectorPayloads(true);
Nils Diewaldf399a672013-11-18 17:55:22 +000055
Nils Diewaldbb33da22015-03-04 16:24:25 +000056 Field textFieldAnalyzed = new Field("text", m.get("textStr"),
57 textFieldWithTermVectors);
Nils Diewaldf399a672013-11-18 17:55:22 +000058
Nils Diewald5c375702015-02-09 20:58:24 +000059 MultiTermTokenStream ts = getTermVector(m.get("text"));
Nils Diewaldf399a672013-11-18 17:55:22 +000060
Nils Diewaldbb33da22015-03-04 16:24:25 +000061 textFieldAnalyzed.setTokenStream(ts);
Nils Diewaldf399a672013-11-18 17:55:22 +000062
Nils Diewald5c375702015-02-09 20:58:24 +000063 doc.add(textFieldAnalyzed);
Nils Diewaldf399a672013-11-18 17:55:22 +000064
Nils Diewald5c375702015-02-09 20:58:24 +000065 // Add document to writer
66 w.addDocument(doc);
Nils Diewaldf399a672013-11-18 17:55:22 +000067 };
68
Nils Diewaldbb33da22015-03-04 16:24:25 +000069
Nils Diewaldf399a672013-11-18 17:55:22 +000070 public static MultiTermTokenStream getTermVector (String stream) {
Nils Diewald5c375702015-02-09 20:58:24 +000071 MultiTermTokenStream ts = new MultiTermTokenStream();
Nils Diewaldf399a672013-11-18 17:55:22 +000072
Nils Diewald5c375702015-02-09 20:58:24 +000073 int pos = 0;
74 for (String seg : stream.split(" ")) {
Nils Diewaldbb33da22015-03-04 16:24:25 +000075
Nils Diewald5c375702015-02-09 20:58:24 +000076 String[] tokseg = seg.split("\\|");
Nils Diewaldf399a672013-11-18 17:55:22 +000077
Nils Diewald5c375702015-02-09 20:58:24 +000078 try {
79 MultiTermToken mtt = new MultiTermToken('s', tokseg[0]);
Nils Diewaldbb33da22015-03-04 16:24:25 +000080
Nils Diewald5c375702015-02-09 20:58:24 +000081 mtt.add("T");
82 mtt.add('i', tokseg[0].toLowerCase());
83 mtt.add('p', tokseg[1]);
84 mtt.add('l', tokseg[2]);
Nils Diewaldf399a672013-11-18 17:55:22 +000085
Nils Diewald5c375702015-02-09 20:58:24 +000086 if (tokseg.length == 4) {
87 for (String morph : tokseg[3].split(";")) {
88 mtt.add('m', morph);
89 }
90 };
91 if (tokseg.length == 5) {
92 mtt.add('e', tokseg[4]);
93 };
Nils Diewaldbb33da22015-03-04 16:24:25 +000094
Nils Diewald5c375702015-02-09 20:58:24 +000095 ts.addMultiTermToken(mtt);
96 }
97 catch (CorpusDataException cde) {
98 fail(cde.getErrorCode() + ": " + cde.getMessage());
99 };
100 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000101
Nils Diewald5c375702015-02-09 20:58:24 +0000102 return ts;
Nils Diewaldf399a672013-11-18 17:55:22 +0000103 };
104};