blob: 0e8d68bcfef08565769eb111eb80a84c59ea92de [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap;
2
3import java.util.*;
4import java.io.IOException;
5
6import de.ids_mannheim.korap.analysis.MultiTermTokenStream;
7import de.ids_mannheim.korap.analysis.MultiTermToken;
8
9import org.apache.lucene.document.Document;
10import org.apache.lucene.document.TextField;
11import org.apache.lucene.document.StringField;
12import org.apache.lucene.document.IntField;
13import org.apache.lucene.document.Field;
14import org.apache.lucene.document.FieldType;
15import org.apache.lucene.index.IndexWriter;
16
17/**
18 * @author Nils Diewald
19 *
20 * Helper class for testing the KorapIndex framework (Normal).
21 */
22public class Test {
23
24 public static void addDoc(IndexWriter w, Map<String, String> m) throws IOException {
25 Document doc = new Document();
26 String[] strInt = { "pubDate" };
27 String[] strStr = { "id", "corpus", "pubPlace" };
28 String[] strTxt = { "title", "subtitle", "textClass" };
29
30 // Text fields
31 for (String s : strTxt) {
32 doc.add(new TextField(s, m.get(s), Field.Store.YES));
33 };
34
35 // String fields
36 for (String s : strStr) {
37 doc.add(new StringField(s, m.get(s), Field.Store.YES));
38 };
39
40 // Integer fields
41 for (String s : strInt) {
42 doc.add(new IntField(s, Integer.parseInt(m.get(s)), Field.Store.YES));
43 };
44
45 FieldType textFieldWithTermVectors = new FieldType(TextField.TYPE_STORED);
46 textFieldWithTermVectors.setStoreTermVectors(true);
47 textFieldWithTermVectors.setStoreTermVectorOffsets(true);
48 textFieldWithTermVectors.setStoreTermVectorPositions(true);
49 textFieldWithTermVectors.setStoreTermVectorPayloads(true);
50
51
52 Field textFieldAnalyzed = new Field(
53 "text",
54 m.get("textStr"),
55 textFieldWithTermVectors
56 );
57
58 MultiTermTokenStream ts = getTermVector(m.get("text"));
59
60 textFieldAnalyzed.setTokenStream( ts );
61
62 doc.add(textFieldAnalyzed);
63
64 // Add document to writer
65 w.addDocument(doc);
66 };
67
68 public static MultiTermTokenStream getTermVector (String stream) {
69 MultiTermTokenStream ts = new MultiTermTokenStream();
70
71 int pos = 0;
72 for (String seg : stream.split(" ")) {
73
74 String[] tokseg = seg.split("\\|");
75
76 MultiTermToken mtt = new MultiTermToken('s', tokseg[0]);
77
78 mtt.add("T");
79 mtt.add('i', tokseg[0].toLowerCase());
80 mtt.add('p', tokseg[1]);
81 mtt.add('l', tokseg[2]);
82
83 if (tokseg.length == 4) {
84 for (String morph : tokseg[3].split(";")) {
85 mtt.add('m', morph);
86 }
87 };
88 if (tokseg.length == 5) {
89 mtt.add('e', tokseg[4]);
90 };
91
92 ts.addMultiTermToken(mtt);
93 };
94
95 return ts;
96 };
97};