| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap; |
| 2 | |
| 3 | import java.util.*; |
| 4 | import java.io.IOException; |
| 5 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 6 | import static org.junit.Assert.*; |
| 7 | |
| Nils Diewald | e4986d7 | 2015-02-27 17:35:00 +0000 | [diff] [blame] | 8 | import de.ids_mannheim.korap.index.MultiTermTokenStream; |
| 9 | import de.ids_mannheim.korap.index.MultiTermToken; |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 10 | import de.ids_mannheim.korap.util.CorpusDataException; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 11 | |
| 12 | import org.apache.lucene.document.Document; |
| 13 | import org.apache.lucene.document.TextField; |
| 14 | import org.apache.lucene.document.StringField; |
| 15 | import org.apache.lucene.document.IntField; |
| 16 | import org.apache.lucene.document.Field; |
| 17 | import org.apache.lucene.document.FieldType; |
| 18 | import org.apache.lucene.index.IndexWriter; |
| 19 | |
| 20 | /** |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 21 | * Helper class for testing the KrillIndex framework (Normal). |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 22 | * |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 23 | * @author diewald |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 24 | */ |
| 25 | public class Test { |
| 26 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 27 | public static void addDoc (IndexWriter w, Map<String, String> m) |
| 28 | throws IOException { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 29 | Document doc = new Document(); |
| 30 | String[] strInt = { "pubDate" }; |
| 31 | String[] strStr = { "id", "corpus", "pubPlace" }; |
| 32 | String[] strTxt = { "title", "subtitle", "textClass" }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 33 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 34 | // Text fields |
| 35 | for (String s : strTxt) { |
| 36 | doc.add(new TextField(s, m.get(s), Field.Store.YES)); |
| 37 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 38 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 39 | // String fields |
| 40 | for (String s : strStr) { |
| 41 | doc.add(new StringField(s, m.get(s), Field.Store.YES)); |
| 42 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 43 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 44 | // Integer fields |
| 45 | for (String s : strInt) { |
| 46 | doc.add(new IntField(s, Integer.parseInt(m.get(s)), Field.Store.YES)); |
| 47 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 48 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 49 | FieldType textFieldWithTermVectors = new FieldType( |
| 50 | TextField.TYPE_STORED); |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 51 | textFieldWithTermVectors.setStoreTermVectors(true); |
| 52 | textFieldWithTermVectors.setStoreTermVectorOffsets(true); |
| 53 | textFieldWithTermVectors.setStoreTermVectorPositions(true); |
| 54 | textFieldWithTermVectors.setStoreTermVectorPayloads(true); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 55 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 56 | Field textFieldAnalyzed = new Field("text", m.get("textStr"), |
| 57 | textFieldWithTermVectors); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 58 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 59 | MultiTermTokenStream ts = getTermVector(m.get("text")); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 60 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 61 | textFieldAnalyzed.setTokenStream(ts); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 62 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 63 | doc.add(textFieldAnalyzed); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 64 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 65 | // Add document to writer |
| 66 | w.addDocument(doc); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 67 | }; |
| 68 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 69 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 70 | public static MultiTermTokenStream getTermVector (String stream) { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 71 | MultiTermTokenStream ts = new MultiTermTokenStream(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 72 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 73 | int pos = 0; |
| 74 | for (String seg : stream.split(" ")) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 75 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 76 | String[] tokseg = seg.split("\\|"); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 77 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 78 | try { |
| 79 | MultiTermToken mtt = new MultiTermToken('s', tokseg[0]); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 80 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 81 | mtt.add("T"); |
| 82 | mtt.add('i', tokseg[0].toLowerCase()); |
| 83 | mtt.add('p', tokseg[1]); |
| 84 | mtt.add('l', tokseg[2]); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 85 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 86 | if (tokseg.length == 4) { |
| 87 | for (String morph : tokseg[3].split(";")) { |
| 88 | mtt.add('m', morph); |
| 89 | } |
| 90 | }; |
| 91 | if (tokseg.length == 5) { |
| 92 | mtt.add('e', tokseg[4]); |
| 93 | }; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 94 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 95 | ts.addMultiTermToken(mtt); |
| 96 | } |
| 97 | catch (CorpusDataException cde) { |
| 98 | fail(cde.getErrorCode() + ": " + cde.getMessage()); |
| 99 | }; |
| 100 | }; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 101 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 102 | return ts; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 103 | }; |
| 104 | }; |