Improve keyword indexing to be retrievable as keywords
Change-Id: Iad74b910ef66bbf684fa06ad1e6bac848a4da9ff
diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index 81cb49f..b7cf517 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
@@ -67,11 +67,14 @@
tvNoField.setStoreTermVectorPayloads(true);
tvNoField.setStoreTermVectorOffsets(false);
+ keywords.setStoreTermVectors(false);
+ /*
keywords.setStoreTermVectors(true);
keywords.setStoreTermVectorPositions(false);
keywords.setStoreTermVectorPayloads(false);
keywords.setStoreTermVectorOffsets(false);
- keywords.setIndexOptions(IndexOptions.DOCS);
+ */
+ keywords.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
};
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 009e18b..81d4c2e 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -91,7 +91,7 @@
private static final int CONTEXT = -99998;
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = true;
+ public static final boolean DEBUG = false;
// Mapper for JSON serialization
ObjectMapper mapper = new ObjectMapper();
diff --git a/src/main/java/de/ids_mannheim/korap/response/MetaField.java b/src/main/java/de/ids_mannheim/korap/response/MetaField.java
index 095dd89..e997313 100644
--- a/src/main/java/de/ids_mannheim/korap/response/MetaField.java
+++ b/src/main/java/de/ids_mannheim/korap/response/MetaField.java
@@ -56,7 +56,7 @@
};
}
- // Value is textual
+ // Value is textual or keywords
else {
// Value is a list
if (this.values.size() > 1) {
@@ -69,7 +69,7 @@
}
// Value is a single
- else {
+ else if (this.values.size() > 0) {
json.put("value", this.values.get(0));
};
};
diff --git a/src/main/java/de/ids_mannheim/korap/response/MetaFields.java b/src/main/java/de/ids_mannheim/korap/response/MetaFields.java
index 8e0bd42..213d2d6 100644
--- a/src/main/java/de/ids_mannheim/korap/response/MetaFields.java
+++ b/src/main/java/de/ids_mannheim/korap/response/MetaFields.java
@@ -11,6 +11,14 @@
import com.fasterxml.jackson.databind.node.ArrayNode;
import de.ids_mannheim.korap.index.AbstractDocument;
+import java.io.IOException;
+
+import de.ids_mannheim.korap.index.KeywordAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import java.io.StringReader;
+
import java.util.*;
import org.apache.lucene.index.*;
@@ -77,15 +85,45 @@
// Field has a textual value
else if (s != null) {
- // Field is not indexed
+ // Stored
if (iFieldType.indexOptions() == IndexOptions.NONE) {
mf.type = "type:store";
+ mf.values.add(s.toString());
}
+
+ // Keywords
+ else if (iFieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS) {
+ mf.type = "type:keywords";
+
+ // Analyze keywords
+ try {
+ StringReader reader = new StringReader(s.toString());
+ KeywordAnalyzer kwa = new KeywordAnalyzer();
+ TokenStream ts = kwa.tokenStream("-", reader);
+ CharTermAttribute term;
+ ts.reset();
+ while (ts.incrementToken()) {
+ term = ts.getAttribute(CharTermAttribute.class);
+ mf.values.add(term.toString());
+ };
+ ts.close();
+ reader.close();
+ }
+ catch (IOException e) {
+ log.error("Unable to split {}={}", iField.name(), s.toString());
+ }
+ }
+
+ // Text
else if (iFieldType.indexOptions() != IndexOptions.DOCS) {
mf.type = "type:text";
- };
+ mf.values.add(s.toString());
+ }
- mf.values.add(s.toString());
+ // String
+ else {
+ mf.values.add(s.toString());
+ };
}
else {
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinator.java b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinator.java
index 610df0b..36360c5 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinator.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinator.java
@@ -17,7 +17,7 @@
private final static Logger log = LoggerFactory.getLogger(Match.class);
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = true;
+ public static final boolean DEBUG = false;
private LinkedList<HighlightCombinatorElement> combine;
private Stack<Integer> balanceStack = new Stack<>();
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java
index 272a9f1..c39825a 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java
@@ -34,7 +34,7 @@
private final static Logger log = LoggerFactory.getLogger(Match.class);
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = true;
+ public static final boolean DEBUG = false;
// Constructor for highlighting elements
public HighlightCombinatorElement (byte type, int number) {
diff --git a/src/main/resources/log4j2.xml b/src/main/resources/log4j2.xml
index 20f61ae..fa441c5 100644
--- a/src/main/resources/log4j2.xml
+++ b/src/main/resources/log4j2.xml
@@ -19,6 +19,11 @@
level="trace">
<AppenderRef ref="Console"/>
</Logger>
+ <Logger name="de.ids_mannheim.korap.KrillIndex"
+ additivity="false"
+ level="trace">
+ <AppenderRef ref="Console"/>
+ </Logger>
-->
</Loggers>
</Configuration>
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestKrillIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestKrillIndex.java
index f15af52..3ca878d 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestKrillIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestKrillIndex.java
@@ -242,7 +242,7 @@
break;
case "keyword":
- assertEquals("type:string", field.at("/type").asText());
+ assertEquals("type:keywords", field.at("/type").asText());
assertEquals("koral:field", field.at("/@type").asText());
assertEquals("baum", field.at("/value/0").asText());
assertEquals("wald", field.at("/value/1").asText());
@@ -301,13 +301,10 @@
break;
case "foundries":
- // TODO:
- // This should better be an array!
- assertEquals("type:string", field.at("/type").asText());
- assertEquals("dereko dereko/structure " +
- "dereko/structure/base-sentences-paragraphs-pagebreaks "+
- "lwc lwc/dependency treetagger treetagger/morpho",
- field.at("/value").asText());
+ assertEquals("type:keywords", field.at("/type").asText());
+ assertEquals("dereko", field.at("/value/0").asText());
+ assertEquals("dereko/structure", field.at("/value/1").asText());
+ assertEquals("dereko/structure/base-sentences-paragraphs-pagebreaks", field.at("/value/2").asText());
break;
};
};