blob: ecea801cad3837d7b5a44857ad966b1f27ce6630 [file] [log] [blame]
package de.ids_mannheim.korap;
import java.util.*;
import java.io.*;
import static org.junit.Assert.*;
import de.ids_mannheim.korap.KorapQuery;
import de.ids_mannheim.korap.analysis.*;
import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
import de.ids_mannheim.korap.util.QueryException;
import static de.ids_mannheim.korap.util.KorapByte.*;
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.Bits;
/**
* @author Nils Diewald
*
* Helper class for testing the KorapIndex framework (Simple).
*/
public class TestSimple {
// Add document
public static void addDoc(IndexWriter w, Map<String, String> m) throws IOException {
Document doc = new Document();
FieldType textFieldWithTermVectors = new FieldType(TextField.TYPE_STORED);
textFieldWithTermVectors.setStoreTermVectors(true);
/*
No offsets are stored.
textFieldWithTermVectors.setStoreTermVectorOffsets(true);
*/
textFieldWithTermVectors.setStoreTermVectorPositions(true);
textFieldWithTermVectors.setStoreTermVectorPayloads(true);
Field textFieldAnalyzed = new Field(
"text",
m.get("textStr"),
textFieldWithTermVectors
);
MultiTermTokenStream ts = getTermVector(m.get("text"));
textFieldAnalyzed.setTokenStream( ts );
doc.add(textFieldAnalyzed);
// Add document to writer
w.addDocument(doc);
};
// Get Term Vector
public static MultiTermTokenStream getTermVector (String stream) {
MultiTermTokenStream ts = new MultiTermTokenStream();
int pos = 0;
for (String seg : stream.split(" ")) {
// System.err.println("** Prepare " + seg);
String[] tokens = seg.split("\\|");
int i = 0;
while (tokens[i].length() == 0)
i++;
MultiTermToken mtt = new MultiTermToken(tokens[i]);
// System.err.println("** Add term " + tokens[i]);
i++;
for (; i < tokens.length; i++) {
if (tokens[i].length() == 0)
continue;
mtt.add(tokens[i]);
};
ts.addMultiTermToken(mtt);
};
return ts;
};
// Get query wrapper based on json file
public static SpanQueryWrapper getJSONQuery (String jsonFile) {
SpanQueryWrapper sqwi;
try {
String json = getString(jsonFile);
sqwi = new KorapQuery("tokens").fromJson(json);
}
catch (QueryException e) {
fail(e.getMessage());
sqwi = new KorapQuery("tokens").seg("???");
};
return sqwi;
};
// Get string
public static String getString (String path) {
StringBuilder contentBuilder = new StringBuilder();
try {
BufferedReader in = new BufferedReader(new FileReader(path));
String str;
while ((str = in.readLine()) != null) {
contentBuilder.append(str);
};
in.close();
} catch (IOException e) {
fail(e.getMessage());
}
return contentBuilder.toString();
};
// getSpan Info
public static List<String> getSpanInfo (IndexReader reader, SpanQuery query)
throws IOException {
Map<Term, TermContext> termContexts = new HashMap<>();
List<String> spanArray = new ArrayList<>();
for (AtomicReaderContext atomic : reader.leaves()) {
Bits bitset = atomic.reader().getLiveDocs();
// Spans spans = NearSpansOrdered();
Spans spans = query.getSpans(atomic, bitset, termContexts);
while (spans.next()) {
StringBuffer payloadString = new StringBuffer();
int docid = atomic.docBase + spans.doc();
if (spans.isPayloadAvailable()) {
for (byte[] payload : spans.getPayload()) {
/* retrieve payload for current matching span */
payloadString.append(byte2int(payload)).append(",");
payloadString.append(byte2int(payload, 2));
// payloadString.append(byte2int(payload, 1));
payloadString.append(" (" + payload.length + ")");
payloadString.append(" | ");
};
};
spanArray.add(
"Doc: " +
docid +
" with " +
spans.start() +
"-" +
spans.end() +
" || " +
payloadString.toString()
);
};
};
return spanArray;
};
};