blob: c869c0a13821ecfa976af5bcf8024bfc3abb9d4d [file] [log] [blame]
package de.ids_mannheim.korap.analysis;
import de.ids_mannheim.korap.analysis.MultiTerm;
import de.ids_mannheim.korap.analysis.MultiTermToken;
import static de.ids_mannheim.korap.util.KorapByte.*;
import org.apache.lucene.util.BytesRef;
import java.util.*;
import java.util.regex.*;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Reader;
import java.io.IOException;
/*
Todo:
- !Payload is [4ByteStartOffset][14BitEndOffset-startOffset][1BitBooleanIfSpan][1BitBooleanIfOpen]
- Payload is [4ByteOffsetStart][4ByteOffsetStart]
*/
/**
* @author Nils Diewald
* @version 0.3
*
* MultiTermTokenStream extends Lucenes TokenStream class to work with MultiTermTokens.
*
* @see org.apache.lucene.analysis.TokenStream
*/
public class MultiTermTokenStream extends TokenStream {
private CharTermAttribute charTermAttr;
private PositionIncrementAttribute posIncrAttr;
private PayloadAttribute payloadAttr;
/*
TODO: Update to new Tokeanstream API
http://www.hankcs.com/program/java/lucene-4-6-1-java-lang-illegalstateexception-tokenstream-contract-violation.html
*/
private static final Pattern pattern = Pattern.compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]");
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);
private List<MultiTermToken> multiTermTokens;
private int mttIndex = 0, mtIndex = 0;
private static short i = 0;
/**
* The empty Constructor.
*/
public MultiTermTokenStream () {
this.charTermAttr = this.addAttribute(CharTermAttribute.class);
this.posIncrAttr = this.addAttribute(PositionIncrementAttribute.class);
this.payloadAttr = this.addAttribute(PayloadAttribute.class);
this.multiTermTokens = new ArrayList<MultiTermToken>(100);
};
/**
* The Constructor.
*
* @param stream The MultiTermTokenStream as a string representation.
*/
public MultiTermTokenStream (String stream) {
this();
this._fromString(stream);
};
/**
* The Constructor.
*
* @param stream The MultiTermTokenStream as a reader object.
*/
public MultiTermTokenStream (Reader stream) throws IOException {
this();
StringBuilder sb = new StringBuilder(4096);
char[] buf = new char[128];
int i;
while ((i = stream.read(buf)) > 0) {
sb.append(buf, 0, i);
};
this._fromString(sb.toString());
};
/**
* Add a MultiTermToken to the end of the MultiTermTokenStream.
*
* @param mtt A MultiTermToken.
*/
public void addMultiTermToken (MultiTermToken mtt) {
this.multiTermTokens.add(mtt);
};
/**
* Add a MultiTermToken by means of MultiTerms to the end of
* the MultiTermTokenStream.
*
* @param term At least one MultiTerm.
*/
public void addMultiTermToken (MultiTerm term, MultiTerm ... moreTerms) {
this.addMultiTermToken(new MultiTermToken(term, moreTerms));
};
/**
* Add a MultiTermToken by means of a single MultiTerm to the end of
* the MultiTermTokenStream.
*
* @param prefix A prefix character of a surface form of a MultiTerm.
* @param surface A surface string of a MultiTerm.
*/
public void addMultiTermToken (char prefix, String surface) {
this.addMultiTermToken(new MultiTermToken(prefix, surface));
};
/**
* Add a MultiTermToken by means of a a series of surface strings
* to the end of the MultiTermTokenStream.
*
* @param surface At least one surface string of a MultiTerm.
*/
public void addMultiTermToken (String surface, String ... moreTerms) {
this.addMultiTermToken(new MultiTermToken(surface, moreTerms));
};
/**
* Add meta information to the MultiTermTokenStream.
*
* @param key A string for denoting the meta information.
* @param value The value of the meta key as a string.
*/
public void addMeta (String key, String value) {
MultiTerm mt = new MultiTerm('-', key);
mt.setPayload(value);
this.multiTermTokens.get(0).add(mt);
};
/**
* Add meta information to the MultiTermTokenStream.
*
* @param key A string for denoting the meta information.
* @param value The value of the meta key as a byte array.
*/
public void addMeta (String key, byte[] value) {
MultiTerm mt = new MultiTerm('-', key);
mt.setPayload(value);
this.multiTermTokens.get(0).add(mt);
};
/**
* Add meta information to the MultiTermTokenStream.
*
* @param key A string for denoting the meta information.
* @param value The value of the meta key as a short value.
*/
public void addMeta (String key, short value) {
MultiTerm mt = new MultiTerm('-', key);
mt.setPayload(value);
this.multiTermTokens.get(0).add(mt);
};
/**
* Add meta information to the MultiTermTokenStream.
*
* @param key A string for denoting the meta information.
* @param value The value of the meta key as a long value.
*/
public void addMeta (String key, long value) {
MultiTerm mt = new MultiTerm('-', key);
mt.setPayload(value);
this.multiTermTokens.get(0).add(mt);
};
/**
* Add meta information to the MultiTermTokenStream.
*
* @param key A string for denoting the meta information.
* @param value The value of the meta key as a integer value.
*/
public void addMeta (String key, int value) {
MultiTerm mt = new MultiTerm('-', key);
mt.setPayload(value);
this.multiTermTokens.get(0).add(mt);
};
/**
* Increment the token in the MultiTermTokenStream.
* This overrides the function in Lucene's TokenStream.
*/
@Override
public final boolean incrementToken() throws IOException {
this.payloadAttr.setPayload(null);
// Last token reached
if (this.multiTermTokens.size() == this.mttIndex) {
reset();
return false;
};
// Get current token
MultiTermToken mtt = this.multiTermTokens.get( this.mttIndex );
// Last term reached
if (mtt.terms.size() == this.mtIndex) {
this.mtIndex = 0;
this.mttIndex++;
// Last term of last token reached
if (this.multiTermTokens.size() == this.mttIndex) {
reset();
return false;
}
// Get last token
else {
mtt = this.multiTermTokens.get( this.mttIndex );
};
};
// Get current term
MultiTerm mt = mtt.terms.get(this.mtIndex);
// Set the relative position to the former term
posIncrAttr.setPositionIncrement( mt.posIncr );
charTermAttr.setEmpty();
charTermAttr.append( mt.term );
BytesRef payload = new BytesRef();
// There is offset information
if (mt.start != mt.end) {
if (DEBUG)
log.trace("MultiTerm with payload offset: {}-{}", mt.start, mt.end);
// Add offsets to BytesRef payload
payload.append(new BytesRef(int2byte(mt.start)));
payload.append(new BytesRef(int2byte(mt.end)));
};
// There is payload in the MultiTerm
if (mt.payload != null) {
payload.append(mt.payload);
if (DEBUG)
log.trace("Create payload[1] {}", payload.toString());
};
// There is payload in the current token to index
if (payload.length > 0) {
payloadAttr.setPayload(payload);
if (DEBUG)
log.trace("Set payload[2] {}", payload.toString());
};
if (DEBUG) {
StringBuilder sb = new StringBuilder("Index: [");
sb.append(mt.term);
if (payload.length > 0)
sb.append('$').append(payload.toString());
sb.append(']');
sb.append(" with increment ").append(mt.posIncr);
log.trace(sb.toString());
};
this.mtIndex++;
return true;
};
public String toString () {
StringBuffer sb = new StringBuffer();
for (MultiTermToken mtt : this.multiTermTokens) {
sb.append( mtt.toString() );
};
return sb.toString();
};
private void _fromString (String stream) {
Matcher matcher = pattern.matcher(stream);
while (matcher.find()) {
String[] seg = matcher.group(1).split("\\|");
MultiTermToken mtt = new MultiTermToken( seg[0] );
for (i = 1; i < seg.length; i++)
mtt.add(seg[i]);
this.addMultiTermToken(mtt);
};
};
@Override
public void reset() {
this.mttIndex = 0;
this.mtIndex = 0;
};
};