blob: f9ff3d6a7ffe5b225d36751d88e312f404841783 [file] [log] [blame]
package de.ids_mannheim.korap.analysis;
import static de.ids_mannheim.korap.util.KorapArray.*;
import org.apache.lucene.util.BytesRef;
import java.nio.ByteBuffer;
import java.util.*;
/**
* @author Nils Diewald
* @version 0.3
*
* MultiTerm represents a term in a MultiTermToken.
*/
public class MultiTerm {
public int start, end = 0;
public String term = null;
public Integer posIncr = 1;
public boolean storeOffsets = false;
public BytesRef payload = null;
private static ByteBuffer bb = ByteBuffer.allocate(8);
private static String[] stringOffset;
private static short i, l;
/**
* The constructor.
*
* @param term The term surface.
Offsets can be written as an appended and dash separated pair of integers,
payloads can be written following a dollar sign.
payloads can be typed as being a short (s), an integer (i), or a long (l)
in leading angular brackets. All other payloads are treated as being UTF-8
characer sequences.
Examples:
MultiTerm test = new MultiTerm("test");
MultiTerm test = new MultiTerm("test#0-4");
MultiTerm test = new MultiTerm("test#0-4$Example");
MultiTerm test = new MultiTerm("test#0-4$<i>1278");
Strings that are malformed fail silently.
*/
public MultiTerm (String term) {
_fromString(term);
};
/**
* The constructor with a separated prefix.
* new MultiTerm('a', "bcd") is equivalent to
* new MultiTerm("a:bcd");
*
* @param prefix A special prefix for the term.
* @param term The term surface.
*
* @see #MultiTerm(String)
*/
public MultiTerm (char prefix, String term) {
StringBuilder sb = new StringBuilder();
_fromString(sb.append(prefix).append(':').append(term).toString());
};
/**
* The empty constructor.
*/
public MultiTerm () {
this.term = "";
};
/**
* Sets the term value.
*
* @param term The term as a string
*/
public void setTerm (String term) {
this.term = term;
};
/**
* Returns the term value.
*
* @return The term value.
*/
public String getTerm () {
return this.term;
};
/**
* Set the payload as a byte value.
*
* @param pl The payload.
*/
public void setPayload (Byte pl) {
this.payload = new BytesRef( ByteBuffer.allocate(1).put(pl).array());
};
/**
* Set the payload as a short value.
*
* @param pl The payload.
*/
public void setPayload (short pl) {
this.payload = new BytesRef( ByteBuffer.allocate(2).putShort(pl).array());
};
/**
* Set the payload as an integer value.
*
* @param pl The payload.
*/
public void setPayload (int pl) {
this.payload = new BytesRef( ByteBuffer.allocate(4).putInt(pl).array());
};
/**
* Set the payload as a long value.
*
* @param pl The payload.
*/
public void setPayload (long pl) {
this.payload = new BytesRef( ByteBuffer.allocate(8).putLong(pl).array());
};
/**
* Set the payload as a string value.
*
* @param pl The payload.
*/
public void setPayload (String pl) {
this.payload = new BytesRef(pl);
};
/**
* Set the payload as a byte array.
*
* @param pl The payload.
*/
public void setPayload (byte[] pl) {
this.payload = new BytesRef(pl);
};
/**
* Set the payload as a BytesRef.
*
* @param pl The payload.
*/
public void setPayload (BytesRef pl) {
this.payload = pl;
};
/**
* Get the payload.
*
* @return The payload as a BytesRef.
*/
public BytesRef getPayload () {
return this.payload;
};
/**
* Set the start position of the term.
*
* @param The start position.
*/
public void setStart (int value) {
this.start = value;
};
/**
* Get the start position.
*
* @return The start position.
*/
public int getStart () {
return this.start;
};
/**
* Set the end position of the term.
*
* @param The end position.
*/
public void setEnd (int value) {
this.end = value;
};
/**
* Get the end position.
*
* @return The end position.
*/
public int getEnd () {
return this.end;
};
/**
* Set the flag for stored offsets.
*
* @param value Boolean value indicating that the term
* contains stored offsets.
*/
public void hasStoredOffsets (boolean value) {
this.storeOffsets = value;
};
/**
* Check if there are offsets stored.
*
* @return Boolean value indicating that the term
* contains stored offsets.
*/
public boolean hasStoredOffsets () {
return this.storeOffsets;
};
private void _fromString (String term) {
String[] termSurface = term.split("\\$", 2);
// Payload is given
if (termSurface.length == 2) {
String payloadStr = termSurface[1];
// Payload has a type
if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') {
// Rewind bytebuffer
bb.rewind();
// Split payload at type marker boundaries
String[] pls = payloadStr.split("(?=<)|(?<=>)");
l = 0; // Bytearray length
try {
for (i = 1; i < pls.length;) {
// Resize the bytebuffer
if ((bb.capacity() - l) < 8) {
bb = ByteBuffer.allocate(bb.capacity() + 8)
.put(bb.array());
bb.position(l);
};
switch (pls[i]) {
case "<b>": // byte
bb.put(Byte.parseByte(pls[i+1]));
l++;
break;
case "<s>": // short
bb.putShort(Short.parseShort(pls[i+1]));
l+=2;
break;
case "<i>": // integer
bb.putInt(Integer.parseInt(pls[i+1]));
l+=4;
break;
case "<l>": // long
bb.putLong(Long.parseLong(pls[i+1]));
l+=8;
break;
};
i+=2;
};
byte[] bytes = new byte[l];
System.arraycopy(bb.array(), 0, bytes, 0, l);
this.payload = new BytesRef(bytes);
}
catch (Exception e) {
};
}
// Payload is a string
else {
this.payload = new BytesRef(payloadStr);
};
};
// Parse offset information
stringOffset = termSurface[0].split("\\#", 2);
if (stringOffset.length == 2) {
// Split start and end position of the offset
String[] offset = stringOffset[1].split("\\-", 2);
// Start and end is given
if (offset.length == 2 && offset[0].length() > 0) {
try {
this.start = Integer.parseInt(offset[0]);
this.end = Integer.parseInt(offset[1]);
}
catch (NumberFormatException e) {
};
};
};
this.term = stringOffset[0];
};
/**
* Represent the MultiTerm as a string.
* Offsets are attached following a hash sign,
* payloads are attached following a dollar sign.
* All payloads are written as UTF-8 character sequences.
*
* @see #toStringShort().
*/
public String toString () {
StringBuilder sb = new StringBuilder(this.term);
if (this.start != this.end) {
sb.append('#')
.append(this.start)
.append('-')
.append(this.end);
};
if (this.payload != null) {
sb.append('$');
try {
sb.append(this.payload.utf8ToString());
}
catch (AssertionError e) {
sb.append("<?>")
.append(this.payload.toString().replace(' ', ','));
};
};
return sb.toString();
};
/**
* Represent the MultiTerm as a string.
* Payloads are attached following a dollar sign.
* All payloads are written as UTF-8 character sequences.
* Offsets are neglected.
*
* @see #toString().
*/
public String toStringShort () {
StringBuilder sb = new StringBuilder(this.term);
if (this.payload != null) {
sb.append('$');
try {
sb.append(this.payload.utf8ToString());
}
catch (AssertionError e) {
sb.append("<?>")
.append(this.payload.toString().replace(' ', ','));
};
};
return sb.toString();
};
};