src/main/java/de/ids_mannheim/korap/model/MultiTerm.java - KorAP/Krill - Gitiles

 package de.ids_mannheim.korap.model;

 import static de.ids_mannheim.korap.util.KorapArray.*;
 import de.ids_mannheim.korap.util.CorpusDataException;
 import org.apache.lucene.util.BytesRef;
 import java.nio.ByteBuffer;
 import java.util.*;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /*
  * Don't use ByteBuffer!
  */
 /**
  * A MultiTerm represents a single term (e.g. a word, an annotation, a relation)
  * that can be part of a MultiTermToken.
  *
  * A MultiTerm consists of a term representation string, optional character offset
  * information that matches the term to the character stream of the input text,
  * and an arbitrary payload.
  *
  * There is a simple string representation of MultiTerms supported:
  * The string is the first sequence of characters.
  * Offsets are written as an appended and dash separated pair of integers.
  * Payloads are written following a dollar sign.
  * Payload segments can be typed as being a short (s), an integer (i), or a long (l)
  * value in leading angular brackets.
  * All other (untyped) payloads are treated as being UTF-8 characer sequences.
  *
  * <blockquote><pre>
  *   MultiTerm test1 = new MultiTerm("test");
  *   MultiTerm test2 = new MultiTerm("test#0-4");
  *   MultiTerm test3 = new MultiTerm("test#0-4$Example");
  *   MultiTerm test4 = new MultiTerm("test#0-4$&lt;i&gt;1278");
  * </pre></blockquote>
  *
  * <strong>Warning</strong>: Strings that are malformed fail silently!
  *
  * @author diewald
  */
 public class MultiTerm implements Comparable<MultiTerm> {
     public int start, end = 0;
     public String term = null;
     private boolean storeOffsets = false;
     public BytesRef payload = null;

     private static ByteBuffer bb = ByteBuffer.allocate(8);
     private static String[] stringOffset;

     private static short i, l;

     // This advices the java compiler to ignore all loggings
     public static final boolean DEBUG = false;
     private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);


     /**
      * Construct a new MultiTerm object.
      */
     public MultiTerm () {
         this.term = "";
     };


     /**
      * Construct a new MultiTerm object.
      *
      * @param term The term surface (see synopsis).
      */
     public MultiTerm (String term) throws CorpusDataException {
         _fromString(term);
     };


     /**
      * Construct a new MultiTerm object.
      *
      * In addition to the normal surface representation,
      * this supports a prefix notation.
      * The following expressions are equal:
      *
      * <blockquote><pre>
      *   MultiTerm test1 = new MultiTerm('a', "bcd");
      *   MultiTerm test2 = new MultiTerm("a:bcd");
      * </pre></blockquote>
      *
      * @param prefix A special prefix for the term.
      * @param term The term surface (see synopsis).
      */
     public MultiTerm (char prefix, String term) throws CorpusDataException {
         StringBuilder sb = new StringBuilder();
         _fromString(sb.append(prefix).append(':').append(term).toString());
     };


     /**
      * Get the term value of the MultiTerm.
      *
      * @return The term as a string.
      */
     public String getTerm () {
         return this.term;
     };


     /**
      * Set the term value of the MultiTerm.
      *
      * @param term The term as a string.
      * @return The {@link MultIterm} object for chaining.
      */
     public MultiTerm setTerm (String term) {
         this.term = term;
         return this;
     };


     /**
      * Get the payload.
      *
      * @return The payload as a BytesRef.
      */
     public BytesRef getPayload () {
         return this.payload;
     };


     /**
      * Set the payload as a {@link Byte} value.
      *
      * @param pl The payload.
      * @return The {@link MultiTerm} object for chaining.
      */
     public MultiTerm setPayload (Byte pl) {
         this.payload = new BytesRef( ByteBuffer.allocate(1).put(pl).array());
         return this;
     };


     /**
      * Set the payload as a short value.
      *
      * @param pl The payload.
      * @return The {@link MultiTerm} object for chaining.
      */
     public MultiTerm setPayload (short pl) {
         this.payload = new BytesRef( ByteBuffer.allocate(2).putShort(pl).array());
         return this;
     };


     /**
      * Set the payload as an integer value.
      *
      * @param pl The payload.
      * @return The {@link MultiTerm} object for chaining.
      */
     public MultiTerm setPayload (int pl) {
         this.payload = new BytesRef( ByteBuffer.allocate(4).putInt(pl).array());
         return this;
     };


     /**
      * Set the payload as a long value.
      *
      * @param pl The payload.
      * @return The {@link MultiTerm} object for chaining.
      */
     public MultiTerm setPayload (long pl) {
         this.payload = new BytesRef( ByteBuffer.allocate(8).putLong(pl).array());
         return this;
     };


     /**
      * Set the payload as a string value.
      *
      * @param pl The payload.
      * @return The {@link MultiTerm} object for chaining.
      */
     public MultiTerm setPayload (String pl) {
         this.payload = new BytesRef(pl);
         return this;
     };


     /**
      * Set the payload as a byte array.
      *
      * @param pl The payload.
      * @return The {@link MultiTerm} object for chaining.
      */
     public MultiTerm setPayload (byte[] pl) {
         this.payload = new BytesRef(pl);
         return this;
     };


     /**
      * Set the payload as a {@link BytesRef} object.
      *
      * @param pl The payload.
      * @return The {@link MultiTerm} object for chaining.
      */
     public MultiTerm setPayload (BytesRef pl) {
         this.payload = pl;
         return this;
     };


     /**
      * Get the start position.
      *
      * @return The start position.
      */
     public int getStart () {
         return this.start;
     };


     /**
      * Set the start position.
      *
      * @param start The start position.
      * @return The {@link MultiTerm} object for chaining.
      */
     public MultiTerm setStart (int start) {
         this.start = start;
         return this;
     };


     /**
      * Get the end position.
      *
      * @return The end position.
      */
     public int getEnd () {
         return this.end;
     };


     /**
      * Set the end position.
      *
      * @param end The end position.
      * @return The {@link MultiTerm} object for chaining.
      */
     public MultiTerm setEnd (int end) {
         this.end = end;
         return this;
     };


     /**
      * Check if there are offsets stored.
      *
      * @return Boolean value indicating that the term
      *         contains stored offsets.
      */
     public boolean hasStoredOffsets () {
         return this.storeOffsets;
     };


     /**
      * Set the flag for stored offsets, in case they are relevant.
      *
      * @param value Boolean value indicating that the term
      *        contains stored offsets.
      * @return The {@link MultiTerm} object for chaining.
      */
     public MultiTerm hasStoredOffsets (boolean value) {
         this.storeOffsets = value;
         return this;
     };


     /**
      * Represent the MultiTerm as a string (see synopsis).
      * Offsets are attached following a hash sign,
      * payloads are attached following a dollar sign.
      * All payloads are written as UTF-8 character sequences.
      *
      * @see #toStringShort().
      */
     public String toString () {
         StringBuilder sb = new StringBuilder(this.term);
         if (this.start != this.end) {
             sb.append('#')
                 .append(this.start)
                 .append('-')
                 .append(this.end);
         };

         if (this.payload != null) {
             sb.append('$');
             try {
                 sb.append(this.payload.utf8ToString());
             }
             catch (AssertionError e) {
                 sb.append("<?>")
                     .append(this.payload.toString().replace(' ', ','));
             };
         };

         return sb.toString();
     };

     @Override
     public int compareTo (MultiTerm o) {
         if (this.payload == null || o.payload == null)
             return 0;
         if (this.end < o.end)
             return -1;
         else if (this.end > o.end)
             return 1;
         else if (this.start < o.start)
             return -1;
         else if (this.start > o.start)
             return 1;
         return 0;
     };


     /**
      * Represent the MultiTerm as a string.
      * Payloads are attached following a dollar sign.
      * All payloads are written as UTF-8 character sequences.
      * Offsets are neglected.
      *
      * Offsets are ignored.
      *
      * @see #toString().
      */
     public String toStringShort () {
         StringBuilder sb = new StringBuilder(this.term);
         if (this.payload != null) {
             sb.append('$');
             try {
                 sb.append(this.payload.utf8ToString());
             }
             catch (AssertionError e) {
                 sb.append("<?>")
                     .append(this.payload.toString().replace(' ', ','));
             };
         };
         return sb.toString();
     };


     /*
      * Deserialize MultiTerm from string representation.
      */
     private void _fromString (String term) throws CorpusDataException {
         String[] termSurface = term.split("\\$", 2);

         // Payload is given
         if (termSurface.length == 2) {
             String payloadStr = termSurface[1];

             // Payload has a type
             if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') {

                 // Rewind bytebuffer
                 bb.rewind();

                 // Split payload at type marker boundaries
                 String[] pls = payloadStr.split("(?=<)|(?<=>)");

                 l = 0; // Bytearray length

                 try {
                     for (i = 1; i < pls.length;) {

                         // Resize the bytebuffer
                         if ((bb.capacity() - l) < 8) {
                             bb = ByteBuffer.allocate(bb.capacity() + 8).
                                 put(bb.array());
                             bb.position(l);
                         };

                         switch (pls[i]) {
                         case "<b>": // byte
                             bb.put(Byte.parseByte(pls[i+1]));
                             l++;
                             break;
                         case "<s>": // short
                             bb.putShort(Short.parseShort(pls[i+1]));
                             l+=2;
                             break;
                         case "<i>": // integer
                             bb.putInt(Integer.parseInt(pls[i+1]));
                             l+=4;
                             break;
                         case "<l>": // long
                             bb.putLong(Long.parseLong(pls[i+1]));
                             l+=8;
                             break;
                         };
                         i+=2;
                     };

                     byte[] bytes = new byte[l];
                     System.arraycopy(bb.array(), 0, bytes, 0, l);
                     this.payload = new BytesRef(bytes);
                 }
                 catch (Exception e) {
                     if (DEBUG)
                         log.warn(e.getMessage());
                 };
             }

             // Payload is a string
             else {
                 this.payload = new BytesRef(payloadStr);
             };
         };

         // Parse offset information
         stringOffset = termSurface[0].split("\\#", 2);

         if (stringOffset.length == 2) {

             // Split start and end position of the offset
             String[] offset = stringOffset[1].split("\\-", 2);

             // Start and end is given
             if (offset.length == 2 && offset[0].length() > 0) {
                 try {
                     this.start = Integer.parseInt(offset[0]);
                     this.end   = Integer.parseInt(offset[1]);

                 }
                 catch (NumberFormatException e) {
                     throw new CorpusDataException(
                         952,
                         "Given offset information is not numeric"
                     );
                 };
             }
             else {
                 throw new CorpusDataException(
                     953,
                     "Given offset information is incomplete"
                 );
             };
         };
         this.term = stringOffset[0];
     };
 };
	package de.ids_mannheim.korap.model;

	import static de.ids_mannheim.korap.util.KorapArray.*;
	import de.ids_mannheim.korap.util.CorpusDataException;
	import org.apache.lucene.util.BytesRef;
	import java.nio.ByteBuffer;
	import java.util.*;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/*
	* Don't use ByteBuffer!
	*/
	/**
	* A MultiTerm represents a single term (e.g. a word, an annotation, a relation)
	* that can be part of a MultiTermToken.
	*
	* A MultiTerm consists of a term representation string, optional character offset
	* information that matches the term to the character stream of the input text,
	* and an arbitrary payload.
	*
	* There is a simple string representation of MultiTerms supported:
	* The string is the first sequence of characters.
	* Offsets are written as an appended and dash separated pair of integers.
	* Payloads are written following a dollar sign.
	* Payload segments can be typed as being a short (s), an integer (i), or a long (l)
	* value in leading angular brackets.
	* All other (untyped) payloads are treated as being UTF-8 characer sequences.
	*
	* <blockquote><pre>
	* MultiTerm test1 = new MultiTerm("test");
	* MultiTerm test2 = new MultiTerm("test#0-4");
	* MultiTerm test3 = new MultiTerm("test#0-4$Example");
	* MultiTerm test4 = new MultiTerm("test#0-4$<i>1278");
	* </pre></blockquote>
	*
	* <strong>Warning</strong>: Strings that are malformed fail silently!
	*
	* @author diewald
	*/
	public class MultiTerm implements Comparable<MultiTerm> {
	public int start, end = 0;
	public String term = null;
	private boolean storeOffsets = false;
	public BytesRef payload = null;

	private static ByteBuffer bb = ByteBuffer.allocate(8);
	private static String[] stringOffset;

	private static short i, l;

	// This advices the java compiler to ignore all loggings
	public static final boolean DEBUG = false;
	private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);


	/**
	* Construct a new MultiTerm object.
	*/
	public MultiTerm () {
	this.term = "";
	};


	/**
	* Construct a new MultiTerm object.
	*
	* @param term The term surface (see synopsis).
	*/
	public MultiTerm (String term) throws CorpusDataException {
	_fromString(term);
	};


	/**
	* Construct a new MultiTerm object.
	*
	* In addition to the normal surface representation,
	* this supports a prefix notation.
	* The following expressions are equal:
	*
	* <blockquote><pre>
	* MultiTerm test1 = new MultiTerm('a', "bcd");
	* MultiTerm test2 = new MultiTerm("a:bcd");
	* </pre></blockquote>
	*
	* @param prefix A special prefix for the term.
	* @param term The term surface (see synopsis).
	*/
	public MultiTerm (char prefix, String term) throws CorpusDataException {
	StringBuilder sb = new StringBuilder();
	_fromString(sb.append(prefix).append(':').append(term).toString());
	};


	/**
	* Get the term value of the MultiTerm.
	*
	* @return The term as a string.
	*/
	public String getTerm () {
	return this.term;
	};


	/**
	* Set the term value of the MultiTerm.
	*
	* @param term The term as a string.
	* @return The {@link MultIterm} object for chaining.
	*/
	public MultiTerm setTerm (String term) {
	this.term = term;
	return this;
	};


	/**
	* Get the payload.
	*
	* @return The payload as a BytesRef.
	*/
	public BytesRef getPayload () {
	return this.payload;
	};


	/**
	* Set the payload as a {@link Byte} value.
	*
	* @param pl The payload.
	* @return The {@link MultiTerm} object for chaining.
	*/
	public MultiTerm setPayload (Byte pl) {
	this.payload = new BytesRef( ByteBuffer.allocate(1).put(pl).array());
	return this;
	};


	/**
	* Set the payload as a short value.
	*
	* @param pl The payload.
	* @return The {@link MultiTerm} object for chaining.
	*/
	public MultiTerm setPayload (short pl) {
	this.payload = new BytesRef( ByteBuffer.allocate(2).putShort(pl).array());
	return this;
	};


	/**
	* Set the payload as an integer value.
	*
	* @param pl The payload.
	* @return The {@link MultiTerm} object for chaining.
	*/
	public MultiTerm setPayload (int pl) {
	this.payload = new BytesRef( ByteBuffer.allocate(4).putInt(pl).array());
	return this;
	};


	/**
	* Set the payload as a long value.
	*
	* @param pl The payload.
	* @return The {@link MultiTerm} object for chaining.
	*/
	public MultiTerm setPayload (long pl) {
	this.payload = new BytesRef( ByteBuffer.allocate(8).putLong(pl).array());
	return this;
	};


	/**
	* Set the payload as a string value.
	*
	* @param pl The payload.
	* @return The {@link MultiTerm} object for chaining.
	*/
	public MultiTerm setPayload (String pl) {
	this.payload = new BytesRef(pl);
	return this;
	};


	/**
	* Set the payload as a byte array.
	*
	* @param pl The payload.
	* @return The {@link MultiTerm} object for chaining.
	*/
	public MultiTerm setPayload (byte[] pl) {
	this.payload = new BytesRef(pl);
	return this;
	};


	/**
	* Set the payload as a {@link BytesRef} object.
	*
	* @param pl The payload.
	* @return The {@link MultiTerm} object for chaining.
	*/
	public MultiTerm setPayload (BytesRef pl) {
	this.payload = pl;
	return this;
	};


	/**
	* Get the start position.
	*
	* @return The start position.
	*/
	public int getStart () {
	return this.start;
	};


	/**
	* Set the start position.
	*
	* @param start The start position.
	* @return The {@link MultiTerm} object for chaining.
	*/
	public MultiTerm setStart (int start) {
	this.start = start;
	return this;
	};


	/**
	* Get the end position.
	*
	* @return The end position.
	*/
	public int getEnd () {
	return this.end;
	};


	/**
	* Set the end position.
	*
	* @param end The end position.
	* @return The {@link MultiTerm} object for chaining.
	*/
	public MultiTerm setEnd (int end) {
	this.end = end;
	return this;
	};


	/**
	* Check if there are offsets stored.
	*
	* @return Boolean value indicating that the term
	* contains stored offsets.
	*/
	public boolean hasStoredOffsets () {
	return this.storeOffsets;
	};


	/**
	* Set the flag for stored offsets, in case they are relevant.
	*
	* @param value Boolean value indicating that the term
	* contains stored offsets.
	* @return The {@link MultiTerm} object for chaining.
	*/
	public MultiTerm hasStoredOffsets (boolean value) {
	this.storeOffsets = value;
	return this;
	};


	/**
	* Represent the MultiTerm as a string (see synopsis).
	* Offsets are attached following a hash sign,
	* payloads are attached following a dollar sign.
	* All payloads are written as UTF-8 character sequences.
	*
	* @see #toStringShort().
	*/
	public String toString () {
	StringBuilder sb = new StringBuilder(this.term);
	if (this.start != this.end) {
	sb.append('#')
	.append(this.start)
	.append('-')
	.append(this.end);
	};

	if (this.payload != null) {
	sb.append('$');
	try {
	sb.append(this.payload.utf8ToString());
	}
	catch (AssertionError e) {
	sb.append("<?>")
	.append(this.payload.toString().replace(' ', ','));
	};
	};

	return sb.toString();
	};

	@Override
	public int compareTo (MultiTerm o) {
	if (this.payload == null \|\| o.payload == null)
	return 0;
	if (this.end < o.end)
	return -1;
	else if (this.end > o.end)
	return 1;
	else if (this.start < o.start)
	return -1;
	else if (this.start > o.start)
	return 1;
	return 0;
	};


	/**
	* Represent the MultiTerm as a string.
	* Payloads are attached following a dollar sign.
	* All payloads are written as UTF-8 character sequences.
	* Offsets are neglected.
	*
	* Offsets are ignored.
	*
	* @see #toString().
	*/
	public String toStringShort () {
	StringBuilder sb = new StringBuilder(this.term);
	if (this.payload != null) {
	sb.append('$');
	try {
	sb.append(this.payload.utf8ToString());
	}
	catch (AssertionError e) {
	sb.append("<?>")
	.append(this.payload.toString().replace(' ', ','));
	};
	};
	return sb.toString();
	};


	/*
	* Deserialize MultiTerm from string representation.
	*/
	private void _fromString (String term) throws CorpusDataException {
	String[] termSurface = term.split("\\$", 2);

	// Payload is given
	if (termSurface.length == 2) {
	String payloadStr = termSurface[1];

	// Payload has a type
	if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') {

	// Rewind bytebuffer
	bb.rewind();

	// Split payload at type marker boundaries
	String[] pls = payloadStr.split("(?=<)\|(?<=>)");

	l = 0; // Bytearray length

	try {
	for (i = 1; i < pls.length;) {

	// Resize the bytebuffer
	if ((bb.capacity() - l) < 8) {
	bb = ByteBuffer.allocate(bb.capacity() + 8).
	put(bb.array());
	bb.position(l);
	};

	switch (pls[i]) {
	case "<b>": // byte
	bb.put(Byte.parseByte(pls[i+1]));
	l++;
	break;
	case "<s>": // short
	bb.putShort(Short.parseShort(pls[i+1]));
	l+=2;
	break;
	case "<i>": // integer
	bb.putInt(Integer.parseInt(pls[i+1]));
	l+=4;
	break;
	case "<l>": // long
	bb.putLong(Long.parseLong(pls[i+1]));
	l+=8;
	break;
	};
	i+=2;
	};

	byte[] bytes = new byte[l];
	System.arraycopy(bb.array(), 0, bytes, 0, l);
	this.payload = new BytesRef(bytes);
	}
	catch (Exception e) {
	if (DEBUG)
	log.warn(e.getMessage());
	};
	}

	// Payload is a string
	else {
	this.payload = new BytesRef(payloadStr);
	};
	};

	// Parse offset information
	stringOffset = termSurface[0].split("\\#", 2);

	if (stringOffset.length == 2) {

	// Split start and end position of the offset
	String[] offset = stringOffset[1].split("\\-", 2);

	// Start and end is given
	if (offset.length == 2 && offset[0].length() > 0) {
	try {
	this.start = Integer.parseInt(offset[0]);
	this.end = Integer.parseInt(offset[1]);

	}
	catch (NumberFormatException e) {
	throw new CorpusDataException(
	952,
	"Given offset information is not numeric"
	);
	};
	}
	else {
	throw new CorpusDataException(
	953,
	"Given offset information is incomplete"
	);
	};
	};
	this.term = stringOffset[0];
	};
	};