Introduced a term collector based on matches; doesn't do anything meaningful yet, other than lifting the SLOC

commit: 2cd1c3d33ea8ba100e6104ab80d75d0ac805f2b4 [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Wed Jan 08 22:53:08 2014 +0000
committer: Nils Diewald <nils@diewald-online.de> Wed Jan 08 22:53:08 2014 +0000
tree: 630291fa947dad19f6f8d70e6e4a3a57a9bfc4ef
parent: 68bb1f79eb7da6647f7624c32aee21213cf709ea [diff]
diff --git a/pom.xml b/pom.xml
index 81ed84b..e07c7c6 100644
--- a/pom.xml
+++ b/pom.xml

@@ -85,6 +85,15 @@
       <version>1.0</version>
     </dependency>
 
+    <!-- among others Base4 support -->
+    <!--
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+      <version>1.4</version>
+    </dependency>
+    -->
+
     <!-- Jackson -->
     <!-- see https://github.com/FasterXML/jackson-core -->
     <!-- https://github.com/FasterXML/jackson-databind -->

diff --git a/src/main/java/de/ids_mannheim/korap/KorapCollection.java b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
index 1f65da2..c41d7f9 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapCollection.java

@@ -31,6 +31,7 @@
 
 // TODO: Make a cache for the bits!!! DELETE IT IN CASE OF AN EXTENSION OR A FILTER!
 
+// TODO: Maybe a constantScoreQuery can make things faster?
 
 // accepts as first parameter the index
 // THIS MAY CHANGE for stuff like combining virtual collections

diff --git a/src/main/java/de/ids_mannheim/korap/KorapDocument.java b/src/main/java/de/ids_mannheim/korap/KorapDocument.java
index 2a86bcf..9c6dc5a 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapDocument.java

@@ -28,8 +28,8 @@
     private String author, textClass, corpusID,
 	           pubPlace, ID, title, subTitle,
 	           foundries, tokenization,
-	           layerInfo;
-    
+	           layerInfo, field;
+
     private KorapDate pubDate;
 
     /**
@@ -178,4 +178,12 @@
     public String getLayerInfo () {
 	return this.layerInfo;
     };
+
+    public void setField (String field) {
+	this.field = field;
+    };
+
+    public String getField () {
+	return this.field;
+    };
 };

diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 612fe1f..9679d31 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java

@@ -13,6 +13,13 @@
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.BooleanClause;
+
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.QueryWrapperFilter;
+
 import org.apache.lucene.search.spans.Spans;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
@@ -49,6 +56,12 @@
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.util.FixedBitSet;
+
+// Automata
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.RegExp;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
 
 import com.fasterxml.jackson.annotation.*;
 import com.fasterxml.jackson.databind.ObjectMapper;
@@ -59,6 +72,7 @@
 import de.ids_mannheim.korap.KorapSearch;
 import de.ids_mannheim.korap.index.FieldDocument;
 import de.ids_mannheim.korap.index.PositionsToOffset;
+import de.ids_mannheim.korap.index.TermInfo;
 import de.ids_mannheim.korap.document.KorapPrimaryData;
 
 import org.slf4j.Logger;
@@ -140,7 +154,6 @@
 	fieldsToLoad.add("foundries");
 	fieldsToLoad.add("layerInfo");
 	fieldsToLoad.add("tokenization");
-	// don't load tokenization
 
 	// Base analyzer for searching and indexing
 	// StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
@@ -153,7 +166,6 @@
             analyzerPerField
         );
 
-
 	// Create configuration with base analyzer
 	IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer);
 
@@ -307,6 +319,7 @@
 
 		    // Go to first term (initialization phase)
 // TODO: THIS MAY BE WRONG!
+// TODO:: DELETEEEEE AND TESTT!
 		    docs.nextPosition();
 
 		    // Copy payload with the offset of the BytesRef
@@ -331,10 +344,10 @@
      * Search for the number of occurrences of different types,
      * e.g. "documents", "sentences" etc.
      *
-     * @param foundry The foundry to search in.
+     * @param field The field containing the textual data and the annotations.
      * @param type The type of meta information, e.g. "documents" or "sentences".
      */
-    public long numberOf (KorapCollection collection, String foundry, String type) throws IOException {
+    public long numberOf (KorapCollection collection, String field, String type) throws IOException {
 	// Short cut for documents
 	if (type.equals("documents")) {
 	    if (collection.getCount() <= 0) {
@@ -354,8 +367,8 @@
 	};
     
 	// Create search term
-	Term term = new Term(foundry, "-:" + type);
-	// System.err.println(">> Search for -:" + type + " in " + foundry);
+	Term term = new Term(field, "-:" + type);
+	// System.err.println(">> Search for -:" + type + " in " + field);
 
 	long occurrences = 0;
 	try {
@@ -377,8 +390,8 @@
 	return occurrences;
     };
 
-    public long numberOf (String foundry, String type) throws IOException {
-	return this.numberOf(new KorapCollection(this), foundry, type);
+    public long numberOf (String field, String type) throws IOException {
+	return this.numberOf(new KorapCollection(this), field, type);
     };
 
 
@@ -400,12 +413,12 @@
      * e.g. "documents", "sentences" etc., in a specific set of documents.
      *
      * @param docvec The document vector for filtering the search space.
-     * @param foundry The foundry to search in.
+     * @param field The field containing the textual data and the annotations.
      * @param type The type of meta information, e.g. "documents" or "sentences".
      *
      * @see #numberOf(String, String)
      */
-    public long numberOf (Bits docvec, String foundry, String type) throws IOException {
+    public long numberOf (Bits docvec, String field, String type) throws IOException {
 
 	// Shortcut for documents
 	if (type.equals("documents")) {
@@ -413,7 +426,7 @@
 	    return os.cardinality();
 	};
     
-	Term term = new Term(foundry, "-:" + type);
+	Term term = new Term(field, "-:" + type);
 
 	int occurrences = 0;
 	try {
@@ -428,20 +441,6 @@
 	return occurrences;
     };
 
-
-    /*
-      Accepts a KorapInfo (with startPos, endPos, docID ... etc.)
-      everything that comes from an ID
-      and collects all information based on a prefix (like cnx/p etc.)
-
-      KorapInfo is associated with a KorapMatch and has an array with all informations
-      per position in the match.
-
-      public KorapInfo infoOf (KorapMatch km, String prefix) {
-
-      };
-    */
-
     @Deprecated
     public long countDocuments () throws IOException {
 	log.warn("countDocuments() is DEPRECATED in favor of numberOf(\"documents\")!");
@@ -455,9 +454,141 @@
 	return this.numberOf("tokens");
     };
 
+    /**
+     * Get a match.
+     * BE AWARE - THIS IS STILL A PLAYGROUND!
+     */
+    public KorapMatch getMatch (String id) {
+
+	String corpusID = "WPD";
+	String docID = "WPD_AAA.00003";
+	String field = "tokens"; // text field
+	String foundry = "mate";
+	String layer = "l";
+	int startPos = 20;
+	int endPos = 30;
+	Boolean includeSpans = true;
+
+	KorapMatch km = (KorapMatch) null;
+	LinkedList<TermInfo> termList = new LinkedList<TermInfo>();
+
+	StringBuffer regex = new StringBuffer();
+
+	// Todo: Ignore -: stuff!
+
+	if (includeSpans)
+	    regex.append("((<>|<|>):)?");
+	else
+	    regex.append("[^<>]");
+	if (foundry != null)
+	    regex.append(foundry).append('/');
+	if (layer != null)
+	    regex.append(layer).append(":");
+	regex.append(".+?");
+
+	BooleanQuery bool = new BooleanQuery();
+	bool.add(new TermQuery(new Term("ID", docID)), BooleanClause.Occur.MUST);
+	bool.add(new TermQuery(new Term("corpusID", corpusID)), BooleanClause.Occur.MUST);
+
+	Filter filter = (Filter) new QueryWrapperFilter(bool);
+
+	// Create an automaton for prefixed terms of interest:
+	CompiledAutomaton fst = new CompiledAutomaton(
+            new RegExp(regex.toString()).toAutomaton()
+        );
+
+	try {
+	for (AtomicReaderContext atomic : this.reader().leaves()) {
+	    DocIdSetIterator filterIter = filter.getDocIdSet(
+                atomic,
+		atomic.reader().getLiveDocs()
+            ).iterator();
+
+	    // Go to the matching doc
+	    int localDocID = filterIter.nextDoc();
+	    if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
+		continue;
+
+	    // We've found the correct document!
+	    HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
+	    fieldsToLoadLocal.add(field);
+
+	    // Load the necessary fields of the document
+	    Document doc = atomic.reader().document(localDocID, fieldsToLoadLocal);
+	    // Get terms from the document
+	    Terms docTerms = atomic.reader().getTermVector(localDocID, field);
+
+	    km = new KorapMatch(
+	        new PositionsToOffset(atomic, field),
+		localDocID,
+		startPos,
+		endPos
+            );
+
+	    // A termsEnum object could be reused here
+	    final TermsEnum termsEnum = docTerms.intersect(fst, null);
+
+	    // Create a bitset for the correct document
+	    // Yeah ... I know ... it could've been easier probably
+	    FixedBitSet bitset = new FixedBitSet(atomic.reader().numDocs());
+	    bitset.or(filterIter);
+
+	    DocsAndPositionsEnum docs = (DocsAndPositionsEnum) null;
+
+	    // Iterate over all terms in the document
+	    while (termsEnum.next() != null) {
+		docs = termsEnum.docsAndPositions(
+	            bitset,
+		    docs,
+		    DocsAndPositionsEnum.FLAG_PAYLOADS
+		);
+
+		// Init docs
+		docs.nextDoc();
+
+		// How often does this term occur in the document?
+		int termOccurrences = docs.freq();
+
+		// Iterate over all occurrences
+		for (int i = 0; i < termOccurrences; i++) {
+
+		    // Init positions and get the current
+		    int pos = docs.nextPosition();
+
+		    // Check, if the position of the term is in the interesting area
+		    if (pos >= startPos && pos <= endPos) {
+			termList.add(new TermInfo(
+			    termsEnum.term().utf8ToString(),
+			    pos,
+			    docs.getPayload()
+		        ));
+		    };
+		};
+	    };
+
+	    break;
+	};
+	}
+	catch (IOException e) {
+	    // ...
+	};
+
+	return km;
+    };
+
+
+    // TODO: collect all information based on a prefix (like cnx/p etc.)
+    // TODO: Generate a meaningful structure (e.g. a tree)
+    /*
+      KorapInfo is associated with a KorapMatch and has an array with all informations
+      per position in the match.
+
+      public KorapInfo infoOf (KorapMatch km, String prefix);
+    */
+
 
     /**
-     * search
+     * Search in the index.
      */
     public KorapResult search (SpanQuery query) {
 	return this.search(new KorapCollection(this), new KorapSearch(query));
@@ -516,7 +647,9 @@
 	this.termContexts = new HashMap<Term, TermContext>();
 
 	SpanQuery query = ks.getQuery();
-	String foundry = query.getField();
+
+	// Get the field of textual data and annotations
+	String field = query.getField();
 
 	// Todo: Make kr subclassing ks - so ks has a method for a new KorapResult!
 	KorapResult kr = new KorapResult(
@@ -530,7 +663,7 @@
 	);
 
 	HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
-	fieldsToLoadLocal.add(foundry);
+	fieldsToLoadLocal.add(field);
 
 	int i = 0;
 	long t1 = 0, t2 = 0;
@@ -564,7 +697,7 @@
 		// Use OpenBitSet;
 		Bits bitset = collection.bits(atomic);
 
-		PositionsToOffset pto = new PositionsToOffset(atomic, foundry);
+		PositionsToOffset pto = new PositionsToOffset(atomic, field);
 
 		// Spans spans = NearSpansOrdered();
 		Spans spans = query.getSpans(atomic, (Bits) bitset, termContexts);
@@ -677,8 +810,7 @@
 
 
 		    match.internalDocID = docID;
-		    // match.foundry = foundry; // This is "tokens" or "base" or so
-
+		    match.setField(field);
 		    match.setAuthor(doc.get("author"));
 		    match.setTextClass(doc.get("textClass"));
 		    match.setDocID(doc.get("ID"));
@@ -688,7 +820,7 @@
 		    match.setCorpusID(doc.get("corpusID"));
 		    match.setPubDate(doc.get("pubDate"));
 
-		    log.trace("I've got a match in {} of {}", match.getID(), count);
+		    log.trace("I've got a match in {} of {}", match.getDocID(), count);
 
 		    // Temporary (later meta fields in term vector)
 		    match.setFoundries(doc.get("foundries"));
@@ -697,7 +829,7 @@
 		    match.setLayerInfo(doc.get("layerInfo"));
 
 		    match.setPrimaryData(
-		        new KorapPrimaryData(doc.get(foundry))
+		        new KorapPrimaryData(doc.get(field))
 		    );
 		    atomicMatches.add(match);
 		};

diff --git a/src/main/java/de/ids_mannheim/korap/KorapMatch.java b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
index 8b48239..8f54f5e 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapMatch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapMatch.java

@@ -1,6 +1,7 @@
 package de.ids_mannheim.korap;
 import java.util.*;
 import java.lang.StringBuffer;
+import java.nio.ByteBuffer;
 
 import com.fasterxml.jackson.annotation.*;
 import com.fasterxml.jackson.databind.ObjectMapper;
@@ -8,6 +9,8 @@
 import de.ids_mannheim.korap.index.PositionsToOffset;
 import static de.ids_mannheim.korap.util.KorapHTML.*;
 
+// import org.apache.commons.codec.binary.Base64;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -30,7 +33,7 @@
     // Snippet information
     @JsonIgnore
     public short leftContext,
-	rightContext;
+  	         rightContext;
 
     @JsonIgnore
     public int startPos,
@@ -40,13 +43,16 @@
     public int potentialStartPosChar = -1,
 	       potentialEndPosChar   = -1;
 
+    private int startOffsetChar = 0;
+
     @JsonIgnore
     public boolean leftTokenContext,
 	           rightTokenContext;
 
     private String tempSnippet,
 	           snippetHTML,
-	           snippetBrackets;
+	           snippetBrackets,
+	           identifier;
 
     private HighlightCombinator snippetStack;
 
@@ -55,6 +61,7 @@
 
     private Collection<byte[]> payload;
     private ArrayList<int[]> highlight;
+    private LinkedList<int[]> span;
 
     private PositionsToOffset positionsToOffset;
     private boolean processed = false;
@@ -104,7 +111,7 @@
 
     public void addHighlight (int start, int end, int number) {
 	if (this.highlight == null)
-	    this.highlight = new ArrayList<int[]>();
+	    this.highlight = new ArrayList<int[]>(16);
 	log.trace("Add highlight of class {} from {} to {}", number, start, end);
 
 	this._reset();
@@ -128,25 +135,57 @@
     @Override
     @JsonProperty("ID")
     public String getID () {
-	StringBuffer sb = new StringBuffer();
-	if (this.getDocID() != null)
-	    sb.append(this.getDocID());
-	sb.append('#');
+
+	if (this.identifier != null)
+	    return this.identifier;
+
+	StringBuffer sb = new StringBuffer("match-");
+
+	// Get prefix string corpus/doc
+	if (this.getCorpusID() != null) {
+	    sb.append(this.getCorpusID());
+
+	    if (this.getDocID() != null) {
+		sb.append('-');
+		sb.append(this.getDocID());
+	    };
+	}
+	else {
+	    sb.append(this.localDocID);
+	};
+
+	sb.append('p');
+
+	// Get Position information
 	sb.append(startPos).append('-').append(endPos);
+
 	if (this.highlight != null) {
 	    for (int[] h : this.highlight) {
-		sb.append(',').append(h[2]).append(':');
+		sb.append('(').append(h[2]).append(')');
 		sb.append(h[0]).append('-').append(h[1]);
 	    };
 	};
 
-	return sb.toString();
+	if (this.processed) {
+	    sb.append('c');
+	    for (int[] s : this.span) {
+		if (s[2] != -1)
+		    sb.append('(').append(s[2]).append(')');
+		sb.append(s[0] + this.startOffsetChar);
+		sb.append('-');
+		sb.append(s[1] + this.startOffsetChar);
+	    };
+	};
+	return (this.identifier = sb.toString());
     };
 
     private void _reset () {
 	this.processed = false;
 	this.snippetHTML = null;
 	this.snippetBrackets = null;
+	this.identifier = null;
+	if (this.span != null)
+	    this.span.clear();
     };
 
     // Start building highlighted snippets
@@ -158,10 +197,12 @@
 	log.trace("Start highlight processing ...");
 	
 	// Get the list of spans for matches and highlighting
-	LinkedList<int[]> spans = this._processHighlightSpans(
-	  leftTokenContext,
-	  rightTokenContext
-        );
+	if (this.span == null || this.span.size() == 0) {
+	    this._processHighlightSpans(
+	        leftTokenContext,
+		rightTokenContext
+	    );
+	};
 
 	/*
 	for (int[] s : spans) {
@@ -171,7 +212,7 @@
 	*/
 
 	// Create a stack for highlighted elements (opening and closing elements)
-	ArrayList<int[]> stack = this._processHighlightStack(spans);
+	ArrayList<int[]> stack = this._processHighlightStack();
 
 	/*
 	for (int[] s : stack) {
@@ -603,15 +644,15 @@
     // This sorts all highlight and match spans to make them nesting correctly,
     // even in case they overlap
     // TODO: Not very fast - improve!
-    private ArrayList<int[]> _processHighlightStack (LinkedList<int[]> spans) {
+    private ArrayList<int[]> _processHighlightStack () {
 
 	log.trace("Create Stack");
 
 	LinkedList<int[]> openList  = new LinkedList<int[]>();
 	LinkedList<int[]> closeList = new LinkedList<int[]>();
 
-	openList.addAll(spans);
-	closeList.addAll(spans);
+	openList.addAll(span);
+	closeList.addAll(span);
 
 	Collections.sort(openList, new OpeningTagComparator());
 	Collections.sort(closeList, new ClosingTagComparator());
@@ -639,8 +680,8 @@
     };
 
 
-    private LinkedList<int[]> _processHighlightSpans (boolean leftTokenContext,
-						      boolean rightTokenContext) {
+    private void _processHighlightSpans (boolean leftTokenContext,
+					 boolean rightTokenContext) {
 	int startOffsetChar,
 	    endOffsetChar,
 	    startPosChar,
@@ -675,7 +716,10 @@
 
 	// right context
 	if (rightTokenContext) {
-	    endOffsetChar = this.positionsToOffset.end(ldid, this.endPos + this.rightContext - 1);
+	    endOffsetChar = this.positionsToOffset.end(
+	        ldid,
+		this.endPos + this.rightContext - 1
+	    );
 	    log.trace("For endOffset {} ({}+{}-1) pto returns {}", (this.endPos + this.rightContext - 1), this.endPos, this.rightContext, endOffsetChar);
 	}
 	else {
@@ -703,10 +747,11 @@
 	if (endOffsetChar != -1 && endOffsetChar < endPosChar)
 	    endOffsetChar = endPosChar;
 
+	this.startOffsetChar = startOffsetChar;
+
 
 	log.trace("Offsetposition {} till {} with contexts {} and {}", startOffsetChar, endOffsetChar, leftContext, rightContext);
 
-
 	if (endOffsetChar > -1 && endOffsetChar < this.getPrimaryDataLength()) {
 	    this.tempSnippet = this.getPrimaryData(startOffsetChar, endOffsetChar);
 	}
@@ -717,12 +762,15 @@
 
 	log.trace("Temporary snippet is \"{}\"", this.tempSnippet);
 
-        LinkedList<int[]> spans = new LinkedList<int[]>();
+	if (this.span == null)
+	    this.span = new LinkedList<int[]>();
+
+	this.identifier = null;
 
 	// Todo: Simplify
 	int[] intArray = new int[]{ startPosChar - startOffsetChar, endPosChar - startOffsetChar, -1, 0};
 	log.trace("IntArray: {}", intArray);
-	spans.add(intArray);
+	this.span.add(intArray);
 
 	// highlights
 	// -- I'm not sure about this.
@@ -745,11 +793,8 @@
 		log.trace("PTO-start: {}", start + startOffsetChar);
 		log.trace("PTO-end: {}", end + startOffsetChar);
 
-		spans.add(intArray);
+		this.span.add(intArray);
 	    };
 	};
-
-	return spans;
     };
-
 };

diff --git a/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java b/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java
index b9bf69d..a447ccd 100644
--- a/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java

@@ -2,12 +2,13 @@
 
 import java.util.*;
 
+import org.apache.lucene.index.Term;
+
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.RegexpQuery;
-import org.apache.lucene.index.Term;
 import org.apache.lucene.search.NumericRangeQuery;
 
 import de.ids_mannheim.korap.util.KorapDate;

diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index 99a0cab..292592c 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java

@@ -15,6 +15,7 @@
 import org.apache.lucene.document.IntField;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
 
 import java.util.*;
 
@@ -58,6 +59,7 @@
 	keywords.setStoreTermVectorPositions(false);
 	keywords.setStoreTermVectorPayloads(false);
 	keywords.setStoreTermVectorOffsets(false);
+	keywords.setIndexOptions(IndexOptions.DOCS_ONLY);
     }
 
     // see http://www.cowtowncoder.com/blog/archives/2011/07/entry_457.html

diff --git a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
new file mode 100644
index 0000000..6859c8f
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java

@@ -0,0 +1,20 @@
+package de.ids_mannheim.korap.index;
+
+import java.util.*;
+import org.apache.lucene.util.BytesRef;
+
+public class TermInfo {
+
+    private String prefix, foundry, layer, value;
+    private int pos = 0;
+    private BytesRef payload;
+
+    // Temporary:
+    private String name;
+
+    public TermInfo (String name, int pos, BytesRef payload) {
+	this.name = name;
+	this.pos = pos;
+	this.payload = payload;
+    };
+};

diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index 13e3871..121ef3c 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java

@@ -49,6 +49,6 @@
 
 	assertEquals("SnippetBrackets (0)", "... bcabca[{2:b{a}}]c", kr.match(0).snippetBrackets());
 
-	assertEquals("ID (0)", "#7-9,0:8-8,2:7-8", kr.match(0).getID());
+	assertEquals("ID (0)", "match-0p7-9(0)8-8(2)7-8c7-9(0)8-9(2)7-9", kr.match(0).getID());
     };
-};
\ No newline at end of file
+};
commit	2cd1c3d33ea8ba100e6104ab80d75d0ac805f2b4	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Wed Jan 08 22:53:08 2014 +0000
committer	Nils Diewald <nils@diewald-online.de>	Wed Jan 08 22:53:08 2014 +0000
tree	630291fa947dad19f6f8d70e6e4a3a57a9bfc4ef
parent	68bb1f79eb7da6647f7624c32aee21213cf709ea [diff]