Halfly finished info retriever

commit: 138e5b9fff6f90cdc0e9427e0a2ebeae13765890 [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Fri Jan 10 21:15:13 2014 +0000
committer: Nils Diewald <nils@diewald-online.de> Fri Jan 10 21:15:13 2014 +0000
tree: 374353365f976d84b1239f97a7b5476e6a9d5b0f
parent: 4aadb0c0b448d0115c039d08e05da69f9c3af463 [diff]
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 9b740ea..1bdad67 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java

@@ -73,6 +73,7 @@
 import de.ids_mannheim.korap.index.FieldDocument;
 import de.ids_mannheim.korap.index.PositionsToOffset;
 import de.ids_mannheim.korap.index.TermInfo;
+import de.ids_mannheim.korap.index.SpanInfo;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -116,9 +117,9 @@
     private ObjectMapper mapper = new ObjectMapper();
 
 
-    private static ByteBuffer bb = ByteBuffer.allocate(4);
+    private static ByteBuffer bb       = ByteBuffer.allocate(4);
     private static ByteBuffer bbOffset = ByteBuffer.allocate(8);
-
+    private static ByteBuffer bbTerm   = ByteBuffer.allocate(16);
 
     private byte[] pl = new byte[4];
 
@@ -450,6 +451,15 @@
 	return this.numberOf("tokens");
     };
 
+
+    public KorapMatch getMatch (String id) {
+	return this.getMatchInfo(id, false, null, null, false, true);
+    };
+
+    public KorapMatch getMatchInfo (String id, String foundry, String layer, boolean includeSpans, boolean includeHighlights) {
+	return this.getMatchInfo(id, true, foundry, layer, includeSpans, includeHighlights);
+    };
+
     /**
      * Get a match.
      * BE AWARE - THIS IS STILL A PLAYGROUND!
@@ -462,22 +472,25 @@
 
       public KorapInfo infoOf (KorapMatch km, String prefix);
     */
-    public KorapMatch getMatch (String id) {
+    public KorapMatch getMatchInfo (String id, boolean info, String foundry, String layer, boolean includeSpans, boolean includeHighlights) {
 
 	// List of terms to populate
-	LinkedList<TermInfo> termList = new LinkedList<TermInfo>();
+	SpanInfo termList = new SpanInfo();
 
 	KorapMatch match = new KorapMatch();
 
 	// That's purely temporary
+	// From ID:
 	String corpusID = "WPD";
 	String docID    = "WPD_AAA.00003";
-	String field    = "tokens"; // text field
-	String foundry  = "mate";
-	String layer    = "l";
 	int startPos    = 25;
 	int endPos      = 30;
-	Boolean includeSpans = true;
+
+	foundry  = "mate";
+	layer    = "l";
+	includeSpans = true;
+
+	String field    = "tokens"; // text field
 
 	// Create a filter based on the corpusID and the docID
 	BooleanQuery bool = new BooleanQuery();
@@ -485,114 +498,106 @@
 	bool.add(new TermQuery(new Term("corpusID", corpusID)), BooleanClause.Occur.MUST);
 	Filter filter = (Filter) new QueryWrapperFilter(bool);
 
-	// Create an automaton for prefixed terms of interest based on a Regex
-	// Todo: Ignore -: stuff!
-	StringBuffer regex = new StringBuffer();
-	if (includeSpans)
-	    regex.append("(((\"<>\"|\"<\"|\">\")\":\")?");
-	else
-	    regex.append("[^<>]");
-	if (foundry != null)
-	    regex.append(foundry).append('/');
-	if (layer != null)
-	    regex.append(layer).append(":");
-	regex.append("(.){1,})|_[0-9]+");
+	CompiledAutomaton fst = null;
 
-	RegExp regexObj = new RegExp(regex.toString());
-	CompiledAutomaton fst = new CompiledAutomaton(regexObj.toAutomaton());
-	log.trace("The final regex is {}", regex.toString());
+	if (info) {
+	    /* Create an automaton for prefixed terms of interest.
+	     * You can define the necessary foundry, the necessary layer,
+	     * in case the foundry is given, and if span annotations
+	     * are of interest.
+	     */
+	    StringBuffer regex = new StringBuffer();
+
+	    if (includeSpans)
+		regex.append("(((\"<>\"|\"<\"|\">\")\":\")?");
+	    else
+		regex.append("[^<>-]");
+	    if (foundry != null) {
+		regex.append(foundry).append('/');
+		if (layer != null)
+		    regex.append(layer).append(":");
+	    }
+	    else if (includeSpans) {
+		regex.append("[^-]");
+	    };
+	    regex.append("(.){1,})|_[0-9]+");
+
+	    RegExp regexObj = new RegExp(regex.toString());
+	    fst = new CompiledAutomaton(regexObj.toAutomaton());
+	    log.trace("The final regex is {}", regex.toString());
+	};
+
 
 	try {
-
 	    // Iterate over all atomic indices and find the matching document
 	    for (AtomicReaderContext atomic : this.reader().leaves()) {
-		/*
-		DocIdSetIterator filterIter = filter.getDocIdSet(
-		    atomic,
-		    atomic.reader().getLiveDocs()
-		).iterator();
-		*/
 
+		// Retrieve the single document of interest
 		DocIdSet filterSet = filter.getDocIdSet(
 		    atomic,
 		    atomic.reader().getLiveDocs()
 		);
 
 		// Create a bitset for the correct document
-		// Yeah ... I know ... it could've been easier probably
-		/*
-		FixedBitSet bitset = new FixedBitSet(atomic.reader().numDocs());
-		bitset.or(filterIter);
-		*/
 		Bits bitset = filterSet.bits();
 
-		// Go to the matching doc
-		// int localDocID = bitset.iterator().nextDoc();
+		// Go to the matching doc - and remember its ID
 		int localDocID = filterSet.iterator().nextDoc();
 
-		// log.trace("Found documents {} with the docID {}", bitset.cardinality(), localDocID);
-
 		if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
 		    continue;
 
-		// We've found the correct document!
+		// We've found the correct document! Hurray!
 		HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
 		fieldsToLoadLocal.add(field);
 
 		// Get terms from the document
 		Terms docTerms = atomic.reader().getTermVector(localDocID, field);
 
-		/* ---
-		 *
-		 */
-		log.trace("docTerms has Payloads: {}", docTerms.hasPayloads());
-		log.trace("docTerms has Positions: {}", docTerms.hasPositions());
-
 		// Load the necessary fields of the document
 		Document doc = atomic.reader().document(localDocID, fieldsToLoadLocal);
 
 		// Put some more information to the match
 		match.setPositionsToOffset(new PositionsToOffset(atomic, field));
 		match.setLocalDocID(localDocID);
-
-		log.trace("pto and localDocID defined");
-
 		match.setStartPos(startPos);
 		match.setEndPos(endPos);
 		match.populateDocument(doc, field, fieldsToLoadLocal);
 
-		log.trace("We have found the correct document: {}", match.getTitle());
-		// log.trace("The match is: {}", doc.get("tokens"));
+		log.trace("The document is called '{}'", match.getTitle());
 
-		// A termsEnum object could be reused here
+		if (!info)
+		    break;
+
+		// Limit the terms to all the terms of interest
 		TermsEnum termsEnum = docTerms.intersect(fst, null);
 
-		DocsAndPositionsEnum docs = (DocsAndPositionsEnum) null;
-		// DocsAndPositionsEnum docs;
+		DocsAndPositionsEnum docs = null;
 
 		// Iterate over all terms in the document
 		while (termsEnum.next() != null) {
 
+		    // Get the positions and payloads of the term in the document
+		    // The bitvector may look different (don't know why)
+		    // and so the local ID may differ.
+		    // That's why the requesting bitset is null.
 		    docs = termsEnum.docsAndPositions(
-		        null, //bitset.bits(),
+		        null,
 			docs,
 			DocsAndPositionsEnum.FLAG_PAYLOADS
 		    );
 
+		    // Init document iterator
 		    docs.nextDoc();
-		    // log.trace("Check for '{}'({}) in document {}({}) from {}", termsEnum.term().utf8ToString(), termsEnum.totalTermFreq(), docs.docID(), localDocID, bitset.cardinality());
 
+		    // Should never happen ... but hell.
 		    if (docs.docID() == DocIdSetIterator.NO_MORE_DOCS)
 			continue;
 
-		    // Init docs
-		    /*
-		    if (docs.advance(localDocID) == DocIdSetIterator.NO_MORE_DOCS || docs.docID() != localDocID)
-			continue;
-		    */
-
 		    // How often does this term occur in the document?
 		    int termOccurrences = docs.freq();
+
+		    // String representation of the term
 		    String termString = termsEnum.term().utf8ToString();
 
 		    // Iterate over all occurrences
@@ -601,24 +606,29 @@
 			// Init positions and get the current
 			int pos = docs.nextPosition();
 
-			log.trace(">> {}: {}-{}-{}!",
-				  termString, docs.freq(), pos, docs.getPayload());
-
-			BytesRef payload = docs.getPayload();
-
-			byte[] pl = new byte[12];
-
-			if (payload != null)
-			    System.arraycopy(payload.bytes, payload.offset, pl, 0, payload.length);
-
-
 			// Check, if the position of the term is in the interesting area
-			if (pos >= startPos && pos <= endPos) {
-			    termList.add(new TermInfo(
-				termString,
-				pos,
-				pl
-			    ));
+			if (pos >= startPos && pos < endPos) {
+
+			    log.trace(
+			        ">> {}: {}-{}-{}",
+				termString, 
+				docs.freq(),
+				pos, docs.getPayload()
+			    );
+
+			    BytesRef payload = docs.getPayload();
+
+			    // Copy the payload
+			    bbTerm.clear();
+			    if (payload != null) {
+				bbTerm.put(
+				    payload.bytes,
+				    payload.offset,
+				    payload.length
+				);
+			    };
+
+			    termList.add(new TermInfo(termString, pos, bbTerm));
 			};
 		    };
 		};
@@ -629,6 +639,10 @@
 	    // ...
 	};
 
+	for (TermInfo t : termList.getTerms()) {
+	    log.trace("Add term {}/{}:{} to {}-{}", t.getFoundry(), t.getLayer(), t.getValue(), t.getStartChar(), t.getEndChar());
+	};
+
 	return match;
     };
 

diff --git a/src/main/java/de/ids_mannheim/korap/KorapMatch.java b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
index cc4fa60..d7b8e79 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapMatch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapMatch.java

@@ -261,7 +261,7 @@
 	    sb.append(this.localDocID);
 	};
 
-	sb.append('p');
+	sb.append("-p");
 
 	// Get Position information
 	sb.append(startPos).append('-').append(endPos);

diff --git a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
index 379a57e..92bbb53 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java

@@ -1,19 +1,182 @@
 package de.ids_mannheim.korap.index;
 
 import java.util.*;
+import java.nio.ByteBuffer;
+import java.util.regex.*;
+import de.ids_mannheim.korap.KorapMatch;
 
-public class TermInfo {
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
-    private String prefix, foundry, layer, value;
+public class TermInfo implements Comparable<TermInfo> {
+
+    // Logger
+    private final static Logger log = LoggerFactory.getLogger(KorapMatch.class);
+
+    private String foundry, layer, value, term, type;
+    // type can be "term", "pos", "span", "rel-src", "rel-target"
+
     private int pos = 0;
-    private byte[] payload;
+    private ByteBuffer payload;
+    private boolean analyzed = false;
 
-    // Temporary:
-    private String name;
+    private int startChar = -1,
+	        endChar   = -1,
+	        startPos  = -1,
+	        endPos    = -1;
 
-    public TermInfo (String name, int pos, byte[] payload) {
-	this.name = name;
-	this.pos = pos;
-	this.payload = payload;
+    private byte depth = (byte) 0;
+
+    private Pattern prefixRegex = Pattern.compile("([^/]+)/([^:]+):(.+?)");
+    private Matcher matcher;
+
+    public TermInfo (String term, int pos, ByteBuffer payload) {
+	this.term     = term;
+	this.startPos = pos;
+	this.endPos   = pos + 1;
+	this.payload  = payload;
+    };
+
+    public TermInfo analyze () {
+	if (analyzed)
+	    return this;
+
+	int ttype = 0;
+	String tterm = this.term;
+	this.payload.rewind();
+
+	switch (tterm.charAt(0)) {
+	case '<':
+	    // "<>:mate/l:..."
+	    if (tterm.charAt(1) == '>') {
+	    // span
+		this.type = "span";
+		tterm = tterm.substring(3);
+		ttype = 2;
+	    }
+	    // rel-target
+ 	    else {
+		this.type = "relTarget";
+		tterm = tterm.substring(2);
+		ttype = 3;
+	    };
+	    break;
+	case '>':
+	    // rel-src
+	    this.type = "relSrc";
+	    tterm = tterm.substring(2);
+	    ttype = 3;
+	    break;
+
+	case '_':
+	    // pos
+	    this.type = "pos";
+	    ttype = 1;
+	    tterm = tterm.substring(1);
+	    break;
+	default:
+	    // term
+	    this.type = "term";
+	};
+
+	// Analyze term value
+	if (ttype != 1) {
+	    log.trace("Check {} for {}", tterm, prefixRegex.toString());
+	    matcher = prefixRegex.matcher(tterm);
+	    if (matcher.matches() && matcher.groupCount() == 3) {
+		this.foundry = matcher.group(1);
+		this.layer   = matcher.group(2);
+		this.value   = matcher.group(3);
+	    };
+	}
+
+	// for positions
+	else {
+	    this.value = tterm;
+	    this.startChar = this.payload.getInt();
+	    this.endChar   = this.payload.getInt();
+	};
+
+	// for spans
+	if (ttype == 2) {
+	    this.startChar = this.payload.getInt();
+	    this.endChar   = this.payload.getInt();
+	};
+
+	// for spans and relations
+	if (ttype > 1)
+	    this.endPos = this.payload.getInt();
+
+	if (ttype == 2 && this.payload.hasRemaining()) {
+	    this.depth = this.payload.get();
+	};
+
+	// payloads can have different meaning
+	analyzed = true;
+	return this;
+    };
+
+    public String getType () {
+	return this.type;
+    };
+
+    public int getStartChar () {
+	return this.startChar;
+    };
+
+    public void setStartChar (int pos) {
+        this.startChar = pos;
+    };
+
+    public int getEndChar () {
+	return this.endChar;
+    };
+
+    public void setEndChar (int pos) {
+        this.endChar = pos;
+    };
+
+    public int getStartPos () {
+	return this.startPos;
+    };
+
+    public int getEndPos () {
+	return this.endPos;
+    };
+
+    public byte getDepth () {
+	return this.depth;
+    };
+
+    public String getFoundry () {
+	return this.foundry;
+    };
+
+    public String getLayer () {
+	return this.layer;
+    };
+
+    public String getValue () {
+	return this.value;
+    };
+
+    @Override
+    public int compareTo (TermInfo obj) {
+	this.analyze();
+	obj.analyze();
+
+	if (this.startChar < obj.startChar) {
+	    return -1;
+	}
+	else if (this.startChar > obj.startChar) {
+	    return 1;
+	}
+	else if (this.depth < obj.depth) {
+	    return 1;
+	}
+	else if (this.depth > obj.depth) {
+	    return -1;
+	};
+	return 0;
     };
 };

diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
index addc5d8..1bb4035 100644
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties

@@ -15,7 +15,7 @@
 #log4j.logger.de.ids_mannheim.korap.KorapCollection = TRACE, stdout
 #log4j.logger.de.ids_mannheim.korap.index.PositionsToOffset = TRACE, stdout
 
-# log4j.logger.de.ids_mannheim.korap.analysis.MultiTermTokenStream = TRACE, stdout
+#log4j.logger.de.ids_mannheim.korap.analysis.MultiTermTokenStream = TRACE, stdout
 
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
commit	138e5b9fff6f90cdc0e9427e0a2ebeae13765890	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Fri Jan 10 21:15:13 2014 +0000
committer	Nils Diewald <nils@diewald-online.de>	Fri Jan 10 21:15:13 2014 +0000
tree	374353365f976d84b1239f97a7b5476e6a9d5b0f
parent	4aadb0c0b448d0115c039d08e05da69f9c3af463 [diff]