Updated parsing MatchInfo with <span class=match>.
Change-Id: I6c4fceb1ce0c338a09c78f947282de81aad447df
diff --git a/src/main/java/de/mannheim/ids/korap/sru/Annotation.java b/src/main/java/de/mannheim/ids/korap/sru/Annotation.java
index 43081c3..65f6030 100644
--- a/src/main/java/de/mannheim/ids/korap/sru/Annotation.java
+++ b/src/main/java/de/mannheim/ids/korap/sru/Annotation.java
@@ -1,6 +1,6 @@
package de.mannheim.ids.korap.sru;
-/**
+/**
* @author margaretha
*
*/
@@ -10,14 +10,14 @@
private long start;
private long end;
private String value;
- private boolean isKeyword;
-
- public Annotation (int id, String value, long start, long end, boolean isKeyword) {
+ private int hitLevel;
+
+ public Annotation (int id, String value, long start, long end, int hitLevel) {
this.id = id;
this.value = value;
this.start = start;
this.end = end;
- this.isKeyword = isKeyword;
+ this.hitLevel = hitLevel;
}
public int getId() {
@@ -52,11 +52,11 @@
this.value = value;
}
- public boolean isKeyword() {
- return isKeyword;
+ public int getHitLevel() {
+ return hitLevel;
}
-
- public void setKeyword(boolean isKeyword) {
- this.isKeyword = isKeyword;
+
+ public void setHitLevel(int hitLevel) {
+ this.hitLevel = hitLevel;
}
}
diff --git a/src/main/java/de/mannheim/ids/korap/sru/AnnotationHandler.java b/src/main/java/de/mannheim/ids/korap/sru/AnnotationHandler.java
index 33b6096..4cc66b4 100644
--- a/src/main/java/de/mannheim/ids/korap/sru/AnnotationHandler.java
+++ b/src/main/java/de/mannheim/ids/korap/sru/AnnotationHandler.java
@@ -4,7 +4,6 @@
import java.util.List;
import java.util.Map;
-import org.hamcrest.core.IsSame;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
@@ -17,6 +16,7 @@
.getLogger(AnnotationHandler.class);
private boolean startSegment = true;
+ private boolean startSentence = false;
private int matchLevel = 0;
@@ -41,24 +41,30 @@
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
- if (qName.equals("mark")) {
- text = textBuilder.toString();
- textBuilder = new StringBuilder();
- if (!text.isEmpty()) {
- addAnnotationToMap(text, annotationLayers.get(0),
- (matchLevel > 1), textStartOffset, textEndOffset);
- textStartOffset = textEndOffset;
- }
- matchLevel++;
- }
- if (matchLevel > 0 && qName.equals("span")
- && attributes.getQName(0).equals("title")) {
+ if (startSentence && attributes.getValue("title") != null && qName.equals("span")) {
if (startSegment) {
segmentStartOffset = segmentEndOffset;
startSegment = false;
}
annotationStrings.add(attributes.getValue("title"));
-
+ }
+ else if (attributes.getValue("class") !=null && qName.equals("span")){
+ if (attributes.getValue("class").equals("match")){
+ startSentence = true;
+ }
+ else {
+ startSentence = false;
+ }
+ }
+ else if (qName.equals("mark")) {
+ text = textBuilder.toString();
+ textBuilder = new StringBuilder();
+ if (!text.isEmpty()) {
+ addAnnotationToMap(text, annotationLayers.get(0),
+ matchLevel, textStartOffset, textEndOffset);
+ textStartOffset = textEndOffset;
+ }
+ matchLevel++;
}
super.startElement(uri, localName, qName, attributes);
@@ -79,22 +85,22 @@
segmentBuilder = new StringBuilder();
}
else if (annotationLayer.getLayerCode().equals(layerCode)) {
- addAnnotationToMap(value, annotationLayer, false);
+ addAnnotationToMap(value, annotationLayer, 0);
break;
}
}
}
private void addAnnotationToMap(String value,
- AnnotationLayer annotationLayer, boolean isKeyword) {
- addAnnotationToMap(value, annotationLayer, isKeyword, segmentStartOffset, segmentEndOffset);
+ AnnotationLayer annotationLayer, int hitLevel) {
+ addAnnotationToMap(value, annotationLayer, hitLevel, segmentStartOffset, segmentEndOffset);
}
private void addAnnotationToMap(String value,
- AnnotationLayer annotationLayer, boolean isKeyword, long startOffset, long endOffset) {
+ AnnotationLayer annotationLayer, int hitLevel, long startOffset, long endOffset) {
Annotation annotation = new Annotation(id, value, startOffset,
- endOffset, isKeyword);
+ endOffset, hitLevel);
Map<Integer, List<Annotation>> map = annotationLayer.getAnnotationMap();
@@ -114,18 +120,14 @@
if (qName.equals("mark")) {
- annotationLayers.get(0);
-
text = textBuilder.toString();
textBuilder = new StringBuilder();
-
addAnnotationToMap(text, annotationLayers.get(0),
- (matchLevel > 1), textStartOffset, textEndOffset);
+ matchLevel, textStartOffset, textEndOffset);
textStartOffset = textEndOffset;
-
matchLevel--;
}
- else if (!startSegment) {
+ else if (!startSegment && qName.equals("span")) {
for (String annotationStr : annotationStrings) {
parseAnnotation(annotationStr);
}
@@ -134,6 +136,13 @@
annotationStrings.clear();
}
}
+
+ @Override
+ public void endDocument() throws SAXException {
+ text = textBuilder.toString();
+ addAnnotationToMap(text, annotationLayers.get(0),
+ matchLevel, textStartOffset, textEndOffset);
+ }
@Override
public void characters(char[] ch, int start, int length)
diff --git a/src/main/java/de/mannheim/ids/korap/sru/KorapMatch.java b/src/main/java/de/mannheim/ids/korap/sru/KorapMatch.java
index cf1db12..b5c2889 100644
--- a/src/main/java/de/mannheim/ids/korap/sru/KorapMatch.java
+++ b/src/main/java/de/mannheim/ids/korap/sru/KorapMatch.java
@@ -2,6 +2,8 @@
import java.util.ArrayList;
import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;
@@ -9,10 +11,10 @@
@JsonIgnoreProperties(ignoreUnknown = true)
public class KorapMatch {
- private String ID;
- private String positionID;
- private String docID;
- private String corpusID;
+ private String matchID;
+ private String positionId;
+ private String docId;
+ private String corpusId;
private String leftContext;
private String keyword;
private String rightContext;
@@ -20,41 +22,51 @@
private String text;
private List<AnnotationLayer> annotationLayers = new ArrayList<AnnotationLayer>();
+
+ private static Pattern idPattern = Pattern.compile("match-(.*)_(.*)-p([0-9]+-[0-9]+)");
public KorapMatch () {}
-
- @JsonProperty("ID")
- public String getID() {
- return ID;
+
+ @JsonProperty("matchID")
+ public String getMatchId() {
+ return matchID;
}
- public void setID(String id) {
- this.ID = id;
+ public void setMatchId(String id) {
+ this.matchID = id;
+ }
+
+ public void parseMatchId(){
+ Matcher matcher = idPattern.matcher(matchID);
+ if (matcher.find()){
+ this.corpusId = matcher.group(1);
+ this.docId = matcher.group(2);
+ this.positionId = "p"+matcher.group(3);
+ }
+ }
+
+ public void setPositionId(String positionId) {
+ this.positionId = positionId;
}
- public void setPositionID() {
- String[] idParts = ID.split("-");
- this.positionID = idParts[2] + "-" + idParts[3];
+ public String getPositionId() {
+ return positionId;
}
- public String getPositionID() {
- return positionID;
+ public String getDocId() {
+ return docId;
}
- public String getDocID() {
- return docID;
+ public void setDocId(String docID) {
+ this.docId = docID.replace(corpusId + "_", "");
}
- public void setDocID(String docID) {
- this.docID = docID.replace(corpusID + "_", "");
+ public String getCorpusId() {
+ return corpusId;
}
- public String getCorpusID() {
- return corpusID;
- }
-
- public void setCorpusID(String corpusID) {
- this.corpusID = corpusID;
+ public void setCorpusId(String corpusId) {
+ this.corpusId = corpusId;
}
public String getLeftContext() {
diff --git a/src/main/java/de/mannheim/ids/korap/sru/KorapMatchHandler.java b/src/main/java/de/mannheim/ids/korap/sru/KorapMatchHandler.java
index 952fca3..1cd91ee 100644
--- a/src/main/java/de/mannheim/ids/korap/sru/KorapMatchHandler.java
+++ b/src/main/java/de/mannheim/ids/korap/sru/KorapMatchHandler.java
@@ -6,7 +6,7 @@
public class KorapMatchHandler extends DefaultHandler{
- KorapMatch match;
+ private KorapMatch match;
boolean isLeftContext, isRightContext, isKeyword, isMore;
private StringBuilder sbLeft, sbRight, sbKey;
diff --git a/src/main/java/de/mannheim/ids/korap/sru/QueryLanguage.java b/src/main/java/de/mannheim/ids/korap/sru/QueryLanguage.java
index 98d2d28..7bccd81 100644
--- a/src/main/java/de/mannheim/ids/korap/sru/QueryLanguage.java
+++ b/src/main/java/de/mannheim/ids/korap/sru/QueryLanguage.java
@@ -2,4 +2,8 @@
public enum QueryLanguage {
CQL, FCSQL;
+
+ public String toString() {
+ return super.toString().toLowerCase();
+ };
}