| margaretha | 4ec2cd3 | 2016-02-29 09:46:36 +0000 | [diff] [blame] | 1 | package de.mannheim.ids.korap.sru; |
| 2 | |
| 3 | import java.io.ByteArrayInputStream; |
| 4 | import java.io.IOException; |
| 5 | import java.io.InputStream; |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 6 | import java.net.URISyntaxException; |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 7 | import java.util.Arrays; |
| 8 | import java.util.List; |
| 9 | import java.util.Map; |
| 10 | import java.util.Set; |
| margaretha | 4ec2cd3 | 2016-02-29 09:46:36 +0000 | [diff] [blame] | 11 | |
| 12 | import javax.xml.parsers.ParserConfigurationException; |
| 13 | import javax.xml.parsers.SAXParser; |
| 14 | import javax.xml.parsers.SAXParserFactory; |
| 15 | import javax.xml.stream.XMLStreamException; |
| 16 | import javax.xml.stream.XMLStreamWriter; |
| 17 | |
| 18 | import org.slf4j.Logger; |
| 19 | import org.slf4j.LoggerFactory; |
| 20 | import org.xml.sax.SAXException; |
| 21 | |
| 22 | import eu.clarin.sru.server.SRUConstants; |
| 23 | import eu.clarin.sru.server.SRUDiagnosticList; |
| 24 | import eu.clarin.sru.server.SRUException; |
| 25 | import eu.clarin.sru.server.SRUSearchResultSet; |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 26 | import eu.clarin.sru.server.fcs.AdvancedDataViewWriter; |
| 27 | import eu.clarin.sru.server.fcs.Layer; |
| margaretha | 4ec2cd3 | 2016-02-29 09:46:36 +0000 | [diff] [blame] | 28 | import eu.clarin.sru.server.fcs.XMLStreamWriterHelper; |
| 29 | |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 30 | /** |
| 31 | * Prepares and creates a search result set for a search retrieve URL |
| 32 | * call. |
| 33 | * |
| 34 | * @author margaretha |
| 35 | * |
| 36 | */ |
| margaretha | 4ec2cd3 | 2016-02-29 09:46:36 +0000 | [diff] [blame] | 37 | public class KorapSRUSearchResultSet extends SRUSearchResultSet { |
| 38 | |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 39 | private Logger logger = (Logger) LoggerFactory |
| 40 | .getLogger(KorapSRUSearchResultSet.class); |
| margaretha | 4ec2cd3 | 2016-02-29 09:46:36 +0000 | [diff] [blame] | 41 | |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 42 | private int i = -1; |
| 43 | private KorapResult korapResult; |
| 44 | private List<String> dataviews; |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 45 | private SAXParser saxParser; |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 46 | private Layer textLayer; |
| 47 | private AnnotationHandler annotationHandler; |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 48 | |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 49 | /** |
| 50 | * Constructs a KorapSRUSearchResultSet for the given KorapResult. |
| 51 | * |
| 52 | * @param diagnostics |
| 53 | * a list of SRU diagnostics |
| 54 | * @param korapResult |
| 55 | * the query result |
| 56 | * @param dataviews |
| 57 | * the required dataviews to generate |
| 58 | * @param textlayer |
| 59 | * the text layer |
| 60 | * @param annotationLayers |
| 61 | * the list of annotation layers |
| 62 | * @throws SRUException |
| 63 | */ |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 64 | public KorapSRUSearchResultSet (SRUDiagnosticList diagnostics, |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 65 | KorapResult korapResult, List<String> dataviews, Layer textlayer, |
| 66 | List<AnnotationLayer> annotationLayers) throws SRUException { |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 67 | super(diagnostics); |
| margaretha | 4ec2cd3 | 2016-02-29 09:46:36 +0000 | [diff] [blame] | 68 | |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 69 | SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); |
| 70 | try { |
| 71 | saxParser = saxParserFactory.newSAXParser(); |
| 72 | } |
| 73 | catch (ParserConfigurationException | SAXException e) { |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 74 | throw new SRUException(SRUConstants.SRU_GENERAL_SYSTEM_ERROR, e); |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 75 | } |
| margaretha | 4ec2cd3 | 2016-02-29 09:46:36 +0000 | [diff] [blame] | 76 | |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 77 | this.korapResult = korapResult; |
| 78 | this.dataviews = dataviews; |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 79 | this.textLayer = textlayer; |
| 80 | annotationHandler = new AnnotationHandler(annotationLayers); |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 81 | } |
| margaretha | 4ec2cd3 | 2016-02-29 09:46:36 +0000 | [diff] [blame] | 82 | |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 83 | @Override |
| 84 | public int getTotalRecordCount() { |
| 85 | return korapResult.getTotalResults(); |
| 86 | } |
| margaretha | 4ec2cd3 | 2016-02-29 09:46:36 +0000 | [diff] [blame] | 87 | |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 88 | @Override |
| 89 | public int getRecordCount() { |
| 90 | return korapResult.getMatchSize(); |
| 91 | } |
| margaretha | 4ec2cd3 | 2016-02-29 09:46:36 +0000 | [diff] [blame] | 92 | |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 93 | @Override |
| 94 | public String getRecordSchemaIdentifier() { |
| 95 | return KorapSRU.CLARIN_FCS_RECORD_SCHEMA; |
| 96 | } |
| margaretha | 4ec2cd3 | 2016-02-29 09:46:36 +0000 | [diff] [blame] | 97 | |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 98 | @Override |
| 99 | public boolean nextRecord() throws SRUException { |
| 100 | return (++i < korapResult.getMatchSize() ? true : false); |
| 101 | } |
| 102 | |
| 103 | @Override |
| 104 | public String getRecordIdentifier() { |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 105 | return korapResult.getMatch(i).getMatchId(); |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 106 | } |
| 107 | |
| 108 | @Override |
| 109 | public void writeRecord(XMLStreamWriter writer) throws XMLStreamException { |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 110 | KorapMatch match = korapResult.getMatch(i); |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 111 | match.parseMatchId(); |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 112 | XMLStreamWriterHelper.writeStartResource(writer, match.getMatchId(), |
| 113 | null); |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 114 | XMLStreamWriterHelper.writeStartResourceFragment(writer, null, null); |
| 115 | |
| 116 | List<AnnotationLayer> annotationLayers; |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 117 | annotationLayers = parseAnnotations(match); |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 118 | |
| 119 | writeAdvancedDataView(writer, annotationLayers); |
| 120 | |
| 121 | XMLStreamWriterHelper.writeEndResourceFragment(writer); |
| 122 | XMLStreamWriterHelper.writeEndResource(writer); |
| 123 | } |
| 124 | |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 125 | /** |
| 126 | * Parses the current match snippet from KorAP search API into |
| 127 | * keyword, left context and right context. |
| 128 | * |
| 129 | * @return a KorapMatch |
| 130 | * @throws XMLStreamException |
| 131 | */ |
| 132 | @Deprecated |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 133 | private KorapMatch parseMatch() throws XMLStreamException { |
| 134 | KorapMatch match = korapResult.getMatch(i); |
| 135 | String snippet = "<snippet>" + match.getSnippet() + "</snippet>"; |
| 136 | InputStream is = new ByteArrayInputStream(snippet.getBytes()); |
| 137 | try { |
| 138 | saxParser.parse(is, new KorapMatchHandler(match)); |
| 139 | } |
| 140 | catch (SAXException | IOException e) { |
| 141 | throw new XMLStreamException(e); |
| 142 | } |
| 143 | return match; |
| 144 | } |
| 145 | |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 146 | /** |
| 147 | * Retrieves and parses the annotations of a match from KorAP |
| 148 | * MatchInfo API. |
| 149 | * |
| 150 | * @param match |
| 151 | * a KorapMatch |
| 152 | * @return a list of annotation layers containing the match |
| 153 | * annotations. |
| 154 | * @throws XMLStreamException |
| 155 | */ |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 156 | private List<AnnotationLayer> parseAnnotations(KorapMatch match) |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 157 | throws XMLStreamException { |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 158 | if (match == null) { |
| 159 | throw new NullPointerException("KorapMatch is null."); |
| 160 | } |
| 161 | |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 162 | try { |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 163 | String annotationSnippet = KorapClient.retrieveAnnotations( |
| Eliza Margaretha | e0e40a3 | 2016-11-09 19:16:08 +0100 | [diff] [blame] | 164 | match.getCorpusId(), match.getDocId(), match.getTextId(), |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 165 | match.getPositionId(), "*"); |
| 166 | InputStream is = new ByteArrayInputStream( |
| Eliza Margaretha | e0e40a3 | 2016-11-09 19:16:08 +0100 | [diff] [blame] | 167 | annotationSnippet.getBytes("UTF-8")); |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 168 | saxParser.parse(is, annotationHandler); |
| 169 | } |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 170 | catch (SAXException | IOException | URISyntaxException e) { |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 171 | throw new XMLStreamException(e); |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 172 | } |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 173 | |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 174 | return annotationHandler.getAnnotationLayers(); |
| 175 | } |
| 176 | |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 177 | /** |
| 178 | * Writes advanced data views, namely segment annotations for each |
| 179 | * annotation layer. |
| 180 | * |
| 181 | * @param writer |
| 182 | * an XMLStreamWriter |
| 183 | * @param annotationLayers |
| 184 | * a list of annotation layers |
| 185 | * @throws XMLStreamException |
| 186 | */ |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 187 | private void writeAdvancedDataView(XMLStreamWriter writer, |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 188 | List<AnnotationLayer> annotationLayers) throws XMLStreamException { |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 189 | |
| 190 | AdvancedDataViewWriter helper = new AdvancedDataViewWriter( |
| 191 | AdvancedDataViewWriter.Unit.ITEM); |
| 192 | |
| 193 | addAnnotationsToWriter(helper, annotationLayers); |
| 194 | |
| 195 | helper.writeHitsDataView(writer, textLayer.getResultId()); |
| 196 | |
| 197 | if (dataviews.contains("adv")) { |
| 198 | helper.writeAdvancedDataView(writer); |
| 199 | } |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 200 | } |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 201 | |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 202 | /** |
| 203 | * Adds all annotations to the AdvancedDataViewWriter. |
| 204 | * |
| 205 | * @param helper |
| 206 | * an AdvancedDataViewWriter |
| 207 | * @param annotationLayers |
| 208 | * a list of annotation layers containing match |
| 209 | * annotations. |
| 210 | */ |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 211 | private void addAnnotationsToWriter(AdvancedDataViewWriter helper, |
| 212 | List<AnnotationLayer> annotationLayers) { |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 213 | |
| 214 | Map<Integer, List<Annotation>> map; |
| 215 | for (AnnotationLayer annotationLayer : annotationLayers) { |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 216 | map = annotationLayer.getAnnotationMap(); |
| 217 | Set<Integer> keyset = map.keySet(); |
| 218 | Integer[] keyArray = keyset.toArray(new Integer[keyset.size()]); |
| 219 | Arrays.sort(keyArray); |
| 220 | for (int key : keyArray) { |
| 221 | List<Annotation> annotations = map.get(key); |
| 222 | if (annotations == null) { |
| 223 | continue; |
| 224 | } |
| 225 | |
| 226 | // FCS advanced dataview does not allow multiple |
| 227 | // annotations on the same segment. |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 228 | // for (Annotation annotation : annotations){ |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 229 | Annotation annotation = annotations.get(0); |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 230 | |
| margaretha | d7fda43 | 2016-08-17 15:49:02 +0200 | [diff] [blame] | 231 | if (annotation.getHitLevel() > 0) { |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 232 | helper.addSpan(annotationLayer.getLayerId(), |
| 233 | annotation.getStart(), annotation.getEnd(), |
| 234 | annotation.getValue(), annotation.getHitLevel()); |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 235 | } |
| 236 | else { |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 237 | helper.addSpan(annotationLayer.getLayerId(), |
| 238 | annotation.getStart(), annotation.getEnd(), |
| 239 | annotation.getValue()); |
| 240 | } |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 241 | } |
| margaretha | 43ea731 | 2016-08-08 19:00:23 +0200 | [diff] [blame] | 242 | map.clear(); |
| margaretha | 4a5f1c2 | 2016-08-03 17:34:32 +0200 | [diff] [blame] | 243 | } |
| 244 | } |
| margaretha | 4ec2cd3 | 2016-02-29 09:46:36 +0000 | [diff] [blame] | 245 | } |