| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 4 | import java.util.ArrayList; |
| 5 | import java.util.List; |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 6 | import java.util.Map; |
| 7 | |
| 8 | import org.apache.lucene.index.AtomicReaderContext; |
| 9 | import org.apache.lucene.index.Term; |
| 10 | import org.apache.lucene.index.TermContext; |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 11 | import org.apache.lucene.search.spans.SpanQuery; |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 12 | import org.apache.lucene.util.Bits; |
| 13 | import org.slf4j.Logger; |
| 14 | import org.slf4j.LoggerFactory; |
| 15 | |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 16 | import de.ids_mannheim.korap.query.SpanAttributeQuery; |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 17 | import de.ids_mannheim.korap.query.SpanWithAttributeQuery; |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 18 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 19 | /** |
| 20 | * Span enumeration of element or relation spans (referent spans) having and/or |
| 21 | * <em>not</em> having some attributes. This class only handles <em>and</em> |
| 22 | * operation on attributes. |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 23 | * |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 24 | * Use SpanOrQuery to perform <em>or</em> operation on attributes, i.e. choose |
| 25 | * between two elements with some attribute constraints. Note that the attribute |
| 26 | * constraints have to be formulated in Conjunctive Normal Form (CNF). |
| 27 | * |
| 28 | * @author margaretha |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 29 | * */ |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 30 | public class SpansWithAttribute extends SpansWithId { |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 31 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 32 | private SpansWithId referentSpans; |
| 33 | private List<AttributeSpans> attributeList; |
| 34 | private List<AttributeSpans> notAttributeList; |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 35 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 36 | protected Logger logger = LoggerFactory.getLogger(SpansWithAttribute.class); |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 37 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 38 | /** |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 39 | * Constructs SpansWithAttribute from the given |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 40 | * {@link SpanWithAttributeQuery} and {@link SpansWithId}, such as |
| 41 | * elementSpans and relationSpans. |
| 42 | * |
| 43 | * @param spanWithAttributeQuery a spanWithAttributeQuery |
| 44 | * @param spansWithId a SpansWithId |
| 45 | * @param context |
| 46 | * @param acceptDocs |
| 47 | * @param termContexts |
| 48 | * @throws IOException |
| 49 | */ |
| 50 | public SpansWithAttribute(SpanWithAttributeQuery spanWithAttributeQuery, |
| 51 | SpansWithId spansWithId, AtomicReaderContext context, |
| 52 | Bits acceptDocs, Map<Term, TermContext> termContexts) |
| 53 | throws IOException { |
| 54 | super(spanWithAttributeQuery, context, acceptDocs, termContexts); |
| 55 | referentSpans = spansWithId; |
| 56 | referentSpans.hasSpanId = true; // dummy setting enabling reading elementRef |
| 57 | hasMoreSpans = referentSpans.next(); |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 58 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 59 | attributeList = new ArrayList<AttributeSpans>(); |
| 60 | notAttributeList = new ArrayList<AttributeSpans>(); |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 61 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 62 | List<SpanQuery> sqs = spanWithAttributeQuery.getClauseList(); |
| 63 | if (sqs != null) { |
| 64 | for (SpanQuery sq : sqs) { |
| 65 | addAttributes((SpanAttributeQuery) sq, context, acceptDocs, |
| 66 | termContexts); |
| 67 | } |
| 68 | } else { |
| 69 | addAttributes( |
| 70 | (SpanAttributeQuery) spanWithAttributeQuery |
| 71 | .getSecondClause(), |
| 72 | context, acceptDocs, termContexts); |
| 73 | } |
| 74 | } |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 75 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 76 | /** |
| 77 | * Adds the given {@link SpanAttributeQuery} to the attributeList or |
| 78 | * notAttributeList depending on the query, whether it is a negation or not. |
| 79 | * |
| 80 | * @param sq a SpanAttributeQuery |
| 81 | * @param context |
| 82 | * @param acceptDocs |
| 83 | * @param termContexts |
| 84 | * @throws IOException |
| 85 | */ |
| 86 | private void addAttributes(SpanAttributeQuery sq, |
| 87 | AtomicReaderContext context, Bits acceptDocs, |
| 88 | Map<Term, TermContext> termContexts) throws IOException { |
| 89 | AttributeSpans as = (AttributeSpans) sq.getSpans(context, acceptDocs, |
| 90 | termContexts); |
| 91 | if (sq.isNegation()) { |
| 92 | notAttributeList.add(as); |
| 93 | as.next(); |
| 94 | } else { |
| 95 | attributeList.add(as); |
| 96 | hasMoreSpans &= as.next(); |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | @Override |
| 101 | public boolean next() throws IOException { |
| 102 | isStartEnumeration = false; |
| 103 | return advance(); |
| 104 | } |
| 105 | |
| 106 | /** |
| 107 | * Searches for the next match by first identify a possible element |
| 108 | * position, and then ensuring that the element contains all the attributes |
| 109 | * and <em>do not</em> contain any of the not attributes. |
| 110 | * |
| 111 | * @return <code>true</code> if the a match is found, <code>false</code> |
| 112 | * otherwise. |
| 113 | * @throws IOException |
| 114 | */ |
| 115 | private boolean advance() throws IOException { |
| 116 | |
| 117 | while (hasMoreSpans && searchSpanPosition()) { |
| 118 | //logger.info("element: " + withAttributeSpans.start() + ","+ withAttributeSpans.end() + |
| 119 | // " ref:"+withAttributeSpans.getSpanId()); |
| 120 | |
| 121 | if (checkReferentId() && checkNotReferentId()) { |
| 122 | this.matchDocNumber = referentSpans.doc(); |
| 123 | this.matchStartPosition = referentSpans.start(); |
| 124 | this.matchEndPosition = referentSpans.end(); |
| 125 | this.matchPayload = referentSpans.getPayload(); |
| 126 | this.spanId = referentSpans.getSpanId(); |
| 127 | |
| 128 | if (attributeList.size() > 0) |
| 129 | hasMoreSpans = attributeList.get(0).next(); |
| 130 | |
| 131 | hasMoreSpans &= referentSpans.next(); |
| 132 | return true; |
| 133 | } |
| 134 | } |
| 135 | return false; |
| 136 | } |
| 137 | |
| 138 | /** |
| 139 | * Searches for a possible referentSpan having the same document number and |
| 140 | * start position as the attributes', and the position is different from the |
| 141 | * <em>not attributes'</em> positions. |
| 142 | * |
| 143 | * @return <code>true</code> if the referentSpan position is valid, |
| 144 | * <code>false</code> otherwise. |
| 145 | * @throws IOException |
| 146 | */ |
| 147 | private boolean searchSpanPosition() throws IOException { |
| 148 | while (hasMoreSpans) { |
| 149 | if (referentSpans.getSpanId() < 1) { // the element does not have an attribute |
| 150 | hasMoreSpans = referentSpans.next(); |
| 151 | continue; |
| 152 | } |
| 153 | if (checkAttributeListPosition()) { |
| 154 | advanceNotAttributes(); |
| 155 | // logger.info("element is found: "+ withAttributeSpans.start()); |
| 156 | return true; |
| 157 | } |
| 158 | } |
| 159 | return false; |
| 160 | } |
| 161 | |
| 162 | /** |
| 163 | * Advances the attributes to be in the same document and start position as |
| 164 | * the referentSpan. |
| 165 | * |
| 166 | * @return <code>true</code> if the attributes are in the same document and |
| 167 | * start position as the referentSpan. |
| 168 | * @throws IOException |
| 169 | */ |
| 170 | private boolean checkAttributeListPosition() throws IOException { |
| 171 | int currentPosition = referentSpans.start(); |
| 172 | boolean isSame = true; |
| 173 | boolean isFirst = true; |
| 174 | |
| 175 | for (AttributeSpans a : attributeList) { |
| 176 | if (!ensureSamePosition(referentSpans, a)) |
| 177 | return false; |
| 178 | // logger.info("pos:" + withAttributeSpans.start()); |
| 179 | if (isFirst) { |
| 180 | isFirst = false; |
| 181 | currentPosition = referentSpans.start(); |
| 182 | } else if (currentPosition != referentSpans.start()) { |
| 183 | currentPosition = referentSpans.start(); |
| 184 | isSame = false; |
| 185 | |
| 186 | } |
| 187 | } |
| 188 | // logger.info("same pos: "+isSame+ ", pos "+withAttributeSpans.start()); |
| 189 | return isSame; |
| 190 | } |
| 191 | |
| 192 | /** |
| 193 | * Advances the element or attribute spans to be in the same document and |
| 194 | * start position. |
| 195 | * */ |
| 196 | private boolean ensureSamePosition(SpansWithId spans, |
| 197 | AttributeSpans attributes) throws IOException { |
| 198 | |
| 199 | while (hasMoreSpans && ensureSameDoc(spans, attributes)) { |
| 200 | if (attributes.start() == spans.start()) |
| 201 | return true; |
| 202 | else if (attributes.start() > spans.start()) |
| 203 | hasMoreSpans = spans.next(); |
| 204 | else |
| 205 | hasMoreSpans = attributes.next(); |
| 206 | } |
| 207 | |
| 208 | return false; |
| 209 | } |
| 210 | |
| 211 | /** |
| 212 | * Advances the <em>not-attributes</em> to be in the same or greater |
| 213 | * document number than referentSpans' document number. If a |
| 214 | * <em>not-attribute</em> is in the same document, it is advanced to be in |
| 215 | * the same as or greater start position than the current referentSpan. |
| 216 | * |
| 217 | * @throws IOException |
| 218 | */ |
| 219 | private void advanceNotAttributes() throws IOException { |
| 220 | |
| 221 | for (AttributeSpans a : notAttributeList) { |
| 222 | // advance the doc# of not AttributeSpans |
| 223 | // logger.info("a "+a.start()); |
| 224 | while (!a.isFinish() && a.doc() <= referentSpans.doc()) { |
| 225 | |
| 226 | if (a.doc() == referentSpans.doc() |
| 227 | && a.start() >= referentSpans.start()) |
| 228 | break; |
| 229 | |
| 230 | if (!a.next()) |
| 231 | a.setFinish(true); |
| 232 | } |
| 233 | } |
| 234 | //return true; |
| 235 | } |
| 236 | |
| 237 | /** |
| 238 | * Ensures that the referent id of each attributeSpans in the attributeList |
| 239 | * is the same as the spanId of the actual referentSpans. |
| 240 | * |
| 241 | * @return <code>true</code> if the spanId of the current referentSpans is |
| 242 | * the same as all the referentId of all the attributeSpans in the |
| 243 | * attributeList, <code>false</code> otherwise. |
| 244 | * @throws IOException |
| 245 | */ |
| 246 | private boolean checkReferentId() throws IOException { |
| 247 | for (AttributeSpans attribute : attributeList) { |
| 248 | if (referentSpans.getSpanId() != attribute.getReferentId()) { |
| 249 | if (referentSpans.getSpanId() < attribute.getReferentId()) |
| 250 | hasMoreSpans = attribute.next(); |
| 251 | else { |
| 252 | hasMoreSpans = referentSpans.next(); |
| 253 | } |
| 254 | |
| 255 | return false; |
| 256 | } |
| 257 | } |
| 258 | return true; |
| 259 | } |
| 260 | |
| 261 | /** |
| 262 | * Ensures that the referentSpans do <em>not</em> contain the |
| 263 | * <em>not attributes</em> (with negation). In other words, the spanId must |
| 264 | * not the same as the <em>not attribute</em>'s referentId. |
| 265 | * |
| 266 | * @return <code>true</code> if the referentSpan does not have the same |
| 267 | * spanId as the referentIds of all the not attributes, |
| 268 | * <code>false</code> otherwise. |
| 269 | * @throws IOException |
| 270 | */ |
| 271 | private boolean checkNotReferentId() throws IOException { |
| 272 | for (AttributeSpans notAttribute : notAttributeList) { |
| 273 | if (!notAttribute.isFinish() |
| 274 | && referentSpans.start() == notAttribute.start() |
| 275 | && referentSpans.getSpanId() == notAttribute |
| 276 | .getReferentId()) { |
| 277 | hasMoreSpans = referentSpans.next(); |
| 278 | return false; |
| 279 | } |
| 280 | } |
| 281 | return true; |
| 282 | } |
| 283 | |
| 284 | @Override |
| 285 | public boolean skipTo(int target) throws IOException { |
| 286 | if (hasMoreSpans && (referentSpans.doc() < target)) { |
| 287 | if (!referentSpans.skipTo(target)) { |
| 288 | return false; |
| 289 | } |
| 290 | } |
| 291 | isStartEnumeration = false; |
| 292 | return advance(); |
| 293 | } |
| 294 | |
| 295 | @Override |
| 296 | public long cost() { |
| 297 | |
| 298 | long cost = 0; |
| 299 | for (AttributeSpans as : attributeList) { |
| 300 | cost += as.cost(); |
| 301 | } |
| 302 | for (AttributeSpans as : notAttributeList) { |
| 303 | cost += as.cost(); |
| 304 | } |
| 305 | return referentSpans.cost() + cost; |
| 306 | } |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 307 | |
| 308 | } |