| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 4 | import java.util.ArrayList; |
| 5 | import java.util.List; |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 6 | import java.util.Map; |
| 7 | |
| 8 | import org.apache.lucene.index.AtomicReaderContext; |
| 9 | import org.apache.lucene.index.Term; |
| 10 | import org.apache.lucene.index.TermContext; |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 11 | import org.apache.lucene.search.spans.SpanQuery; |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 12 | import org.apache.lucene.util.Bits; |
| 13 | import org.slf4j.Logger; |
| 14 | import org.slf4j.LoggerFactory; |
| 15 | |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 16 | import de.ids_mannheim.korap.query.SpanAttributeQuery; |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 17 | import de.ids_mannheim.korap.query.SpanElementAttributeQuery; |
| 18 | |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 19 | /** Span enumeration of elements that have some attribute and/or do <em>not</em> |
| 20 | * have some attributes. This class handles <em>and</em> operation on attributes. |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 21 | * |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 22 | * Use SpanOrQuery to perform <em>or</em> operation on attributes, i.e. choose |
| 23 | * between two elements with some attribute constraints. Note that the attribute |
| 24 | * constraints have to be in Conjunctive Normal Form (CNF). |
| 25 | * |
| 26 | * @author margaretha |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 27 | * */ |
| 28 | public class ElementAttributeSpans extends SimpleSpans{ |
| 29 | |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 30 | private ElementSpans elements; |
| 31 | private List<AttributeSpans> attributeList; |
| 32 | private List<AttributeSpans> notAttributeList; |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 33 | |
| 34 | protected Logger logger = LoggerFactory.getLogger(ElementAttributeSpans.class); |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 35 | |
| 36 | // This advices the java compiler to ignore all loggings |
| 37 | public static final boolean DEBUG = false; |
| 38 | |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 39 | public ElementAttributeSpans(SpanElementAttributeQuery simpleSpanQuery, |
| 40 | AtomicReaderContext context, Bits acceptDocs, |
| 41 | Map<Term, TermContext> termContexts) throws IOException { |
| 42 | super(simpleSpanQuery, context, acceptDocs, termContexts); |
| 43 | elements = (ElementSpans) firstSpans; |
| Eliza Margaretha | 1c3bf27 | 2014-06-11 11:50:39 +0000 | [diff] [blame] | 44 | elements.isElementRef = true; // dummy setting enabling reading elementRef |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 45 | hasMoreSpans = elements.next(); |
| 46 | |
| 47 | attributeList = new ArrayList<AttributeSpans>(); |
| 48 | notAttributeList = new ArrayList<AttributeSpans>(); |
| 49 | |
| 50 | List<SpanQuery> sqs = simpleSpanQuery.getClauseList(); |
| 51 | AttributeSpans as; |
| 52 | for (SpanQuery sq: sqs){ |
| 53 | as = (AttributeSpans) sq.getSpans(context, acceptDocs, termContexts); |
| 54 | if (((SpanAttributeQuery) sq).isNegation()){ |
| 55 | notAttributeList.add(as); |
| 56 | as.next(); |
| 57 | } |
| 58 | else { |
| 59 | attributeList.add(as); |
| 60 | hasMoreSpans &= as.next(); |
| 61 | } |
| 62 | } |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 63 | } |
| 64 | |
| 65 | @Override |
| 66 | public boolean next() throws IOException { |
| 67 | isStartEnumeration=false; |
| 68 | return advance(); |
| 69 | } |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 70 | |
| 71 | /** Search for the next match by first identify a possible |
| 72 | * element position, and then ensuring that the element contains |
| 73 | * all the attributes and <em>do not</em> contain any of the |
| 74 | * not attributes. |
| 75 | * */ |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 76 | private boolean advance() throws IOException { |
| 77 | |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 78 | while (hasMoreSpans && computeElementPosition()){ |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 79 | if (DEBUG) |
| 80 | logger.info("element: " + elements.start() + ","+ elements.end() + |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 81 | " ref:"+elements.getElementRef()); |
| Eliza Margaretha | 669e7a8 | 2014-06-26 12:57:18 +0000 | [diff] [blame] | 82 | |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 83 | if (checkElementRef() && checkNotElementRef()){ |
| 84 | this.matchDocNumber = elements.doc(); |
| 85 | this.matchStartPosition = elements.start(); |
| 86 | this.matchEndPosition = elements.end(); |
| 87 | this.matchPayload = elements.getPayload(); |
| 88 | hasMoreSpans = attributeList.get(0).next(); |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 89 | if (DEBUG) |
| 90 | logger.info("MATCH "+matchDocNumber); |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 91 | |
| 92 | hasMoreSpans = elements.next(); |
| 93 | return true; |
| 94 | } |
| 95 | } |
| 96 | return false; |
| 97 | } |
| 98 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 99 | /** Ensuring all the attribute spans having the same elementRef with |
| 100 | * the actual element's elementRef. |
| 101 | * */ |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 102 | private boolean checkElementRef() throws IOException{ |
| 103 | |
| 104 | for (AttributeSpans attribute: attributeList){ |
| 105 | if (elements.getElementRef() != attribute.getElementRef()){ |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 106 | if (DEBUG) |
| 107 | logger.info("attribute ref doesn't match"); |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 108 | if (elements.getElementRef() < attribute.getElementRef()) |
| 109 | hasMoreSpans = attribute.next(); |
| 110 | else { |
| 111 | hasMoreSpans = elements.next(); |
| 112 | } |
| 113 | |
| 114 | return false; |
| 115 | } |
| 116 | } |
| 117 | return true; |
| 118 | } |
| 119 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 120 | /** Ensuring elements do not contain the not attributes. In other words, |
| 121 | * the elementRef is not the same as the not attribute's elementRefs. |
| 122 | * */ |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 123 | private boolean checkNotElementRef() throws IOException{ |
| 124 | for (AttributeSpans notAttribute: notAttributeList){ |
| Eliza Margaretha | 7788a98 | 2014-08-29 16:10:52 +0000 | [diff] [blame] | 125 | if (!notAttribute.isFinish() && |
| 126 | elements.start() == notAttribute.start() && |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 127 | elements.getElementRef() == notAttribute.getElementRef()){ |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 128 | if (DEBUG) |
| 129 | logger.info("not attribute ref exists"); |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 130 | hasMoreSpans = elements.next(); |
| 131 | return false; |
| 132 | } |
| 133 | } |
| 134 | return true; |
| 135 | } |
| 136 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 137 | /** Search for a possible element having the same doc and start position as |
| 138 | * the attributes. |
| 139 | * */ |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 140 | private boolean computeElementPosition() throws IOException { |
| 141 | |
| 142 | while (hasMoreSpans){ |
| 143 | |
| 144 | if (elements.getElementRef() < 1){ // the element does not have an attribute |
| Eliza Margaretha | 669e7a8 | 2014-06-26 12:57:18 +0000 | [diff] [blame] | 145 | elements.isElementRef = true; // dummy setting enabling reading elementRef |
| 146 | hasMoreSpans = elements.next(); |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 147 | if (DEBUG) |
| 148 | logger.info("skip"); |
| Eliza Margaretha | 669e7a8 | 2014-06-26 12:57:18 +0000 | [diff] [blame] | 149 | continue; |
| 150 | } |
| 151 | |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 152 | if (checkAttributeListPosition() && |
| 153 | checkNotAttributeListPosition()){ |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 154 | if (DEBUG) |
| 155 | logger.info("element is found: "+ elements.start()); |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 156 | return true; |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 157 | } |
| 158 | } |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 159 | |
| 160 | return false; |
| 161 | } |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 162 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 163 | /** Advancing the not attributes to be in the same or greater doc# than |
| 164 | * element doc#. If a not attribute is in the same doc, advance it to |
| 165 | * be in the same or greater start position than the element. |
| 166 | * |
| 167 | * */ |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 168 | private boolean checkNotAttributeListPosition() throws IOException{ |
| 169 | |
| 170 | for (AttributeSpans a : notAttributeList){ |
| 171 | // advance the doc# of not AttributeSpans |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 172 | if (DEBUG) |
| 173 | logger.info("a "+a.start()); |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 174 | while (!a.isFinish() && a.doc() <= elements.doc()){ |
| 175 | |
| 176 | if (a.doc() == elements.doc() && |
| 177 | a.start() >= elements.start()) |
| 178 | break; |
| 179 | |
| 180 | if (!a.next()) a.setFinish(true); |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | return true; |
| 185 | } |
| 186 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 187 | /** Advancing the attributes to be in the same doc and start position |
| 188 | * as the element. |
| 189 | * */ |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 190 | private boolean checkAttributeListPosition() throws IOException{ |
| 191 | int currentPosition = elements.start(); |
| 192 | boolean isSame = true; |
| 193 | boolean isFirst = true; |
| 194 | |
| 195 | for (AttributeSpans a : attributeList){ |
| 196 | if(!ensureSamePosition(elements, a)) return false; |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 197 | if (DEBUG) |
| 198 | logger.info("pos:" + elements.start()); |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 199 | if (isFirst){ |
| 200 | isFirst = false; |
| 201 | currentPosition = elements.start(); |
| 202 | } |
| 203 | else if (currentPosition != elements.start()){ |
| 204 | currentPosition = elements.start(); |
| 205 | isSame = false; |
| 206 | |
| 207 | } |
| 208 | } |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 209 | if (DEBUG) |
| 210 | logger.info("same pos: "+isSame+ ", pos "+elements.start()); |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 211 | return isSame; |
| 212 | } |
| 213 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 214 | /** Advance the element or attribute spans to be in the same doc |
| 215 | * and start position. |
| 216 | * */ |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 217 | private boolean ensureSamePosition(ElementSpans elements, |
| 218 | AttributeSpans attributes) throws IOException { |
| 219 | |
| 220 | while (hasMoreSpans && ensureSameDoc(elements, attributes)){ |
| 221 | if (attributes.start() == elements.start()) |
| 222 | return true; |
| 223 | else if (attributes.start() > elements.start()) |
| 224 | hasMoreSpans = elements.next(); |
| 225 | else |
| 226 | hasMoreSpans= attributes.next(); |
| 227 | } |
| 228 | |
| 229 | return false; |
| 230 | } |
| 231 | |
| 232 | @Override |
| 233 | public boolean skipTo(int target) throws IOException { |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 234 | if (hasMoreSpans && (elements.doc() < target)){ |
| 235 | if (!elements.skipTo(target)){ |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 236 | return false; |
| 237 | } |
| 238 | } |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 239 | isStartEnumeration=false; |
| 240 | return advance(); |
| 241 | } |
| 242 | |
| 243 | @Override |
| 244 | public long cost() { |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 245 | |
| 246 | long cost = 0; |
| 247 | for (AttributeSpans as: attributeList){ |
| 248 | cost += as.cost(); |
| 249 | } |
| 250 | for (AttributeSpans as: notAttributeList){ |
| 251 | cost += as.cost(); |
| 252 | } |
| 253 | return elements.cost() + cost; |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 254 | } |
| 255 | |
| 256 | |
| 257 | } |