| Eliza Margaretha | c8d0a7a | 2014-09-16 13:08:30 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| 4 | import java.util.ArrayList; |
| 5 | import java.util.List; |
| 6 | import java.util.Map; |
| 7 | |
| 8 | import org.apache.lucene.index.AtomicReaderContext; |
| 9 | import org.apache.lucene.index.Term; |
| 10 | import org.apache.lucene.index.TermContext; |
| 11 | import org.apache.lucene.search.spans.Spans; |
| 12 | import org.apache.lucene.util.Bits; |
| 13 | |
| 14 | import de.ids_mannheim.korap.query.SpanExpansionQuery; |
| 15 | |
| Eliza Margaretha | 7dd8712 | 2014-09-16 15:34:46 +0000 | [diff] [blame^] | 16 | /** Spans expanded with min m tokens and max n tokens, and throughout all |
| 17 | * the expansions do not contain the notClause. |
| 18 | * |
| 19 | * @author margaretha |
| 20 | * */ |
| Eliza Margaretha | c8d0a7a | 2014-09-16 13:08:30 +0000 | [diff] [blame] | 21 | public class ExpandedExclusionSpans extends SimpleSpans{ |
| 22 | |
| 23 | private int min, max; |
| 24 | private int direction; |
| 25 | private List<CandidateSpan> candidateSpans; |
| 26 | private boolean hasMoreNotClause; |
| 27 | private Spans notClause; |
| 28 | |
| 29 | public ExpandedExclusionSpans(SpanExpansionQuery spanExpansionQuery, |
| 30 | AtomicReaderContext context, Bits acceptDocs, |
| 31 | Map<Term, TermContext> termContexts) throws IOException { |
| 32 | super(spanExpansionQuery, context, acceptDocs, termContexts); |
| 33 | |
| 34 | if (spanExpansionQuery.getSecondClause() == null){ |
| 35 | throw new IllegalArgumentException("The SpanExpansionQuery " + |
| 36 | "is not valid. The spanquery to exclude (notClause) cannot " + |
| 37 | "be null."); |
| 38 | } |
| 39 | |
| 40 | /*if (spanExpansionQuery.getMin() < 1){ |
| 41 | throw new IllegalArgumentException("Min occurrence for notClause " + |
| 42 | "must be at least 1."); |
| 43 | }*/ |
| 44 | |
| 45 | this.min = spanExpansionQuery.getMin(); |
| 46 | this.max = spanExpansionQuery.getMax(); |
| 47 | this.direction = spanExpansionQuery.getDirection(); |
| 48 | |
| 49 | this.notClause = secondSpans; |
| 50 | this.hasMoreNotClause = notClause.next(); |
| 51 | |
| 52 | candidateSpans = new ArrayList<CandidateSpan>(); |
| 53 | hasMoreSpans = firstSpans.next(); |
| 54 | } |
| 55 | |
| 56 | @Override |
| 57 | public boolean next() throws IOException { |
| 58 | matchPayload.clear(); |
| 59 | isStartEnumeration = false; |
| 60 | return advance(); |
| 61 | } |
| 62 | |
| 63 | private boolean advance() throws IOException { |
| 64 | while (hasMoreSpans || candidateSpans.size() > 0){ |
| Eliza Margaretha | 7dd8712 | 2014-09-16 15:34:46 +0000 | [diff] [blame^] | 65 | if (candidateSpans.size() > 0){ |
| 66 | // set a candidate span as a match |
| Eliza Margaretha | c8d0a7a | 2014-09-16 13:08:30 +0000 | [diff] [blame] | 67 | CandidateSpan cs = candidateSpans.get(0); |
| 68 | matchDocNumber = cs.getDoc(); |
| 69 | matchStartPosition = cs.getStart(); |
| 70 | matchEndPosition = cs.getEnd(); |
| 71 | matchPayload = cs.getPayloads(); |
| 72 | candidateSpans.remove(0); |
| 73 | return true; |
| 74 | } |
| 75 | else if (!hasMoreNotClause || notClause.doc() > firstSpans.doc()){ |
| 76 | generateCandidates(min, max, direction); |
| 77 | hasMoreSpans = firstSpans.next(); |
| 78 | } |
| 79 | else findMatches(); |
| 80 | } |
| 81 | return false; |
| 82 | } |
| 83 | |
| 84 | private void findMatches() throws IOException { |
| 85 | while (hasMoreNotClause && notClause.doc() <= firstSpans.doc()){ |
| Eliza Margaretha | c8d0a7a | 2014-09-16 13:08:30 +0000 | [diff] [blame] | 86 | if (notClause.doc() == firstSpans.doc()){ |
| Eliza Margaretha | c8d0a7a | 2014-09-16 13:08:30 +0000 | [diff] [blame] | 87 | if (direction < 0 ){ // left |
| Eliza Margaretha | 7dd8712 | 2014-09-16 15:34:46 +0000 | [diff] [blame^] | 88 | expandLeft(); |
| 89 | } // right |
| 90 | else { expandRight(); } |
| Eliza Margaretha | c8d0a7a | 2014-09-16 13:08:30 +0000 | [diff] [blame] | 91 | break; |
| 92 | } |
| Eliza Margaretha | c8d0a7a | 2014-09-16 13:08:30 +0000 | [diff] [blame] | 93 | else if (!notClause.next()) hasMoreNotClause = false; |
| 94 | } |
| 95 | } |
| 96 | |
| Eliza Margaretha | 7dd8712 | 2014-09-16 15:34:46 +0000 | [diff] [blame^] | 97 | private void expandLeft() throws IOException{ |
| 98 | //int counter = max; |
| 99 | int maxPos = max; |
| 100 | CandidateSpan lastNotClause = null; |
| 101 | while (hasMoreNotClause && |
| 102 | notClause.start() < firstSpans.start()){ |
| 103 | |
| 104 | // between max and firstspan.start() |
| 105 | if (notClause.start() >= firstSpans.start() - maxPos){ |
| 106 | maxPos = firstSpans.start() - notClause.start() -1; |
| 107 | lastNotClause = new CandidateSpan(notClause); |
| 108 | //counter--; |
| 109 | } |
| 110 | if (!notClause.next()) hasMoreNotClause = false; |
| 111 | } |
| 112 | |
| 113 | // if a notClause is between max and firstspan.start, |
| 114 | // then maxPos = last NotClause pos -1 |
| 115 | generateCandidates(min, maxPos, direction); |
| 116 | |
| 117 | if (lastNotClause != null) |
| 118 | while ((hasMoreSpans = firstSpans.next()) |
| 119 | // the next notClause is not in between max and firstspan.start() |
| 120 | && notClause.start() > firstSpans.start() |
| 121 | // the last notClause is in between max and firstspan.start() |
| 122 | && lastNotClause.getStart() < firstSpans.start() |
| 123 | && lastNotClause.getStart() >= firstSpans.start() - max |
| 124 | ){ |
| 125 | |
| 126 | maxPos = firstSpans.start() - lastNotClause.getStart() -1; |
| 127 | generateCandidates(min, maxPos, direction); |
| 128 | } |
| 129 | else hasMoreSpans = firstSpans.next(); |
| 130 | } |
| 131 | |
| 132 | private void expandRight() throws IOException{ |
| 133 | int expansionEnd = firstSpans.end() + max; |
| 134 | int maxPos = max; |
| 135 | boolean isFound = false; |
| 136 | |
| 137 | CandidateSpan firstNotClause = null; |
| 138 | //System.out.println("main start:"+firstSpans.start()); |
| 139 | while (hasMoreNotClause && notClause.start() < expansionEnd){ |
| 140 | // between firstspan.end() and expansionEnd |
| 141 | if (!isFound && notClause.start() >= firstSpans.end()){ |
| 142 | maxPos = notClause.start() - firstSpans.end() -1; |
| 143 | firstNotClause = new CandidateSpan(notClause); |
| 144 | isFound = true; |
| 145 | } |
| 146 | if (!notClause.next()) hasMoreNotClause = false; |
| 147 | } |
| 148 | // if a notClause is between firstSpan.end and max |
| 149 | // then maxPos = the first notClause pos -1 |
| 150 | generateCandidates(min, maxPos, direction); |
| 151 | |
| 152 | if (firstNotClause !=null){ |
| 153 | while ((hasMoreSpans = firstSpans.next()) |
| 154 | // in between |
| 155 | && firstNotClause.getStart() < firstSpans.end() + max |
| 156 | && firstNotClause.getStart() >= firstSpans.end()) |
| 157 | { |
| 158 | //System.out.println("first start:"+firstNotClause.getStart()+", main start:"+firstSpans.start()); |
| 159 | maxPos = firstNotClause.getStart() - firstSpans.end() -1; |
| 160 | generateCandidates(min, maxPos, direction); |
| 161 | } |
| 162 | } |
| 163 | else hasMoreSpans = firstSpans.next(); |
| 164 | } |
| 165 | |
| Eliza Margaretha | c8d0a7a | 2014-09-16 13:08:30 +0000 | [diff] [blame] | 166 | private void generateCandidates(int minPos, int maxPos, int direction) |
| 167 | throws IOException { |
| 168 | int counter; |
| 169 | int start, end; |
| 170 | |
| 171 | if (direction < 0 ) { // left |
| 172 | counter = maxPos; |
| 173 | while (counter >= min){ |
| 174 | start = firstSpans.start() - counter; |
| 175 | if (start > -1 ){ |
| 176 | |
| 177 | end = firstSpans.end(); |
| 178 | //System.out.println(start+","+end); |
| 179 | candidateSpans.add(new CandidateSpan(start, end, firstSpans.doc(), |
| 180 | firstSpans.cost(), firstSpans.getPayload())); |
| 181 | } |
| 182 | counter --; |
| 183 | } |
| 184 | } |
| 185 | else { // right |
| 186 | counter = minPos; |
| 187 | while(counter <= maxPos){ |
| 188 | start = firstSpans.start(); |
| 189 | end = firstSpans.end() + counter; |
| 190 | //System.out.println(start+","+end); |
| 191 | candidateSpans.add(new CandidateSpan(start, end, firstSpans.doc(), |
| 192 | firstSpans.cost(), firstSpans.getPayload())); |
| 193 | counter++; |
| 194 | } |
| 195 | } |
| 196 | } |
| 197 | |
| 198 | |
| 199 | @Override |
| 200 | public boolean skipTo(int target) throws IOException { |
| 201 | if (hasMoreSpans && (firstSpans.doc() < target)){ |
| 202 | if (!firstSpans.skipTo(target)){ |
| 203 | hasMoreSpans = false; |
| 204 | return false; |
| 205 | } |
| 206 | } |
| 207 | matchPayload.clear(); |
| 208 | return advance(); |
| 209 | } |
| 210 | |
| 211 | @Override |
| 212 | public long cost() { |
| 213 | // TODO Auto-generated method stub |
| 214 | return 0; |
| 215 | } |
| 216 | } |