| Eliza Margaretha | 8e274e3 | 2014-01-28 15:09:30 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| 4 | import java.util.ArrayList; |
| 5 | import java.util.List; |
| 6 | import java.util.Map; |
| 7 | |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 8 | import org.apache.lucene.index.LeafReaderContext; |
| Eliza Margaretha | 8e274e3 | 2014-01-28 15:09:30 +0000 | [diff] [blame] | 9 | import org.apache.lucene.index.Term; |
| 10 | import org.apache.lucene.index.TermContext; |
| 11 | import org.apache.lucene.util.Bits; |
| 12 | |
| 13 | import de.ids_mannheim.korap.query.SpanDistanceQuery; |
| 14 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 15 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 16 | * Enumeration of token-based distance span matches consisting of two |
| 17 | * child |
| 18 | * spans having an actual distance in the range of the minimum and |
| 19 | * maximum |
| 20 | * distance parameters specified in the corresponding query. A |
| 21 | * TokenDistanceSpan |
| 22 | * starts from the minimum start positions of its child spans and ends |
| 23 | * at the |
| Eliza Margaretha | afe9812 | 2015-01-23 17:37:57 +0000 | [diff] [blame] | 24 | * maximum end positions of the child spans. |
| Eliza Margaretha | 8e274e3 | 2014-01-28 15:09:30 +0000 | [diff] [blame] | 25 | * |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 26 | * @author margaretha |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 27 | */ |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 28 | public class TokenDistanceSpans extends OrderedDistanceSpans { |
| Eliza Margaretha | 8e274e3 | 2014-01-28 15:09:30 +0000 | [diff] [blame] | 29 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 30 | /** |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 31 | * Constructs TokenDistanceSpans from the given query. |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 32 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 33 | * @param query |
| 34 | * a SpanDistanceQuery |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 35 | * @param context |
| 36 | * @param acceptDocs |
| 37 | * @param termContexts |
| 38 | * @throws IOException |
| 39 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 40 | public TokenDistanceSpans (SpanDistanceQuery query, |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 41 | LeafReaderContext context, Bits acceptDocs, |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 42 | Map<Term, TermContext> termContexts) |
| 43 | throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 44 | super(query, context, acceptDocs, termContexts); |
| 45 | hasMoreSpans = hasMoreFirstSpans; |
| 46 | } |
| Eliza Margaretha | 8e274e3 | 2014-01-28 15:09:30 +0000 | [diff] [blame] | 47 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 48 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 49 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 50 | protected void setCandidateList () throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 51 | if (candidateListDocNum == secondSpans.doc()) { |
| 52 | copyPossibleCandidates(); |
| 53 | addNewCandidates(); |
| 54 | candidateListIndex = -1; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 55 | } |
| 56 | else { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 57 | candidateList.clear(); |
| 58 | if (hasMoreFirstSpans && ensureSameDoc(firstSpans, secondSpans)) { |
| 59 | candidateListDocNum = firstSpans.doc(); |
| 60 | addNewCandidates(); |
| 61 | candidateListIndex = -1; |
| 62 | } |
| 63 | } |
| 64 | } |
| 65 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 66 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 67 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 68 | * Restructures the candidateList to contain only candidate |
| 69 | * (first) spans |
| 70 | * which are still possible to create a match, from the candidate |
| 71 | * list |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 72 | * prepared for the previous second spans. |
| 73 | * |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 74 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 75 | private void copyPossibleCandidates () { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 76 | List<CandidateSpan> temp = new ArrayList<>(); |
| 77 | for (CandidateSpan cs : candidateList) { |
| 78 | if (cs.getEnd() + maxDistance > secondSpans.start()) |
| 79 | temp.add(cs); |
| 80 | } |
| 81 | candidateList = temp; |
| 82 | } |
| 83 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 84 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 85 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 86 | * Add new possible firstspan candidates for the current |
| 87 | * secondspan. |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 88 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 89 | private void addNewCandidates () throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 90 | while (hasMoreFirstSpans && firstSpans.doc() == candidateListDocNum |
| 91 | && firstSpans.start() < secondSpans.end()) { |
| 92 | |
| 93 | if (firstSpans.end() + maxDistance > secondSpans.start()) |
| 94 | candidateList.add(new CandidateSpan(firstSpans)); |
| 95 | |
| 96 | hasMoreFirstSpans = firstSpans.next(); |
| 97 | } |
| 98 | } |
| 99 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 100 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 101 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 102 | protected boolean findMatch () throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 103 | CandidateSpan candidateSpan = candidateList.get(candidateListIndex); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 104 | if (minDistance == 0 && |
| 105 | // intersection |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 106 | candidateSpan.getStart() < secondSpans.end() |
| 107 | && secondSpans.start() < candidateSpan.getEnd()) { |
| 108 | |
| 109 | setMatchProperties(candidateSpan, true); |
| 110 | return true; |
| 111 | } |
| 112 | |
| 113 | int actualDistance = secondSpans.start() - candidateSpan.getEnd() + 1; |
| 114 | if (candidateSpan.getStart() < secondSpans.start() |
| 115 | && minDistance <= actualDistance |
| 116 | && actualDistance <= maxDistance) { |
| 117 | |
| 118 | setMatchProperties(candidateSpan, false); |
| 119 | return true; |
| 120 | } |
| 121 | return false; |
| 122 | } |
| 123 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 124 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 125 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 126 | public long cost () { |
| Akron | 4299355 | 2016-02-04 13:24:24 +0100 | [diff] [blame] | 127 | if (candidateList.size() > 0) { |
| 128 | long cost = 0; |
| 129 | for (CandidateSpan candidateSpan : candidateList) { |
| 130 | cost += candidateSpan.getCost(); |
| 131 | } |
| 132 | return cost + secondSpans.cost(); |
| 133 | } |
| 134 | else { |
| 135 | return firstSpans.cost() + secondSpans.cost(); |
| 136 | } |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 137 | } |
| 138 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 139 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 140 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 141 | protected boolean isSecondSpanValid () throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 142 | return true; |
| 143 | } |
| Eliza Margaretha | 8e274e3 | 2014-01-28 15:09:30 +0000 | [diff] [blame] | 144 | } |