| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.response.match; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 2 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 3 | import java.util.*; |
| 4 | import java.util.regex.*; |
| 5 | |
| 6 | public class MatchIdentifier extends DocIdentifier { |
| 7 | private int startPos, endPos = -1; |
| 8 | |
| 9 | private ArrayList<int[]> pos = new ArrayList<>(8); |
| 10 | |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 11 | // TODO: "contains" is necessary for a compatibility bug in Kustvakt |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 12 | Pattern idRegex = Pattern.compile("^(?:match-|contains-)" |
| 13 | + "(?:([^!]+?)[!\\.])?" + "([^!]+)-p([0-9]+)-([0-9]+)" |
| 14 | + "((?:\\(-?[0-9]+\\)-?[0-9]+--?[0-9]+)*)" + "(?:c.+?)?$"); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 15 | Pattern posRegex = Pattern.compile("\\(([0-9]+)\\)([0-9]+)-([0-9]+)"); |
| 16 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 17 | |
| 18 | public MatchIdentifier () {}; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 19 | |
| 20 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 21 | public MatchIdentifier (String id) { |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 22 | |
| 23 | // Replace for legacy reasons with incompatible versions of Kustvakt |
| 24 | id = id.replaceAll("^(contains-|match-)([^-!_\\.]+?)!\\2_", "$1$2_"); |
| 25 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 26 | Matcher matcher = idRegex.matcher(id); |
| 27 | if (matcher.matches()) { |
| Akron | 8f6f7a3 | 2015-06-25 01:03:15 +0200 | [diff] [blame] | 28 | |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 29 | // <legacy> |
| 30 | // and test compatibility |
| 31 | if (id.contains("!") || !id.contains("_")) { |
| 32 | this.setCorpusID(matcher.group(1)); |
| 33 | this.setDocID(matcher.group(2)); |
| 34 | } |
| 35 | // </legacy> |
| 36 | else { |
| 37 | // this.getCorpusID() + "." + this.getDocID() |
| 38 | this.setTextSigle(matcher.group(1) + '.' + matcher.group(2)); |
| 39 | }; |
| Akron | 8f6f7a3 | 2015-06-25 01:03:15 +0200 | [diff] [blame] | 40 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 41 | this.setStartPos(Integer.parseInt(matcher.group(3))); |
| 42 | this.setEndPos(Integer.parseInt(matcher.group(4))); |
| 43 | |
| 44 | if (matcher.group(5) != null) { |
| 45 | matcher = posRegex.matcher(matcher.group(5)); |
| 46 | while (matcher.find()) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 47 | this.addPos(Integer.parseInt(matcher.group(2)), |
| 48 | Integer.parseInt(matcher.group(3)), |
| 49 | Integer.parseInt(matcher.group(1))); |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 50 | }; |
| 51 | }; |
| 52 | }; |
| 53 | }; |
| 54 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 55 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 56 | public int getStartPos () { |
| 57 | return this.startPos; |
| 58 | }; |
| 59 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 60 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 61 | public void setStartPos (int pos) { |
| 62 | if (pos >= 0) |
| 63 | this.startPos = pos; |
| 64 | }; |
| 65 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 66 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 67 | public int getEndPos () { |
| 68 | return this.endPos; |
| 69 | }; |
| 70 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 71 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 72 | public void setEndPos (int pos) { |
| 73 | if (pos >= 0) |
| 74 | this.endPos = pos; |
| 75 | }; |
| 76 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 77 | |
| 78 | public void addPos (int start, int end, int number) { |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 79 | if (start >= 0 && end >= 0 && number >= 0) |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 80 | this.pos.add(new int[] { start, end, number }); |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 81 | }; |
| 82 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 83 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 84 | public ArrayList<int[]> getPos () { |
| 85 | return this.pos; |
| 86 | }; |
| 87 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 88 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 89 | public String toString () { |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 90 | StringBuilder sb = new StringBuilder("match-"); |
| 91 | |
| Akron | 8f6f7a3 | 2015-06-25 01:03:15 +0200 | [diff] [blame] | 92 | if (this.docID == null) { |
| 93 | if (this.textSigle == null) |
| 94 | return null; |
| 95 | |
| 96 | sb.append(this.textSigle); |
| 97 | } |
| 98 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 99 | // Get prefix string corpus/doc |
| Akron | 8f6f7a3 | 2015-06-25 01:03:15 +0200 | [diff] [blame] | 100 | // LEGACY |
| 101 | else if (this.corpusID != null) { |
| 102 | sb.append(this.corpusID).append('!').append(this.docID); |
| 103 | } |
| 104 | else { |
| 105 | sb.append(this.docID); |
| 106 | }; |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 107 | |
| Akron | 640458c | 2015-06-25 12:36:15 +0200 | [diff] [blame] | 108 | sb.append('-').append(this.getPositionString()); |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 109 | return sb.toString(); |
| 110 | }; |
| 111 | |
| 112 | |
| 113 | public String getPositionString () { |
| 114 | StringBuilder sb = new StringBuilder(); |
| 115 | sb.append('p').append(this.startPos).append('-').append(this.endPos); |
| 116 | |
| 117 | // Get Position information |
| 118 | for (int[] i : this.pos) { |
| 119 | sb.append('(').append(i[2]).append(')'); |
| 120 | sb.append(i[0]).append('-').append(i[1]); |
| 121 | }; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 122 | |
| Nils Diewald | ff0f874 | 2015-02-26 20:42:45 +0000 | [diff] [blame] | 123 | return sb.toString(); |
| 124 | }; |
| 125 | }; |