| Marc Kupietz | 67eed1c | 2020-09-28 21:37:16 +0200 | [diff] [blame] | 1 | package de.ids_mannheim.korap.tokenizer; |
| 2 | /** |
| 3 | * Licensed to the Apache Software Foundation (ASF) under one or more |
| 4 | * contributor license agreements. See the NOTICE file distributed with |
| 5 | * this work for additional information regarding copyright ownership. |
| 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 7 | * (the "License"); you may not use this file except in compliance with |
| 8 | * the License. You may obtain a copy of the License at |
| 9 | * |
| 10 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | * |
| 12 | * Unless required by applicable law or agreed to in writing, software |
| 13 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | * See the License for the specific language governing permissions and |
| 16 | * limitations under the License. |
| 17 | */ |
| 18 | |
| 19 | /** |
| 20 | Modifications |
| 21 | Copyright 2014 David Hall |
| 22 | |
| 23 | Licensed under the Apache License, Version 2.0 (the "License") |
| 24 | you may not use this file except in compliance with the License. |
| 25 | You may obtain a copy of the License at |
| 26 | |
| 27 | http://www.apache.org/licenses/LICENSE-2.0 |
| 28 | |
| 29 | Unless required by applicable law or agreed to in writing, software |
| 30 | distributed under the License is distributed on an "AS IS" BASIS, |
| 31 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 32 | See the License for the specific language governing permissions and |
| 33 | limitations under the License. |
| 34 | */ |
| 35 | |
| 36 | /** |
| 37 | Further Modifications |
| 38 | Copyright 2016 Marc Kupietz |
| 39 | |
| 40 | Licensed under the Apache License, Version 2.0 (the "License") |
| 41 | you may not use this file except in compliance with the License. |
| 42 | You may obtain a copy of the License at |
| 43 | |
| 44 | http://www.apache.org/licenses/LICENSE-2.0 |
| 45 | |
| 46 | Unless required by applicable law or agreed to in writing, software |
| 47 | distributed under the License is distributed on an "AS IS" BASIS, |
| 48 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 49 | See the License for the specific language governing permissions and |
| 50 | limitations under the License. |
| 51 | */ |
| 52 | import java.io.*; |
| 53 | import java.lang.StringBuffer; |
| 54 | import java.util.ArrayList; |
| 55 | import java.util.List; |
| 56 | import opennlp.tools.util.Span; |
| Marc Kupietz | 74141b3 | 2020-10-01 23:23:18 +0200 | [diff] [blame] | 57 | |
| 58 | @Languages({ /*$"\""+target.language+"\" })$*/ /*-*/ ""}) |
| Marc Kupietz | 67eed1c | 2020-09-28 21:37:16 +0200 | [diff] [blame] | 59 | %% |
| 60 | |
| 61 | /** |
| 62 | * Based on the Epic tokenizer (https://github.com/dlwh/epic) |
| 63 | * ... which is ... |
| 64 | * Based on Lucene's StandardTokenizerImpl, but heavily modified. |
| 65 | */ |
| 66 | %class DerekoDfaTokenizer_/*$target.language$*/ |
| 67 | %unicode |
| 68 | %public |
| 69 | %implements KorapTokenizer, opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector |
| 70 | %type Span |
| 71 | %function getNextToken |
| 72 | %char |
| 73 | |
| 74 | %{ |
| Marc Kupietz | 74141b3 | 2020-10-01 23:23:18 +0200 | [diff] [blame] | 75 | private static final CharSequence[] targetLanguages = { /*$"\""+target.language+"\"};$*/ /*-*/ "" }; |
| Marc Kupietz | 67eed1c | 2020-09-28 21:37:16 +0200 | [diff] [blame] | 76 | private boolean xmlEcho = false; |
| 77 | private boolean normalize = false; |
| 78 | private boolean debug = false; |
| 79 | private boolean newSentence = true; |
| 80 | private long startOffset = 0; |
| 81 | private long previousFileEndOffset = -1; |
| 82 | private int tokenId = 0; |
| 83 | private boolean atEOT = false; |
| 84 | private boolean splitSentences = false; |
| 85 | private boolean echo = false; |
| 86 | private boolean printOffsets = false; |
| 87 | private boolean printTokens = false; |
| 88 | private PrintStream outputStream = System.out; |
| 89 | |
| 90 | @Override |
| 91 | public CharSequence[] getTargetLanguages() { |
| 92 | return targetLanguages; |
| 93 | } |
| 94 | |
| 95 | public DerekoDfaTokenizer_/*$target.language$*/() { |
| 96 | this.zzReader = null; |
| 97 | } |
| 98 | |
| 99 | @Override |
| 100 | public void setInputReader(Reader inputReader) { |
| 101 | this.zzReader = inputReader; |
| 102 | } |
| 103 | |
| 104 | @Override |
| 105 | public void setSplitSentences(boolean splitSentences) { |
| 106 | this.splitSentences = splitSentences; |
| 107 | } |
| 108 | |
| 109 | @Override |
| 110 | public void setEcho(boolean echo) { |
| 111 | this.echo = echo; |
| 112 | } |
| 113 | |
| 114 | @Override |
| 115 | public void setPrintOffsets(boolean printOffsets) { |
| 116 | this.printOffsets = printOffsets; |
| 117 | } |
| 118 | |
| 119 | @Override |
| 120 | public void setPrintTokens(boolean printTokens) { |
| 121 | this.printTokens = printTokens; |
| 122 | } |
| 123 | |
| 124 | @Override |
| 125 | public void setOutputStream(PrintStream outputStream) { |
| 126 | this.outputStream = outputStream; |
| 127 | } |
| 128 | |
| 129 | @Override |
| 130 | public void setNormalize(boolean normalize) { |
| 131 | this.normalize = normalize; |
| 132 | } |
| 133 | |
| 134 | @Override |
| 135 | public void scan() throws IOException { |
| 136 | List<Span> list = new ArrayList<Span>(); |
| 137 | Span token; |
| 138 | while (!zzAtEOF) { |
| 139 | token = this.getNextToken(); |
| 140 | if (atEOT) { |
| 141 | if (echo) { |
| 142 | printTokenPositions(list, splitSentences); |
| 143 | list.clear(); |
| 144 | } |
| 145 | atEOT = false; |
| 146 | } |
| 147 | if (token != null) { |
| 148 | list.add(token); |
| 149 | } |
| 150 | } |
| 151 | } |
| 152 | |
| 153 | @Override |
| 154 | public String[] tokenize(String s) { |
| 155 | Span[] spans; |
| 156 | int i; |
| 157 | String[] tokens; |
| 158 | |
| 159 | spans = tokenizePos(s); |
| 160 | tokens = new String[spans.length]; |
| 161 | for (i = 0; i < spans.length; i++) { |
| 162 | tokens[i] = spans[i].getType(); |
| 163 | } |
| 164 | return tokens; |
| 165 | } |
| 166 | |
| 167 | void printTokenPositions(List<Span> spanList, boolean sentencize) { |
| 168 | int sentenceStart = -1; |
| 169 | StringBuilder tokenStringBuffer = new StringBuilder(); |
| 170 | StringBuilder sentenceStringBuffer = new StringBuilder(); |
| 171 | for (int i = 0; i < spanList.size(); i++) { |
| 172 | Span s = spanList.get(i); |
| 173 | if (sentenceStart == -1) |
| 174 | sentenceStart = s.getStart(); |
| 175 | if (printOffsets) { |
| 176 | tokenStringBuffer.append(s.getStart()) |
| 177 | .append(" ") |
| 178 | .append(s.getEnd()); |
| 179 | if (i < spanList.size() - 1) |
| 180 | tokenStringBuffer.append(" "); |
| 181 | } |
| 182 | if (isSentenceBound(s.getType()) || (i == spanList.size() - 1)) { |
| 183 | sentenceStringBuffer.append(sentenceStart) |
| 184 | .append(" ") |
| 185 | .append(s.getEnd()); |
| 186 | sentenceStart = -1; |
| 187 | if (i < spanList.size() - 1) |
| 188 | sentenceStringBuffer.append(" "); |
| 189 | } |
| 190 | } |
| 191 | outputStream.println(tokenStringBuffer.toString()); |
| 192 | if (sentencize) |
| 193 | outputStream.println(sentenceStringBuffer.toString()); |
| 194 | } |
| 195 | |
| 196 | @Override |
| 197 | public Span[] tokenizePos(String s) { |
| 198 | Span token; |
| 199 | int i = 0; |
| 200 | List<Span> list = new ArrayList<Span>(); |
| 201 | tokenId = 0; |
| 202 | yyreset(new StringReader(s)); |
| 203 | try { |
| 204 | while (!this.zzAtEOF) { |
| 205 | token = this.getNextToken(); |
| 206 | if (atEOT) { |
| 207 | if (echo) { |
| 208 | printTokenPositions(list, splitSentences); |
| 209 | list.clear(); |
| 210 | } |
| 211 | atEOT = false; |
| 212 | } |
| 213 | if (token != null) { |
| 214 | list.add(token); |
| 215 | } |
| 216 | } |
| 217 | } catch (java.io.IOException e) { |
| 218 | System.err.println("IO error scanning " + s); |
| 219 | System.err.println(e); |
| 220 | } |
| 221 | return (list.toArray(new Span[list.size()])); |
| 222 | } |
| 223 | |
| 224 | @Override |
| 225 | public String[] sentDetect(String s) { |
| 226 | Span[] spans; |
| 227 | int i; |
| 228 | String[] sentences; |
| 229 | |
| 230 | spans = sentPosDetect(s); |
| 231 | sentences = new String[spans.length]; |
| 232 | for (i = 0; i < spans.length; i++) { |
| 233 | sentences[i] = spans[i].getType(); |
| 234 | } |
| 235 | return sentences; |
| 236 | } |
| 237 | |
| 238 | @Override |
| 239 | public Span[] sentPosDetect(String s) { |
| 240 | final Span tokens[] = tokenizePos(s); |
| 241 | ArrayList<Span> sentences = new ArrayList<Span>(); |
| 242 | int sentenceStart = 0; |
| 243 | if (tokens.length > 0) |
| 244 | tokens[0].getStart(); |
| 245 | for (int i = 0; i < tokens.length; i++) { |
| 246 | if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) { |
| 247 | sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd()))); |
| 248 | if (i < tokens.length - 1) { |
| 249 | sentenceStart = tokens[i + 1].getStart(); |
| 250 | } |
| 251 | } |
| 252 | } |
| 253 | return sentences.toArray(new Span[0]); |
| 254 | } |
| 255 | |
| 256 | public final long yychar() { |
| 257 | return yychar; |
| 258 | } |
| 259 | |
| 260 | final Span currentToken() { |
| 261 | return currentToken(yytext()); |
| 262 | } |
| 263 | |
| 264 | public boolean isSentenceBound(String s) { |
| 265 | return s.matches("^[.?!]+$"); |
| 266 | } |
| 267 | |
| 268 | final Span currentToken(String normalizedValue) { |
| 269 | String value; |
| 270 | long lengthDiff = 0; |
| 271 | previousFileEndOffset = -1; |
| 272 | |
| 273 | if (normalize) { |
| 274 | value = normalizedValue; |
| 275 | } else { |
| 276 | value = yytext(); |
| 277 | lengthDiff = value.length() - value.codePointCount(0, value.length()); |
| 278 | } |
| 279 | if (startOffset > yychar || startOffset < 0) { // how can this happen? |
| 280 | startOffset = 0; |
| 281 | } |
| 282 | long from = (yychar - startOffset), |
| 283 | to = (yychar - startOffset + yylength() - lengthDiff); |
| 284 | if (xmlEcho) { |
| 285 | outputStream.println("<span id=\"t_" + tokenId + "\" from=\"" + from + "\" to=\"" + to + "\"/>\n" + value); |
| 286 | } else if (echo && printTokens) { |
| 287 | outputStream.println(value); |
| 288 | } |
| 289 | startOffset += lengthDiff; |
| 290 | tokenId++; |
| 291 | return new Span((int) from, (int) to, value); |
| 292 | } |
| 293 | |
| 294 | final void fileEnd() { |
| 295 | startOffset = yychar + yylength(); |
| 296 | // do not end a file multiple times because of additional EOT characters |
| 297 | if (startOffset == previousFileEndOffset) |
| 298 | return; |
| 299 | atEOT = true; |
| 300 | previousFileEndOffset = startOffset; |
| 301 | tokenId = 0; |
| 302 | } |
| 303 | |
| 304 | final Span xmlPassage() { |
| 305 | if (xmlEcho) { |
| 306 | String dings = yytext(); |
| 307 | if (dings.indexOf("<text") >= 0) { |
| 308 | startOffset = yychar + yylength(); |
| 309 | tokenId = 0; |
| 310 | } |
| 311 | outputStream.println(dings.replaceAll("[\n\r]+", "")); |
| 312 | return null; |
| 313 | } else { |
| 314 | return currentToken(); |
| 315 | } |
| 316 | } |
| 317 | |
| 318 | final void zipArchive() { |
| 319 | String name; |
| 320 | String matched = yytext(); |
| 321 | int start = 10; |
| 322 | name = matched.substring(start, matched.length() - 1); |
| 323 | outputStream.println("<archive name=\"" + name + "\"/>"); |
| 324 | } |
| 325 | |
| 326 | final void zippedFile() { |
| 327 | String name; |
| 328 | String matched = yytext(); |
| 329 | int start = 13; |
| 330 | name = matched.substring(start, matched.length() - 3); |
| 331 | outputStream.println("<file name=\"" + name + "\"/>"); |
| 332 | } |
| 333 | %} |
| 334 | |
| 335 | THAI = [\u0E00-\u0E59] |
| 336 | |
| 337 | // basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function) |
| 338 | ALPHANUM = ({LETTER}|{THAI}|[:digit:]|_)+ |
| 339 | |
| 340 | // case insensitivity is useful sometimes |
| 341 | // a = [aA] |
| 342 | // b = [bB] |
| 343 | c = [cC] |
| 344 | // d = [dD] |
| 345 | e = [eE] |
| 346 | // f = [fF] |
| 347 | g = [gG] |
| 348 | // h = [hH] |
| 349 | // i = [iI] |
| 350 | // j = [jJ] |
| 351 | // k = [kK] |
| 352 | l = [lL] |
| 353 | // m = [mM] |
| 354 | // n = [nN] |
| 355 | o = [oO] |
| 356 | // p = [pP] |
| 357 | // q = [qQ] |
| 358 | // r = [rR] |
| 359 | // s = [sS] |
| 360 | // t = [tT] |
| 361 | // u = [uU] |
| 362 | // v = [vV] |
| 363 | w = [wW] |
| 364 | // x = [xX] |
| 365 | // y = [yY] |
| 366 | // z = [zZ] |
| 367 | |
| 368 | ALPHA = ({LETTER}|¨)+ |
| 369 | |
| 370 | NEWLINE = [\n\r] |
| 371 | |
| 372 | // acronyms: U.S.A., I.B.M., etc. |
| 373 | // use a post-filter to remove dots |
| 374 | // ABBRNYM = {LETTER} "." ({LETTER} ".")+ |
| 375 | |
| 376 | // ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+ |
| 377 | |
| 378 | // hostname |
| 379 | HOST = ({ALPHANUM}|"-"){4,15} ((".") ({ALPHANUM}|"-"){2,16})+ |
| 380 | |
| 381 | EMDASH = (--|---|[\u2014\u2015\u2e3a\u2e3b\ufe58]+) |
| 382 | |
| 383 | DASH = ([\-\u2011\u2012\u2013\u2e1a\ufe63\uff0d]) |
| 384 | |
| 385 | SLASH = [⁄∕//] |
| 386 | |
| 387 | |
| 388 | // url |
| 389 | |
| 390 | // url spec lifted from Lucene |
| 391 | |
| 392 | // URL and E-mail syntax specifications: |
| 393 | // |
| 394 | // RFC-952: DOD INTERNET HOST TABLE SPECIFICATION |
| 395 | // RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION |
| 396 | // RFC-1123: Requirements for Internet Hosts - Application and Support |
| 397 | // RFC-1738: Uniform Resource Locators (URL) |
| 398 | // RFC-3986: Uniform Resource Identifier (URI): Generic Syntax |
| 399 | // RFC-5234: Augmented BNF for Syntax Specifications: ABNF |
| 400 | // RFC-5321: Simple Mail Transfer Protocol |
| 401 | // RFC-5322: Internet Message Format |
| 402 | |
| 403 | // http://code.ohloh.net/file?fid=wEylHt__FppVh8Ub_GTsx__CTK4&cid=d0f5PFFYrnk&s=UAX29URLEmailTokenizerImpl&filterChecked=true&fp=473333&mp,=1&ml=1&me=1&md=1&projSelected=true#L0 |
| 404 | |
| 405 | DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])? |
| 406 | DomainNameLoose = {DomainLabel} (("."|"[dot]") {DomainLabel})* |
| Marc Kupietz | e3282b0 | 2020-10-13 10:29:23 +0200 | [diff] [blame] | 407 | WWWDomainName = "www" (("."|"[dot]") {DomainLabel})* |
| Marc Kupietz | 67eed1c | 2020-09-28 21:37:16 +0200 | [diff] [blame] | 408 | |
| 409 | IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5]) |
| 410 | IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3} |
| 411 | IPv6Hex16Bit = [0-9A-Fa-f]{1,4} |
| 412 | IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit}) |
| 413 | IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits} |
| 414 | | "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits} |
| 415 | | {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits} |
| 416 | | (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits} |
| 417 | | (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits} |
| 418 | | (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits} |
| 419 | | (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits} |
| 420 | | (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} |
| 421 | | (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::" |
| 422 | |
| 423 | URIunreserved = [-._~A-Za-z0-9] |
| 424 | URIpercentEncoded = "%" [0-9A-Fa-f]{2} |
| 425 | URIsubDelims = [!$&\'()*+,;=] |
| 426 | URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})* |
| 427 | URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@" |
| 428 | URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])* |
| 429 | URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])* |
| 430 | URIport = ":" [0-9]{1,5} |
| 431 | URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} |
| 432 | URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose} |
| 433 | |
| 434 | URIauthorityStrict = {URIhostStrict} {URIport}? |
| 435 | URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}? |
| 436 | |
| 437 | HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])* |
| 438 | HTTPpath = ("/" {HTTPsegment})* |
| 439 | HTTPscheme = [hH][tT][tT][pP][sS]? "://" |
| 440 | HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}? |
| 441 | // {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses |
| Marc Kupietz | e3282b0 | 2020-10-13 10:29:23 +0200 | [diff] [blame] | 442 | HTTPurlNoScheme = ( {URIauthorityStrict} | {WWWDomainName} ) {HTTPpath}? {URIquery}? {URIfragment}? |
| Marc Kupietz | 67eed1c | 2020-09-28 21:37:16 +0200 | [diff] [blame] | 443 | HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme} |
| 444 | |
| 445 | FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])* |
| 446 | FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})* |
| 447 | FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD] |
| 448 | FTPscheme = [fF][tT][pP] "://" |
| 449 | FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}? |
| 450 | |
| 451 | FILEscheme = [fF][iI][lL][eE] "://" |
| 452 | FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}? |
| 453 | |
| 454 | URL = {HTTPurl} | {FTPurl} | {FILEurl} |
| 455 | |
| 456 | // EMAILquotedString without space |
| 457 | // EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u001F\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"] |
| 458 | // original version from lucene |
| 459 | // EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"] |
| 460 | EMAILatomText = [A-Za-z0-9!#$%&\'*+-/=?\^_`{|}~] |
| 461 | EMAILlabel = {EMAILatomText}+ |
| 462 | EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})* |
| 463 | EMAILdomainLiteralText = {ALPHANUM}|{DomainNameLoose} |
| 464 | //EMAILdomainLiteralText = ([\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F]|[\\][\u0000-\u007F])*{ALPHANUM} |
| 465 | // DFA minimization allows {IPv6Address} and {IPv4Address} to be included |
| 466 | // in the {EMAILbracketedHost} definition without incurring any size penalties, |
| 467 | // since {EMAILdomainLiteralText} recognizes all valid IP addresses. |
| 468 | // The IP address regexes are included in {EMAILbracketedHost} simply as a |
| 469 | // reminder that they are acceptable bracketed host forms. |
| 470 | EMAILbracketedHost = "["? ({EMAILdomainLiteralText}+ | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"? |
| 471 | EMAIL = {EMAILlocalPart} ("@"|"["at"]") ({EMAILbracketedHost}) |
| 472 | |
| 473 | // {ALPHANUM} "://" {HOST} (ALPHANUM|\/)* |
| 474 | // URL = ({ALPHA}({ALPHANUM}|-)+:(/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)([^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])) |
| 475 | |
| 476 | |
| 477 | // floating point, serial, model numbers, ip addresses, etc. |
| 478 | // every other segment must have at least one digit |
| 479 | NUM = ({ALPHANUM} {P} {HAS_DIGIT} |
| 480 | | {HAS_DIGIT} {P} {ALPHANUM} |
| 481 | | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+ |
| 482 | | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ |
| 483 | | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ |
| 484 | | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+) |
| 485 | |
| 486 | |
| 487 | /* floating point literals */ |
| 488 | DoubleLiteral = ({FLit1}|{FLit2}|{FLit3}) {Exponent}? |
| 489 | |
| 490 | FLit1 = [0-9]+ \. [0-9]* |
| 491 | FLit2 = \. [0-9]+ |
| 492 | FLit3 = [0-9]+ |
| 493 | Exponent = [eE] [+-]? [0-9]+ |
| 494 | |
| 495 | // punctuation |
| 496 | P = ("_"|"-"|"."|",")|{SLASH} |
| 497 | |
| 498 | Q = [’\'`] |
| 499 | |
| 500 | PUNCT = ({P}|{Q}|[?!@#$%\^&*_:;\]\[\"»«\202\204\206\207\213\221\222\223\224\225\226\227\233]) |
| 501 | |
| 502 | // at least one digit |
| 503 | HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])* |
| 504 | |
| 505 | |
| 506 | LETTER = ([:letter:]|¨) |
| 507 | |
| 508 | ENGLISH_CLITIC = ({Q}(ll|d|ve|s|re|LL|D|VE|S|RE|m|M|n|N|[eE][mM])?|[nN]{Q}[Tt]) |
| 509 | |
| 510 | FRENCH_CLITIC = (-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-là) |
| 511 | |
| 512 | IRISH_O = [Oo]{Q} |
| 513 | |
| 514 | FRENCH_INIT_CLITIC = ([dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\') |
| 515 | |
| 516 | CLITIC = ({ENGLISH_CLITIC}|{FRENCH_CLITIC}) |
| 517 | |
| 518 | INIT_CLITIC = ({FRENCH_INIT_CLITIC}) |
| 519 | |
| 520 | POLISH_CONDITIONAL_CLITIC = (by) |
| 521 | |
| 522 | POLISH_CONDITIONAL_ENDING = (m|ś|śmy|ście)? |
| 523 | |
| 524 | POLISH_PAST_ENDING_1 = (ś|śmy|ście) |
| 525 | POLISH_PAST_ENDING_2 = ([mś]?|śmy|ście) |
| 526 | |
| 527 | WHITESPACE = \s |
| 528 | |
| 529 | ENDMARKER = (\n?\004\n?) |
| 530 | XML = <(\/text|\?xml|\?xml-model|\/?raw_text|\/?metadata) ?[^\004\n>]{0,100}> |
| 531 | |
| 532 | EMOTICON = ( [<>]?[BX;8:=][o\-\']?[DdPp()\/3>oO*]+|<\/?3+|ಠ_ಠ|\(-.-\)|\(T_T\)|\(♥_♥\)|\)\':|\)-:|\(-:|\)=|\)o:|\)x|:\'C|:\/|:<|:C|:[|=\(|=\)|=D|=P|>:|D\':|D:|\:|]:|x\(|\^\^|o.O|oO|\\{o}\/|\\m\/|:;\)\)|_\)\)|\*_\*|._.|:wink:|>_<|\*<:-\)|[:;]\)|[;;]" "\)) |
| 533 | |
| 534 | OMISSIONWORD = ({LETTER}+\*\*+{LETTER}*|{LETTER}+\*{LETTER}+|{LETTER}+[\'`]{LETTER}+) |
| 535 | |
| 536 | EXTENSION = (html|htm|doc|docx|pdf|jpg|mp3|mp4|ogg|png|avi|txt|xls|xml|aac|DOC|DOCX|GIF|JPG|JPEG) |
| 537 | FNAME = (({LETTER}:[\\/])?|\/)?({LETTER}+|[\\_/-])+\.{EXTENSION} |
| 538 | |
| 539 | PLUSAMPERSAND = (&|'|>|&K|<|&M|"|&RQ|\+Ale|\+ALe|\+Anima|\+APD|\+co|\+Co|\+GF\+|\+Leif|\+Strang|\+Teamgeist|A&A|A&E|A&F|A&M|A&O|A&P|A&R|A&V|A&W|A\+\+|A\+\+\+|A\+E|A\+f|AAC\+|ABC&D|AC\+|AD&D|AE&E|AES\+F|AEW&C|AFM\+E|AGTL\+|Altenpflege\+ProPflege|Analyse\+kritik|anlagen\+verfahren|ANT\+|Anynet\+|Applus\+|Arch\+|ARCH\+|ART\+COM|AS&P|ASC\+T|ASEAN\+|Asis&t|AT&L|AT&S|AT&SF|AT&T|ATV\+|Auer\+Weber|Auer\+Weber\+Assoziierte|Axis&Allies|B&B|B&C|B&F|B&G|B&H|B&I|B&K|B&M|B&MTJR|B&NES|B&O|B&Q|B&R|B&T|B&V|B&W|B\+B|B\+R|B\+T|Baby&Co|Bayern\+|BB&T|BD\+|Beast\+|BEAST\+|Beck\+Schubert|Belle&Sebastian|BFE\+|BG\+BRG|BIBEL\+ORIENT|Bild\+Funk|Binder\+Co|Blohm\+Voss|Blood\+|Blut\+Eisen|BM&F|BM&FBovespa|Bolles\+Wilson|Bottega\+Ehrhardt|Brangs\+Heinrich|BRF\+|Briner\+Kern|BUCH&media|Burghardt\+Schmidt|bus\+bahn|C&A|C&C|C&D|C&L|C&M|C&O|C&P|C&R|C&S|C&T|C&W|C\+\+|C\+\+Builder|C\+c|C\+C|C\+M\+B|Ca\+\+|Cafe\+co|Cafe\+Co|Canal\+|Cantata\+\+|CB&I|CC&G|CCC&StL|CD&E|CD&V|CD\+DVD|CD\+G|CDIA\+|Celtic\+|Cendres\+M|Chage&Aska|Chage&Asuka|Channel\+smile|Charm\+\+|Chip&Chap|CI&CEQ|CI\+|Click&Buy|Cocl&Seff|Com&Com|COM\+|Comicplus\+|COR&FJA|CS&S|CT&T|ctc\+\+|Ctrl\+Alt\+Del|CTRL\+ALT\+DEL|Cube\+|Cyfra\+|CYFRA\+|D&A|D&AD|D&b|D&B|D&D|D&G|D&O|D&RGW|D&S|D&W|D\+Q|DAB\+|DACH\+HOLZ|DAML\+OIL|DBM&T|Dc\+\+|DC\+\+|DDDBM&T|Despe&Siga|DF&S|Digital\+|DirectConnect\+\+|Dissing\+Weitling|DL\+NT|DLSW\+|Do&Co|Dok&Deb|Dorma\+kaba|DP&L|Drm\+|DRM\+|DTS\+\+|DU&ICH|DVD\+R|Dvd\+rw|DVD\+RW|E&a|E&N|E&Y|E\+|E\+e|E\+h|E\+H|EAAC\+|Ebert\+Jacobi|ECO\+|EG&G|Eigen\+Art|Eins\+Alles|Electromobility\+|En\+|Endress\+Hauser|Erasmus\+|ES&T|ETV\+|EV\+|Eve&rave|Every\+|F&A|F&B|F&E|F&F|F&K|F\+F|F\+U|Familie&Co|FAT\+|Film\+|FILM\+SCHULE|Fischer\+Kr|Fix\+Foxi|FLUXUS\+|FMHL\+|Form\+zweck|fuhrpark\+management|G&B|G&D|G&IF|G&L|G&V|G\+\+|G\+H|G\+J|G\+tt|GC&CS|GDI\+|ge\+her|GG&L|Go\+|GO\+|Google\+|Goran\+Vujic|GRAF\+ZYX|Gruner\+Jahr|Gtk\+|GTK\+|GTL\+|GTX\+|Guide\+|H&BC|H&H|H&K|H&M|h&m|H&N|H&R|H&S|H\+BEDV|H\+H|H\+N|H\+S|Haase&band|Hahn\+Kolb|HAHN\+KOLB|Hasta\+Coda|Haubitz&Zoche|Haubitz\+Zoche|HBCI\+|HD\+|Health&Care|Heim\+Handwerk|Heute\+|HFS\+|hne\+Nagel|HSPA\+|HT&L|HTML\+TIME|Huber\+Suhner|Hunger&Seide|I&A|I&K|I&Q|I&u|I&U|I\+D|I\+R|Ich\+Ich|ID&T|Idee\+spiel|Ihp\+|II\+|IIc\+|III\+|IK\+|In&phone|In&Phone|info\+|Interkama\+|IT&Production|J&B|J&D|J&J|J&M|J&P|J&S|J&T|J\+\+|J\+S|Jazz\+Az|Jenna\+Ron|Johnson&Johnson|JU\+TE|Jugend\+Sport|Jugend\+Technik|Jump&Run|K&k|K&K|K&L|K&M|K&N|K&R|K&S|K&U|K\+\+|K\+A|K\+H|K\+K|k\+Metal|K\+R|K\+S|K\+W|Kai\+Sven|Kaiser\+Kraft|KAISER\+KRAFT|Kino&Co|KINO&CO|Kino\+|Kirche\+Leben|Klassik&JazzMagazin|Kurz&F|L&B|L&C|L&M|L&N|L&P|L&S|L\+R|L\+T|Lancia\+Voyager|Landis\+Gyr|LB&SCR|Leader\+|LEADER\+|Lederer\+Ragnarsd|Leicht&Cross|Lenord\+Bauer|Leslie\+Lohman|Libsigc\+\+|Life&Style|LIFE\+|Light\+Building|Lippmann\+Rau|LISA\+|Lords&Knights|LT&SR|Lussi\+Halter|M&A|M&B|M&D|M&G|M&i|M&I|M&M|M&Ms|M&N|M&S|M&T|M\+a|M\+C|M\+M|M\+O|M\+s|M\+S|M\+W|Maildir\+\+|Mann\+Hummel|Markt\+Technik|Means\+\+|Melodie&Rhythmus|Metadata\+|Miles&more|Milk\+|Mining\+geo|Mix&Genest|mmerly\+Frey|Monet\+|Motion\+picture|MPP\+|MS&D|MS&L|MStP&SSM|Music&Voice|N&CRR|N&ER|n&gut|N&R|N&W|N\+M|Na\+|NADHH\+|Nah&gut|Natur\+kosmos|natur\+mensch|Nc\+|NI&Co|nig\+Neurath|Nike\+iPod|Nintendogs\+Cats|Notepad\+\+|NYW&B|O&K|O&L|O&M|Ola\+|OMNeT\+\+|ORFsport\+|Ost\+Front|P&A|P&C|P&E|P&G|P&I|P&ID|P&L|P&M|P&O|P&P|P&R|P&T|P&TLuxembourg|P&W|P\+M|P\+R|P\+S|PAL\+|Pan&Scan|Papier&Stift|Park&Charge|Park&Rail|Park&Ride|Park&Suites|PB&J|Peek&Cloppenburg|Pen&Paper|Pepperl&Fuchs|Pepperl\+Fuchs|Peste&Sida|PG&E|Pirelli&C|Pittel\+Brausewetter|Plug&play|Plus\+|POB&A|Pol&is|POL&IS|POLO\+|Poses\+\+|PP&P|Pratt&Whitney|Princess\+|Prius\+|Procter&Gamble|Prozac\+|PS&P|Pur\+|Q&A|Q&Q|Q\+Q|Quanta\+|R&A|R&B|R&D|R&ER|R&F|R&G|R&I|R&M|R&Q|R&R|R&S|R\+C|R\+S|R\+V|Rail&Fly|REDD\+|Reise&Touristik|Relax\+ng|RF&P|Richter\+Frenzel|Rio\+|Rohde&Schwarz|RT\+|Run&Dine|S&B|S&D|S&G|S&H|S&K|S&M|S&P|S&T|S&w|S&W|S\+D|S\+G|S\+T|S\+U|Sales&Services|Sam&Max|Schedule\+|Schiff&Hafen|Schlund\+Partner|Schmelzle\+Partner|Schmidt\+Clemens|science\+business|Science\+Business|sd&m|Sd&m|Sdr\+|Serve&Volley|Severin\+K|SiMPLE\+\+|SMS&park|SMW\+|Soap&Skin|Solo\+|Spar\+Kreditbank|Spar\+Leihkasse|speed\+|Speed\+|Spoga\+gafa|SPORT\+|Sport\+Technik|SS\+|St&H|St&Z|Standard&Poor|Standard&Poors|Station&Service|Steib\+Steib|Stil&Stadt|Strategy&|Strg\+Alt\+Entf|StrongDC\+\+|Such&Find|Sumol\+Compal|SVS&E|SVWZ\+|SW&S|Swift\+|SXGA\+|T&D|T&L|T&N|T&T|T\+A|T\+T|TACACS\+|Tanz&FolkFest|Taylor&Francis|text\+kritik|TEXT\+KRITIK|textil\+mode|Timidity\+\+|TMRM\+|Toni&Guy|toon\+|Touch&Travel|Track\+|Trends\+More|TT&C|TT&R|ttir\+Oei|TV\+Synchron|U&D|U\+\+|U\+F|Ultimate\+\+|Urban&Fischer|URW\+\+|USC&GS|UTC\+|V&A|V&R|V&S|V&W|Valentien\+Valentien|VC\+\+|VF\+|Vieweg\+Teubner|VISEO\+|Vision\+Technik|VisualDSP\+\+|VIVA\+|VL&D|Vorschau\+R|Vorster&Gr|VT&MA|W&B|W&F|W&G|W&H|W&p|W&V|W&W|WB\+|Wein\+Markt|Wienstroth&Hammans|Winkler\+D|Wirtschaft\+Markt|WP&YR|WS&P|WSXGA\+|WXGA\+|X\+\+|X\+Y|Xbase\+\+|XHTML\+SMIL|Y&R|Y&T|Yin&Yang|Yotsuba&|Young&Queer|Z\+W|Zeidler&Wimmel|Zinc&Germanium) |
| 540 | |
| 541 | TWITTER_HANDLE = @{ALPHA}{ALPHANUM}? |
| 542 | TWITTER_HASHTAG = #{ALPHANUM} |
| 543 | |
| 544 | // blocks of question marks and exclamation marks are one token |
| 545 | LONG_END_PUNCT = [?!][?!1]+ |
| 546 | |
| 547 | WORD = ({IRISH_O}?{ALPHANUM}+|[Qq]ur{Q}an) |
| 548 | |
| 549 | // pragmas used for anonymization etc. |
| 550 | PRAGMA = \[_[A-Z\-]+_\] |
| 551 | |
| Marc Kupietz | 74141b3 | 2020-10-01 23:23:18 +0200 | [diff] [blame] | 552 | %include language-specific_/*$target.language$*/.jflex-macro |
| Marc Kupietz | 67eed1c | 2020-09-28 21:37:16 +0200 | [diff] [blame] | 553 | |
| 554 | %s OPEN_QUOTE POLISH_CONDITIONAL_MODE JUST_AFTER_PERIOD CLITIC_MODE |
| 555 | |
| 556 | %% |
| 557 | {ENDMARKER} { fileEnd(); return null; } |
| 558 | |
| 559 | |
| 560 | // dates and fractions |
| 561 | |
| 562 | <POLISH_CONDITIONAL_MODE>{POLISH_CONDITIONAL_CLITIC} / {POLISH_CONDITIONAL_ENDING} { yybegin(YYINITIAL); return currentToken(); } |
| 563 | <POLISH_CONDITIONAL_MODE>[^b]. { throw new RuntimeException("..." + currentToken());} |
| 564 | {EMDASH} {return currentToken();} |
| 565 | {URL} { return currentToken(); } |
| 566 | |
| 567 | // special words |
| 568 | {c}an / not {return currentToken();} |
| 569 | {l}em / me {return currentToken();} |
| 570 | {g}on / na {return currentToken();} |
| 571 | {g}im / me {return currentToken();} |
| 572 | {w}an / na {return currentToken();} |
| 573 | {g}ot / ta {return currentToken();} |
| 574 | |
| 575 | {LETTER}\. {return currentToken();} |
| 576 | {LETTER}{2,12} / \.[:uppercase:] {return currentToken();} |
| 577 | {PLUSAMPERSAND} {return currentToken();} |
| 578 | {SEABBR}\. {return currentToken();} |
| 579 | {PRAGMA} {return currentToken();} |
| 580 | {FNAME} {return currentToken();} |
| 581 | |
| 582 | // contractions and other clitics |
| 583 | {INIT_CLITIC}{CLITIC} {return currentToken();} |
| 584 | |
| 585 | // polish clitics |
| 586 | {ALPHANUM}{ALPHANUM}+[lł][aeoiy]? / {POLISH_CONDITIONAL_CLITIC}{POLISH_CONDITIONAL_ENDING} {yybegin(POLISH_CONDITIONAL_MODE); return currentToken(); } |
| 587 | {ALPHANUM}{ALPHANUM}+[lł][aeoiy]? / {POLISH_PAST_ENDING_1} {return currentToken(); } |
| 588 | // need to not let lam through.... |
| 589 | {ALPHANUM}{ALPHANUM}+[ł][aeoiy]? / {POLISH_PAST_ENDING_2} {return currentToken(); } |
| 590 | |
| 591 | // times |
| 592 | [01]?[0-9]{WHITESPACE}?:[0-6][0-9] { return currentToken(yytext().replaceAll("\\s+","")); } |
| 593 | |
| 594 | // ordinals |
| 595 | [0-9]{1,3}\. {return currentToken();} |
| 596 | |
| 597 | // quotes |
| 598 | <YYINITIAL>\"/{WHITESPACE}*{ALPHANUM} { yybegin(OPEN_QUOTE); return currentToken("``"); } |
| 599 | <YYINITIAL>\'/{WHITESPACE}*{ALPHANUM} { yybegin(OPEN_QUOTE); return currentToken("`"); } |
| 600 | ‘ { yybegin(OPEN_QUOTE); return currentToken("`"); } |
| 601 | ’ { yybegin(YYINITIAL); return currentToken("'"); } |
| 602 | <OPEN_QUOTE>\" { yybegin(YYINITIAL); return currentToken("''"); } |
| 603 | “ { yybegin(YYINITIAL); return currentToken("``"); } |
| 604 | ” { yybegin(YYINITIAL); return currentToken("''"); } |
| 605 | \"/.*{ALPHANUM}+ { yybegin(OPEN_QUOTE); return currentToken("``"); } |
| 606 | \" { yybegin(YYINITIAL); return currentToken("''"); } |
| 607 | |
| 608 | ":!:" { return currentToken();} |
| 609 | "->" { return currentToken();} |
| 610 | "<-" { return currentToken();} |
| 611 | \*\*+ { return currentToken();} |
| 612 | \[\[+ { return currentToken();} |
| 613 | \]\]+ { return currentToken();} |
| 614 | |
| 615 | // normal stuff |
| 616 | // dashed words |
| 617 | {WORD}({DASH}{NEWLINE}*{WORD})+ { return currentToken();} |
| 618 | {WORD}{DASH} { return currentToken();} |
| 619 | {TWITTER_HANDLE} { return currentToken(); } |
| 620 | {TWITTER_HASHTAG} { return currentToken(); } |
| 621 | {WORD} { return currentToken();} |
| 622 | {OMISSIONWORD} { return currentToken();} |
| 623 | //{ABBRNYM} { return currentToken(); } |
| 624 | {EMAIL} { return currentToken(); } |
| 625 | {HOST} { return currentToken(); } |
| 626 | {NUM} { return currentToken(); } |
| 627 | //{ACRONYM_DEP} { return currentToken(); } |
| 628 | {NEWLINE} { } |
| 629 | {WHITESPACE} { } |
| 630 | |
| 631 | // KorAP-XML spcecifics |
| 632 | ^{WHITESPACE}*{XML}{NEWLINE}* {xmlPassage(); } |
| 633 | \<\/text>{NEWLINE}* {xmlPassage(); } |
| 634 | ^"Archive: "[^ \n]+".zip"\n {zipArchive(); } // handle unzip -c |
| 635 | ^" "+inflating: [^\n]{1,255}" "\n {zippedFile(); } |
| 636 | |
| 637 | // \( {return currentToken("-LRB-");} |
| 638 | // \) {return currentToken("-RRB-");} |
| 639 | //\{ {return currentToken("-LCB-");} |
| 640 | //\} {return currentToken("-RCB-");} |
| 641 | //\[ {return currentToken("-LSB-");} |
| 642 | //\] {return currentToken("-RSB-");} |
| 643 | ([.][.]+|…+) {return currentToken("...");} |
| 644 | {LONG_END_PUNCT} { return currentToken();} |
| 645 | {PUNCT} { return currentToken();} |
| 646 | {EMOTICON} { return currentToken();} |
| 647 | {DASH}{DoubleLiteral} { return currentToken();} |
| 648 | <<EOF>> { fileEnd(); return null;} |
| 649 | . { return currentToken();} |
| 650 | |
| 651 | |