blob: d1e68c2899e2170191f73e629d39a2bf9ad0e955 [file] [log] [blame]
Marc Kupietz67eed1c2020-09-28 21:37:16 +02001package de.ids_mannheim.korap.tokenizer;
2/**
3 * Licensed to the Apache Software Foundation (ASF) under one or more
4 * contributor license agreements. See the NOTICE file distributed with
5 * this work for additional information regarding copyright ownership.
6 * The ASF licenses this file to You under the Apache License, Version 2.0
7 * (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19/**
20 Modifications
21 Copyright 2014 David Hall
22
23 Licensed under the Apache License, Version 2.0 (the "License")
24 you may not use this file except in compliance with the License.
25 You may obtain a copy of the License at
26
27 http://www.apache.org/licenses/LICENSE-2.0
28
29 Unless required by applicable law or agreed to in writing, software
30 distributed under the License is distributed on an "AS IS" BASIS,
31 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
32 See the License for the specific language governing permissions and
33 limitations under the License.
34*/
35
36/**
37 Further Modifications
38 Copyright 2016 Marc Kupietz
39
40 Licensed under the Apache License, Version 2.0 (the "License")
41 you may not use this file except in compliance with the License.
42 You may obtain a copy of the License at
43
44 http://www.apache.org/licenses/LICENSE-2.0
45
46 Unless required by applicable law or agreed to in writing, software
47 distributed under the License is distributed on an "AS IS" BASIS,
48 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
49 See the License for the specific language governing permissions and
50 limitations under the License.
51*/
52import java.io.*;
53import java.lang.StringBuffer;
54import java.util.ArrayList;
55import java.util.List;
56import opennlp.tools.util.Span;
Marc Kupietz74141b32020-10-01 23:23:18 +020057
58@Languages({ /*$"\""+target.language+"\" })$*/ /*-*/ ""})
Marc Kupietz67eed1c2020-09-28 21:37:16 +020059%%
60
61/**
62* Based on the Epic tokenizer (https://github.com/dlwh/epic)
63* ... which is ...
64* Based on Lucene's StandardTokenizerImpl, but heavily modified.
65*/
66%class DerekoDfaTokenizer_/*$target.language$*/
67%unicode
68%public
69%implements KorapTokenizer, opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector
70%type Span
71%function getNextToken
72%char
73
74%{
Marc Kupietz74141b32020-10-01 23:23:18 +020075 private static final CharSequence[] targetLanguages = { /*$"\""+target.language+"\"};$*/ /*-*/ "" };
Marc Kupietz67eed1c2020-09-28 21:37:16 +020076 private boolean xmlEcho = false;
77 private boolean normalize = false;
78 private boolean debug = false;
79 private boolean newSentence = true;
80 private long startOffset = 0;
81 private long previousFileEndOffset = -1;
82 private int tokenId = 0;
83 private boolean atEOT = false;
84 private boolean splitSentences = false;
85 private boolean echo = false;
86 private boolean printOffsets = false;
87 private boolean printTokens = false;
88 private PrintStream outputStream = System.out;
89
90 @Override
91 public CharSequence[] getTargetLanguages() {
92 return targetLanguages;
93 }
94
95 public DerekoDfaTokenizer_/*$target.language$*/() {
96 this.zzReader = null;
97 }
98
99 @Override
100 public void setInputReader(Reader inputReader) {
101 this.zzReader = inputReader;
102 }
103
104 @Override
105 public void setSplitSentences(boolean splitSentences) {
106 this.splitSentences = splitSentences;
107 }
108
109 @Override
110 public void setEcho(boolean echo) {
111 this.echo = echo;
112 }
113
114 @Override
115 public void setPrintOffsets(boolean printOffsets) {
116 this.printOffsets = printOffsets;
117 }
118
119 @Override
120 public void setPrintTokens(boolean printTokens) {
121 this.printTokens = printTokens;
122 }
123
124 @Override
125 public void setOutputStream(PrintStream outputStream) {
126 this.outputStream = outputStream;
127 }
128
129 @Override
130 public void setNormalize(boolean normalize) {
131 this.normalize = normalize;
132 }
133
134 @Override
135 public void scan() throws IOException {
136 List<Span> list = new ArrayList<Span>();
137 Span token;
138 while (!zzAtEOF) {
139 token = this.getNextToken();
140 if (atEOT) {
141 if (echo) {
142 printTokenPositions(list, splitSentences);
143 list.clear();
144 }
145 atEOT = false;
146 }
147 if (token != null) {
148 list.add(token);
149 }
150 }
151 }
152
153 @Override
154 public String[] tokenize(String s) {
155 Span[] spans;
156 int i;
157 String[] tokens;
158
159 spans = tokenizePos(s);
160 tokens = new String[spans.length];
161 for (i = 0; i < spans.length; i++) {
162 tokens[i] = spans[i].getType();
163 }
164 return tokens;
165 }
166
167 void printTokenPositions(List<Span> spanList, boolean sentencize) {
168 int sentenceStart = -1;
169 StringBuilder tokenStringBuffer = new StringBuilder();
170 StringBuilder sentenceStringBuffer = new StringBuilder();
171 for (int i = 0; i < spanList.size(); i++) {
172 Span s = spanList.get(i);
173 if (sentenceStart == -1)
174 sentenceStart = s.getStart();
175 if (printOffsets) {
176 tokenStringBuffer.append(s.getStart())
177 .append(" ")
178 .append(s.getEnd());
179 if (i < spanList.size() - 1)
180 tokenStringBuffer.append(" ");
181 }
182 if (isSentenceBound(s.getType()) || (i == spanList.size() - 1)) {
183 sentenceStringBuffer.append(sentenceStart)
184 .append(" ")
185 .append(s.getEnd());
186 sentenceStart = -1;
187 if (i < spanList.size() - 1)
188 sentenceStringBuffer.append(" ");
189 }
190 }
191 outputStream.println(tokenStringBuffer.toString());
192 if (sentencize)
193 outputStream.println(sentenceStringBuffer.toString());
194 }
195
196 @Override
197 public Span[] tokenizePos(String s) {
198 Span token;
199 int i = 0;
200 List<Span> list = new ArrayList<Span>();
201 tokenId = 0;
202 yyreset(new StringReader(s));
203 try {
204 while (!this.zzAtEOF) {
205 token = this.getNextToken();
206 if (atEOT) {
207 if (echo) {
208 printTokenPositions(list, splitSentences);
209 list.clear();
210 }
211 atEOT = false;
212 }
213 if (token != null) {
214 list.add(token);
215 }
216 }
217 } catch (java.io.IOException e) {
218 System.err.println("IO error scanning " + s);
219 System.err.println(e);
220 }
221 return (list.toArray(new Span[list.size()]));
222 }
223
224 @Override
225 public String[] sentDetect(String s) {
226 Span[] spans;
227 int i;
228 String[] sentences;
229
230 spans = sentPosDetect(s);
231 sentences = new String[spans.length];
232 for (i = 0; i < spans.length; i++) {
233 sentences[i] = spans[i].getType();
234 }
235 return sentences;
236 }
237
238 @Override
239 public Span[] sentPosDetect(String s) {
240 final Span tokens[] = tokenizePos(s);
241 ArrayList<Span> sentences = new ArrayList<Span>();
242 int sentenceStart = 0;
243 if (tokens.length > 0)
244 tokens[0].getStart();
245 for (int i = 0; i < tokens.length; i++) {
246 if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
247 sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
248 if (i < tokens.length - 1) {
249 sentenceStart = tokens[i + 1].getStart();
250 }
251 }
252 }
253 return sentences.toArray(new Span[0]);
254 }
255
256 public final long yychar() {
257 return yychar;
258 }
259
260 final Span currentToken() {
261 return currentToken(yytext());
262 }
263
264 public boolean isSentenceBound(String s) {
265 return s.matches("^[.?!]+$");
266 }
267
268 final Span currentToken(String normalizedValue) {
269 String value;
270 long lengthDiff = 0;
271 previousFileEndOffset = -1;
272
273 if (normalize) {
274 value = normalizedValue;
275 } else {
276 value = yytext();
277 lengthDiff = value.length() - value.codePointCount(0, value.length());
278 }
279 if (startOffset > yychar || startOffset < 0) { // how can this happen?
280 startOffset = 0;
281 }
282 long from = (yychar - startOffset),
283 to = (yychar - startOffset + yylength() - lengthDiff);
284 if (xmlEcho) {
285 outputStream.println("<span id=\"t_" + tokenId + "\" from=\"" + from + "\" to=\"" + to + "\"/>\n" + value);
286 } else if (echo && printTokens) {
287 outputStream.println(value);
288 }
289 startOffset += lengthDiff;
290 tokenId++;
291 return new Span((int) from, (int) to, value);
292 }
293
294 final void fileEnd() {
295 startOffset = yychar + yylength();
296 // do not end a file multiple times because of additional EOT characters
297 if (startOffset == previousFileEndOffset)
298 return;
299 atEOT = true;
300 previousFileEndOffset = startOffset;
301 tokenId = 0;
302 }
303
304 final Span xmlPassage() {
305 if (xmlEcho) {
306 String dings = yytext();
307 if (dings.indexOf("<text") >= 0) {
308 startOffset = yychar + yylength();
309 tokenId = 0;
310 }
311 outputStream.println(dings.replaceAll("[\n\r]+", ""));
312 return null;
313 } else {
314 return currentToken();
315 }
316 }
317
318 final void zipArchive() {
319 String name;
320 String matched = yytext();
321 int start = 10;
322 name = matched.substring(start, matched.length() - 1);
323 outputStream.println("<archive name=\"" + name + "\"/>");
324 }
325
326 final void zippedFile() {
327 String name;
328 String matched = yytext();
329 int start = 13;
330 name = matched.substring(start, matched.length() - 3);
331 outputStream.println("<file name=\"" + name + "\"/>");
332 }
333%}
334
335THAI = [\u0E00-\u0E59]
336
337// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
338ALPHANUM = ({LETTER}|{THAI}|[:digit:]|_)+
339
340// case insensitivity is useful sometimes
341// a = [aA]
342// b = [bB]
343c = [cC]
344// d = [dD]
345e = [eE]
346// f = [fF]
347g = [gG]
348// h = [hH]
349// i = [iI]
350// j = [jJ]
351// k = [kK]
352l = [lL]
353// m = [mM]
354// n = [nN]
355o = [oO]
356// p = [pP]
357// q = [qQ]
358// r = [rR]
359// s = [sS]
360// t = [tT]
361// u = [uU]
362// v = [vV]
363w = [wW]
364// x = [xX]
365// y = [yY]
366// z = [zZ]
367
368ALPHA = ({LETTER}|¨)+
369
370NEWLINE = [\n\r]
371
372// acronyms: U.S.A., I.B.M., etc.
373// use a post-filter to remove dots
374// ABBRNYM = {LETTER} "." ({LETTER} ".")+
375
376// ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
377
378// hostname
379HOST = ({ALPHANUM}|"-"){4,15} ((".") ({ALPHANUM}|"-"){2,16})+
380
381EMDASH = (--|---|[\u2014\u2015\u2e3a\u2e3b\ufe58]+)
382
383DASH = ([\-\u2011\u2012\u2013\u2e1a\ufe63\uff0d])
384
385SLASH = [⁄∕//]
386
387
388// url
389
390// url spec lifted from Lucene
391
392// URL and E-mail syntax specifications:
393//
394// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
395// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
396// RFC-1123: Requirements for Internet Hosts - Application and Support
397// RFC-1738: Uniform Resource Locators (URL)
398// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
399// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
400// RFC-5321: Simple Mail Transfer Protocol
401// RFC-5322: Internet Message Format
402
403// http://code.ohloh.net/file?fid=wEylHt__FppVh8Ub_GTsx__CTK4&cid=d0f5PFFYrnk&s=UAX29URLEmailTokenizerImpl&filterChecked=true&fp=473333&mp,=1&ml=1&me=1&md=1&projSelected=true#L0
404
405DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
406DomainNameLoose = {DomainLabel} (("."|"[dot]") {DomainLabel})*
Marc Kupietze3282b02020-10-13 10:29:23 +0200407WWWDomainName = "www" (("."|"[dot]") {DomainLabel})*
Marc Kupietz67eed1c2020-09-28 21:37:16 +0200408
409IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
410IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
411IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
412IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
413IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
414 | "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
415 | {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
416 | (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
417 | (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
418 | (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
419 | (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
420 | (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
421 | (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
422
423URIunreserved = [-._~A-Za-z0-9]
424URIpercentEncoded = "%" [0-9A-Fa-f]{2}
425URIsubDelims = [!$&\'()*+,;=]
426URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
427URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
428URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
429URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
430URIport = ":" [0-9]{1,5}
431URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address}
432URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
433
434URIauthorityStrict = {URIhostStrict} {URIport}?
435URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
436
437HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
438HTTPpath = ("/" {HTTPsegment})*
439HTTPscheme = [hH][tT][tT][pP][sS]? "://"
440HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
441// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
Marc Kupietze3282b02020-10-13 10:29:23 +0200442HTTPurlNoScheme = ( {URIauthorityStrict} | {WWWDomainName} ) {HTTPpath}? {URIquery}? {URIfragment}?
Marc Kupietz67eed1c2020-09-28 21:37:16 +0200443HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
444
445FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
446FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
447FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
448FTPscheme = [fF][tT][pP] "://"
449FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
450
451FILEscheme = [fF][iI][lL][eE] "://"
452FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
453
454URL = {HTTPurl} | {FTPurl} | {FILEurl}
455
456// EMAILquotedString without space
457// EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u001F\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
458// original version from lucene
459// EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
460EMAILatomText = [A-Za-z0-9!#$%&\'*+-/=?\^_`{|}~]
461EMAILlabel = {EMAILatomText}+
462EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
463EMAILdomainLiteralText = {ALPHANUM}|{DomainNameLoose}
464//EMAILdomainLiteralText = ([\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F]|[\\][\u0000-\u007F])*{ALPHANUM}
465// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
466// in the {EMAILbracketedHost} definition without incurring any size penalties,
467// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
468// The IP address regexes are included in {EMAILbracketedHost} simply as a
469// reminder that they are acceptable bracketed host forms.
470EMAILbracketedHost = "["? ({EMAILdomainLiteralText}+ | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"?
471EMAIL = {EMAILlocalPart} ("@"|"["at"]") ({EMAILbracketedHost})
472
473 // {ALPHANUM} "://" {HOST} (ALPHANUM|\/)*
474// URL = ({ALPHA}({ALPHANUM}|-)+:(/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)([^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))
475
476
477// floating point, serial, model numbers, ip addresses, etc.
478// every other segment must have at least one digit
479NUM = ({ALPHANUM} {P} {HAS_DIGIT}
480 | {HAS_DIGIT} {P} {ALPHANUM}
481 | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
482 | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
483 | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
484 | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
485
486
487/* floating point literals */
488DoubleLiteral = ({FLit1}|{FLit2}|{FLit3}) {Exponent}?
489
490FLit1 = [0-9]+ \. [0-9]*
491FLit2 = \. [0-9]+
492FLit3 = [0-9]+
493Exponent = [eE] [+-]? [0-9]+
494
495// punctuation
496P = ("_"|"-"|"."|",")|{SLASH}
497
498Q = [’\'`]
499
500PUNCT = ({P}|{Q}|[?!@#$%\^&*_:;\]\[\"»«\202\204\206\207\213\221\222\223\224\225\226\227\233])
501
502// at least one digit
503HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
504
505
506LETTER = ([:letter:]|¨)
507
508ENGLISH_CLITIC = ({Q}(ll|d|ve|s|re|LL|D|VE|S|RE|m|M|n|N|[eE][mM])?|[nN]{Q}[Tt])
509
510FRENCH_CLITIC = (-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-là)
511
512IRISH_O = [Oo]{Q}
513
514FRENCH_INIT_CLITIC = ([dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\')
515
516CLITIC = ({ENGLISH_CLITIC}|{FRENCH_CLITIC})
517
518INIT_CLITIC = ({FRENCH_INIT_CLITIC})
519
520POLISH_CONDITIONAL_CLITIC = (by)
521
522POLISH_CONDITIONAL_ENDING = (m|ś|śmy|ście)?
523
524POLISH_PAST_ENDING_1 = (ś|śmy|ście)
525POLISH_PAST_ENDING_2 = ([mś]?|śmy|ście)
526
527WHITESPACE = \s
528
529ENDMARKER = (\n?\004\n?)
530XML = <(\/text|\?xml|\?xml-model|\/?raw_text|\/?metadata) ?[^\004\n>]{0,100}>
531
532EMOTICON = ( [<>]?[BX;8:=][o\-\']?[DdPp()\/3>oO*]+|<\/?3+|ಠ_ಠ|\(-.-\)|\(T_T\)|\(♥_♥\)|\)\':|\)-:|\(-:|\)=|\)o:|\)x|:\'C|:\/|:<|:C|:[|=\(|=\)|=D|=P|>:|D\':|D:|\:|]:|x\(|\^\^|o.O|oO|\\{o}\/|\\m\/|:;\)\)|_\)\)|\*_\*|._.|:wink:|>_<|\*<:-\)|[:;]\)|[;;]" "\))
533
534OMISSIONWORD = ({LETTER}+\*\*+{LETTER}*|{LETTER}+\*{LETTER}+|{LETTER}+[\'`]{LETTER}+)
535
536EXTENSION = (html|htm|doc|docx|pdf|jpg|mp3|mp4|ogg|png|avi|txt|xls|xml|aac|DOC|DOCX|GIF|JPG|JPEG)
537FNAME = (({LETTER}:[\\/])?|\/)?({LETTER}+|[\\_/-])+\.{EXTENSION}
538
539PLUSAMPERSAND = (&amp;|&apos;|&gt;|&K|&lt;|&M|&quot;|&RQ|\+Ale|\+ALe|\+Anima|\+APD|\+co|\+Co|\+GF\+|\+Leif|\+Strang|\+Teamgeist|A&A|A&E|A&F|A&M|A&O|A&P|A&R|A&V|A&W|A\+\+|A\+\+\+|A\+E|A\+f|AAC\+|ABC&D|AC\+|AD&D|AE&E|AES\+F|AEW&C|AFM\+E|AGTL\+|Altenpflege\+ProPflege|Analyse\+kritik|anlagen\+verfahren|ANT\+|Anynet\+|Applus\+|Arch\+|ARCH\+|ART\+COM|AS&P|ASC\+T|ASEAN\+|Asis&t|AT&L|AT&S|AT&SF|AT&T|ATV\+|Auer\+Weber|Auer\+Weber\+Assoziierte|Axis&Allies|B&B|B&C|B&F|B&G|B&H|B&I|B&K|B&M|B&MTJR|B&NES|B&O|B&Q|B&R|B&T|B&V|B&W|B\+B|B\+R|B\+T|Baby&Co|Bayern\+|BB&T|BD\+|Beast\+|BEAST\+|Beck\+Schubert|Belle&Sebastian|BFE\+|BG\+BRG|BIBEL\+ORIENT|Bild\+Funk|Binder\+Co|Blohm\+Voss|Blood\+|Blut\+Eisen|BM&F|BM&FBovespa|Bolles\+Wilson|Bottega\+Ehrhardt|Brangs\+Heinrich|BRF\+|Briner\+Kern|BUCH&media|Burghardt\+Schmidt|bus\+bahn|C&A|C&C|C&D|C&L|C&M|C&O|C&P|C&R|C&S|C&T|C&W|C\+\+|C\+\+Builder|C\+c|C\+C|C\+M\+B|Ca\+\+|Cafe\+co|Cafe\+Co|Canal\+|Cantata\+\+|CB&I|CC&G|CCC&StL|CD&E|CD&V|CD\+DVD|CD\+G|CDIA\+|Celtic\+|Cendres\+M|Chage&Aska|Chage&Asuka|Channel\+smile|Charm\+\+|Chip&Chap|CI&CEQ|CI\+|Click&Buy|Cocl&Seff|Com&Com|COM\+|Comicplus\+|COR&FJA|CS&S|CT&T|ctc\+\+|Ctrl\+Alt\+Del|CTRL\+ALT\+DEL|Cube\+|Cyfra\+|CYFRA\+|D&A|D&AD|D&b|D&B|D&D|D&G|D&O|D&RGW|D&S|D&W|D\+Q|DAB\+|DACH\+HOLZ|DAML\+OIL|DBM&T|Dc\+\+|DC\+\+|DDDBM&T|Despe&Siga|DF&S|Digital\+|DirectConnect\+\+|Dissing\+Weitling|DL\+NT|DLSW\+|Do&Co|Dok&Deb|Dorma\+kaba|DP&L|Drm\+|DRM\+|DTS\+\+|DU&ICH|DVD\+R|Dvd\+rw|DVD\+RW|E&a|E&N|E&Y|E\+|E\+e|E\+h|E\+H|EAAC\+|Ebert\+Jacobi|ECO\+|EG&G|Eigen\+Art|Eins\+Alles|Electromobility\+|En\+|Endress\+Hauser|Erasmus\+|ES&T|ETV\+|EV\+|Eve&rave|Every\+|F&A|F&B|F&E|F&F|F&K|F\+F|F\+U|Familie&Co|FAT\+|Film\+|FILM\+SCHULE|Fischer\+Kr|Fix\+Foxi|FLUXUS\+|FMHL\+|Form\+zweck|fuhrpark\+management|G&B|G&D|G&IF|G&L|G&V|G\+\+|G\+H|G\+J|G\+tt|GC&CS|GDI\+|ge\+her|GG&L|Go\+|GO\+|Google\+|Goran\+Vujic|GRAF\+ZYX|Gruner\+Jahr|Gtk\+|GTK\+|GTL\+|GTX\+|Guide\+|H&BC|H&H|H&K|H&M|h&m|H&N|H&R|H&S|H\+BEDV|H\+H|H\+N|H\+S|Haase&band|Hahn\+Kolb|HAHN\+KOLB|Hasta\+Coda|Haubitz&Zoche|Haubitz\+Zoche|HBCI\+|HD\+|Health&Care|Heim\+Handwerk|Heute\+|HFS\+|hne\+Nagel|HSPA\+|HT&L|HTML\+TIME|Huber\+Suhner|Hunger&Seide|I&A|I&K|I&Q|I&u|I&U|I\+D|I\+R|Ich\+Ich|ID&T|Idee\+spiel|Ihp\+|II\+|IIc\+|III\+|IK\+|In&phone|In&Phone|info\+|Interkama\+|IT&Production|J&B|J&D|J&J|J&M|J&P|J&S|J&T|J\+\+|J\+S|Jazz\+Az|Jenna\+Ron|Johnson&Johnson|JU\+TE|Jugend\+Sport|Jugend\+Technik|Jump&Run|K&k|K&K|K&L|K&M|K&N|K&R|K&S|K&U|K\+\+|K\+A|K\+H|K\+K|k\+Metal|K\+R|K\+S|K\+W|Kai\+Sven|Kaiser\+Kraft|KAISER\+KRAFT|Kino&Co|KINO&CO|Kino\+|Kirche\+Leben|Klassik&JazzMagazin|Kurz&F|L&B|L&C|L&M|L&N|L&P|L&S|L\+R|L\+T|Lancia\+Voyager|Landis\+Gyr|LB&SCR|Leader\+|LEADER\+|Lederer\+Ragnarsd|Leicht&Cross|Lenord\+Bauer|Leslie\+Lohman|Libsigc\+\+|Life&Style|LIFE\+|Light\+Building|Lippmann\+Rau|LISA\+|Lords&Knights|LT&SR|Lussi\+Halter|M&A|M&B|M&D|M&G|M&i|M&I|M&M|M&Ms|M&N|M&S|M&T|M\+a|M\+C|M\+M|M\+O|M\+s|M\+S|M\+W|Maildir\+\+|Mann\+Hummel|Markt\+Technik|Means\+\+|Melodie&Rhythmus|Metadata\+|Miles&more|Milk\+|Mining\+geo|Mix&Genest|mmerly\+Frey|Monet\+|Motion\+picture|MPP\+|MS&D|MS&L|MStP&SSM|Music&Voice|N&CRR|N&ER|n&gut|N&R|N&W|N\+M|Na\+|NADHH\+|Nah&gut|Natur\+kosmos|natur\+mensch|Nc\+|NI&Co|nig\+Neurath|Nike\+iPod|Nintendogs\+Cats|Notepad\+\+|NYW&B|O&K|O&L|O&M|Ola\+|OMNeT\+\+|ORFsport\+|Ost\+Front|P&A|P&C|P&E|P&G|P&I|P&ID|P&L|P&M|P&O|P&P|P&R|P&T|P&TLuxembourg|P&W|P\+M|P\+R|P\+S|PAL\+|Pan&Scan|Papier&Stift|Park&Charge|Park&Rail|Park&Ride|Park&Suites|PB&J|Peek&Cloppenburg|Pen&Paper|Pepperl&Fuchs|Pepperl\+Fuchs|Peste&Sida|PG&E|Pirelli&C|Pittel\+Brausewetter|Plug&play|Plus\+|POB&A|Pol&is|POL&IS|POLO\+|Poses\+\+|PP&P|Pratt&Whitney|Princess\+|Prius\+|Procter&Gamble|Prozac\+|PS&P|Pur\+|Q&A|Q&Q|Q\+Q|Quanta\+|R&A|R&B|R&D|R&ER|R&F|R&G|R&I|R&M|R&Q|R&R|R&S|R\+C|R\+S|R\+V|Rail&Fly|REDD\+|Reise&Touristik|Relax\+ng|RF&P|Richter\+Frenzel|Rio\+|Rohde&Schwarz|RT\+|Run&Dine|S&B|S&D|S&G|S&H|S&K|S&M|S&P|S&T|S&w|S&W|S\+D|S\+G|S\+T|S\+U|Sales&Services|Sam&Max|Schedule\+|Schiff&Hafen|Schlund\+Partner|Schmelzle\+Partner|Schmidt\+Clemens|science\+business|Science\+Business|sd&m|Sd&m|Sdr\+|Serve&Volley|Severin\+K|SiMPLE\+\+|SMS&park|SMW\+|Soap&Skin|Solo\+|Spar\+Kreditbank|Spar\+Leihkasse|speed\+|Speed\+|Spoga\+gafa|SPORT\+|Sport\+Technik|SS\+|St&H|St&Z|Standard&Poor|Standard&Poors|Station&Service|Steib\+Steib|Stil&Stadt|Strategy&|Strg\+Alt\+Entf|StrongDC\+\+|Such&Find|Sumol\+Compal|SVS&E|SVWZ\+|SW&S|Swift\+|SXGA\+|T&D|T&L|T&N|T&T|T\+A|T\+T|TACACS\+|Tanz&FolkFest|Taylor&Francis|text\+kritik|TEXT\+KRITIK|textil\+mode|Timidity\+\+|TMRM\+|Toni&Guy|toon\+|Touch&Travel|Track\+|Trends\+More|TT&C|TT&R|ttir\+Oei|TV\+Synchron|U&D|U\+\+|U\+F|Ultimate\+\+|Urban&Fischer|URW\+\+|USC&GS|UTC\+|V&A|V&R|V&S|V&W|Valentien\+Valentien|VC\+\+|VF\+|Vieweg\+Teubner|VISEO\+|Vision\+Technik|VisualDSP\+\+|VIVA\+|VL&D|Vorschau\+R|Vorster&Gr|VT&MA|W&B|W&F|W&G|W&H|W&p|W&V|W&W|WB\+|Wein\+Markt|Wienstroth&Hammans|Winkler\+D|Wirtschaft\+Markt|WP&YR|WS&P|WSXGA\+|WXGA\+|X\+\+|X\+Y|Xbase\+\+|XHTML\+SMIL|Y&R|Y&T|Yin&Yang|Yotsuba&|Young&Queer|Z\+W|Zeidler&Wimmel|Zinc&Germanium)
540
541TWITTER_HANDLE = @{ALPHA}{ALPHANUM}?
542TWITTER_HASHTAG = #{ALPHANUM}
543
544// blocks of question marks and exclamation marks are one token
545LONG_END_PUNCT = [?!][?!1]+
546
547WORD = ({IRISH_O}?{ALPHANUM}+|[Qq]ur{Q}an)
548
549// pragmas used for anonymization etc.
550PRAGMA = \[_[A-Z\-]+_\]
551
Marc Kupietz74141b32020-10-01 23:23:18 +0200552%include language-specific_/*$target.language$*/.jflex-macro
Marc Kupietz67eed1c2020-09-28 21:37:16 +0200553
554%s OPEN_QUOTE POLISH_CONDITIONAL_MODE JUST_AFTER_PERIOD CLITIC_MODE
555
556%%
557{ENDMARKER} { fileEnd(); return null; }
558
559
560// dates and fractions
561
562<POLISH_CONDITIONAL_MODE>{POLISH_CONDITIONAL_CLITIC} / {POLISH_CONDITIONAL_ENDING} { yybegin(YYINITIAL); return currentToken(); }
563<POLISH_CONDITIONAL_MODE>[^b]. { throw new RuntimeException("..." + currentToken());}
564{EMDASH} {return currentToken();}
565{URL} { return currentToken(); }
566
567// special words
568{c}an / not {return currentToken();}
569{l}em / me {return currentToken();}
570{g}on / na {return currentToken();}
571{g}im / me {return currentToken();}
572{w}an / na {return currentToken();}
573{g}ot / ta {return currentToken();}
574
575{LETTER}\. {return currentToken();}
576{LETTER}{2,12} / \.[:uppercase:] {return currentToken();}
577{PLUSAMPERSAND} {return currentToken();}
578{SEABBR}\. {return currentToken();}
579{PRAGMA} {return currentToken();}
580{FNAME} {return currentToken();}
581
582// contractions and other clitics
583{INIT_CLITIC}{CLITIC} {return currentToken();}
584
585// polish clitics
586{ALPHANUM}{ALPHANUM}+[lł][aeoiy]? / {POLISH_CONDITIONAL_CLITIC}{POLISH_CONDITIONAL_ENDING} {yybegin(POLISH_CONDITIONAL_MODE); return currentToken(); }
587{ALPHANUM}{ALPHANUM}+[lł][aeoiy]? / {POLISH_PAST_ENDING_1} {return currentToken(); }
588// need to not let lam through....
589{ALPHANUM}{ALPHANUM}+[ł][aeoiy]? / {POLISH_PAST_ENDING_2} {return currentToken(); }
590
591// times
592[01]?[0-9]{WHITESPACE}?:[0-6][0-9] { return currentToken(yytext().replaceAll("\\s+","")); }
593
594// ordinals
595[0-9]{1,3}\. {return currentToken();}
596
597// quotes
598<YYINITIAL>\"/{WHITESPACE}*{ALPHANUM} { yybegin(OPEN_QUOTE); return currentToken("``"); }
599<YYINITIAL>\'/{WHITESPACE}*{ALPHANUM} { yybegin(OPEN_QUOTE); return currentToken("`"); }
600‘ { yybegin(OPEN_QUOTE); return currentToken("`"); }
601’ { yybegin(YYINITIAL); return currentToken("'"); }
602<OPEN_QUOTE>\" { yybegin(YYINITIAL); return currentToken("''"); }
603“ { yybegin(YYINITIAL); return currentToken("``"); }
604” { yybegin(YYINITIAL); return currentToken("''"); }
605\"/.*{ALPHANUM}+ { yybegin(OPEN_QUOTE); return currentToken("``"); }
606\" { yybegin(YYINITIAL); return currentToken("''"); }
607
608":!:" { return currentToken();}
609"->" { return currentToken();}
610"<-" { return currentToken();}
611\*\*+ { return currentToken();}
612\[\[+ { return currentToken();}
613\]\]+ { return currentToken();}
614
615// normal stuff
616// dashed words
617{WORD}({DASH}{NEWLINE}*{WORD})+ { return currentToken();}
618{WORD}{DASH} { return currentToken();}
619{TWITTER_HANDLE} { return currentToken(); }
620{TWITTER_HASHTAG} { return currentToken(); }
621{WORD} { return currentToken();}
622{OMISSIONWORD} { return currentToken();}
623//{ABBRNYM} { return currentToken(); }
624{EMAIL} { return currentToken(); }
625{HOST} { return currentToken(); }
626{NUM} { return currentToken(); }
627//{ACRONYM_DEP} { return currentToken(); }
628{NEWLINE} { }
629{WHITESPACE} { }
630
631// KorAP-XML spcecifics
632^{WHITESPACE}*{XML}{NEWLINE}* {xmlPassage(); }
633\<\/text>{NEWLINE}* {xmlPassage(); }
634^"Archive: "[^ \n]+".zip"\n {zipArchive(); } // handle unzip -c
635^" "+inflating: [^\n]{1,255}" "\n {zippedFile(); }
636
637// \( {return currentToken("-LRB-");}
638// \) {return currentToken("-RRB-");}
639//\{ {return currentToken("-LCB-");}
640//\} {return currentToken("-RCB-");}
641//\[ {return currentToken("-LSB-");}
642//\] {return currentToken("-RSB-");}
643([.][.]+|…+) {return currentToken("...");}
644{LONG_END_PUNCT} { return currentToken();}
645{PUNCT} { return currentToken();}
646{EMOTICON} { return currentToken();}
647{DASH}{DoubleLiteral} { return currentToken();}
648<<EOF>> { fileEnd(); return null;}
649. { return currentToken();}
650
651