blob: 378d45fe50759fe506928af2a9da3f92af059926 [file] [log] [blame]
lexer grammar PoliqarpPlusLexer;
@header {package de.ids_mannheim.korap.query.parse.poliqarpplus;}
options
{
language=Java;
}
/*
-- author: Joachim Bingel
-- date: 14-06-27
Poliqarp Query Language lexer
Language documentations:
- Adam Przepiórkowski (2004):
"The IPI PAN Corpus -- preliminary version", pp. 44
Further information:
- http://korpus.pl/index.php?page=poliqarp
Statistical extension:
- http://nlp.ipipan.waw.pl/Poliqarp/
Based on CQP
- http://cwb.sourceforge.net/files/CQP_Tutorial/
Todo: Some special characters aren't supported in REGEX and strings.
Todo: tags can be splittet at ':' in case the fieldname is 'tag'
*/
POSITION_OP : ('contains' | 'startswith' | 'startsWith' | 'endswith' | 'endsWith' | 'matches' | 'overlaps') ;
RELATION_OP : ('dominates' | 'dependency' | 'relatesTo');
MATCH_OP : ('focus' | 'shrink' | 'split'); // submatch and shrink are deprecated!
SUBMATCH_OP : 'submatch';
WITHIN : 'within';
META : 'meta';
/*
Regular expression
/x allows submatches like /^.*?RE.*?$/
/X forces full matches
/i means case insensitivity
/I forces case sensitivity
*/
FLAG_xi : '/' ( ('x'|'X') ('i'|'I')? );
FLAG_ix : '/' ( ('i'|'I') ('x'|'X')? );
/** Simple strings and Simple queries */
WS : [ \t] -> skip ;
fragment FOCC : '{' WS* ( [0-9]* WS* ',' WS* [0-9]+ | [0-9]+ WS* ','? ) WS* '}';
fragment NO_RE : ~[ \t\/];
fragment ALPHABET : ~('\t' | ' ' | '/' | '*' | '?' | '+' | '{' | '}' | '[' | ']'
| '(' | ')' | '|' | '"' | ',' | ':' | '\'' | '\\' | '!' | '=' | '~' | '&' | '^' | '<' | '>' );
NUMBER : [0-9]+;
NL : [\r\n] -> skip;
WORD : ALPHABET+;
/* Complex queries */
LPAREN : '[';
RPAREN : ']';
LRPAREN : '(';
RRPAREN : ')';
NEG : '!';
QMARK : '?';
CONJ : '&';
DISJ : '|';
COMMA : ',';
LT : '<';
GT : '>';
LBRACE : '{';
RBRACE : '}';
SLASH : '/';
COLON : ':';
TILDE : '~';
EQ : '=';
CARET : '^';
STAR : '*';
PLUS : '+';
EMPTYREL : '@';
/* Regular expressions and Regex queries */
fragment RE_char : ~('*' | '?' | '+' | '{' | '}' | '[' | ']'
| '(' | ')' | '|' | '"' | ':' | '\'' | '\\');
fragment RE_alter : ((RE_char | ('(' RE_expr ')') | RE_chgroup) '|' RE_expr )+;
fragment RE_chgroup : '[' RE_char+ ']';
fragment RE_quant : (RE_star | RE_plus | RE_occ) QMARK?;
fragment RE_opt : (RE_char | RE_chgroup | ( '(' RE_expr ')')) '?';
fragment RE_star : (RE_char | RE_chgroup | ( '(' RE_expr ')')) '*';
fragment RE_plus : (RE_char | RE_chgroup | ( '(' RE_expr ')')) '+';
fragment RE_occ : (RE_char | RE_chgroup | ( '(' RE_expr ')')) FOCC;
fragment RE_group : '(' RE_expr ')';
fragment RE_expr : ('.' | RE_char | RE_alter | RE_chgroup | RE_opt | RE_quant | RE_group)+;
fragment RE_dquote : '"' (RE_expr | '\'' | ':' )* '"';
fragment RE_squote : '\'' (RE_expr | '\"' | ':' )* '\'';
REGEX : ( RE_dquote | RE_squote );