blob: 571cf12355d7a0caee2ce7b58cd1414e6c714c6d [file] [log] [blame]
parser grammar PoliqarpPlusParser;
@header {package de.ids_mannheim.korap.query.parse.poliqarpplus;}
options
{
language=Java;
tokenVocab=PoliqarpPlusLexer;
}
/*
-- author: Joachim Bingel
-- date: 14-06-27
Poliqarp Query Language parser
Language documentations:
- Adam Przepiórkowski (2004):
"The IPI PAN Corpus -- preliminary version", pp. 44
Further information:
- http://korpus.pl/index.php?page=poliqarp
Statistical extension:
- http://nlp.ipipan.waw.pl/Poliqarp/
Based on CQP
- http://cwb.sourceforge.net/files/CQP_Tutorial/
Todo: Some special characters aren't supported in REGEX and strings.
Todo: tags can be splittet at ':' in case the fieldname is 'tag'
*/
flag:
FLAG_xi | FLAG_ix
;
boolOp
: CONJ | DISJ
;
regex
: REGEX
;
key
: (WORD
| regex
| NUMBER)
;
foundry
: WORD
;
layer
: WORD
;
value
: (WORD | NUMBER) | regex
;
/* Fields */
term
: NEG* (foundry SLASH)? layer termOp key (COLON value)? flag?
| LRPAREN term RRPAREN
;
termOp
: (NEG? EQ? EQ | NEG? TILDE? TILDE)
;
min
: NUMBER
;
max
: NUMBER
;
startpos
: NUMBER
;
length
: NUMBER
;
range
: LBRACE
( min COMMA max
| max
| COMMA max
| min COMMA
)
RBRACE
;
emptyToken
: LPAREN RPAREN
;
termGroup
: (term | LRPAREN termGroup RRPAREN) boolOp (term | LRPAREN termGroup RRPAREN | termGroup)
| LRPAREN termGroup RRPAREN
;
repetition
: kleene
| range
;
kleene
: QMARK
| STAR
| PLUS
;
token
: NEG*
( LPAREN term RPAREN
| LPAREN termGroup RPAREN
| key flag?
)
;
span
: LT ((foundry SLASH)? layer termOp)? key NEG* ((LRPAREN term RRPAREN|LRPAREN termGroup RRPAREN)? | (term|termGroup)?) GT
;
position
: POSITION_OP LRPAREN (segment|sequence) COMMA (segment|sequence) RRPAREN
;
relation
: RELATION_OP LRPAREN ((EMPTYREL|relSpec)? repetition? COLON)? (segment|sequence) COMMA (segment|sequence) RRPAREN
;
relSpec
: (foundry SLASH)? layer (termOp key)?
;
submatch
: SUBMATCH_OP LRPAREN startpos (COMMA length)? COLON (segment|sequence) RRPAREN
;
matching
: MATCH_OP LRPAREN spanclass_id? (segment|sequence)? RRPAREN
;
alignment
: segment? ( (CARET segment)+ | CARET)
;
disjunction
: (segment|sequence|group) (DISJ (segment|sequence|group))+
;
group
: LRPAREN ( disjunction | sequence ) RRPAREN
;
spanclass_id
: NUMBER (boolOp NUMBER)* COLON
;
emptyTokenSequence
: (emptyToken repetition?)+
;
emptyTokenSequenceClass
: LBRACE spanclass_id? emptyTokenSequence RBRACE // class defined around empty tokens
;
distance
: emptyTokenSequence
;
spanclass
: LBRACE spanclass_id? (segment|sequence) RBRACE
;
segment
: ( position
| token
| span
| group
| spanclass
| matching
| submatch
| relation
| LRPAREN segment RRPAREN
)
repetition?
;
sequence
: segment* (emptyTokenSequence|emptyTokenSequenceClass) // ordering important! this subrule must precede any 'distance'-subrules to give precedence to repetition-interpretation of numbers in braces (could be mistaken for number tokens in spanclass), e.g. {2}.
| (emptyTokenSequence|emptyTokenSequenceClass) (segment+ | sequence) (emptyTokenSequence|emptyTokenSequenceClass)?
| alignment segment* // give precedence to this subrule over the next to make sure preceding segments come into 'alignment'
| segment+ alignment segment*
| segment segment+
| segment (distance|emptyTokenSequenceClass) segment
| segment (distance|emptyTokenSequenceClass)? sequence
//| alignment (segment|sequence) alignment?
;
/** Entry point for linguistic queries */
query
: segment | sequence | disjunction
;
within
: WITHIN WORD
;
/**
=== META section ===
defines metadata filters on request
*/
meta : META metaTermGroup;
metaTermGroup : ( term | termGroup )+;
/**
Entry point for all requests. Linguistic query is obligatory, metadata filtering
is optional.
*/
request : query within? meta? EOF;