blob: 9e9c595ef531c1a1e5ebe11eaf556b6d3f7c8c97 [file] [log] [blame]
Joachim Bingel6003b852014-12-18 14:20:55 +00001parser grammar PoliqarpPlusParser;
2
3@header {package de.ids_mannheim.korap.query.parse.poliqarpplus;}
4
5options
6{
7language=Java;
8tokenVocab=PoliqarpPlusLexer;
9}
10/*
11 -- author: Joachim Bingel
12 -- date: 14-06-27
13
14 Poliqarp Query Language parser
15
16 Language documentations:
17 - Adam Przepiórkowski (2004):
18 "The IPI PAN Corpus -- preliminary version", pp. 44
19
20 Further information:
21 - http://korpus.pl/index.php?page=poliqarp
22 Statistical extension:
23 - http://nlp.ipipan.waw.pl/Poliqarp/
24 Based on CQP
25 - http://cwb.sourceforge.net/files/CQP_Tutorial/
26
27Todo: Some special characters aren't supported in REGEX and strings.
28Todo: tags can be splittet at ':' in case the fieldname is 'tag'
29
30*/
31
32
33flag:
34FLAG_xi | FLAG_ix
35;
36
37boolOp
38: CONJ | DISJ
39;
40
41regex
42: REGEX
43;
44
45key
46: WORD
47| regex
48| NUMBER
49;
50
51foundry
52: WORD
53;
54
55layer
56: WORD
57;
58
59value
60: WORD
61| NUMBER
62| regex
63;
64
65/* Fields */
66term
67: NEG* (foundry SLASH)? layer termOp key (COLON value)? flag?
68;
69
70termOp
71: (NEG? EQ? EQ | NEG? TILDE? TILDE)
72;
73
74min
75: NUMBER
76;
77
78max
79: NUMBER
80;
81
82startpos
83: NUMBER
84;
85
86length
87: NUMBER
88;
89
90range
91: LBRACE
92 ( min COMMA max
93 | max
94 | COMMA max
95 | min COMMA
96 )
97 RBRACE
98;
99
100emptyToken
101: LPAREN RPAREN
102;
103
104termGroup
105: (term | LRPAREN termGroup RRPAREN) boolOp (term | LRPAREN termGroup RRPAREN | termGroup)
106;
107
108repetition
109: kleene
110| range
111;
112
113kleene
114: QMARK
115| STAR
116| PLUS
117;
118
119token
120: NEG*
121 ( LPAREN term RPAREN
122 | LPAREN termGroup RPAREN
123 | key flag?
124 )
125;
126
127span
128: LT ((foundry SLASH)? layer termOp)? key NEG* (LRPAREN term RRPAREN|LRPAREN termGroup RRPAREN)? GT
129| LT ((foundry SLASH)? layer termOp)? key NEG* (term|termGroup)? GT
130;
131
132position
133: POSITION_OP LRPAREN (segment|sequence) COMMA (segment|sequence) RRPAREN
134;
135
136relation
137: RELATION_OP LRPAREN ((EMPTYREL|relSpec)? repetition? COLON)? (segment|sequence) COMMA (segment|sequence) RRPAREN
138;
139
140relSpec
141: (foundry SLASH)? layer (termOp key)?
142;
143
144submatch
Joachim Bingel97c194c2015-02-03 14:41:30 +0000145: SUBMATCH_OP LRPAREN startpos (COMMA length)? COLON (segment|sequence) RRPAREN
Joachim Bingel6003b852014-12-18 14:20:55 +0000146;
147
148matching
Joachim Bingel74e533d2015-02-03 14:48:56 +0000149: MATCH_OP LRPAREN spanclass_id? (segment|sequence)? RRPAREN
Joachim Bingel6003b852014-12-18 14:20:55 +0000150;
151
152alignment
Joachim Bingelc322c352015-03-18 12:12:07 +0100153: segment? (CARET segment)* CARET?
Joachim Bingel6003b852014-12-18 14:20:55 +0000154;
155
156disjunction
157: (segment|sequence|group) (DISJ (segment|sequence|group))+
158;
159
160group
161: LRPAREN ( disjunction | sequence ) RRPAREN
162;
163
164spanclass_id
165: NUMBER (boolOp NUMBER)* COLON
166;
167
168emptyTokenSequence
169: (emptyToken repetition?)+
170;
171
172emptyTokenSequenceClass
173: LBRACE spanclass_id? emptyTokenSequence RBRACE // class defined around empty tokens
174;
175
176
177distance
178: emptyTokenSequence
179;
180
181spanclass
182: LBRACE spanclass_id? (segment|sequence) RBRACE
183;
184
185segment
186: ( position
187 | token
188 | span
189 | group
190 | spanclass
191 | matching
192 | submatch
193 | relation
194 | LRPAREN segment RRPAREN
195 )
196 repetition?
Joachim Bingelc322c352015-03-18 12:12:07 +0100197 ;
Joachim Bingel6003b852014-12-18 14:20:55 +0000198
199sequence
Joachim Bingel07ef0422015-01-30 16:05:38 +0000200: segment* (emptyTokenSequence|emptyTokenSequenceClass) // ordering important! this subrule must precede any 'distance'-subrules to give precedence to repetition-interpretation of numbers in braces (could be mistaken for number tokens in spanclass), e.g. {2}.
Joachim Bingel6003b852014-12-18 14:20:55 +0000201| (emptyTokenSequence|emptyTokenSequenceClass) (segment+ | sequence) (emptyTokenSequence|emptyTokenSequenceClass)?
Joachim Bingelc322c352015-03-18 12:12:07 +0100202| alignment segment* // give precedence to this subrule over the next to make sure preceding segments come into 'alignment'
203| segment+ alignment segment*
Joachim Bingel07ef0422015-01-30 16:05:38 +0000204| segment segment+
Joachim Bingel6003b852014-12-18 14:20:55 +0000205| segment (distance|emptyTokenSequenceClass) segment
206| segment (distance|emptyTokenSequenceClass)? sequence
Joachim Bingelc322c352015-03-18 12:12:07 +0100207
208//| alignment (segment|sequence) alignment?
Joachim Bingel6003b852014-12-18 14:20:55 +0000209;
210
211
212/** Entry point for linguistic queries */
213
214query
215: segment | sequence | disjunction
216;
217
Joachim Bingel3c37eb22015-01-15 13:38:42 +0000218within
219: WITHIN WORD
220;
Joachim Bingel6003b852014-12-18 14:20:55 +0000221
222/**
223 === META section ===
224 defines metadata filters on request
225*/
226
227meta : META metaTermGroup;
228metaTermGroup : ( term | termGroup )+;
229
230/**
231 Entry point for all requests. Linguistic query is obligatory, metadata filtering
232 is optional.
233*/
Joachim Bingel3c37eb22015-01-15 13:38:42 +0000234request : query within? meta? EOF;