Blame - src/test/java/de/ids_mannheim/korap/index/TestIndex.java - KorAP/Krill

blob: d1dd6085986305d930c2be68c02542c434f3e238 [file] [log] [blame]

Eliza Margaretha	0192918	2014-02-19 11:48:59 +0000	[diff] [blame]	1	package de.ids_mannheim.korap.index;
				2
Nils Diewald	f399a67	2013-11-18 17:55:22 +0000	[diff] [blame]	3	import java.util.*;
				4	import java.io.*;
				5
				6	import de.ids_mannheim.korap.analysis.MultiTermToken;
				7	import de.ids_mannheim.korap.query.wrap.SpanSegmentQueryWrapper;
				8	import de.ids_mannheim.korap.query.wrap.SpanRegexQueryWrapper;
				9	import de.ids_mannheim.korap.query.wrap.SpanSequenceQueryWrapper;
				10	import de.ids_mannheim.korap.query.SpanWithinQuery;
				11
				12	import static de.ids_mannheim.korap.Test.*;
				13
				14	import org.apache.lucene.analysis.standard.StandardAnalyzer;
				15	import org.apache.lucene.analysis.TokenFilter;
				16	import org.apache.lucene.analysis.TokenStream;
				17	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
				18
				19	import org.apache.lucene.index.Term;
				20	import org.apache.lucene.index.TermsEnum;
				21	import org.apache.lucene.index.TermContext;
				22
				23	import org.apache.lucene.index.DocsAndPositionsEnum;
				24	import org.apache.lucene.index.DirectoryReader;
				25	import org.apache.lucene.index.IndexWriter;
				26	import org.apache.lucene.index.IndexWriterConfig;
				27	import org.apache.lucene.index.IndexWriterConfig.OpenMode;
				28	import org.apache.lucene.index.AtomicReaderContext;
				29
				30	import org.apache.lucene.queryparser.classic.ParseException;
				31	import org.apache.lucene.queryparser.classic.QueryParser;
				32
				33	import org.apache.lucene.search.IndexSearcher;
				34	import org.apache.lucene.search.Query;
				35	import org.apache.lucene.search.TermQuery;
				36	import org.apache.lucene.search.BooleanClause;
				37	import org.apache.lucene.search.BooleanQuery;
				38	import org.apache.lucene.search.PhraseQuery;
				39	import org.apache.lucene.search.NumericRangeQuery;
				40	import org.apache.lucene.search.spans.Spans;
				41	import org.apache.lucene.search.spans.SpanQuery;
				42	import org.apache.lucene.search.spans.SpanOrQuery;
				43	import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
				44	import org.apache.lucene.search.spans.SpanTermQuery;
				45	import org.apache.lucene.search.spans.SpanNearQuery;
				46	import org.apache.lucene.search.spans.SpanNotQuery;
				47	import org.apache.lucene.search.spans.NearSpansOrdered;
				48	import org.apache.lucene.search.WildcardQuery;
				49	import org.apache.lucene.search.ScoreDoc;
				50	import org.apache.lucene.search.TopScoreDocCollector;
				51	import org.apache.lucene.search.TopDocs;
				52	import org.apache.lucene.search.RegexpQuery;
				53
				54	import org.apache.lucene.store.Directory;
				55	import org.apache.lucene.store.RAMDirectory;
				56	import org.apache.lucene.store.SimpleFSDirectory; // temporary
				57
				58	import org.apache.lucene.util.Version;
				59	import org.apache.lucene.util.BytesRef;
				60	import org.apache.lucene.util.Bits;
				61
				62	import static org.junit.Assert.*;
				63	import org.junit.Test;
				64	import org.junit.Ignore;
				65	import org.junit.runner.RunWith;
				66	import org.junit.runners.JUnit4;
				67
				68	@RunWith(JUnit4.class)
				69	public class TestIndex { // extends LuceneTestCase {
				70	// Create index in RAM
				71	// private Directory index = new RAMDirectory();
				72
				73	private Directory index = new RAMDirectory();
				74
				75	@Test
				76	public void multiTermToken () {
				77	MultiTermToken test = new MultiTermToken("hunde", "pos:n", "m:gen:pl");
				78	assertEquals(test.terms.get(0).term, "hunde");
				79	assertEquals(test.terms.get(1).term, "pos:n");
				80	assertEquals(test.terms.get(2).term, "m:gen:pl");
				81	assertEquals(test.terms.get(0).posIncr, 1, 1);
				82	assertEquals(test.terms.get(1).posIncr, 0, 1);
				83	assertEquals(test.terms.get(2).posIncr, 0, 1);
				84
				85	test = new MultiTermToken("hunde", "pos:n", "m:gen:pl");
				86	assertEquals(test.terms.get(0).term, "hunde");
				87	assertEquals(test.terms.get(1).term, "pos:n");
				88	assertEquals(test.terms.get(2).term, "m:gen:pl");
				89	assertEquals(test.terms.get(0).posIncr, 1, 1);
				90	assertEquals(test.terms.get(1).posIncr, 0, 1);
				91	assertEquals(test.terms.get(2).posIncr, 0, 1);
				92	};
				93
				94	private List initIndexer () throws IOException {
				95	List<Map<String, String>> list = new ArrayList<>();
				96
				97	Map<String, String> d1 = new HashMap<String, String>();
				98	d1.put("id", "w1");
				99	d1.put("corpus", "wiki");
				100	d1.put("author", "Nils Diewald");
				101	d1.put("title", "Wikipedia");
				102	d1.put("subtitle", "A test");
				103	d1.put("pubDate", "20130701");
				104	d1.put("pubPlace", "Mannheim");
				105	d1.put("textClass", "news sports");
				106	d1.put("textStr", "Er nahm den Hunden die Angst.");
				107	d1.put("text", "Er#0-2\|PPER\|er\|c:nom;p:3;n:sg;g:masc\|<>:s#0-29$<i>7 " +
				108	"nahm#3-7\|VVFIN\|nehmen\|p:3;n:sg;t:past;m:ind\| " +
				109	"den#8-11\|ART\|der\|c:acc;n:sg;g:masc\| " +
				110	"Hunden#12-18\|NN\|hund\|c:acc;n:sg;g:masc\| " +
				111	"die#19-22\|ART\|der\|c:nom;n:sg;g:fem\| " +
				112	"Angst#23-28\|NN\|angst\|c:nom;n:sg;g:fem\| " +
				113	".#28-29\|$.\|.\|\|");
				114	list.add(d1);
				115
				116	Map<String, String> d2 = new HashMap<String, String>();
				117
				118	d2.put("id", "w2");
				119	d2.put("corpus", "wiki");
				120	d2.put("author", "Peter Thomas");
				121	d2.put("title", "Waldartikel");
				122	d2.put("subtitle", "Another test");
				123	d2.put("pubDate", "20130723");
				124	d2.put("pubPlace", "Bielefeld");
				125	d2.put("textClass", "news");
				126	d2.put("textStr", "Sie liefen durch den Wald.");
				127	d2.put("text", "Sie#0-3\|PPER\|sie\|c:nom;p:3;n:pl;g:all\|<>:s#0-26$<i>6 " +
				128	"liefen#4-10\|VVFIN\|laufen\|p:3;n:pl;t:past;m:ind\| " +
				129	"durch#11-16\|APPR\|durch\|\| " +
				130	"den#17-20\|ART\|der\|c:acc;n:sg;g:masc\| " +
				131	"Wald#21-25\|NN\|wald\|c:acc;n:sg;g:masc\| " +
				132	".#25-26\|$.\|.\|\|");
				133	list.add(d2);
				134
				135	Map<String, String> d3 = new HashMap<String, String>();
				136	d3.put("id", "w3");
				137	d3.put("corpus", "zeitung");
				138	d3.put("author", "Michael Meier");
				139	d3.put("title", "Angst");
				140	d3.put("subtitle", "Starr vor Angst");
				141	d3.put("pubDate", "20130713");
				142	d3.put("pubPlace", "Bielefeld");
				143	d3.put("textClass", "sports");
				144	d3.put("textStr", "Er wagte nicht, sich zu ruehren. Er war starr vor Angst.");
				145	d3.put("text", "Er#0-2\|PPER\|er\|c:nom;n:sg;g:masc;p:3\|<>:s#0-32$<i>8 " +
				146	"wagte#3-8\|VVFIN\|wagen\|p:3;n:sg;t:past;m:ind\| " +
				147	"nicht#9-14\|PTKNEG\|nicht\|\| " +
				148	",#14-15\|$,\|,\|\| " +
				149	"sich#16-20\|PRF\|sich\|c:acc;p:3;n:sg\| " +
				150	"zu#21-23\|PTKZU\|zu\|\| " +
				151	"ruehren#24-31\|VVFIN\|ruehren\|\| " +
				152	".#31-32\|$.\|.\|\| " +
				153	"Er#33-35\|PPER\|er\|c:nom;p:3;n:sg;g:masc\|<>:s#33-56$<i>14 " +
				154	"war#36-39\|VAFIN\|sein\|p:3;n:sg;t:past;m:ind\| " +
				155	"starr#40-45\|ADJD\|starr\|comp:pos\| " +
				156	"vor#46-49\|APPR\|vor\|\| " +
				157	"Angst#50-55\|NN\|angst\|c:dat;n:sg;g:fem\| " +
				158	".#55-56\|$.\|.\|\|");
				159	list.add(d3);
				160
				161	return list;
				162	};
				163
				164	@Test
Nils Diewald	be5943e	2014-10-21 19:35:34 +0000	[diff] [blame]	165	public void indexLucene () throws Exception {
Nils Diewald	f399a67	2013-11-18 17:55:22 +0000	[diff] [blame]	166
				167	// Base analyzer for searching and indexing
				168	StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
				169
				170	// Based on
				171	// http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/
				172	// analysis/Analyzer.html?is-external=true
				173
				174	// Create configuration with base analyzer
				175	IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer);
				176
				177	// Add a document 1 with the correct fields
				178	IndexWriter w = new IndexWriter(index, config);
				179
				180	Collection docs = initIndexer();
				181
				182	@SuppressWarnings("unchecked")
				183	Iterator<Map<String,String>> i = (Iterator<Map<String,String>>) docs.iterator();
				184
				185	for (; i.hasNext(); ) {
				186	addDoc(w, i.next());
				187	};
				188
				189	assertEquals(3, w.numDocs());
				190
				191	w.close();
				192
				193	// Check directory
				194	DirectoryReader reader = DirectoryReader.open( index );
				195	assertEquals(docs.size(), reader.maxDoc());
				196	assertEquals(docs.size(), reader.numDocs());
				197
				198	// Check searcher
				199	IndexSearcher searcher = new IndexSearcher( reader );
				200
				201	// textClass
				202	// All texts of text class "news"
				203	assertEquals(2,
				204	searcher.search(
				205	new TermQuery(
				206	new Term("textClass", "news")
				207	), 10
				208	).totalHits
				209	);
				210
				211	// textClass
				212	// All texts of text class "sports"
				213	assertEquals(2,
				214	searcher.search(
				215	new TermQuery(
				216	new Term("textClass", "sports")
				217	), 10
				218	).totalHits
				219	);
				220
				221	// TextIndex
				222	// All docs containing "l:nehmen"
				223	assertEquals(1,
				224	searcher.search(
				225	new TermQuery(
				226	new Term("text", "l:nehmen")
				227	), 10
				228	).totalHits
				229	);
				230
				231	// TextIndex
				232	// All docs containing "s:den"
				233	assertEquals(2,
				234	searcher.search(
				235	new TermQuery(
				236	new Term("text", "s:den")
				237	), 10
				238	).totalHits
				239	);
				240
				241	/*
				242	assertEquals(3,
				243	searcher.search(
				244	new TermQuery(
				245	new Term("text", "T")
				246	), 10
				247	).totalHits
				248	);
				249	*/
				250
				251	// BooleanQuery
				252	// All docs containing "s:den" and "l:sie"
				253	TermQuery s_den = new TermQuery(new Term("text", "s:den"));
				254	TermQuery l_sie = new TermQuery(new Term("text", "l:sie"));
				255	BooleanQuery bool = new BooleanQuery();
				256	bool.add(s_den, BooleanClause.Occur.MUST);
				257	bool.add(l_sie, BooleanClause.Occur.MUST);
				258
				259	assertEquals(1, searcher.search(bool, 10).totalHits);
				260
				261	// BooleanQuery
				262	// All docs containing "s:den" or "l:sie"
				263	bool = new BooleanQuery();
				264	bool.add(s_den, BooleanClause.Occur.SHOULD);
				265	bool.add(l_sie, BooleanClause.Occur.SHOULD);
				266	assertEquals(2, searcher.search(bool, 10).totalHits);
				267
				268
				269	// RegexpQuery
				270	// All docs containing ".{4}en" (liefen und Hunden)
				271	RegexpQuery srquery = new RegexpQuery(
				272	new Term("text", "s:.{4}en")
				273	);
				274	assertEquals(2, searcher.search(srquery, 10).totalHits);
				275
				276	// RegexpQuery
				277	// All docs containing "E." (Er) (2x)
				278	srquery = new RegexpQuery(
				279	new Term("text", "s:E.")
				280	);
				281	assertEquals(2, searcher.search(srquery, 10).totalHits);
				282
				283	SpanRegexQueryWrapper ssrquery = new SpanRegexQueryWrapper("text", "s:E.");
				284	assertEquals(2, searcher.search(ssrquery.toQuery(), 10).totalHits);
				285
				286
				287	// RegexpQuery
				288	// All docs containing "E." (er) (0x)
				289	srquery = new RegexpQuery(
				290	new Term("text", "s:e.")
				291	);
				292	assertEquals(0, searcher.search(srquery, 10).totalHits);
				293
				294	ssrquery = new SpanRegexQueryWrapper("text", "s:e.");
				295	assertEquals(0, searcher.search(ssrquery.toQuery(), 10).totalHits);
				296
				297	// Check http://comments.gmane.org/gmane.comp.jakarta.lucene.user/52283
				298	// for Carstens question on wildcards
				299
				300	// RegexpQuery
				301	// All docs containing "E."/i ([Ee]r) (2x)
				302	srquery = new RegexpQuery(
				303	new Term("text", "i:e.")
				304	);
				305	assertEquals(2, searcher.search(srquery, 10).totalHits);
				306
				307	ssrquery = new SpanRegexQueryWrapper("text", "s:e.", true);
				308	assertEquals("SpanMultiTermQueryWrapper(text:/i:e./)", ssrquery.toQuery().toString());
				309	assertEquals(2, searcher.search(ssrquery.toQuery(), 10).totalHits);
				310
				311	// All docs containing "ng"/x (Angst) (2x)
				312	srquery = new RegexpQuery(
				313	new Term("text", "s:.ng.")
				314	);
				315	assertEquals(2, searcher.search(srquery, 10).totalHits);
				316
				317	// [base=angst]
				318	SpanTermQuery stq = new SpanTermQuery(new Term("text", "l:angst"));
				319	assertEquals(2, searcher.search(srquery, 10).totalHits);
				320
				321	// vor Angst
				322	// [orth=vor][orth=Angst]
				323	SpanNearQuery snquery = new SpanNearQuery(
				324	new SpanQuery[] {
				325	new SpanTermQuery(new Term("text", "s:vor")),
				326	new SpanTermQuery(new Term("text", "s:Angst"))
				327	},
				328	1,
				329	true
				330	);
				331	assertEquals(1, searcher.search(snquery, 10).totalHits);
				332
				333	// Spannearquery [p:VVFIN][]{,5}[m:nom:sg:fem]
				334	snquery = new SpanNearQuery(
				335	new SpanQuery[] {
				336	new SpanTermQuery(new Term("text", "p:VVFIN")),
				337	new SpanSegmentQueryWrapper("text", "m:c:nom", "m:n:sg", "m:g:fem").toQuery()
				338	},
				339	5, // slop
				340	true // inOrder
				341	// Possible: CollectPayloads
				342	);
				343	assertEquals(1, searcher.search(snquery, 10).totalHits);
				344
				345
				346	// Spannearquery [p:VVFIN][m:acc:sg:masc]
				347	snquery = new SpanNearQuery(
				348	new SpanQuery[] {
				349	new SpanTermQuery(new Term("text", "p:VVFIN")),
				350	new SpanNearQuery(
				351	new SpanQuery[] {
				352	new SpanTermQuery(new Term("text", "m:c:acc")),
				353	new SpanNearQuery(
				354	new SpanQuery[] {
				355	new SpanTermQuery(new Term("text", "m:n:sg")),
				356	new SpanTermQuery(new Term("text", "m:g:masc"))
				357	},
				358	-1,
				359	false
				360	)
				361	},
				362	-1, // slop
				363	false // inOrder
				364	// Possible: CollectPayloads
				365	)
				366	// new SpanTermQuery(new Term("text", "m:-acc:--sg:masc"))
				367	},
				368	0, // slop
				369	true // inOrder
				370	// Possible: CollectPayloads
				371	);
				372	assertEquals(1, searcher.search(snquery, 10).totalHits);
				373
				374
				375	// Spannearquery [p:VVFIN\|m:3:sg:past:ind]
				376	// Exact match!
				377	snquery = new SpanNearQuery(
				378	new SpanQuery[] {
				379	new SpanTermQuery(new Term("text", "p:VVFIN")),
				380	new SpanNearQuery(
				381	new SpanQuery[] {
				382	new SpanTermQuery(new Term("text", "m:p:3")),
				383	new SpanNearQuery(
				384	new SpanQuery[] {
				385	new SpanTermQuery(new Term("text", "m:n:sg")),
				386	new SpanNearQuery(
				387	new SpanQuery[] {
				388	new SpanTermQuery(new Term("text", "m:t:past")),
				389	new SpanTermQuery(new Term("text", "m:m:ind")),
				390	},
				391	-1,
				392	false
				393	)
				394	},
				395	-1,
				396	false
				397	)
				398	},
				399	-1,
				400	false
				401	)
				402	},
				403	// new SpanTermQuery(new Term("text", "m:---3:--sg:past:-ind"))
				404	-1, // slop
				405	false // inOrder
				406	// Possible: CollectPayloads
				407	);
				408	assertEquals(2, searcher.search(snquery, 10).totalHits);
				409
				410	// To make sure, this is not equal:
				411	// Spannearquery [p:VVFIN & m:3:sg:past:ind]
				412	// Exact match!
				413	// Maybe it IS equal
				414	snquery = new SpanNearQuery(
				415	new SpanQuery[] {
				416	new SpanTermQuery(new Term("text", "p:VVFIN")),
				417	new SpanTermQuery(new Term("text", "m:p:3")),
				418	new SpanTermQuery(new Term("text", "m:n:sg")),
				419	new SpanTermQuery(new Term("text", "m:t:past")),
				420	new SpanTermQuery(new Term("text", "m:m:ind")),
				421	},
				422	-1, // slop
				423	false // inOrder
				424	// Possible: CollectPayloads
				425	);
				426	assertNotEquals(2, searcher.search(snquery, 10).totalHits);
				427	// assertEquals(2, searcher.search(snquery, 10).totalHits);
				428
				429	// Spannearquery [p:VVFIN & m:3:sg & past:ind]
				430	SpanSegmentQueryWrapper sniquery = new SpanSegmentQueryWrapper(
				431	"text",
				432	"p:VVFIN",
				433	"m:p:3",
				434	"m:n:sg",
				435	"m:t:past",
				436	"m:m:ind"
				437	);
				438	assertEquals(2, searcher.search(sniquery.toQuery(), 10).totalHits);
				439
				440
				441	// Todo:
				442
				443	/*
				444	sniquery = new SpanSegmentQuery(
				445	"text",
				446	"p:VVFIN",
				447	"m:p:3",
				448	"m:n:sg",
				449	"m:t:past",
				450	"m:m:ind"
				451	);
				452	*/
				453
				454	// Spannearquery [p:VVFIN][]{,5}[m:nom:sg:fem]
				455	snquery = new SpanNearQuery(
				456	new SpanQuery[] {
				457	new SpanTermQuery(new Term("text", "p:VVFIN")),
				458	new SpanSegmentQueryWrapper("text", "m:c:nom", "m:n:sg", "m:g:fem").toQuery()
				459	},
				460	5, // slop
				461	true // inOrder
				462	// Possible: CollectPayloads
				463	);
				464	assertEquals(1, searcher.search(snquery, 10).totalHits);
				465
				466	sniquery = new SpanSegmentQueryWrapper("text", "p:VVFIN", "m:p:3", "m:t:past", "m:m:ind", "m:n:sg");
				467	assertEquals(2, searcher.search(sniquery.toQuery(), 10).totalHits);
				468
				469	// [p = VVFIN & m:p = 3 & m:t = past & m:n != pl] or
				470	// [p = VVFIN & m:p = 3 & m:t = past & !m:n = pl]
				471	// TODO: Problem: What should happen in case the category does not exist?
				472	// pssible solution: & ( m:n != pl & exists(m:n))
				473	sniquery = new SpanSegmentQueryWrapper("text", "p:VVFIN", "m:p:3", "m:t:past");
				474	SpanQuery snqquery = new SpanNotQuery(sniquery.toQuery(), new SpanTermQuery(new Term("text", "m:n:pl")));
				475	assertEquals(2, searcher.search(snqquery, 10).totalHits);
				476
				477	// [p = NN & (m:c: = dat \| m:c = acc)]
				478	snquery = new SpanNearQuery(
				479	new SpanQuery[] {
				480	new SpanTermQuery(new Term("text", "p:NN")),
				481	new SpanOrQuery(
				482	new SpanTermQuery( new Term("text", "m:c:nom" )),
				483	new SpanTermQuery( new Term("text", "m:c:acc" ))
				484	)
				485	},
				486	-1,
				487	false
				488	);
				489
				490	assertEquals(2, searcher.search(snqquery, 10).totalHits);
				491
				492	// [p = NN & !(m:c: = nom \| m:c = acc)]
				493	snqquery = new SpanNotQuery(
				494	new SpanTermQuery(new Term("text", "p:NN")),
				495	new SpanOrQuery(
				496	new SpanTermQuery( new Term("text", "m:c:nom" )),
				497	new SpanTermQuery( new Term("text", "m:c:acc" ))
				498	)
				499	);
				500	assertEquals(1, searcher.search(snqquery, 10).totalHits);
				501
				502	// [p = NN & !(m:c = nom)]
				503	snqquery = new SpanNotQuery(
				504	new SpanTermQuery( new Term("text", "p:NN")),
				505	new SpanTermQuery( new Term("text", "m:c:nom" ))
				506	);
				507	assertEquals(3, searcher.search(snqquery, 10).totalHits);
				508
				509	// [p=NN & !(m:c = acc)]
				510	snqquery = new SpanNotQuery(
				511	new SpanTermQuery( new Term("text", "p:NN")),
				512	new SpanTermQuery( new Term("text", "m:c:acc" ))
				513	);
				514	assertEquals(2, searcher.search(snqquery, 10).totalHits);
				515
				516	// [p=PPER][][p=ART]
				517	snquery = new SpanNearQuery(
				518	new SpanQuery[] {
				519	new SpanTermQuery( new Term("text", "p:PPER")),
				520	new SpanNearQuery(
				521	new SpanQuery[] {
				522	new SpanTermQuery( new Term("text", "T")),
				523	new SpanTermQuery( new Term("text", "p:ART"))
				524	},
				525	0,
				526	true),
				527	},
				528	0,
				529	true
				530	);
				531	assertEquals(1, searcher.search(snquery, 10).totalHits);
				532
				533
				534	// Todo:
				535	// [orth=się][]{2,4}[base=bać]
				536	// [orth=się][orth!="[.!?,:]"]{,5}[base=bać]\|[base=bać][base="on\|ja\|ty\|my\|wy"]?[orth=się]
				537	// [pos=subst & orth="a.*"]{2}
				538	// [tag=subst:sg:nom:n]
				539	// [case==acc & case==gen] ??
				540	// [case~acc & case~gen]
				541	// [case~~acc]
				542	// [base=bać][orth!=się]+[orth=się] within s
				543
				544	// [][][p:VAFIN] within s
				545	// [][p:VAFIN] within s
				546
				547
				548	// [][][p:VAFIN]
				549	snquery = new SpanNearQuery(
				550	new SpanQuery[] {
				551	new SpanNearQuery(
				552	new SpanQuery[] {
				553	new SpanTermQuery( new Term("text", "T") ),
				554	new SpanTermQuery( new Term("text", "T") )
				555	},
				556	0,
				557	true
				558	),
				559	new SpanTermQuery( new Term("text", "p:VAFIN") )
				560	},
				561	0,
				562	true
				563	);
				564	assertEquals(1, searcher.search(snquery, 10).totalHits);
				565
				566	/*
				567	http://stackoverflow.com/questions/1311199/finding-the-position-of-search-hits-from-lucene
				568	*/
				569
				570	StringBuilder payloadString = new StringBuilder();
				571	Map<Term, TermContext> termContexts = new HashMap<>();
				572	for (AtomicReaderContext atomic : reader.leaves()) {
				573	Bits bitset = atomic.reader().getLiveDocs();
				574	// Spans spans = NearSpansOrdered();
				575	Spans spans = snquery.getSpans(atomic, bitset, termContexts);
				576
				577	while (spans.next()) {
				578	int docid = atomic.docBase + spans.doc();
				579	if (spans.isPayloadAvailable()) {
				580	for (byte[] payload : spans.getPayload()) {
				581	/* retrieve payload for current matching span */
				582	payloadString.append(new String(payload));
				583	payloadString.append(" \| ");
				584	};
				585	};
				586	};
				587	};
				588	// assertEquals(33, payloadString.length());
				589	assertEquals(0, payloadString.length());
				590
				591
				592
				593	// [][][p:VAFIN]
				594	// without collecting payloads
				595	snquery = new SpanNearQuery(
				596	new SpanQuery[] {
				597	new SpanNearQuery(
				598	new SpanQuery[] {
				599	new SpanTermQuery( new Term("text", "T") ),
				600	new SpanTermQuery( new Term("text", "T") )
				601	},
				602	0,
				603	true,
				604	false
				605	),
				606	new SpanTermQuery( new Term("text", "p:VAFIN") )
				607	},
				608	0,
				609	true,
				610	false
				611	);
				612	assertEquals(1, searcher.search(snquery, 10).totalHits);
				613
				614	payloadString = new StringBuilder();
				615	termContexts = new HashMap<>();
				616	for (AtomicReaderContext atomic : reader.leaves()) {
				617	Bits bitset = atomic.reader().getLiveDocs();
				618	// Spans spans = NearSpansOrdered();
				619	Spans spans = snquery.getSpans(atomic, bitset, termContexts);
				620
				621	while (spans.next()) {
				622	int docid = atomic.docBase + spans.doc();
				623	for (byte[] payload : spans.getPayload()) {
				624	/* retrieve payload for current matching span */
				625	payloadString.append(new String(payload));
				626	payloadString.append(" \| ");
				627	};
				628	};
				629	};
				630	assertEquals(0, payloadString.length());
				631
				632
				633	// [][][p:VAFIN] in s
Nils Diewald	cc7c0b3	2014-07-31 19:58:22 +0000	[diff] [blame]	634	// ([e:s:<][][T] \| [T & e:s:<]) [T] ([p:VAFIN & e:s:>] \| [T][][e:s:>]
Nils Diewald	f399a67	2013-11-18 17:55:22 +0000	[diff] [blame]	635
				636	/*
Nils Diewald	cc7c0b3	2014-07-31 19:58:22 +0000	[diff] [blame]	637
Nils Diewald	f399a67	2013-11-18 17:55:22 +0000	[diff] [blame]	638	SpanSegmentWithinQuery ssequery = new SpanSegmentWithinQuery(
				639	"text","s", new SpanSegmentSequenceQuery("text", "T", "T", "p:VAFIN")
				640	);
				641	assertEquals(0, searcher.search(ssequery.toQuery(), 10).totalHits);
				642
				643	payloadString = new StringBuilder();
				644	termContexts = new HashMap<>();
				645	for (AtomicReaderContext atomic : reader.leaves()) {
				646	Bits bitset = atomic.reader().getLiveDocs();
				647	// Spans spans = NearSpansOrdered();
				648	Spans spans = ssequery.toQuery().getSpans(atomic, bitset, termContexts);
				649
				650	while (spans.next()) {
				651	int docid = atomic.docBase + spans.doc();
				652	for (byte[] payload : spans.getPayload()) {
				653	/// retrieve payload for current matching span
				654	payloadString.append(new String(payload));
				655	payloadString.append(" \| ");
				656	};
				657	};
				658	};
				659	assertEquals(0, payloadString.length(), 1);
				660
				661	ssequery = new SpanSegmentWithinQuery(
				662	"text","s", new SpanSegmentSequenceQuery("text", "T", "p:VAFIN")
				663	);
				664
				665	assertEquals("for " + ssequery.toQuery(),
				666	1, searcher.search(ssequery.toQuery(), 10).totalHits);
				667
				668	payloadString = new StringBuilder();
				669	termContexts = new HashMap<>();
				670	for (AtomicReaderContext atomic : reader.leaves()) {
				671	Bits bitset = atomic.reader().getLiveDocs();
				672	// Spans spans = NearSpansOrdered();
				673	Spans spans = ssequery.toQuery().getSpans(atomic, bitset, termContexts);
				674
				675	while (spans.next()) {
				676	int docid = atomic.docBase + spans.doc();
				677	for (byte[] payload : spans.getPayload()) {
				678	// retrieve payload for current matching span
				679	payloadString.append(new String(payload));
				680	payloadString.append(" \| ");
				681	};
				682	fail("Doc: " + docid + " with " + spans.start() + "-" + spans.end() + " \|\| " + payloadString.toString());
				683	};
				684	};
				685	assertEquals(20, payloadString.length());
				686
				687	*/
				688
				689	// --------------------______>
				690
				691
				692
				693	// Spans spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), ssequery.toQuery());
				694	/*
				695	TopDocs topDocs = is.search(snq, 1);
				696	Set<String> payloadSet = new HashSet<String>();
				697	for (int i = 0; i < topDocs.scoreDocs.length; i++) {
				698	while (spans.next()) {
				699	Collection<byte[]> payloads = spans.getPayload();
				700
				701	for (final byte [] payload : payloads) {
				702	payloadSet.add(new String(payload, "UTF-8"));
				703	}
				704	}
				705	}
				706	*/
				707
				708
				709	/*
				710	Alternativ:
				711	IndexReader reader = writer.getReader();
				712	writer.close();
				713	IndexSearcher searcher = newSearcher(reader);
				714
				715	PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getTopReaderContext());
				716
				717	Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr")));
				718	if(VERBOSE)
				719	System.out.println("Num payloads:" + payloads.size());
				720	for (final byte [] bytes : payloads) {
				721	if(VERBOSE)
				722	System.out.println(new String(bytes, "UTF-8"));
				723	}
				724	*/
				725
				726
				727
				728	/* new: */
				729
				730	// PayloadHelper helper = new PayloadHelper();
				731
				732	// Map<Term, TermContext> termContexts = new HashMap<>();
				733	//Spans spans;
				734	//spans = snquery.getSpans(searcher.getIndexReader());
				735	// searcher = helper.setUp(similarity, 1000);
				736	/*
				737	IndexReader reader = search.getReader(querycontainer.getFoundry());
				738	Spans luceneSpans;
				739	Bits bitset = atomic.reader().getLiveDocs();
				740	for (byte[] payload : luceneSpans.getPayload())
				741
				742	/* Iterate over all matching documents */
				743	/*
				744	while (luceneSpans.next() && total < config.getMaxhits()) {
				745	Span matchSpan;
				746	StringBuilder payloadString = new StringBuilder();
				747	int docid = atomic.docBase + luceneSpans.doc();
				748	String docname = search.retrieveDocname(docid,
				749	querycontainer.getFoundry());
				750	total++;
				751
				752	for (byte[] payload : luceneSpans.getPayload())
				753	*/
				754	/* retrieve payload for current matching span */
				755	// payloadString.append(new String(payload));
				756
				757	/* create span containing result */
				758	/*
				759	matchSpan = new Span(docname);
				760	matchSpan.setIndexdocid(docid);
				761	matchSpan.setLayer(querycontainer.getLayer());
				762	matchSpan.storePayloads(payloadString.toString());
				763	matchSpans.add(matchSpan);
				764	*/
				765	/*
				766	* topdocs = searcher.search(new ConstantScoreQuery(corpusQ add
				767	* position to list of positions to be considered for later
				768	* searches
				769	*/
				770	/*
				771	validValues.put(docname,
				772	matchSpan.getPayload(config.getPrefix()));
				773	}
				774	*/
				775
				776
				777	// Todo: API made by add() typisiert für queries, strings
				778
				779	// SpanPayloadCheckQuery for sentences!
				780
				781	/* Support regular expression in SpanSegmentQuery */
				782	// new Regexp();
				783	// new Term();
				784
				785	/*
				786	Vielleicht: spanSegmentQuery(new Term(), new Wildcard(), new Regex());
				787	*/
				788
				789	// And Not ->
				790	// SpanTermDiffQuery
				791
				792	/*
				793	SpanNearQuery poquery = new SpanNearQuery(
				794
				795	);
				796	*/
				797
				798	reader.close();
				799
				800
				801	};
				802	};