autoformat
diff --git a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
index f0e9ddc..c140d1e 100644
--- a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
@@ -16,12 +16,12 @@
/**
* Abstract class representing a document in the
* Krill index.
- *
+ *
* This model is rather specific to DeReKo data and
* should be considered experimental. It may be replaced
* by a more agnostic model.
* string fields, e.g. may be combined with a prefix.
- *
+ *
* @author diewald
*/
@JsonIgnoreProperties(ignoreUnknown = true)
@@ -29,53 +29,47 @@
private String primaryData;
@JsonIgnore
- public int
- internalDocID,
- localDocID,
- UID;
+ public int internalDocID, localDocID, UID;
- private KrillDate
- pubDate,
- // newly added
- creationDate;
+ private KrillDate pubDate,
+ // newly added
+ creationDate;
private String
- // No longer supported
- ID,
- corpusID,
- field,
- layerInfo,
- tokenization,
+ // No longer supported
+ ID,
+ corpusID,
+ field,
+ layerInfo,
+ tokenization,
- // Still supported
- foundries,
- textClass,
- pubPlace,
+ // Still supported
+ foundries,
+ textClass,
+ pubPlace,
- // Newly added for the corpus/doc/text distinction of DeReKo
- textSigle, docSigle, corpusSigle,
- title, subTitle, author, editor,
- docTitle, docSubTitle, docAuthor, docEditor,
- corpusTitle, corpusSubTitle, corpusAuthor, corpusEditor,
- textType, textTypeArt, textTypeRef, textColumn, textDomain,
- fileEditionStatement, biblEditionStatement,
- publisher,
- reference,
- language,
- license,
- pages,
- keywords,
+ // Newly added for the corpus/doc/text distinction of DeReKo
+ textSigle, docSigle, corpusSigle, title, subTitle, author,
+ editor,
+ docTitle, docSubTitle, docAuthor, docEditor,
+ corpusTitle,
+ corpusSubTitle, corpusAuthor, corpusEditor, textType,
+ textTypeArt,
+ textTypeRef, textColumn, textDomain,
+ fileEditionStatement,
+ biblEditionStatement, publisher, reference, language,
+ license,
+ pages, keywords,
- // Meta information regarding annotations
- tokenSource,
- layerInfos;
+ // Meta information regarding annotations
+ tokenSource, layerInfos;
/**
* Get the publication date of the document
* as a {@link KrillDate} object.
- *
+ *
* @return A {@link KrillDate} object for chaining.
*/
@JsonIgnore
@@ -87,9 +81,9 @@
/**
* Get the publication date of the document
* as a string.
- *
+ *
* @return A string containing the {@link KrillDate}.
- */
+ */
@JsonProperty("pubDate")
public String getPubDateString () {
if (this.pubDate != null)
@@ -100,9 +94,10 @@
/**
* Set the publication date of the document.
- *
- * @param date The date as a {@link KrillDate}
- * compatible string representation.
+ *
+ * @param date
+ * The date as a {@link KrillDate} compatible string
+ * representation.
* @return A {@link KrillDate} object for chaining.
*/
public KrillDate setPubDate (String date) {
@@ -113,8 +108,9 @@
/**
* Set the publication date of the document.
- *
- * @param date The date as a {@link KrillDate} object.
+ *
+ * @param date
+ * The date as a {@link KrillDate} object.
* @return A {@link KrillDate} object for chaining.
*/
public KrillDate setPubDate (KrillDate date) {
@@ -125,7 +121,7 @@
/**
* Get the creation date of the document
* as a {@link KrillDate} object.
- *
+ *
* @return A {@link KrillDate} object for chaining.
*/
@JsonIgnore
@@ -137,7 +133,7 @@
/**
* Get the creation date of the document
* as a string.
- *
+ *
* @return A string containing the {@link KrillDate}.
*/
@JsonProperty("creationDate")
@@ -150,9 +146,10 @@
/**
* Set the creation date of the document.
- *
- * @param date The date as a {@link KrillDate}
- * compatible string representation.
+ *
+ * @param date
+ * The date as a {@link KrillDate} compatible string
+ * representation.
* @return A {@link KrillDate} object for chaining.
*/
public KrillDate setCreationDate (String date) {
@@ -163,38 +160,40 @@
/**
* Set the creation date of the document.
- *
- * @param date The date as a {@link KrillDate} object.
+ *
+ * @param date
+ * The date as a {@link KrillDate} object.
* @return A {@link KrillDate} object for chaining.
*/
public KrillDate setCreationDate (KrillDate date) {
return (this.creationDate = date);
- };
+ };
/**
* Get the name of the author of the document.
- *
+ *
* @return The name of the author as a string.
*/
public String getAuthor () {
return this.author;
};
-
+
/**
* Set the name of the author of the document.
- *
- * @param author The name of the author as a string.
+ *
+ * @param author
+ * The name of the author as a string.
*/
public void setAuthor (String author) {
this.author = author;
};
-
+
/**
* Get the text class of the document.
- *
+ *
* @return The text class of the document as a string.
*/
public String getTextClass () {
@@ -204,8 +203,9 @@
/**
* Set the text class of the document.
- *
- * @param textClass The text class of the document as a string.
+ *
+ * @param textClass
+ * The text class of the document as a string.
*/
public void setTextClass (String textClass) {
this.textClass = textClass;
@@ -214,39 +214,41 @@
/**
* Get the publication place of the document.
- *
+ *
* @return The publication place of the document as a string.
*/
public String getPubPlace () {
return this.pubPlace;
};
-
-
+
+
/**
* Set the publication place of the document.
- *
- * @param pubPlace The publication place of the document as a string.
+ *
+ * @param pubPlace
+ * The publication place of the document as a string.
*/
public void setPubPlace (String pubPlace) {
this.pubPlace = pubPlace;
- };
+ };
/**
* Get the unique identifier of the document.
- *
+ *
* @return The unique identifier of the document as an integer.
*/
@JsonProperty("UID")
public int getUID () {
return this.UID;
};
-
+
/**
* Set the unique identifier of the document.
- *
- * @param UID The unique identifier of the document as an integer.
+ *
+ * @param UID
+ * The unique identifier of the document as an integer.
* @return The invocant for chaining.
*/
public void setUID (int UID) {
@@ -256,9 +258,10 @@
/**
* Set the unique identifier of the document.
- *
- * @param UID The unique identifier of the document as a
- * string representing an integer.
+ *
+ * @param UID
+ * The unique identifier of the document as a
+ * string representing an integer.
* @return The invocant for chaining.
* @throws NumberFormatException
*/
@@ -270,7 +273,7 @@
/**
* Get the title of the document.
- *
+ *
* @return The title of the document as a string.
*/
public String getTitle () {
@@ -280,17 +283,18 @@
/**
* Set the title of the document.
- *
- * @param title The title of the document as a string.
+ *
+ * @param title
+ * The title of the document as a string.
*/
public void setTitle (String title) {
this.title = title;
};
-
-
+
+
/**
* Get the subtitle of the document.
- *
+ *
* @return The subtitle of the document as a string.
*/
public String getSubTitle () {
@@ -300,8 +304,9 @@
/**
* Set the subtitle of the document.
- *
- * @param subTitle The subtitle of the document as a string.
+ *
+ * @param subTitle
+ * The subtitle of the document as a string.
*/
public void setSubTitle (String subTitle) {
this.subTitle = subTitle;
@@ -310,7 +315,7 @@
/**
* Get the primary data of the document.
- *
+ *
* @return The primary data of the document as a string.
*/
public String getPrimaryData () {
@@ -323,9 +328,11 @@
/**
* Get the primary data of the document,
* starting with a given character offset.
- *
- * @param startOffset The starting character offset.
- * @return The substring of primary data of the document as a string.
+ *
+ * @param startOffset
+ * The starting character offset.
+ * @return The substring of primary data of the document as a
+ * string.
*/
public String getPrimaryData (int startOffset) {
return this.primaryData.substring(startOffset);
@@ -336,10 +343,13 @@
* Get the primary data of the document,
* starting with a given character offset and ending
* with a given character offset.
- *
- * @param startOffset The starting character offset.
- * @param endOffset The ending character offset.
- * @return The substring of the primary data of the document as a string.
+ *
+ * @param startOffset
+ * The starting character offset.
+ * @param endOffset
+ * The ending character offset.
+ * @return The substring of the primary data of the document as a
+ * string.
*/
public String getPrimaryData (int startOffset, int endOffset) {
return this.primaryData.substring(startOffset, endOffset);
@@ -348,19 +358,22 @@
/**
* Set the primary data of the document.
- *
- * @param primary The primary data of the document
- * as a string.
+ *
+ * @param primary
+ * The primary data of the document
+ * as a string.
*/
public void setPrimaryData (String primary) {
this.primaryData = primary;
};
+
/**
* Get the length of the primary data of the document
* (i.e. the number of characters).
- *
- * @return The length of the primary data of the document as an integer.
+ *
+ * @return The length of the primary data of the document as an
+ * integer.
*/
@JsonIgnore
public int getPrimaryDataLength () {
@@ -370,8 +383,8 @@
/**
* Get information on the foundries the document
- * is annotated with as a string.
- *
+ * is annotated with as a string.
+ *
* @return The foundry information string.
*/
public String getFoundries () {
@@ -381,9 +394,10 @@
/**
* Set information on the foundries the document
- * is annotated with.
- *
- * @param foundries The foundry information string.
+ * is annotated with.
+ *
+ * @param foundries
+ * The foundry information string.
*/
public void setFoundries (String foundries) {
this.foundries = foundries;
@@ -392,8 +406,8 @@
/**
* Get information on the layers the document
- * is annotated with as a string.
- *
+ * is annotated with as a string.
+ *
* @return The layer information string.
*/
public String getLayerInfos () {
@@ -403,9 +417,10 @@
/**
* Set information on the layers the document
- * is annotated with as a string.
- *
- * @param layerInfos The layer information string.
+ * is annotated with as a string.
+ *
+ * @param layerInfos
+ * The layer information string.
*/
public void setLayerInfos (String layerInfos) {
this.layerInfos = layerInfos;
@@ -414,8 +429,8 @@
// This is the new text id
/**
- * Get the text sigle as a string.
- *
+ * Get the text sigle as a string.
+ *
* @return The text sigle as a string.
*/
public String getTextSigle () {
@@ -425,9 +440,10 @@
// This is the new text id
/**
- * Set the text sigle as a string.
- *
- * @param textSigle The text sigle as a string.
+ * Set the text sigle as a string.
+ *
+ * @param textSigle
+ * The text sigle as a string.
*/
public void setTextSigle (String textSigle) {
this.textSigle = textSigle;
@@ -436,8 +452,8 @@
// This is the new corpus id
/**
- * Get the corpus sigle as a string.
- *
+ * Get the corpus sigle as a string.
+ *
* @return The corpus sigle as a string.
*/
public String getCorpusSigle () {
@@ -447,9 +463,10 @@
// This is the new corpus id
/**
- * Set the corpus sigle as a string.
- *
- * @param corpusSigle The corpus sigle as a string.
+ * Set the corpus sigle as a string.
+ *
+ * @param corpusSigle
+ * The corpus sigle as a string.
*/
public void setCorpusSigle (String corpusSigle) {
this.corpusSigle = corpusSigle;
@@ -457,8 +474,8 @@
/**
- * Get the document sigle as a string.
- *
+ * Get the document sigle as a string.
+ *
* @return The document sigle as a string.
*/
public String getDocSigle () {
@@ -467,9 +484,10 @@
/**
- * Set the document sigle as a string.
- *
- * @param docSigle The document sigle as a string.
+ * Set the document sigle as a string.
+ *
+ * @param docSigle
+ * The document sigle as a string.
*/
public void setDocSigle (String docSigle) {
this.docSigle = docSigle;
@@ -477,8 +495,8 @@
/**
- * Get the name of the publisher as a string.
- *
+ * Get the name of the publisher as a string.
+ *
* @return The name of the publisher as a string.
*/
public String getPublisher () {
@@ -487,9 +505,10 @@
/**
- * Set the name of the publisher as a string.
- *
- * @param publisher The name of the publisher as a string.
+ * Set the name of the publisher as a string.
+ *
+ * @param publisher
+ * The name of the publisher as a string.
*/
public void setPublisher (String publisher) {
this.publisher = publisher;
@@ -497,8 +516,8 @@
/**
- * Get the name of the editor as a string.
- *
+ * Get the name of the editor as a string.
+ *
* @return The name of the editor as a string.
*/
public String getEditor () {
@@ -507,9 +526,10 @@
/**
- * Set the name of the editor as a string.
- *
- * @param editor The name of the editor as a string.
+ * Set the name of the editor as a string.
+ *
+ * @param editor
+ * The name of the editor as a string.
*/
public void setEditor (String editor) {
this.editor = editor;
@@ -517,8 +537,8 @@
/**
- * Get the type of the text as a string.
- *
+ * Get the type of the text as a string.
+ *
* @return The type of the text as a string.
*/
public String getTextType () {
@@ -527,9 +547,10 @@
/**
- * Set the type of the text as a string.
- *
- * @param textType The type of the text as a string.
+ * Set the type of the text as a string.
+ *
+ * @param textType
+ * The type of the text as a string.
*/
public void setTextType (String textType) {
this.textType = textType;
@@ -537,8 +558,8 @@
/**
- * Get the type art of the text as a string.
- *
+ * Get the type art of the text as a string.
+ *
* @return The type art of the text as a string.
*/
public String getTextTypeArt () {
@@ -547,9 +568,10 @@
/**
- * Set the type art of the text as a string.
- *
- * @param textTypeArt The type art of the text as a string.
+ * Set the type art of the text as a string.
+ *
+ * @param textTypeArt
+ * The type art of the text as a string.
*/
public void setTextTypeArt (String textTypeArt) {
this.textTypeArt = textTypeArt;
@@ -557,9 +579,10 @@
/**
- * Set the type reference of the text as a string.
- *
- * @param textTypeRef The type reference of the text as a string.
+ * Set the type reference of the text as a string.
+ *
+ * @param textTypeRef
+ * The type reference of the text as a string.
*/
public void setTextTypeRef (String textTypeRef) {
this.textTypeRef = textTypeRef;
@@ -567,8 +590,8 @@
/**
- * Get the type reference of the text as a string.
- *
+ * Get the type reference of the text as a string.
+ *
* @return The type reference of the text as a string.
*/
public String getTextTypeRef () {
@@ -577,8 +600,8 @@
/**
- * Get the column of the text as a string.
- *
+ * Get the column of the text as a string.
+ *
* @return The column of the text as a string.
*/
public String getTextColumn () {
@@ -587,9 +610,10 @@
/**
- * Set the column of the text as a string.
- *
- * @param textColumn The column of the text as a string.
+ * Set the column of the text as a string.
+ *
+ * @param textColumn
+ * The column of the text as a string.
*/
public void setTextColumn (String textColumn) {
this.textColumn = textColumn;
@@ -597,8 +621,8 @@
/**
- * Get the domain of the text as a string.
- *
+ * Get the domain of the text as a string.
+ *
* @return The domain of the text as a string.
*/
public String getTextDomain () {
@@ -607,9 +631,10 @@
/**
- * Set the domain of the text as a string.
- *
- * @param textDomain The domain of the text as a string.
+ * Set the domain of the text as a string.
+ *
+ * @param textDomain
+ * The domain of the text as a string.
*/
public void setTextDomain (String textDomain) {
this.textDomain = textDomain;
@@ -617,8 +642,8 @@
/**
- * Get the license of the text as a string.
- *
+ * Get the license of the text as a string.
+ *
* @return The license of the text as a string.
*/
public String getLicense () {
@@ -627,9 +652,10 @@
/**
- * Set the license of the text as a string.
- *
- * @param license The license of the text as a string.
+ * Set the license of the text as a string.
+ *
+ * @param license
+ * The license of the text as a string.
*/
public void setLicense (String license) {
this.license = license;
@@ -637,8 +663,8 @@
/**
- * Get the page numbers of the text as a string.
- *
+ * Get the page numbers of the text as a string.
+ *
* @return The page numbers of the text as a string.
*/
public String getPages () {
@@ -647,9 +673,10 @@
/**
- * Set the page numbers of the text as a string.
- *
- * @param pages The page numbers of the text as a string.
+ * Set the page numbers of the text as a string.
+ *
+ * @param pages
+ * The page numbers of the text as a string.
*/
public void setPages (String pages) {
this.pages = pages;
@@ -657,8 +684,8 @@
/**
- * Get the file edition statement of the text as a string.
- *
+ * Get the file edition statement of the text as a string.
+ *
* @return The file edition statement of the text as a string.
*/
public String getFileEditionStatement () {
@@ -667,10 +694,11 @@
/**
- * Set the file edition statement of the text as a string.
- *
- * @param fileEditionStatement The file edition statement
- * of the text as a string.
+ * Set the file edition statement of the text as a string.
+ *
+ * @param fileEditionStatement
+ * The file edition statement
+ * of the text as a string.
*/
public void setFileEditionStatement (String fileEditionStatement) {
this.fileEditionStatement = fileEditionStatement;
@@ -678,9 +706,10 @@
/**
- * Get the bibliograhic edition statement of the text as a string.
- *
- * @return The bibliograhic edition statement of the text as a string.
+ * Get the bibliograhic edition statement of the text as a string.
+ *
+ * @return The bibliograhic edition statement of the text as a
+ * string.
*/
public String getBiblEditionStatement () {
return this.biblEditionStatement;
@@ -688,10 +717,11 @@
/**
- * Set the bibliograhic edition statement of the text as a string.
- *
- * @param biblEditionStatement The bibliograhic edition statement
- * of the text as a string.
+ * Set the bibliograhic edition statement of the text as a string.
+ *
+ * @param biblEditionStatement
+ * The bibliograhic edition statement
+ * of the text as a string.
*/
public void setBiblEditionStatement (String biblEditionStatement) {
this.biblEditionStatement = biblEditionStatement;
@@ -699,8 +729,8 @@
/**
- * Get the reference of the text as a string.
- *
+ * Get the reference of the text as a string.
+ *
* @return The reference of the text as a string.
*/
public String getReference () {
@@ -709,9 +739,10 @@
/**
- * Set the reference of the text as a string.
- *
- * @param reference The reference of the text as a string.
+ * Set the reference of the text as a string.
+ *
+ * @param reference
+ * The reference of the text as a string.
*/
public void setReference (String reference) {
this.reference = reference;
@@ -719,8 +750,8 @@
/**
- * Get the language of the text as a string.
- *
+ * Get the language of the text as a string.
+ *
* @return The language of the text as a string.
*/
public String getLanguage () {
@@ -729,9 +760,10 @@
/**
- * Set the language of the text as a string.
- *
- * @param language The language of the text as a string.
+ * Set the language of the text as a string.
+ *
+ * @param language
+ * The language of the text as a string.
*/
public void setLanguage (String language) {
this.language = language;
@@ -739,8 +771,8 @@
/**
- * Get the corpus title of the text as a string.
- *
+ * Get the corpus title of the text as a string.
+ *
* @return The corpus title of the text as a string.
*/
public String getCorpusTitle () {
@@ -749,9 +781,10 @@
/**
- * Set the corpus title of the text as a string.
- *
- * @param corpusTitle The corpus title of the text as a string.
+ * Set the corpus title of the text as a string.
+ *
+ * @param corpusTitle
+ * The corpus title of the text as a string.
*/
public void setCorpusTitle (String corpusTitle) {
this.corpusTitle = corpusTitle;
@@ -759,8 +792,8 @@
/**
- * Get the corpus subtitle of the text as a string.
- *
+ * Get the corpus subtitle of the text as a string.
+ *
* @return The corpus subtitle of the text as a string.
*/
public String getCorpusSubTitle () {
@@ -769,10 +802,11 @@
/**
- * Set the corpus subtitle of the text as a string.
- *
- * @param corpusSubTitle The corpus subtitle of the
- * text as a string.
+ * Set the corpus subtitle of the text as a string.
+ *
+ * @param corpusSubTitle
+ * The corpus subtitle of the
+ * text as a string.
*/
public void setCorpusSubTitle (String corpusSubTitle) {
this.corpusSubTitle = corpusSubTitle;
@@ -780,8 +814,8 @@
/**
- * Get the corpus author of the text as a string.
- *
+ * Get the corpus author of the text as a string.
+ *
* @return The corpus author of the text as a string.
*/
public String getCorpusAuthor () {
@@ -790,8 +824,8 @@
/**
- * Set the corpus author of the text as a string.
- *
+ * Set the corpus author of the text as a string.
+ *
* @return The corpus author of the text as a string.
*/
public void setCorpusAuthor (String corpusAuthor) {
@@ -800,8 +834,8 @@
/**
- * Get the corpus editor of the text as a string.
- *
+ * Get the corpus editor of the text as a string.
+ *
* @return The corpus editor of the text as a string.
*/
public String getCorpusEditor () {
@@ -810,9 +844,10 @@
/**
- * Set the corpus editor of the text as a string.
- *
- * @param corpusEditor The corpus editor of the text as a string.
+ * Set the corpus editor of the text as a string.
+ *
+ * @param corpusEditor
+ * The corpus editor of the text as a string.
*/
public void setCorpusEditor (String corpusEditor) {
this.corpusEditor = corpusEditor;
@@ -820,18 +855,20 @@
/**
- * Get the document title of the text as a string.
- *
+ * Get the document title of the text as a string.
+ *
* @return The document title of the text as a string.
*/
public String getDocTitle () {
return this.docTitle;
};
+
/**
- * Set the document title of the text as a string.
- *
- * @param docTitle The document title of the text as a string.
+ * Set the document title of the text as a string.
+ *
+ * @param docTitle
+ * The document title of the text as a string.
*/
public void setDocTitle (String docTitle) {
this.docTitle = docTitle;
@@ -839,8 +876,8 @@
/**
- * Get the subtitle of the document of the text as a string.
- *
+ * Get the subtitle of the document of the text as a string.
+ *
* @return The subtitle of the document of the text as a string.
*/
public String getDocSubTitle () {
@@ -849,10 +886,11 @@
/**
- * Set the subtitle of the document of the text as a string.
- *
- * @param docSubTitle The subtitle of the document of the
- * text as a string.
+ * Set the subtitle of the document of the text as a string.
+ *
+ * @param docSubTitle
+ * The subtitle of the document of the
+ * text as a string.
*/
public void setDocSubTitle (String docSubTitle) {
this.docSubTitle = docSubTitle;
@@ -860,8 +898,8 @@
/**
- * Get the author of the document of the text as a string.
- *
+ * Get the author of the document of the text as a string.
+ *
* @return The author of the document of the text as a string.
*/
public String getDocAuthor () {
@@ -870,9 +908,10 @@
/**
- * Set the author of the document of the text as a string.
- *
- * @param docAuthor The author of the document of the text as a string.
+ * Set the author of the document of the text as a string.
+ *
+ * @param docAuthor
+ * The author of the document of the text as a string.
*/
public void setDocAuthor (String docAuthor) {
this.docAuthor = docAuthor;
@@ -880,8 +919,8 @@
/**
- * Get the editor of the document of the text as a string.
- *
+ * Get the editor of the document of the text as a string.
+ *
* @return The editor of the document of the text as a string.
*/
public String getDocEditor () {
@@ -890,9 +929,10 @@
/**
- * Set the editor of the document of the text as a string.
- *
- * @param docEditor The editor of the document of the text as a string.
+ * Set the editor of the document of the text as a string.
+ *
+ * @param docEditor
+ * The editor of the document of the text as a string.
*/
public void setDocEditor (String docEditor) {
this.docEditor = docEditor;
@@ -900,8 +940,8 @@
/**
- * Get the keywords of the text as a string.
- *
+ * Get the keywords of the text as a string.
+ *
* @return The keywords of the text as a string.
*/
public String getKeywords () {
@@ -910,9 +950,10 @@
/**
- * Set the keywords of the text as a string.
- *
- * @param keywords The keywords of the text as a string.
+ * Set the keywords of the text as a string.
+ *
+ * @param keywords
+ * The keywords of the text as a string.
*/
public void setKeywords (String keywords) {
this.keywords = keywords;
@@ -921,8 +962,8 @@
/**
* Get information about the source of tokenization
- * as a string.
- *
+ * as a string.
+ *
* @return The tokenization information as a string.
*/
public String getTokenSource () {
@@ -932,9 +973,10 @@
/**
* Set information about the source of tokenization
- * as a string.
- *
- * @param tokenSource The tokenization information as a string.
+ * as a string.
+ *
+ * @param tokenSource
+ * The tokenization information as a string.
*/
public void setTokenSource (String tokenSource) {
this.tokenSource = tokenSource;
diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index 70d183f..dc16ccf 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
@@ -30,7 +30,7 @@
* FieldDocument represents a simple API to create documents
* for storing with KrillIndex. <i>Field</i> in the name resembles
* the meaning of Lucene index fields.
- *
+ *
* @author diewald
*/
@JsonIgnoreProperties(ignoreUnknown = true)
@@ -40,9 +40,9 @@
public Document doc = new Document();
- private FieldType tvField = new FieldType(TextField.TYPE_STORED);
+ private FieldType tvField = new FieldType(TextField.TYPE_STORED);
private FieldType tvNoField = new FieldType(TextField.TYPE_NOT_STORED);
- private FieldType keywords = new FieldType(TextField.TYPE_STORED);
+ private FieldType keywords = new FieldType(TextField.TYPE_STORED);
{
tvField.setStoreTermVectors(true);
@@ -62,71 +62,86 @@
keywords.setIndexOptions(IndexOptions.DOCS_ONLY);
};
+
// see http://www.cowtowncoder.com/blog/archives/2011/07/entry_457.html
public void addInt (String key, int value) {
doc.add(new IntField(key, value, Field.Store.YES));
};
+
public void addInt (String key, String value) {
this.addInt(key, Integer.parseInt(value));
};
+
public void addText (String key, String value) {
doc.add(new TextField(key, value, Field.Store.YES));
};
+
public void addKeyword (String key, String value) {
doc.add(new Field(key, value, keywords));
};
+
public void addString (String key, String value) {
doc.add(new StringField(key, value, Field.Store.YES));
};
+
public void addStored (String key, String value) {
doc.add(new StoredField(key, value));
};
+
public void addStored (String key, int value) {
doc.add(new StoredField(key, value));
};
+
public void addTV (String key, String value, String tsString) {
this.addTV(key, value, new MultiTermTokenStream(tsString));
};
+
public void addTV (String key, String tsString) {
this.addTV(key, new MultiTermTokenStream(tsString));
};
+
public void addTV (String key, String value, MultiTermTokenStream ts) {
- Field textField = new Field( key, value, tvField );
- textField.setTokenStream( ts );
+ Field textField = new Field(key, value, tvField);
+ textField.setTokenStream(ts);
doc.add(textField);
};
+
public void addTV (String key, MultiTermTokenStream ts) {
- Field textField = new Field( key, ts, tvNoField );
+ Field textField = new Field(key, ts, tvNoField);
doc.add(textField);
};
+
public String toString () {
return doc.toString();
};
+
public MultiTermTokenStream newMultiTermTokenStream (String ts) {
return new MultiTermTokenStream(ts);
};
+
public MultiTermTokenStream newMultiTermTokenStream () {
return new MultiTermTokenStream();
};
+
/**
* Deserialize token stream data.
*/
- public void setData (Map<String,Object> node) {
+ public void setData (Map<String, Object> node) {
this.setPrimaryData((String) node.get("text"));
String fieldName = (String) node.get("name");
@@ -134,7 +149,8 @@
MultiTermTokenStream mtts = this.newMultiTermTokenStream();
// Iterate over all tokens in stream
- for (ArrayList<String> token : (ArrayList<ArrayList<String>>) node.get("stream")) {
+ for (ArrayList<String> token : (ArrayList<ArrayList<String>>) node
+ .get("stream")) {
try {
// Initialize MultiTermToken
@@ -143,7 +159,7 @@
// Add rest of the list
for (String term : token) {
mtt.add(term);
- };
+ };
// Add MultiTermToken to stream
mtts.addMultiTermToken(mtt);
@@ -170,20 +186,22 @@
this.setTokenSource((String) node.get("tokenSource"));
};
+
/**
* Deserialize token stream data (LEGACY).
*/
public void setFields (ArrayList<Map<String, Object>> fields) {
-
- Map<String,Object> primary = fields.remove(0);
+
+ Map<String, Object> primary = fields.remove(0);
this.setPrimaryData((String) primary.get("primaryData"));
- for (Map<String,Object> field : fields) {
+ for (Map<String, Object> field : fields) {
String fieldName = (String) field.get("name");
MultiTermTokenStream mtts = this.newMultiTermTokenStream();
- for (ArrayList<String> token : (ArrayList<ArrayList<String>>) field.get("data")) {
+ for (ArrayList<String> token : (ArrayList<ArrayList<String>>) field
+ .get("data")) {
try {
MultiTermToken mtt = new MultiTermToken(token.remove(0));
@@ -219,36 +237,42 @@
};
};
+
@Override
public void setTextClass (String textClass) {
super.setTextClass(textClass);
this.addKeyword("textClass", textClass);
};
+
@Override
public void setTitle (String title) {
super.setTitle(title);
this.addText("title", title);
};
+
@Override
public void setSubTitle (String subTitle) {
super.setSubTitle(subTitle);
this.addText("subTitle", subTitle);
};
+
@Override
public void setAuthor (String author) {
super.setAuthor(author);
this.addText("author", author);
};
+
@Override
public void setPubPlace (String pubPlace) {
super.setPubPlace(pubPlace);
this.addString("pubPlace", pubPlace);
};
+
@JsonProperty("pubDate")
@Override
public KrillDate setPubDate (String pubDate) {
@@ -257,6 +281,7 @@
return date;
};
+
@JsonProperty("creationDate")
@Override
public KrillDate setCreationDate (String creationDate) {
@@ -265,6 +290,7 @@
return date;
};
+
// No longer supported
@Override
public void setCorpusID (String corpusID) {
@@ -272,6 +298,7 @@
this.addString("corpusID", corpusID);
};
+
// No longer supported
@Override
public void setID (String ID) {
@@ -279,12 +306,14 @@
this.addString("ID", ID);
};
+
@Override
public void setUID (int ID) {
super.setUID(ID);
this.addString("UID", new Integer(ID).toString());
};
+
// No longer supported
@Override
public void setLayerInfo (String layerInfo) {
@@ -292,168 +321,196 @@
this.addStored("layerInfo", layerInfo);
};
+
@Override
public void setLayerInfos (String layerInfos) {
super.setLayerInfos(layerInfos);
this.addStored("layerInfos", layerInfos);
};
+
@Override
public void setTextSigle (String textSigle) {
super.setTextSigle(textSigle);
this.addString("textSigle", textSigle);
};
+
@Override
public void setDocSigle (String docSigle) {
super.setDocSigle(docSigle);
this.addString("docSigle", docSigle);
};
+
@Override
public void setCorpusSigle (String corpusSigle) {
super.setCorpusSigle(corpusSigle);
this.addString("corpusSigle", corpusSigle);
};
+
@Override
public void setPublisher (String publisher) {
super.setPublisher(publisher);
this.addStored("publisher", publisher);
};
+
@Override
public void setEditor (String editor) {
super.setEditor(editor);
this.addStored("editor", editor);
};
+
@Override
public void setTextType (String textType) {
super.setTextType(textType);
this.addString("textType", textType);
};
+
@Override
public void setTextTypeArt (String textTypeArt) {
super.setTextTypeArt(textTypeArt);
this.addString("textTypeArt", textTypeArt);
};
+
@Override
public void setTextTypeRef (String textTypeRef) {
super.setTextTypeRef(textTypeRef);
this.addString("textTypeRef", textTypeRef);
};
+
@Override
public void setTextColumn (String textColumn) {
super.setTextColumn(textColumn);
this.addString("textColumn", textColumn);
};
+
@Override
public void setTextDomain (String textDomain) {
super.setTextDomain(textDomain);
this.addString("textDomain", textDomain);
};
+
@Override
public void setLicense (String license) {
super.setLicense(license);
this.addString("license", license);
};
+
@Override
public void setPages (String pages) {
super.setPages(pages);
this.addStored("pages", pages);
};
+
@Override
public void setFileEditionStatement (String fileEditionStatement) {
super.setFileEditionStatement(fileEditionStatement);
this.addStored("fileEditionStatement", fileEditionStatement);
};
+
@Override
public void setBiblEditionStatement (String biblEditionStatement) {
super.setBiblEditionStatement(biblEditionStatement);
this.addStored("biblEditionStatement", biblEditionStatement);
};
+
@Override
public void setReference (String reference) {
super.setReference(reference);
this.addStored("reference", reference);
};
+
@Override
public void setLanguage (String language) {
super.setLanguage(language);
this.addString("language", language);
};
+
@Override
public void setDocTitle (String docTitle) {
super.setDocTitle(docTitle);
this.addText("docTitle", docTitle);
};
+
@Override
public void setDocSubTitle (String docSubTitle) {
super.setDocSubTitle(docSubTitle);
this.addText("docSubTitle", docSubTitle);
};
+
@Override
public void setDocAuthor (String docAuthor) {
super.setDocAuthor(docAuthor);
this.addText("docAuthor", docAuthor);
};
+
@Override
public void setDocEditor (String docEditor) {
super.setDocEditor(docEditor);
this.addStored("docEditor", docEditor);
};
+
@Override
public void setCorpusTitle (String corpusTitle) {
super.setCorpusTitle(corpusTitle);
this.addText("corpusTitle", corpusTitle);
};
+
@Override
public void setCorpusSubTitle (String corpusSubTitle) {
super.setCorpusSubTitle(corpusSubTitle);
this.addText("corpusSubTitle", corpusSubTitle);
};
+
@Override
public void setCorpusAuthor (String corpusAuthor) {
super.setCorpusAuthor(corpusAuthor);
this.addText("corpusAuthor", corpusAuthor);
};
+
@Override
public void setCorpusEditor (String corpusEditor) {
super.setCorpusEditor(corpusEditor);
this.addStored("corpusEditor", corpusEditor);
};
+
@Override
public void setKeywords (String keywords) {
super.setKeywords(keywords);
this.addKeyword("keywords", keywords);
};
+
@Override
public void setTokenSource (String tokenSource) {
super.setTokenSource(tokenSource);
this.addStored("tokenSource", tokenSource);
};
+
@Override
public void setFoundries (String foundries) {
super.setFoundries(foundries);
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index 5af92e5..6eb8099 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -1,4 +1,5 @@
package de.ids_mannheim.korap.index;
+
import java.util.*;
import java.io.*;
import org.apache.lucene.store.MMapDirectory;
@@ -12,7 +13,7 @@
* is using the standalone server system,
* this tool may be more suitable for your needs
* (especially as it is way faster).
- *
+ *
* Usage: java -jar Krill-X.XX.jar [propfile] [directories]*
*/
public class Indexer {
@@ -22,25 +23,26 @@
int commitCount;
// Init logger
- private final static Logger log =
- LoggerFactory.getLogger(KrillIndex.class);
+ private final static Logger log = LoggerFactory.getLogger(KrillIndex.class);
/**
* Construct a new indexer object.
- *
- * @param prop A {@link Properties} object with
- * at least the following information:
- * <tt>krill.indexDir</tt>.
+ *
+ * @param prop
+ * A {@link Properties} object with
+ * at least the following information:
+ * <tt>krill.indexDir</tt>.
* @throws IOException
*/
public Indexer (Properties prop) throws IOException {
this.indexDir = prop.getProperty("krill.indexDir");
System.out.println("Index to " + this.indexDir);
-
+
// Default to 1000 documents till the next commit
- String commitCount = prop.getProperty("krill.index.commit.count", "1000");
+ String commitCount = prop.getProperty("krill.index.commit.count",
+ "1000");
// Create a new index object based on the directory
this.index = new KrillIndex(new MMapDirectory(new File(indexDir)));
@@ -48,11 +50,13 @@
this.commitCount = Integer.parseInt(commitCount);
};
+
/**
* Parse a directory for document files.
- *
- * @param dir The {@link File} directory containing
- * documents to index.
+ *
+ * @param dir
+ * The {@link File} directory containing
+ * documents to index.
*/
public void parse (File dir) {
for (String file : dir.list()) {
@@ -80,6 +84,7 @@
};
};
+
/**
* Commit changes to the index.
*/
@@ -95,12 +100,14 @@
System.out.println("done.");
};
+
/**
* Main method.
- *
- * @param argv Argument list,
- * expecting the properties file
- * and a list of directories
+ *
+ * @param argv
+ * Argument list,
+ * expecting the properties file
+ * and a list of directories
* @throws IOException
*/
public static void main (String[] argv) throws IOException {
@@ -110,11 +117,9 @@
if (argv.length < 2) {
String jar = new File(Indexer.class.getProtectionDomain()
- .getCodeSource()
- .getLocation()
- .getPath()).getName();
- System.out.println("Usage: java -jar " + jar +
- " [propfile] [directories]*");
+ .getCodeSource().getLocation().getPath()).getName();
+ System.out.println("Usage: java -jar " + jar
+ + " [propfile] [directories]*");
return;
};
@@ -129,7 +134,7 @@
System.out.println();
// Iterate over list of directories
- for (String arg : Arrays.copyOfRange(argv, 1, argv.length)) {
+ for (String arg : Arrays.copyOfRange(argv, 1, argv.length)) {
File f = new File(arg);
if (f.isDirectory())
ki.parse(f);
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java b/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
index 33a81cf..1f74f93 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
@@ -13,30 +13,36 @@
* Don't use ByteBuffer!
*/
/**
- * A MultiTerm represents a single term (e.g. a word, an annotation, a relation)
+ * A MultiTerm represents a single term (e.g. a word, an annotation, a
+ * relation)
* that can be part of a MultiTermToken.
- *
- * A MultiTerm consists of a term representation string, optional character offset
- * information that matches the term to the character stream of the input text,
+ *
+ * A MultiTerm consists of a term representation string, optional
+ * character offset
+ * information that matches the term to the character stream of the
+ * input text,
* and an arbitrary payload.
- *
+ *
* There is a simple string representation of MultiTerms supported:
* The string is the first sequence of characters.
- * Offsets are written as an appended and dash separated pair of integers.
+ * Offsets are written as an appended and dash separated pair of
+ * integers.
* Payloads are written following a dollar sign.
- * Payload segments can be typed as being a short (s), an integer (i), or a long (l)
+ * Payload segments can be typed as being a short (s), an integer (i),
+ * or a long (l)
* value in leading angular brackets.
- * All other (untyped) payloads are treated as being UTF-8 characer sequences.
- *
+ * All other (untyped) payloads are treated as being UTF-8 characer
+ * sequences.
+ *
* <blockquote><pre>
- * MultiTerm test1 = new MultiTerm("test");
- * MultiTerm test2 = new MultiTerm("test#0-4");
- * MultiTerm test3 = new MultiTerm("test#0-4$Example");
- * MultiTerm test4 = new MultiTerm("test#0-4$<i>1278");
+ * MultiTerm test1 = new MultiTerm("test");
+ * MultiTerm test2 = new MultiTerm("test#0-4");
+ * MultiTerm test3 = new MultiTerm("test#0-4$Example");
+ * MultiTerm test4 = new MultiTerm("test#0-4$<i>1278");
* </pre></blockquote>
- *
+ *
* <strong>Warning</strong>: Strings that are malformed fail silently!
- *
+ *
* @author diewald
*/
public class MultiTerm implements Comparable<MultiTerm> {
@@ -52,7 +58,8 @@
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
- private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);
+ private final Logger log = LoggerFactory
+ .getLogger(MultiTermTokenStream.class);
/**
@@ -65,28 +72,31 @@
/**
* Construct a new MultiTerm object.
- *
- * @param term The term surface (see synopsis).
+ *
+ * @param term
+ * The term surface (see synopsis).
*/
public MultiTerm (String term) throws CorpusDataException {
_fromString(term);
};
-
+
/**
* Construct a new MultiTerm object.
- *
+ *
* In addition to the normal surface representation,
* this supports a prefix notation.
* The following expressions are equal:
- *
+ *
* <blockquote><pre>
- * MultiTerm test1 = new MultiTerm('a', "bcd");
- * MultiTerm test2 = new MultiTerm("a:bcd");
+ * MultiTerm test1 = new MultiTerm('a', "bcd");
+ * MultiTerm test2 = new MultiTerm("a:bcd");
* </pre></blockquote>
- *
- * @param prefix A special prefix for the term.
- * @param term The term surface (see synopsis).
+ *
+ * @param prefix
+ * A special prefix for the term.
+ * @param term
+ * The term surface (see synopsis).
*/
public MultiTerm (char prefix, String term) throws CorpusDataException {
StringBuilder sb = new StringBuilder();
@@ -96,18 +106,19 @@
/**
* Get the term value of the MultiTerm.
- *
+ *
* @return The term as a string.
*/
public String getTerm () {
return this.term;
};
-
-
+
+
/**
* Set the term value of the MultiTerm.
- *
- * @param term The term as a string.
+ *
+ * @param term
+ * The term as a string.
* @return The {@link MultIterm} object for chaining.
*/
public MultiTerm setTerm (String term) {
@@ -118,66 +129,71 @@
/**
* Get the payload.
- *
+ *
* @return The payload as a BytesRef.
*/
public BytesRef getPayload () {
return this.payload;
};
-
+
/**
* Set the payload as a {@link Byte} value.
- *
- * @param pl The payload.
+ *
+ * @param pl
+ * The payload.
* @return The {@link MultiTerm} object for chaining.
*/
public MultiTerm setPayload (Byte pl) {
- this.payload = new BytesRef( ByteBuffer.allocate(1).put(pl).array());
+ this.payload = new BytesRef(ByteBuffer.allocate(1).put(pl).array());
return this;
};
-
+
/**
* Set the payload as a short value.
- *
- * @param pl The payload.
+ *
+ * @param pl
+ * The payload.
* @return The {@link MultiTerm} object for chaining.
*/
public MultiTerm setPayload (short pl) {
- this.payload = new BytesRef( ByteBuffer.allocate(2).putShort(pl).array());
+ this.payload = new BytesRef(ByteBuffer.allocate(2).putShort(pl).array());
return this;
};
/**
* Set the payload as an integer value.
- *
- * @param pl The payload.
+ *
+ * @param pl
+ * The payload.
* @return The {@link MultiTerm} object for chaining.
*/
public MultiTerm setPayload (int pl) {
- this.payload = new BytesRef( ByteBuffer.allocate(4).putInt(pl).array());
+ this.payload = new BytesRef(ByteBuffer.allocate(4).putInt(pl).array());
return this;
};
-
+
/**
* Set the payload as a long value.
- *
- * @param pl The payload.
+ *
+ * @param pl
+ * The payload.
* @return The {@link MultiTerm} object for chaining.
*/
public MultiTerm setPayload (long pl) {
- this.payload = new BytesRef( ByteBuffer.allocate(8).putLong(pl).array());
+ this.payload = new BytesRef(ByteBuffer.allocate(8).putLong(pl).array());
return this;
};
/**
* Set the payload as a string value.
- *
- * @param pl The payload.
+ *
+ * @param pl
+ * The payload.
* @return The {@link MultiTerm} object for chaining.
*/
public MultiTerm setPayload (String pl) {
@@ -188,8 +204,9 @@
/**
* Set the payload as a byte array.
- *
- * @param pl The payload.
+ *
+ * @param pl
+ * The payload.
* @return The {@link MultiTerm} object for chaining.
*/
public MultiTerm setPayload (byte[] pl) {
@@ -200,8 +217,9 @@
/**
* Set the payload as a {@link BytesRef} object.
- *
- * @param pl The payload.
+ *
+ * @param pl
+ * The payload.
* @return The {@link MultiTerm} object for chaining.
*/
public MultiTerm setPayload (BytesRef pl) {
@@ -212,7 +230,7 @@
/**
* Get the start position.
- *
+ *
* @return The start position.
*/
public int getStart () {
@@ -222,8 +240,9 @@
/**
* Set the start position.
- *
- * @param start The start position.
+ *
+ * @param start
+ * The start position.
* @return The {@link MultiTerm} object for chaining.
*/
public MultiTerm setStart (int start) {
@@ -234,7 +253,7 @@
/**
* Get the end position.
- *
+ *
* @return The end position.
*/
public int getEnd () {
@@ -244,8 +263,9 @@
/**
* Set the end position.
- *
- * @param end The end position.
+ *
+ * @param end
+ * The end position.
* @return The {@link MultiTerm} object for chaining.
*/
public MultiTerm setEnd (int end) {
@@ -256,7 +276,7 @@
/**
* Check if there are offsets stored.
- *
+ *
* @return Boolean value indicating that the term
* contains stored offsets.
*/
@@ -267,9 +287,10 @@
/**
* Set the flag for stored offsets, in case they are relevant.
- *
- * @param value Boolean value indicating that the term
- * contains stored offsets.
+ *
+ * @param value
+ * Boolean value indicating that the term
+ * contains stored offsets.
* @return The {@link MultiTerm} object for chaining.
*/
public MultiTerm hasStoredOffsets (boolean value) {
@@ -283,16 +304,13 @@
* Offsets are attached following a hash sign,
* payloads are attached following a dollar sign.
* All payloads are written as UTF-8 character sequences.
- *
+ *
* @see #toStringShort().
*/
public String toString () {
StringBuilder sb = new StringBuilder(this.term);
if (this.start != this.end) {
- sb.append('#')
- .append(this.start)
- .append('-')
- .append(this.end);
+ sb.append('#').append(this.start).append('-').append(this.end);
};
if (this.payload != null) {
@@ -301,14 +319,15 @@
sb.append(this.payload.utf8ToString());
}
catch (AssertionError e) {
- sb.append("<?>")
- .append(this.payload.toString().replace(' ', ','));
+ sb.append("<?>").append(
+ this.payload.toString().replace(' ', ','));
};
};
return sb.toString();
};
+
@Override
public int compareTo (MultiTerm o) {
if (this.payload == null || o.payload == null)
@@ -330,7 +349,7 @@
* Payloads are attached following a dollar sign.
* All payloads are written as UTF-8 character sequences.
* Offsets are neglected.
- *
+ *
* Offsets are ignored.
*
* @see #toString().
@@ -343,8 +362,8 @@
sb.append(this.payload.utf8ToString());
}
catch (AssertionError e) {
- sb.append("<?>")
- .append(this.payload.toString().replace(' ', ','));
+ sb.append("<?>").append(
+ this.payload.toString().replace(' ', ','));
};
};
return sb.toString();
@@ -377,32 +396,32 @@
// Resize the bytebuffer
if ((bb.capacity() - l) < 8) {
- bb = ByteBuffer.allocate(bb.capacity() + 8).
- put(bb.array());
+ bb = ByteBuffer.allocate(bb.capacity() + 8).put(
+ bb.array());
bb.position(l);
};
switch (pls[i]) {
- case "<b>": // byte
- bb.put(Byte.parseByte(pls[i+1]));
- l++;
- break;
- case "<s>": // short
- bb.putShort(Short.parseShort(pls[i+1]));
- l+=2;
- break;
- case "<i>": // integer
- bb.putInt(Integer.parseInt(pls[i+1]));
- l+=4;
- break;
- case "<l>": // long
- bb.putLong(Long.parseLong(pls[i+1]));
- l+=8;
- break;
+ case "<b>": // byte
+ bb.put(Byte.parseByte(pls[i + 1]));
+ l++;
+ break;
+ case "<s>": // short
+ bb.putShort(Short.parseShort(pls[i + 1]));
+ l += 2;
+ break;
+ case "<i>": // integer
+ bb.putInt(Integer.parseInt(pls[i + 1]));
+ l += 4;
+ break;
+ case "<l>": // long
+ bb.putLong(Long.parseLong(pls[i + 1]));
+ l += 8;
+ break;
};
- i+=2;
+ i += 2;
};
-
+
byte[] bytes = new byte[l];
System.arraycopy(bb.array(), 0, bytes, 0, l);
this.payload = new BytesRef(bytes);
@@ -418,7 +437,7 @@
this.payload = new BytesRef(payloadStr);
};
};
-
+
// Parse offset information
stringOffset = termSurface[0].split("\\#", 2);
@@ -426,26 +445,22 @@
// Split start and end position of the offset
String[] offset = stringOffset[1].split("\\-", 2);
-
+
// Start and end is given
if (offset.length == 2 && offset[0].length() > 0) {
try {
this.start = Integer.parseInt(offset[0]);
- this.end = Integer.parseInt(offset[1]);
-
+ this.end = Integer.parseInt(offset[1]);
+
}
catch (NumberFormatException e) {
- throw new CorpusDataException(
- 952,
- "Given offset information is not numeric"
- );
+ throw new CorpusDataException(952,
+ "Given offset information is not numeric");
};
}
else {
- throw new CorpusDataException(
- 953,
- "Given offset information is incomplete"
- );
+ throw new CorpusDataException(953,
+ "Given offset information is incomplete");
};
};
this.term = stringOffset[0];
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTermToken.java b/src/main/java/de/ids_mannheim/korap/index/MultiTermToken.java
index c3ef720..68ba827 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTermToken.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTermToken.java
@@ -10,18 +10,18 @@
/**
- *
+ *
* A MultiTermToken represents a set of {@link MultiTerm MultiTerms}
* starting at the same position, i.e. represents a segment
* in a {@link MultiTermTokenStream}.
- *
+ *
* <blockquote><pre>
- * MultiTermToken mtt = new MultiTermToken("t:test", "a:abbruch");
- * mtt.add("b:banane");
- * System.err.println(mtt.toString());
- * // [t:test|a:abbruch|b:banane]
+ * MultiTermToken mtt = new MultiTermToken("t:test", "a:abbruch");
+ * mtt.add("b:banane");
+ * System.err.println(mtt.toString());
+ * // [t:test|a:abbruch|b:banane]
* </pre></blockquote>
- *
+ *
* @author diewald
*/
public class MultiTermToken {
@@ -31,18 +31,22 @@
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
- private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);
+ private final Logger log = LoggerFactory
+ .getLogger(MultiTermTokenStream.class);
+
/**
* Construct a new MultiTermToken by passing a stream of
* {@link MultiTerm MultiTerms}.
- *
- * @param terms Take at least one {@link MultiTerm} object for a token.
+ *
+ * @param terms
+ * Take at least one {@link MultiTerm} object for a
+ * token.
*/
public MultiTermToken (MultiTerm terms, MultiTerm ... moreTerms) {
this.terms = new ArrayList<MultiTerm>(16);
-
- this.terms.add( terms );
+
+ this.terms.add(terms);
// Further elements on same position
for (i = 0; i < moreTerms.length; i++) {
@@ -54,9 +58,11 @@
/**
* Construct a new MultiTermToken by passing a {@link MultiTerm}
* represented as a prefixed string.
- *
- * @param prefix The term prefix.
- * @param surface The term surface.
+ *
+ * @param prefix
+ * The term prefix.
+ * @param surface
+ * The term surface.
* @see MultiTerm
*/
public MultiTermToken (char prefix, String surface) {
@@ -67,32 +73,35 @@
MultiTerm term = new MultiTerm(prefix, surface);
// First word element
- terms.add( term );
+ terms.add(term);
}
catch (CorpusDataException cde) {
log.error("{}: {}", cde.getErrorCode(), cde.getMessage());
};
};
-
+
/**
* Construct a new MultiTermToken by passing a stream of
* {@link MultiTerm MultiTerms} represented as strings.
- *
- * @param terms Take at least one {@link MultiTerm} string for a token.
+ *
+ * @param terms
+ * Take at least one {@link MultiTerm} string for a
+ * token.
*/
- public MultiTermToken (String terms, String ... moreTerms) throws CorpusDataException {
+ public MultiTermToken (String terms, String ... moreTerms)
+ throws CorpusDataException {
this.terms = new ArrayList<MultiTerm>(16);
MultiTerm term = new MultiTerm(terms);
try {
// First word element
- this.terms.add( term );
+ this.terms.add(term);
// Further elements on same position
for (i = 0; i < moreTerms.length; i++) {
- term = new MultiTerm( moreTerms[i] );
+ term = new MultiTerm(moreTerms[i]);
this.terms.add(term);
};
}
@@ -101,11 +110,12 @@
};
};
-
+
/**
* Add a new {@link MultiTerm} to the MultiTermToken.
- *
- * @param term A {@link MultiTerm} object.
+ *
+ * @param term
+ * A {@link MultiTerm} object.
* @return The {@link MultiTermToken} object for chaining.
*/
public MultiTermToken add (MultiTerm term) {
@@ -117,8 +127,9 @@
/**
* Add a new {@link MultiTerm} to the MultiTermToken.
- *
- * @param term A MultiTerm represented as a surface string.
+ *
+ * @param term
+ * A MultiTerm represented as a surface string.
* @return The {@link MultiTermToken} object for chaining.
*/
public MultiTermToken add (String term) throws CorpusDataException {
@@ -138,9 +149,11 @@
/**
* Add a new {@link MultiTerm} to the MultiTermToken.
- *
- * @param prefix A MultiTerm prefix.
- * @param term A MultiTerm represented as a surface string.
+ *
+ * @param prefix
+ * A MultiTerm prefix.
+ * @param term
+ * A MultiTerm represented as a surface string.
* @return The {@link MultiTermToken} object for chaining.
*/
public MultiTermToken add (char prefix, String term) {
@@ -160,9 +173,10 @@
/**
* Get a {@link MultiTerm} by index.
- *
- * @param index The index position of a {@link MultiTerm}
- * in the {@link MultiTermToken}.
+ *
+ * @param index
+ * The index position of a {@link MultiTerm} in the
+ * {@link MultiTermToken}.
* @return A {@link MultiTerm}.
*/
public MultiTerm get (int index) {
@@ -171,11 +185,11 @@
/**
- * Get the number of {@link MultiTerm MultiTerms}
- * in the MultiTermToken.
- *
- * @return The number of {@link MultiTerm MultiTerms}
- * in the MultiTermToken.
+ * Get the number of {@link MultiTerm MultiTerms} in the
+ * MultiTermToken.
+ *
+ * @return The number of {@link MultiTerm MultiTerms} in the
+ * MultiTermToken.
*/
public int getSize () {
return this.terms.size();
@@ -184,7 +198,7 @@
/**
* Sort the {@link MultiTerm MultiTerms} in the correct order.
- *
+ *
* @return The {@link MultiTermToken} object for chaining.
*/
public MultiTermToken sort () {
@@ -199,7 +213,7 @@
/**
* Serialize the MultiTermToken to a string.
- *
+ *
* @return A string representation of the MultiTermToken,
* with leading offset information.
*/
@@ -211,7 +225,7 @@
sb.append(this.terms.get(i).toString()).append('|');
};
sb.append(this.terms.get(i).toString()).append(']');
-
+
return sb.toString();
};
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java b/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
index bae942a..de974c3 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
@@ -26,15 +26,15 @@
*/
/**
- * MultiTermTokenStream extends Lucenes {@link TokenStream}
- * to work with {@link MultiTermToken MultiTermTokens}.
- *
+ * MultiTermTokenStream extends Lucenes {@link TokenStream} to work
+ * with {@link MultiTermToken MultiTermTokens}.
+ *
* <blockquote><pre>
- * MultiTermTokenStream mtts = new MultiTermTokenStream(
- * "[s:den#0-3|i:den|p:DET|l:der|m:c:acc|m:n:sg|m:masc]"
- * );
+ * MultiTermTokenStream mtts = new MultiTermTokenStream(
+ * "[s:den#0-3|i:den|p:DET|l:der|m:c:acc|m:n:sg|m:masc]"
+ * );
* </pre></blockquote>
- *
+ *
* @author diewald
* @see TokenStream
*/
@@ -43,33 +43,35 @@
private PositionIncrementAttribute posIncrAttr;
private PayloadAttribute payloadAttr;
- private static final Pattern pattern =
- Pattern.compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]");
+ private static final Pattern pattern = Pattern
+ .compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]");
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
- private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);
+ private final Logger log = LoggerFactory
+ .getLogger(MultiTermTokenStream.class);
private List<MultiTermToken> multiTermTokens;
- private int mttIndex = 0,
- mtIndex = 0;
+ private int mttIndex = 0, mtIndex = 0;
private short i = 0;
+
/**
* Construct a new MultiTermTokenStream object.
*/
public MultiTermTokenStream () {
- this.charTermAttr = this.addAttribute(CharTermAttribute.class);
- this.posIncrAttr = this.addAttribute(PositionIncrementAttribute.class);
- this.payloadAttr = this.addAttribute(PayloadAttribute.class);
+ this.charTermAttr = this.addAttribute(CharTermAttribute.class);
+ this.posIncrAttr = this.addAttribute(PositionIncrementAttribute.class);
+ this.payloadAttr = this.addAttribute(PayloadAttribute.class);
this.multiTermTokens = new ArrayList<MultiTermToken>(100);
};
/**
* Construct a new MultiTermTokenStream object
- *
- * @param stream The stream as a string representation.
+ *
+ * @param stream
+ * The stream as a string representation.
*/
public MultiTermTokenStream (String stream) {
this();
@@ -85,13 +87,14 @@
/**
* Construct a new MultiTermTokenStream object
- *
- * @param stream The stream as a {@link Reader} object.
+ *
+ * @param stream
+ * The stream as a {@link Reader} object.
* @throws IOException
*/
public MultiTermTokenStream (Reader stream) throws IOException {
this();
-
+
StringBuilder sb = new StringBuilder(4096);
char[] buf = new char[128];
@@ -111,8 +114,9 @@
/**
* Append a {@link MultiTermToken} to the MultiTermTokenStream.
- *
- * @param mtt A {@link MultiTermToken}.
+ *
+ * @param mtt
+ * A {@link MultiTermToken}.
* @return The {@link MultiTermTokenStream} object for chaining.
*/
public MultiTermTokenStream addMultiTermToken (MultiTermToken mtt) {
@@ -124,12 +128,13 @@
/**
* Append a {@link MultiTermToken} to the MultiTermTokenStream
* by means of a set of {@link MultiTerm MultiTerms}.
- *
- * @param mts A list of {@link MultiTerm} objects.
+ *
+ * @param mts
+ * A list of {@link MultiTerm} objects.
* @return The {@link MultiTermTokenStream} object for chaining.
*/
- public MultiTermTokenStream addMultiTermToken
- (MultiTerm mts, MultiTerm ... moreTerms) {
+ public MultiTermTokenStream addMultiTermToken (MultiTerm mts,
+ MultiTerm ... moreTerms) {
return this.addMultiTermToken(new MultiTermToken(mts, moreTerms));
};
@@ -137,9 +142,12 @@
/**
* Append a {@link MultiTermToken} to the MultiTermTokenStream
* by means of a single {@link MultiTerm} as a prefixed term.
- *
- * @param prefix A prefix character of a surface form of a {@link MultiTerm}.
- * @param surface A surface string of a {@link MultiTerm}.
+ *
+ * @param prefix
+ * A prefix character of a surface form of a
+ * {@link MultiTerm}.
+ * @param surface
+ * A surface string of a {@link MultiTerm}.
* @return The {@link MultiTermTokenStream} object for chaining.
*/
public MultiTermTokenStream addMultiTermToken (char prefix, String surface) {
@@ -151,12 +159,13 @@
* Append a {@link MultiTermToken} to the MultiTermTokenStream
* by means of {@link MultiTerm MultiTerm} represented as a set
* of terms represented as strings.
- *
- * @param surface At least one surface string of a {@link MultiTerm}.
+ *
+ * @param surface
+ * At least one surface string of a {@link MultiTerm}.
* @return The {@link MultiTermTokenStream} object for chaining.
*/
- public MultiTermTokenStream addMultiTermToken
- (String surface, String ... moreTerms) {
+ public MultiTermTokenStream addMultiTermToken (String surface,
+ String ... moreTerms) {
try {
this.addMultiTermToken(new MultiTermToken(surface, moreTerms));
}
@@ -168,14 +177,15 @@
-
/**
* Add meta information to the MultiTermTokenStream.
- *
+ *
* <strong>This is experimental!</strong>
- *
- * @param key A string for denoting the meta information.
- * @param value The value of the meta key as a string.
+ *
+ * @param key
+ * A string for denoting the meta information.
+ * @param value
+ * The value of the meta key as a string.
* @return The {@link MultiTermTokenStream} object for chaining.
*/
public MultiTermTokenStream addMeta (String key, String value) {
@@ -193,11 +203,13 @@
/**
* Add meta information to the MultiTermTokenStream.
- *
+ *
* <strong>This is experimental!</strong>
- *
- * @param key A string for denoting the meta information.
- * @param value The value of the meta key as a byte array.
+ *
+ * @param key
+ * A string for denoting the meta information.
+ * @param value
+ * The value of the meta key as a byte array.
* @return The {@link MultiTermTokenStream} object for chaining.
*/
public MultiTermTokenStream addMeta (String key, byte[] value) {
@@ -215,11 +227,13 @@
/**
* Add meta information to the MultiTermTokenStream.
- *
+ *
* <strong>This is experimental!</strong>
- *
- * @param key A string for denoting the meta information.
- * @param value The value of the meta key as a short value.
+ *
+ * @param key
+ * A string for denoting the meta information.
+ * @param value
+ * The value of the meta key as a short value.
* @return The {@link MultiTermTokenStream} object for chaining.
*/
public MultiTermTokenStream addMeta (String key, short value) {
@@ -237,11 +251,13 @@
/**
* Add meta information to the MultiTermTokenStream.
- *
+ *
* <strong>This is experimental!</strong>
- *
- * @param key A string for denoting the meta information.
- * @param value The value of the meta key as a long value.
+ *
+ * @param key
+ * A string for denoting the meta information.
+ * @param value
+ * The value of the meta key as a long value.
* @return The {@link MultiTermTokenStream} object for chaining.
*/
public MultiTermTokenStream addMeta (String key, long value) {
@@ -259,11 +275,13 @@
/**
* Add meta information to the MultiTermTokenStream.
- *
+ *
* <strong>This is experimental!</strong>
- *
- * @param key A string for denoting the meta information.
- * @param value The value of the meta key as a integer value.
+ *
+ * @param key
+ * A string for denoting the meta information.
+ * @param value
+ * The value of the meta key as a integer value.
* @return The {@link MultiTermTokenStream} object for chaining.
*/
public MultiTermTokenStream addMeta (String key, int value) {
@@ -282,9 +300,10 @@
/**
* Get a {@link MultiTermToken} by index.
- *
- * @param index The index position of a {@link MultiTermToken}
- * in the {@link MultiTermTokenStream}.
+ *
+ * @param index
+ * The index position of a {@link MultiTermToken} in
+ * the {@link MultiTermTokenStream}.
* @return A {@link MultiTermToken}.
*/
public MultiTermToken get (int index) {
@@ -293,11 +312,11 @@
/**
- * Get the number of {@link MultiTermToken MultiTermTokens}
- * in the stream.
- *
- * @return The number of {@link MultiTermToken MultiTermTokens}
- * in the stream.
+ * Get the number of {@link MultiTermToken MultiTermTokens} in the
+ * stream.
+ *
+ * @return The number of {@link MultiTermToken MultiTermTokens} in
+ * the stream.
*/
public int getSize () {
return this.multiTermTokens.size();
@@ -306,13 +325,13 @@
/**
* Serialize the MultiTermTokenStream to a string.
- *
+ *
* @return The MultiTermTokenStream as a string.
*/
public String toString () {
StringBuffer sb = new StringBuffer();
for (MultiTermToken mtt : this.multiTermTokens) {
- sb.append( mtt.toString() );
+ sb.append(mtt.toString());
};
return sb.toString();
};
@@ -324,7 +343,7 @@
while (matcher.find()) {
String[] seg = matcher.group(1).split("\\|");
- MultiTermToken mtt = new MultiTermToken( seg[0] );
+ MultiTermToken mtt = new MultiTermToken(seg[0]);
for (i = 1; i < seg.length; i++)
mtt.add(seg[i]);
@@ -349,7 +368,7 @@
};
// Get current token
- MultiTermToken mtt = this.multiTermTokens.get( this.mttIndex );
+ MultiTermToken mtt = this.multiTermTokens.get(this.mttIndex);
// Sort the MultiTermToken
mtt.sort();
@@ -367,7 +386,7 @@
// Get last token
else {
- mtt = this.multiTermTokens.get( this.mttIndex );
+ mtt = this.multiTermTokens.get(this.mttIndex);
};
};
@@ -375,16 +394,17 @@
MultiTerm mt = mtt.terms.get(this.mtIndex);
// Set the relative position to the former term
- posIncrAttr.setPositionIncrement( this.mtIndex == 0 ? 1 : 0 );
+ posIncrAttr.setPositionIncrement(this.mtIndex == 0 ? 1 : 0);
charTermAttr.setEmpty();
- charTermAttr.append( mt.term );
+ charTermAttr.append(mt.term);
BytesRef payload = new BytesRef();
// There is offset information
if (mt.start != mt.end) {
if (DEBUG)
- log.trace("MultiTerm with payload offset: {}-{}", mt.start, mt.end);
+ log.trace("MultiTerm with payload offset: {}-{}", mt.start,
+ mt.end);
// Add offsets to BytesRef payload
payload.append(new BytesRef(int2byte(mt.start)));
@@ -397,7 +417,7 @@
if (DEBUG)
log.trace("Create payload[1] {}", payload.toString());
};
-
+
// There is payload in the current token to index
if (payload.length > 0) {
payloadAttr.setPayload(payload);
@@ -413,7 +433,7 @@
sb.append('$').append(payload.toString());
sb.append(']');
sb.append(" with increment ").append(this.mtIndex == 0 ? 1 : 0);
-
+
log.trace(sb.toString());
};
@@ -421,9 +441,10 @@
return true;
};
+
@Override
public void reset () {
this.mttIndex = 0;
- this.mtIndex = 0;
+ this.mtIndex = 0;
};
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java b/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
index 3bd0d74..0440286 100644
--- a/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
+++ b/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
@@ -20,235 +20,239 @@
private AtomicReaderContext atomic;
private boolean processed = false;
private Integer[] pair;
- private static ByteBuffer bbOffset =
- ByteBuffer.allocate(8);
+ private static ByteBuffer bbOffset = ByteBuffer.allocate(8);
HashSet<PositionsToOffsetArray> positions;
HashMap<PositionsToOffsetArray, Integer[]> offsets;
- private final static Logger log =
- LoggerFactory.getLogger(PositionsToOffset.class);
+ private final static Logger log = LoggerFactory
+ .getLogger(PositionsToOffset.class);
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
private class PositionsToOffsetArray {
- public int docID;
- public int pos;
-
- public PositionsToOffsetArray (int docID, int pos) {
- this.docID = docID;
- this.pos = pos;
- };
+ public int docID;
+ public int pos;
- public int hashCode(){
- long hashCode;
- hashCode = (docID * Integer.MAX_VALUE) - Integer.MAX_VALUE + pos;
- return new Long(hashCode).hashCode();
- };
-
- public boolean equals(Object obj){
- if (obj instanceof PositionsToOffsetArray) {
- PositionsToOffsetArray ptoa = (PositionsToOffsetArray) obj;
- return (ptoa.docID == this.docID && ptoa.pos == this.pos);
- };
- return false;
- };
+
+ public PositionsToOffsetArray (int docID, int pos) {
+ this.docID = docID;
+ this.pos = pos;
+ };
+
+
+ public int hashCode () {
+ long hashCode;
+ hashCode = (docID * Integer.MAX_VALUE) - Integer.MAX_VALUE + pos;
+ return new Long(hashCode).hashCode();
+ };
+
+
+ public boolean equals (Object obj) {
+ if (obj instanceof PositionsToOffsetArray) {
+ PositionsToOffsetArray ptoa = (PositionsToOffsetArray) obj;
+ return (ptoa.docID == this.docID && ptoa.pos == this.pos);
+ };
+ return false;
+ };
};
+
public PositionsToOffset (AtomicReaderContext atomic, String field) {
- this.field = field;
- this.atomic = atomic;
- this.positions = new HashSet<>(64);
- this.offsets = new HashMap<>(64);
+ this.field = field;
+ this.atomic = atomic;
+ this.positions = new HashSet<>(64);
+ this.offsets = new HashMap<>(64);
};
+
public void clear () {
- this.positions.clear();
- this.offsets.clear();
- this.bbOffset.clear();
- this.processed = false;
+ this.positions.clear();
+ this.offsets.clear();
+ this.bbOffset.clear();
+ this.processed = false;
};
+
public void add (int docID, int pos) {
- this.add(new PositionsToOffsetArray(docID, pos));
+ this.add(new PositionsToOffsetArray(docID, pos));
};
+
public void add (PositionsToOffsetArray ptoa) {
- if (DEBUG)
- log.trace("Add positionsToOffsetArray {}/{}", ptoa.docID, ptoa.pos);
- if (ptoa.pos < 0)
- return;
+ if (DEBUG)
+ log.trace("Add positionsToOffsetArray {}/{}", ptoa.docID, ptoa.pos);
+ if (ptoa.pos < 0)
+ return;
- if (this.processed && this.exists(ptoa))
- return;
+ if (this.processed && this.exists(ptoa))
+ return;
- if (DEBUG)
- log.trace("Reopen processing");
+ if (DEBUG)
+ log.trace("Reopen processing");
- this.positions.add(ptoa);
- this.processed = false;
+ this.positions.add(ptoa);
+ this.processed = false;
};
+
public boolean exists (int docID, int pos) {
- return this.offsets.containsKey(new PositionsToOffsetArray(docID, pos));
+ return this.offsets.containsKey(new PositionsToOffsetArray(docID, pos));
};
+
public boolean exists (PositionsToOffsetArray ptoa) {
- return this.offsets.containsKey(ptoa);
+ return this.offsets.containsKey(ptoa);
};
+
public int start (int docID, int pos) {
- return this.start(new PositionsToOffsetArray(docID, pos));
+ return this.start(new PositionsToOffsetArray(docID, pos));
};
+
public int start (PositionsToOffsetArray ptoa) {
- if (ptoa.pos < 0)
- return 0;
+ if (ptoa.pos < 0)
+ return 0;
- if (!processed)
- this.offsets();
+ if (!processed)
+ this.offsets();
- Integer[] pair = this.offsets.get(ptoa);
+ Integer[] pair = this.offsets.get(ptoa);
- if (pair == null)
- return 0;
+ if (pair == null)
+ return 0;
- return pair[0];
+ return pair[0];
};
+
public int end (int docID, int pos) {
- return this.end(new PositionsToOffsetArray(docID, pos));
+ return this.end(new PositionsToOffsetArray(docID, pos));
};
+
public int end (PositionsToOffsetArray ptoa) {
- if (ptoa.pos < 0)
- return -1;
+ if (ptoa.pos < 0)
+ return -1;
- if (!processed)
- this.offsets();
+ if (!processed)
+ this.offsets();
- Integer[] pair = this.offsets.get(ptoa);
- if (pair == null)
- return -1;
+ Integer[] pair = this.offsets.get(ptoa);
+ if (pair == null)
+ return -1;
- return pair[1];
+ return pair[1];
};
+
public Integer[] span (int docID, int pos) {
- return this.span(new PositionsToOffsetArray(docID, pos));
+ return this.span(new PositionsToOffsetArray(docID, pos));
};
+
public Integer[] span (PositionsToOffsetArray ptoa) {
- if (!processed)
- this.offsets();
- return this.offsets.get(ptoa);
+ if (!processed)
+ this.offsets();
+ return this.offsets.get(ptoa);
};
- public void addOffset (int docID,
- int pos,
- int startOffset,
- int endOffset) {
- offsets.put(
- new PositionsToOffsetArray(docID, pos),
- new Integer[]{startOffset, endOffset}
- );
+
+ public void addOffset (int docID, int pos, int startOffset, int endOffset) {
+ offsets.put(new PositionsToOffsetArray(docID, pos), new Integer[] {
+ startOffset, endOffset });
};
+
public HashMap<PositionsToOffsetArray, Integer[]> offsets () {
- if (processed)
- return offsets;
+ if (processed)
+ return offsets;
- if (DEBUG)
- log.trace("Process offsets");
+ if (DEBUG)
+ log.trace("Process offsets");
- StringBuilder sb = new StringBuilder().append('_');
+ StringBuilder sb = new StringBuilder().append('_');
- try {
- Terms terms = atomic.reader().fields().terms(field);
+ try {
+ Terms terms = atomic.reader().fields().terms(field);
- if (terms != null) {
- // TODO: Maybe reuse a termsEnum!
+ if (terms != null) {
+ // TODO: Maybe reuse a termsEnum!
- final TermsEnum termsEnum = terms.iterator(null);
+ final TermsEnum termsEnum = terms.iterator(null);
- for (PositionsToOffsetArray posDoc : positions) {
- if (this.exists(posDoc))
- continue;
+ for (PositionsToOffsetArray posDoc : positions) {
+ if (this.exists(posDoc))
+ continue;
- int docID = posDoc.docID;
+ int docID = posDoc.docID;
- /*
- int pos = posDoc[1];
- Integer[] posDoc2 = new Integer[2];
- posDoc2[0] = docID;
- posDoc2[1] = pos;
- */
+ /*
+ int pos = posDoc[1];
+ Integer[] posDoc2 = new Integer[2];
+ posDoc2[0] = docID;
+ posDoc2[1] = pos;
+ */
- sb.append(posDoc.pos);
+ sb.append(posDoc.pos);
- Term term = new Term(field, sb.toString());
- sb.setLength(1);
-
- // Set the position in the iterator to the term that is seeked
- if (termsEnum.seekExact(term.bytes())) {
-
- if (DEBUG)
- log.trace("Search for {} in doc {} with pos {}",
- term.toString(),
- posDoc.docID,
- posDoc.pos);
+ Term term = new Term(field, sb.toString());
+ sb.setLength(1);
- // Start an iterator to fetch all payloads of the term
- DocsAndPositionsEnum docs = termsEnum.docsAndPositions(
- null,
- null,
- DocsAndPositionsEnum.FLAG_PAYLOADS
- );
+ // Set the position in the iterator to the term that is seeked
+ if (termsEnum.seekExact(term.bytes())) {
- if (docs.advance(docID) == docID) {
- docs.nextPosition();
+ if (DEBUG)
+ log.trace("Search for {} in doc {} with pos {}",
+ term.toString(), posDoc.docID, posDoc.pos);
- BytesRef payload = docs.getPayload();
+ // Start an iterator to fetch all payloads of the term
+ DocsAndPositionsEnum docs = termsEnum.docsAndPositions(
+ null, null, DocsAndPositionsEnum.FLAG_PAYLOADS);
- if (payload.length == 8) {
- bbOffset.clear();
- bbOffset.put(payload.bytes, payload.offset, 8);
- bbOffset.rewind();
- Integer[] offsetArray = new Integer[2];
- offsetArray[0] = bbOffset.getInt();
- offsetArray[1] = bbOffset.getInt();
- offsets.put(posDoc, offsetArray);
+ if (docs.advance(docID) == docID) {
+ docs.nextPosition();
- if (DEBUG)
- log.trace("Found {}-{} for {}",
- offsetArray[0],
- offsetArray[1],
- term.toString());
- }
+ BytesRef payload = docs.getPayload();
- else {
- log.error(
- "Doc {} has no offsets stored for {}",
- docID,
- term.toString()
- );
- };
- };
- };
- };
- };
- }
- catch (IOException e) {
- log.warn(e.getLocalizedMessage());
- };
+ if (payload.length == 8) {
+ bbOffset.clear();
+ bbOffset.put(payload.bytes, payload.offset, 8);
+ bbOffset.rewind();
+ Integer[] offsetArray = new Integer[2];
+ offsetArray[0] = bbOffset.getInt();
+ offsetArray[1] = bbOffset.getInt();
+ offsets.put(posDoc, offsetArray);
- processed = true;
- positions.clear();
- return offsets;
+ if (DEBUG)
+ log.trace("Found {}-{} for {}",
+ offsetArray[0], offsetArray[1],
+ term.toString());
+ }
+
+ else {
+ log.error(
+ "Doc {} has no offsets stored for {}",
+ docID, term.toString());
+ };
+ };
+ };
+ };
+ };
+ }
+ catch (IOException e) {
+ log.warn(e.getLocalizedMessage());
+ };
+
+ processed = true;
+ positions.clear();
+ return offsets;
};
+
public AtomicReaderContext getAtomicReader () {
- return this.atomic;
+ return this.atomic;
};
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/SpanInfo.java b/src/main/java/de/ids_mannheim/korap/index/SpanInfo.java
index b8d5ea5..5c797ce 100644
--- a/src/main/java/de/ids_mannheim/korap/index/SpanInfo.java
+++ b/src/main/java/de/ids_mannheim/korap/index/SpanInfo.java
@@ -1,9 +1,9 @@
package de.ids_mannheim.korap.index;
+
import de.ids_mannheim.korap.index.TermInfo;
import de.ids_mannheim.korap.response.Match;
import de.ids_mannheim.korap.index.PositionsToOffset;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -11,7 +11,7 @@
public class SpanInfo {
ArrayList<TermInfo> terms;
- HashMap<Integer,Integer> startChar, endChar;
+ HashMap<Integer, Integer> startChar, endChar;
PositionsToOffset pto;
int localDocID;
@@ -20,14 +20,16 @@
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
+
public SpanInfo (PositionsToOffset pto, int localDocID) {
- this.terms = new ArrayList<TermInfo>(64);
- this.startChar = new HashMap<Integer,Integer>(16);
- this.endChar = new HashMap<Integer,Integer>(16);
- this.pto = pto;
+ this.terms = new ArrayList<TermInfo>(64);
+ this.startChar = new HashMap<Integer, Integer>(16);
+ this.endChar = new HashMap<Integer, Integer>(16);
+ this.pto = pto;
this.localDocID = localDocID;
};
+
public void add (TermInfo info) {
info.analyze();
if (info.getType() != "pos") {
@@ -39,6 +41,7 @@
};
};
+
public ArrayList<TermInfo> getTerms () {
// Sort terms (this will also analyze them!)
Collections.sort(this.terms);
@@ -48,7 +51,8 @@
// missing this information
for (TermInfo t : this.terms) {
if (DEBUG)
- log.trace("Check offsets for {} and {}", t.getStartPos(), t.getEndPos());
+ log.trace("Check offsets for {} and {}", t.getStartPos(),
+ t.getEndPos());
found = true;
if (t.getStartChar() == -1) {
if (this.startChar.containsKey(t.getStartPos()))
@@ -62,15 +66,11 @@
else
found = false;
};
-
+
// Add this to found offsets
if (found && t.getStartPos() == t.getEndPos())
- this.pto.addOffset(
- this.localDocID,
- t.getStartPos(),
- t.getStartChar(),
- t.getEndChar()
- );
+ this.pto.addOffset(this.localDocID, t.getStartPos(),
+ t.getStartChar(), t.getEndChar());
else {
if (DEBUG)
log.trace("{} can't be found!", t.getAnnotation());
diff --git a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
index 2835be3..6354f82 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
@@ -24,23 +24,23 @@
private ByteBuffer payload;
private boolean analyzed = false;
- private int startChar = -1,
- endChar = -1,
- startPos = -1,
- endPos = -1;
+ private int startChar = -1, endChar = -1, startPos = -1, endPos = -1;
private byte depth = (byte) 0;
-
- private Pattern prefixRegex = Pattern.compile("(?:([^/]+)/)?([^:/]+)(?::(.+?))?");
+
+ private Pattern prefixRegex = Pattern
+ .compile("(?:([^/]+)/)?([^:/]+)(?::(.+?))?");
private Matcher matcher;
+
public TermInfo (String term, int pos, ByteBuffer payload) {
- this.term = term;
+ this.term = term;
this.startPos = pos;
- this.endPos = pos;
- this.payload = payload;
+ this.endPos = pos;
+ this.payload = payload;
};
+
public TermInfo analyze () {
if (analyzed)
return this;
@@ -51,39 +51,40 @@
this.payload.rewind();
switch (tterm.charAt(0)) {
- case '<':
- // "<>:mate/l:..."
- if (tterm.charAt(1) == '>') {
- // span
- this.type = "span";
- tterm = tterm.substring(3);
- ttype = 2;
- }
- // rel-target
- else {
- this.type = "relTarget";
+ case '<':
+ // "<>:mate/l:..."
+ if (tterm.charAt(1) == '>') {
+ // span
+ this.type = "span";
+ tterm = tterm.substring(3);
+ ttype = 2;
+ }
+ // rel-target
+ else {
+ this.type = "relTarget";
+ tterm = tterm.substring(2);
+ ttype = 3;
+ }
+ ;
+ break;
+
+ case '>':
+ // rel-src
+ this.type = "relSrc";
tterm = tterm.substring(2);
ttype = 3;
- };
- break;
+ break;
- case '>':
- // rel-src
- this.type = "relSrc";
- tterm = tterm.substring(2);
- ttype = 3;
- break;
-
- case '_':
- // pos
- this.type = "pos";
- ttype = 1;
- tterm = tterm.substring(1);
- break;
+ case '_':
+ // pos
+ this.type = "pos";
+ ttype = 1;
+ tterm = tterm.substring(1);
+ break;
- default:
- // term
- this.type = "term";
+ default:
+ // term
+ this.type = "term";
};
// Analyze term value
@@ -97,8 +98,8 @@
this.foundry = matcher.group(1);
else
this.foundry = "base";
- this.layer = matcher.group(2);
- this.value = matcher.group(3);
+ this.layer = matcher.group(2);
+ this.value = matcher.group(3);
};
}
@@ -106,77 +107,90 @@
else {
this.value = tterm;
this.startChar = this.payload.getInt();
- this.endChar = this.payload.getInt();
+ this.endChar = this.payload.getInt();
};
-
+
// for spans
if (ttype == 2) {
this.startChar = this.payload.getInt();
- this.endChar = this.payload.getInt();
+ this.endChar = this.payload.getInt();
};
// for spans and relations
if (ttype > 1)
// Unsure if this is correct
- this.endPos = this.payload.getInt() -1;
-
+ this.endPos = this.payload.getInt() - 1;
+
if (ttype == 2 && this.payload.position() < lastPos) {
this.depth = this.payload.get();
};
-
+
// payloads can have different meaning
analyzed = true;
return this;
};
+
public String getType () {
return this.type;
};
+
public int getStartChar () {
return this.startChar;
};
+
public void setStartChar (int pos) {
this.startChar = pos;
};
+
public int getEndChar () {
return this.endChar;
};
+
public void setEndChar (int pos) {
this.endChar = pos;
};
+
public int getStartPos () {
return this.startPos;
};
+
public int getEndPos () {
return this.endPos;
};
+
public byte getDepth () {
return this.depth;
};
+
public String getFoundry () {
return this.foundry;
};
+
public String getLayer () {
return this.layer;
};
+
public String getValue () {
return this.value;
};
+
public String getAnnotation () {
return this.annotation;
};
+
public String toString () {
this.analyze();
@@ -198,6 +212,7 @@
return sb.toString();
};
+
@Override
public int compareTo (TermInfo obj) {
this.analyze();
diff --git a/src/main/java/de/ids_mannheim/korap/index/TimeOutThread.java b/src/main/java/de/ids_mannheim/korap/index/TimeOutThread.java
index df24db1..ae09567 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TimeOutThread.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TimeOutThread.java
@@ -1,4 +1,5 @@
package de.ids_mannheim.korap.index;
+
import org.apache.lucene.util.Counter;
import java.lang.*;
import java.lang.InterruptedException.*;
@@ -14,31 +15,35 @@
private volatile boolean stop = false;
private Counter counter;
+
public TimeOutThread () {
- super("TimeOutThread");
- counter = Counter.newCounter(true);
+ super("TimeOutThread");
+ counter = Counter.newCounter(true);
};
+
@Override
- public void run() {
- while (!stop) {
- counter.addAndGet(resolution);
- try {
- Thread.sleep( resolution );
- }
- catch (InterruptedException ie) {
- throw new ThreadInterruptedException(ie);
- };
- };
+ public void run () {
+ while (!stop) {
+ counter.addAndGet(resolution);
+ try {
+ Thread.sleep(resolution);
+ }
+ catch (InterruptedException ie) {
+ throw new ThreadInterruptedException(ie);
+ };
+ };
};
+
// Get miliseconds
public long getTime () {
- return counter.get();
+ return counter.get();
};
-
+
+
// Stops the timer thread
public void stopTimer () {
- stop = true;
+ stop = true;
};
};