a generic sru web service for weblicht,
currently only configured for korap
diff --git a/src/main/java/de/mannheim/ids/sruws/Corpus.java b/src/main/java/de/mannheim/ids/sruws/Corpus.java
new file mode 100644
index 0000000..b980e15
--- /dev/null
+++ b/src/main/java/de/mannheim/ids/sruws/Corpus.java
@@ -0,0 +1,42 @@
+package de.mannheim.ids.sruws;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class Corpus {
+ String name;
+ String pid;
+ List<String[]> tokenizedSentences;
+
+ public Corpus(String pid, String name) {
+ this.pid = pid;
+ this.name = name;
+ tokenizedSentences =new ArrayList<String[]>();
+ }
+
+ public void addSentence(String sentence){
+ tokenizedSentences.add(sentence.split("\\s+"));
+ }
+
+ public String getName() {
+ return name;
+ }
+ public void setName(String name) {
+ this.name = name;
+ }
+ public String getPid() {
+ return pid;
+ }
+ public void setPid(String pid) {
+ this.pid = pid;
+ }
+ public List<String[]> getTokenizedSentences() {
+ return tokenizedSentences;
+ }
+ public void setTokenizedSentences(List<String[]> tokenizedSentences) {
+ this.tokenizedSentences = tokenizedSentences;
+ }
+
+
+
+}
diff --git a/src/main/java/de/mannheim/ids/sruws/SRUScanResonseHandler.java b/src/main/java/de/mannheim/ids/sruws/SRUScanResonseHandler.java
new file mode 100644
index 0000000..9b713b9
--- /dev/null
+++ b/src/main/java/de/mannheim/ids/sruws/SRUScanResonseHandler.java
@@ -0,0 +1,80 @@
+package de.mannheim.ids.sruws;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class SRUScanResonseHandler extends DefaultHandler{
+
+ private Map<String, Corpus> corpora;
+ private boolean isSruTerm, isSruValue, isSruDisplayTerm;
+ private String pid, name;
+
+ private Logger logger = (Logger) LoggerFactory.getLogger(SRUScanResonseHandler.class);
+
+ public SRUScanResonseHandler() {
+ corpora = new HashMap<String, Corpus>();
+ pid = "";
+ name = "";
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes attributes) throws SAXException {
+ // TODO Auto-generated method stub
+
+ if (qName.equalsIgnoreCase("sru:term")){
+ isSruTerm = true;
+ }
+ else if (qName.equalsIgnoreCase("sru:value")){
+ isSruValue = true;
+ }
+ else if (qName.equalsIgnoreCase("sru:displayTerm")){
+ isSruDisplayTerm = true;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+
+ if (qName.equalsIgnoreCase("sru:term")){
+ isSruTerm = false;
+ logger.info("Corpus {} pid {}",name,pid);
+ corpora.put(pid, new Corpus(pid, name));
+ pid=""; name="";
+ }
+ else if (qName.equalsIgnoreCase("sru:value")){
+ isSruValue = false;
+ }
+ else if (qName.equalsIgnoreCase("sru:displayTerm")){
+ isSruDisplayTerm = false;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+
+ if (isSruTerm && isSruValue){
+ pid += new String(ch, start, length);
+ }
+ else if (isSruTerm && isSruDisplayTerm){
+ name += new String(ch, start, length);
+ }
+ }
+
+
+ public Map<String, Corpus> getCorpora() {
+ return corpora;
+ }
+
+ public void setCorpora(Map<String, Corpus> corpora) {
+ this.corpora = corpora;
+ }
+}
diff --git a/src/main/java/de/mannheim/ids/sruws/SRU_WS.java b/src/main/java/de/mannheim/ids/sruws/SRU_WS.java
new file mode 100644
index 0000000..9748aeb
--- /dev/null
+++ b/src/main/java/de/mannheim/ids/sruws/SRU_WS.java
@@ -0,0 +1,373 @@
+package de.mannheim.ids.sruws;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URLDecoder;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import javax.xml.stream.events.XMLEvent;
+
+import org.apache.http.HttpEntity;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.utils.URIBuilder;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+import eu.clarin.weblicht.wlfxb.io.WLDObjector;
+import eu.clarin.weblicht.wlfxb.io.WLFormatException;
+import eu.clarin.weblicht.wlfxb.tc.api.MatchedCorpus;
+import eu.clarin.weblicht.wlfxb.tc.api.MatchesLayer;
+import eu.clarin.weblicht.wlfxb.tc.api.SentencesLayer;
+import eu.clarin.weblicht.wlfxb.tc.api.Token;
+import eu.clarin.weblicht.wlfxb.tc.api.TokensLayer;
+import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusStored;
+import eu.clarin.weblicht.wlfxb.xb.WLData;
+
+/**
+ * @author margaretha
+ * */
+
+public class SRU_WS extends HttpServlet{
+
+ private String endpointUri;
+ private String endpointBase = "de.mannheim.ids.sruws.endpoint.";
+ private String DUMMY_PID = "Wikipedia";
+
+ private static final String HITS_NS = "http://clarin.eu/fcs/dataview/hits";
+ private static final String HITS_PREFIX = "hits";
+ private static final String FCS_NS = "http://clarin.eu/fcs/1.0";
+ private static final String FCS_PREFIX = "fcs";
+ //private static final String FCS_KWIC_NS = "http://clarin.eu/fcs/1.0/kwic";
+ //private static final String FCS_KWIC_PREFIX = "kwic";
+
+ private static String CORPUS_LANG = "de";
+ private static String QUERY_LANG = "CQL";
+ private Map<String,Corpus> corpusList = new HashMap<String,Corpus>();
+
+ private Logger logger = (Logger) LoggerFactory.getLogger(SRU_WS.class);
+ private SAXParserFactory factory = SAXParserFactory.newInstance();
+
+ private void generateCorpusList() throws IOException{
+ corpusList = new HashMap<String, Corpus>();
+
+ URI requestURI = null;
+ try {
+ URIBuilder builder = new URIBuilder(endpointUri);
+ builder.setParameter("operation", "scan");
+ builder.setParameter("version", "1.2");
+ builder.setParameter("scanClause", "fcs.resource=root");
+ requestURI = builder.build();
+ } catch (URISyntaxException e) {
+ e.printStackTrace();
+ }
+ HttpGet request = new HttpGet(requestURI);
+
+ CloseableHttpClient httpClient = HttpClients.createDefault();
+ CloseableHttpResponse httpResponse = httpClient.execute(request);
+ HttpEntity httpEntity = httpResponse.getEntity();
+ BufferedInputStream bis = new BufferedInputStream(httpEntity.getContent());
+
+ try {
+ SAXParser saxParser = factory.newSAXParser();
+ SRUScanResonseHandler handler = new SRUScanResonseHandler();
+ saxParser.parse(bis, handler);
+ corpusList = handler.getCorpora();
+
+ } catch (ParserConfigurationException | SAXException e) {
+ e.printStackTrace();
+ }
+ }
+
+ @Override
+ protected void doPost(HttpServletRequest req, HttpServletResponse resp)
+ throws ServletException, IOException {
+
+ String endpoint = req.getParameter("endpoint");
+ String query = req.getParameter("query");
+ String max = req.getParameter("maximumRecords");
+
+ if (endpoint == null || endpoint.isEmpty()){
+ resp.sendError(HttpServletResponse.SC_BAD_REQUEST,
+ "Endpoint parameter cannot be empty.");
+ }
+ else if (getServletConfig().getInitParameter(endpointBase+endpoint) == null){
+ resp.sendError(HttpServletResponse.SC_BAD_REQUEST,
+ "Endpoint parameter is incorrect.");
+ }
+ else if (query == null || query.isEmpty()){
+ resp.sendError(HttpServletResponse.SC_BAD_REQUEST, "Query cannot be empty.");
+ }
+ else if (max == null || max.isEmpty() || max.equals("0")){
+ resp.sendError(HttpServletResponse.SC_BAD_REQUEST,
+ "The maximumRecords must be specified greater than 0.");
+ }
+ else{
+
+ endpointUri = getServletConfig().getInitParameter(endpointBase+endpoint);
+ logger.info("Endpoint URL {} ",endpointUri);
+
+ generateCorpusList();
+
+ HttpGet getReq = createSearchRetrieveRequest(query, req.getParameter("maximumRecords"));
+ CloseableHttpClient httpClient = HttpClients.createDefault();
+ CloseableHttpResponse httpResponse = httpClient.execute(getReq);
+ HttpEntity httpEntity = httpResponse.getEntity();
+ BufferedInputStream bis = new BufferedInputStream(httpEntity.getContent());
+
+ //List<String[]> tokenizedSentences = new ArrayList<String[]>();
+ String text;
+ try {
+ //text = readInput(bis, tokenizedSentences);
+ text = readKwicInput(bis);
+ //logger.info(text);
+ } catch (XMLStreamException e) {
+ throw new ServletException("XML streaming error");
+ }
+
+ writeTCL(resp.getOutputStream(), text, query);
+ //writeTCL(resp.getOutputStream(), text, tokenizedSentences, query);
+ }
+ }
+
+ public void writeTCL(OutputStream out, String text, List<String[]> tokenizedSentences, String query){
+ TextCorpusStored textCorpus = new TextCorpusStored(CORPUS_LANG);
+ textCorpus.createTextLayer().addText(text);
+
+ TokensLayer tokensLayer = textCorpus.createTokensLayer();
+ SentencesLayer sentencesLayer = textCorpus.createSentencesLayer();
+ MatchesLayer matchesLayer = textCorpus.createMatchesLayer(QUERY_LANG, query);
+ MatchedCorpus matchedCorpus = matchesLayer.addCorpus("Wikipedia", "WikiPID");
+
+ List<Token> itemToken = new ArrayList<Token>();
+ for (int i=0; i<tokenizedSentences.size();i++){
+ List<Token> sentenceTokens = new ArrayList<Token>();
+ for (String tokenString : tokenizedSentences.get(i)) {
+
+ Token token = tokensLayer.addToken(tokenString);
+ sentenceTokens.add(token);
+
+ if (tokenString.equalsIgnoreCase(query)){
+ itemToken.add(token);
+ }
+ }
+ sentencesLayer.addSentence(sentenceTokens);
+ matchesLayer.addItem(matchedCorpus, itemToken);
+ itemToken.clear();
+ }
+
+ WLData wlData = new WLData(textCorpus);
+ try {
+ WLDObjector.write(wlData, out);
+ } catch (WLFormatException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void writeTCL(OutputStream out, String text, String query){
+ TextCorpusStored textCorpus = new TextCorpusStored(CORPUS_LANG);
+ textCorpus.createTextLayer().addText(text);
+
+ TokensLayer tokensLayer = textCorpus.createTokensLayer();
+ SentencesLayer sentencesLayer = textCorpus.createSentencesLayer();
+ MatchesLayer matchesLayer = textCorpus.createMatchesLayer(QUERY_LANG, query);
+
+ Corpus c;
+ MatchedCorpus matchedCorpus;
+ for (String pid : corpusList.keySet()){
+ c = corpusList.get(pid);
+ if (c.getTokenizedSentences().size() == 0) continue;
+
+ matchedCorpus = matchesLayer.addCorpus(c.getName(), c.getPid());
+
+ List<String[]> tokenizedSentences = c.getTokenizedSentences();
+ List<Token> itemToken = new ArrayList<Token>();
+
+ for (int i=0; i<tokenizedSentences.size();i++){
+
+ List<Token> sentenceTokens = new ArrayList<Token>();
+ for (String tokenString : tokenizedSentences.get(i)) {
+
+ Token token = tokensLayer.addToken(tokenString);
+ sentenceTokens.add(token);
+
+ if (tokenString.equalsIgnoreCase(query)){
+ itemToken.add(token);
+ }
+ }
+ sentencesLayer.addSentence(sentenceTokens);
+ matchesLayer.addItem(matchedCorpus, itemToken);
+ itemToken.clear();
+ }
+ }
+
+ WLData wlData = new WLData(textCorpus);
+ try {
+ WLDObjector.write(wlData, out);
+ } catch (WLFormatException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private String readKwicInput(InputStream is) throws XMLStreamException, UnsupportedEncodingException{
+
+ String kwic;
+ StringBuilder sb = null;
+ StringBuilder kwicCollector = new StringBuilder();
+
+ XMLInputFactory f = XMLInputFactory.newInstance();
+ XMLStreamReader reader = f.createXMLStreamReader(is);
+
+ Corpus c = null;
+ int e;
+ while (reader.hasNext()) {
+ e = reader.next();
+
+ if (e == XMLEvent.START_ELEMENT &&
+ reader.getLocalName().equals("Resource") &&
+ reader.getPrefix().equals(FCS_PREFIX) &&
+ reader.getNamespaceURI().equals(FCS_NS)){
+
+ for (int i=0; i<reader.getAttributeCount(); i++){
+ if (reader.getAttributeLocalName(i).equals("pid")){
+ String pid = reader.getAttributeValue(i);
+ logger.info("Corpus: " + URLDecoder.decode(pid,"UTF-8"));
+ c = corpusList.get(URLDecoder.decode(pid,"UTF-8"));
+ if (c == null) c = corpusList.get(DUMMY_PID);
+ //logger.info(c.getPid());
+ }
+ }
+ }
+
+
+ else if (e == XMLEvent.START_ELEMENT &&
+ reader.getLocalName().equals("kwic") ){
+ sb = new StringBuilder();
+ }
+
+ else if (e == XMLEvent.START_ELEMENT &&
+ reader.getLocalName().equals("c") ){
+ reader.next();
+ sb.append(reader.getText());
+ sb.append(" ");
+ }
+
+ else if (e == XMLEvent.START_ELEMENT &&
+ reader.getLocalName().equals("kw")){
+ reader.next();
+ sb.append(reader.getText());
+ sb.append(" ");
+ }
+
+ else if (e == XMLEvent.START_ELEMENT &&
+ reader.getLocalName().equals("c")){
+ reader.next();
+ sb.append(reader.getText());
+ sb.append(" ");
+ }
+
+ else if (e == XMLEvent.END_ELEMENT &&
+ reader.getLocalName().equals("kwic")){
+ kwic = normalizeString(sb.toString());
+ //logger.info("sentence {}", kwic);
+ c.addSentence(kwic);
+ kwicCollector.append(kwic);
+ }
+ }
+ return kwicCollector.toString();
+ }
+
+ private String readInput(InputStream is, List<String[]> tokenizedSentences)
+ throws XMLStreamException{
+
+ String kwic;
+ StringBuilder sb;
+ StringBuilder kwicCollector = new StringBuilder();
+
+ XMLInputFactory f = XMLInputFactory.newInstance();
+ XMLStreamReader reader = f.createXMLStreamReader(is);
+
+ while (reader.hasNext()) {
+ if (reader.next() == XMLEvent.START_ELEMENT &&
+ reader.getLocalName().equals("Result") &&
+ reader.getPrefix().equals(HITS_PREFIX) &&
+ reader.getNamespaceURI().equals(HITS_NS)){
+
+ sb = new StringBuilder();
+
+ reader.next();
+ sb.append(reader.getText());
+ sb.append(" "); // left context
+
+ reader.next();
+ if (reader.getLocalName().equals("Hit") &&
+ reader.getPrefix().equals(HITS_PREFIX) &&
+ reader.getNamespaceURI().equals(HITS_NS)){
+
+ reader.next();
+ sb.append(reader.getText());
+ sb.append(" ");
+ }
+ reader.next(); // end element Hit
+ reader.next();
+ sb.append(reader.getText()); // right context
+
+ kwic = normalizeString(sb.toString());
+ tokenizedSentences.add(kwic.split("\\s+"));
+ kwicCollector.append(kwic);
+ }
+ }
+ return kwicCollector.toString();
+ }
+
+ private String normalizeString(String text){
+ StringBuilder sb = new StringBuilder();
+ for (int j = 0; j < text.length(); j++){
+ char temp = text.charAt(j);
+ if (!Character.isDigit(temp) && !Character.isLetter(temp)){
+ if (temp == '\''){ sb.append(temp); }
+ else { sb.append(" "+temp); }
+ }
+ else{ sb.append(temp); }
+ }
+ return sb.toString();
+ }
+
+ private HttpGet createSearchRetrieveRequest(String query, String maximumRecords)
+ throws ServletException{
+ try {
+ URIBuilder builder = new URIBuilder(endpointUri);
+ builder.setParameter("operation", "searchRetrieve");
+ builder.setParameter("version", "1.2");
+ builder.setParameter("query", query);
+ builder.setParameter("startRecord", "1");
+ builder.setParameter("maximumRecords", maximumRecords);
+ URI requestURI = builder.build();
+ return new HttpGet(requestURI);
+ }
+ catch (URISyntaxException e) {
+ throw new ServletException();
+ }
+ }
+}
diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
new file mode 100644
index 0000000..022a9df
--- /dev/null
+++ b/src/main/resources/log4j.properties
@@ -0,0 +1,7 @@
+
+log4j.rootLogger=DEBUG, STDOUT
+log4j.de.mannheim.ids.sruws.SRU_WS = INFO, STDOUT
+
+log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender
+log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout
+log4j.appender.STDOUT.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n
diff --git a/src/main/webapp/WEB-INF/web.xml b/src/main/webapp/WEB-INF/web.xml
new file mode 100644
index 0000000..27ff16c
--- /dev/null
+++ b/src/main/webapp/WEB-INF/web.xml
@@ -0,0 +1,57 @@
+<web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="SRU_WEB_SERVICE">
+ <display-name>SRU Web-service</display-name>
+
+ <servlet>
+ <servlet-name>SRU_WS</servlet-name>
+ <servlet-class>de.mannheim.ids.sruws.SRU_WS</servlet-class>
+ <init-param>
+ <param-name>de.mannheim.ids.sruws.endpoint.korap</param-name>
+ <param-value>http://clarin.ids-mannheim.de/korapsru</param-value>
+ </init-param>
+ <init-param>
+ <param-name>de.mannheim.ids.sruws.endpoint.cosmas</param-name>
+ <param-value>http://clarin.ids-mannheim.de/cosmassru</param-value>
+ </init-param>
+ <init-param>
+ <param-name>de.mannheim.ids.sruws.endpoint.leipzig</param-name>
+ <param-value>http://clarinws.informatik.uni-leipzig.de:8080/CQL</param-value>
+ </init-param>
+ <init-param>
+ <param-name>de.mannheim.ids.sruws.endpoint.tuebingen</param-name>
+ <param-value>http://weblicht.sfs.uni-tuebingen.de/rws/sru/</param-value>
+ </init-param>
+ <init-param>
+ <param-name>de.mannheim.ids.sruws.endpoint.stuttgart</param-name>
+ <param-value>http://clarin01.ims.uni-stuttgart.de/SRUCQIBridge</param-value>
+ </init-param>
+ <!-- same case like goethe, pid is sub-resource -->
+ <init-param>
+ <param-name>de.mannheim.ids.sruws.endpoint.muenchen</param-name>
+ <param-value>https://clarin.phonetik.uni-muenchen.de/BASSRU/</param-value>
+ </init-param>
+ <!-- weird pid -->
+ <init-param>
+ <param-name>de.mannheim.ids.sruws.endpoint.mpi</param-name>
+ <param-value>http://cqlservlet.mpi.nl/</param-value>
+ </init-param>
+ <!-- doesnt provide scan -->
+ <init-param>
+ <param-name>de.mannheim.ids.sruws.endpoint.saarland</param-name>
+ <param-value>http://fedora.clarin-d.uni-saarland.de/sru2</param-value>
+ </init-param>
+ <!-- empty pid -->
+ <init-param>
+ <param-name>de.mannheim.ids.sruws.endpoint.berlin</param-name>
+ <param-value>http://dspin.dwds.de:8088/DDC-Endpoint/sru</param-value>
+ </init-param>
+ <!-- pid doesn't match -->
+ <init-param>
+ <param-name>de.mannheim.ids.sruws.endpoint.hamburg</param-name>
+ <param-value>http://virt-fedora.multilingua.uni-hamburg.de:8080/HZSKsru/</param-value>
+ </init-param>
+ </servlet>
+ <servlet-mapping>
+ <servlet-name>SRU_WS</servlet-name>
+ <url-pattern>/*</url-pattern>
+ </servlet-mapping>
+</web-app>
\ No newline at end of file
diff --git a/src/main/webapp/index.jsp b/src/main/webapp/index.jsp
new file mode 100644
index 0000000..c38169b
--- /dev/null
+++ b/src/main/webapp/index.jsp
@@ -0,0 +1,5 @@
+<html>
+<body>
+<h2>Hello World!</h2>
+</body>
+</html>