a generic sru web service for weblicht,
currently only configured for korap
diff --git a/src/main/java/de/mannheim/ids/sruws/Corpus.java b/src/main/java/de/mannheim/ids/sruws/Corpus.java
new file mode 100644
index 0000000..b980e15
--- /dev/null
+++ b/src/main/java/de/mannheim/ids/sruws/Corpus.java
@@ -0,0 +1,42 @@
+package de.mannheim.ids.sruws;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class Corpus {
+	String name;
+	String pid;
+	List<String[]> tokenizedSentences;
+	
+	public Corpus(String pid, String name) {
+		this.pid = pid;
+		this.name = name;
+		tokenizedSentences =new ArrayList<String[]>();
+	}
+	
+	public void addSentence(String sentence){
+		tokenizedSentences.add(sentence.split("\\s+"));
+	}
+	
+	public String getName() {
+		return name;
+	}
+	public void setName(String name) {
+		this.name = name;
+	}
+	public String getPid() {
+		return pid;
+	}
+	public void setPid(String pid) {
+		this.pid = pid;
+	}
+	public List<String[]> getTokenizedSentences() {
+		return tokenizedSentences;
+	}
+	public void setTokenizedSentences(List<String[]> tokenizedSentences) {
+		this.tokenizedSentences = tokenizedSentences;
+	}
+
+	
+	
+}
diff --git a/src/main/java/de/mannheim/ids/sruws/SRUScanResonseHandler.java b/src/main/java/de/mannheim/ids/sruws/SRUScanResonseHandler.java
new file mode 100644
index 0000000..9b713b9
--- /dev/null
+++ b/src/main/java/de/mannheim/ids/sruws/SRUScanResonseHandler.java
@@ -0,0 +1,80 @@
+package de.mannheim.ids.sruws;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class SRUScanResonseHandler extends DefaultHandler{
+	
+	private Map<String, Corpus> corpora;
+	private boolean isSruTerm, isSruValue, isSruDisplayTerm; 
+	private String pid, name;
+	
+	private Logger logger = (Logger) LoggerFactory.getLogger(SRUScanResonseHandler.class);
+	
+	public SRUScanResonseHandler() {
+		corpora = new HashMap<String, Corpus>(); 
+		pid = ""; 
+		name = "";
+	}
+	
+	@Override
+	public void startElement(String uri, String localName, String qName,
+			Attributes attributes) throws SAXException {
+		// TODO Auto-generated method stub
+		
+		if (qName.equalsIgnoreCase("sru:term")){
+			isSruTerm = true;
+		}
+		else if (qName.equalsIgnoreCase("sru:value")){
+			isSruValue = true;
+		}
+		else if (qName.equalsIgnoreCase("sru:displayTerm")){
+			isSruDisplayTerm = true;
+		}		
+	}
+	
+	@Override
+	public void endElement(String uri, String localName, String qName)
+			throws SAXException {
+	
+		if (qName.equalsIgnoreCase("sru:term")){
+			isSruTerm = false;
+			logger.info("Corpus {} pid {}",name,pid);
+			corpora.put(pid, new Corpus(pid, name));
+			pid=""; name="";
+		}
+		else if (qName.equalsIgnoreCase("sru:value")){
+			isSruValue = false;
+		}
+		else if (qName.equalsIgnoreCase("sru:displayTerm")){
+			isSruDisplayTerm = false;
+		}		
+	}
+	
+	@Override
+	public void characters(char[] ch, int start, int length)
+			throws SAXException {
+		
+		if (isSruTerm && isSruValue){
+			pid += new String(ch, start, length);
+		}
+		else if (isSruTerm && isSruDisplayTerm){
+			name += new String(ch, start, length);
+		}
+	}
+	
+	
+	public Map<String, Corpus> getCorpora() {
+		return corpora;
+	}
+
+	public void setCorpora(Map<String, Corpus> corpora) {
+		this.corpora = corpora;
+	}
+}
diff --git a/src/main/java/de/mannheim/ids/sruws/SRU_WS.java b/src/main/java/de/mannheim/ids/sruws/SRU_WS.java
new file mode 100644
index 0000000..9748aeb
--- /dev/null
+++ b/src/main/java/de/mannheim/ids/sruws/SRU_WS.java
@@ -0,0 +1,373 @@
+package de.mannheim.ids.sruws;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URLDecoder;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import javax.xml.stream.events.XMLEvent;
+
+import org.apache.http.HttpEntity;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.utils.URIBuilder;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+import eu.clarin.weblicht.wlfxb.io.WLDObjector;
+import eu.clarin.weblicht.wlfxb.io.WLFormatException;
+import eu.clarin.weblicht.wlfxb.tc.api.MatchedCorpus;
+import eu.clarin.weblicht.wlfxb.tc.api.MatchesLayer;
+import eu.clarin.weblicht.wlfxb.tc.api.SentencesLayer;
+import eu.clarin.weblicht.wlfxb.tc.api.Token;
+import eu.clarin.weblicht.wlfxb.tc.api.TokensLayer;
+import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusStored;
+import eu.clarin.weblicht.wlfxb.xb.WLData;
+
+/**
+ * @author margaretha
+ * */
+
+public class SRU_WS extends HttpServlet{
+	
+	private String endpointUri;
+	private String endpointBase = "de.mannheim.ids.sruws.endpoint.";
+	private String DUMMY_PID = "Wikipedia";
+	
+	private static final String HITS_NS = "http://clarin.eu/fcs/dataview/hits";
+    private static final String HITS_PREFIX = "hits";    
+    private static final String FCS_NS = "http://clarin.eu/fcs/1.0";
+    private static final String FCS_PREFIX = "fcs";    
+    //private static final String FCS_KWIC_NS = "http://clarin.eu/fcs/1.0/kwic";
+    //private static final String FCS_KWIC_PREFIX = "kwic";
+        
+    private static String CORPUS_LANG = "de";
+    private static String QUERY_LANG = "CQL";   
+	private Map<String,Corpus> corpusList = new HashMap<String,Corpus>();
+    
+	private Logger logger = (Logger) LoggerFactory.getLogger(SRU_WS.class);
+	private SAXParserFactory factory = SAXParserFactory.newInstance();
+			
+	private void generateCorpusList() throws IOException{
+		corpusList = new HashMap<String, Corpus>();
+		
+		URI requestURI = null;
+		try {
+			URIBuilder builder = new URIBuilder(endpointUri);			
+			builder.setParameter("operation", "scan");
+			builder.setParameter("version", "1.2");
+			builder.setParameter("scanClause", "fcs.resource=root");
+			requestURI = builder.build();
+		} catch (URISyntaxException e) {
+			e.printStackTrace();
+		}	
+		HttpGet request = new HttpGet(requestURI);
+		
+		CloseableHttpClient httpClient = HttpClients.createDefault();
+		CloseableHttpResponse httpResponse = httpClient.execute(request);
+		HttpEntity httpEntity = httpResponse.getEntity();
+		BufferedInputStream bis = new BufferedInputStream(httpEntity.getContent());			
+		
+		try {
+			SAXParser saxParser = factory.newSAXParser();
+			SRUScanResonseHandler handler = new SRUScanResonseHandler();
+			saxParser.parse(bis, handler);
+			corpusList = handler.getCorpora();
+			
+		} catch (ParserConfigurationException | SAXException e) {
+			e.printStackTrace();
+		}
+	}
+		
+	@Override
+	protected void doPost(HttpServletRequest req, HttpServletResponse resp)
+			throws ServletException, IOException {
+						
+		String endpoint = req.getParameter("endpoint");
+		String query = req.getParameter("query");
+		String max = req.getParameter("maximumRecords");
+		
+		if (endpoint == null || endpoint.isEmpty()){
+			resp.sendError(HttpServletResponse.SC_BAD_REQUEST,
+					"Endpoint parameter cannot be empty.");
+		}
+		else if (getServletConfig().getInitParameter(endpointBase+endpoint) == null){
+			resp.sendError(HttpServletResponse.SC_BAD_REQUEST,
+					"Endpoint parameter is incorrect.");
+		}
+		else if (query == null || query.isEmpty()){
+			resp.sendError(HttpServletResponse.SC_BAD_REQUEST, "Query cannot be empty.");
+		}
+		else if (max == null || max.isEmpty() || max.equals("0")){
+			resp.sendError(HttpServletResponse.SC_BAD_REQUEST, 
+					"The maximumRecords must be specified greater than 0.");
+		}
+		else{			
+			
+			endpointUri = getServletConfig().getInitParameter(endpointBase+endpoint);
+			logger.info("Endpoint URL {} ",endpointUri);		
+			
+			generateCorpusList();
+			
+			HttpGet getReq = createSearchRetrieveRequest(query, req.getParameter("maximumRecords"));			
+			CloseableHttpClient httpClient = HttpClients.createDefault();
+			CloseableHttpResponse httpResponse = httpClient.execute(getReq);
+			HttpEntity httpEntity = httpResponse.getEntity();
+			BufferedInputStream bis = new BufferedInputStream(httpEntity.getContent());
+			
+			//List<String[]> tokenizedSentences = new ArrayList<String[]>();
+			String text;
+			try {
+				//text = readInput(bis, tokenizedSentences);
+				text = readKwicInput(bis);
+				//logger.info(text);
+			} catch (XMLStreamException e) {				
+				throw new ServletException("XML streaming error");
+			}
+			
+			writeTCL(resp.getOutputStream(), text, query);
+			//writeTCL(resp.getOutputStream(), text, tokenizedSentences, query);
+		}
+	}
+	
+	public void writeTCL(OutputStream out, String text, List<String[]> tokenizedSentences, String query){
+		TextCorpusStored textCorpus = new TextCorpusStored(CORPUS_LANG);
+		textCorpus.createTextLayer().addText(text);
+
+		TokensLayer tokensLayer = textCorpus.createTokensLayer();
+		SentencesLayer sentencesLayer = textCorpus.createSentencesLayer();	
+		MatchesLayer matchesLayer = textCorpus.createMatchesLayer(QUERY_LANG, query);
+		MatchedCorpus matchedCorpus = matchesLayer.addCorpus("Wikipedia", "WikiPID");
+		
+		List<Token> itemToken = new ArrayList<Token>();
+		for (int i=0; i<tokenizedSentences.size();i++){			
+			List<Token> sentenceTokens = new ArrayList<Token>();				
+			for (String tokenString : tokenizedSentences.get(i)) {
+				
+				Token token = tokensLayer.addToken(tokenString);
+				sentenceTokens.add(token);
+				
+				if (tokenString.equalsIgnoreCase(query)){
+					itemToken.add(token);
+				}
+			}
+			sentencesLayer.addSentence(sentenceTokens);
+			matchesLayer.addItem(matchedCorpus, itemToken);
+			itemToken.clear();
+		}
+		
+		WLData wlData = new WLData(textCorpus);
+		try {
+			WLDObjector.write(wlData, out);
+		} catch (WLFormatException e) {			
+			e.printStackTrace();
+		}
+	}
+	
+	public void writeTCL(OutputStream out, String text, String query){
+		TextCorpusStored textCorpus = new TextCorpusStored(CORPUS_LANG);
+		textCorpus.createTextLayer().addText(text);
+
+		TokensLayer tokensLayer = textCorpus.createTokensLayer();
+		SentencesLayer sentencesLayer = textCorpus.createSentencesLayer();	
+		MatchesLayer matchesLayer = textCorpus.createMatchesLayer(QUERY_LANG, query);
+		
+		Corpus c;
+		MatchedCorpus matchedCorpus; 
+		for (String pid : corpusList.keySet()){
+			c = corpusList.get(pid);
+			if (c.getTokenizedSentences().size() == 0) continue;
+			
+			matchedCorpus = matchesLayer.addCorpus(c.getName(), c.getPid());
+			
+			List<String[]> tokenizedSentences = c.getTokenizedSentences();
+			List<Token> itemToken = new ArrayList<Token>();			
+			
+			for (int i=0; i<tokenizedSentences.size();i++){
+				
+				List<Token> sentenceTokens = new ArrayList<Token>();				
+				for (String tokenString : tokenizedSentences.get(i)) {
+					
+					Token token = tokensLayer.addToken(tokenString);
+					sentenceTokens.add(token);
+					
+					if (tokenString.equalsIgnoreCase(query)){
+						itemToken.add(token);
+					}
+				}
+				sentencesLayer.addSentence(sentenceTokens);
+				matchesLayer.addItem(matchedCorpus, itemToken);
+				itemToken.clear();
+			}
+		}
+		
+		WLData wlData = new WLData(textCorpus);
+		try {
+			WLDObjector.write(wlData, out);
+		} catch (WLFormatException e) {			
+			e.printStackTrace();
+		}
+	}
+	
+	private String readKwicInput(InputStream is) throws XMLStreamException, UnsupportedEncodingException{
+		
+		String kwic;
+		StringBuilder sb = null;
+		StringBuilder kwicCollector = new StringBuilder();	
+		
+		XMLInputFactory f = XMLInputFactory.newInstance();
+		XMLStreamReader reader = f.createXMLStreamReader(is);
+		
+		Corpus c = null;
+		int e;
+		while (reader.hasNext()) {
+			e = reader.next();
+			
+			if (e == XMLEvent.START_ELEMENT &&
+					reader.getLocalName().equals("Resource") &&
+					reader.getPrefix().equals(FCS_PREFIX) &&
+					reader.getNamespaceURI().equals(FCS_NS)){
+				
+				for (int i=0; i<reader.getAttributeCount(); i++){
+					if (reader.getAttributeLocalName(i).equals("pid")){
+						String pid = reader.getAttributeValue(i);						
+						logger.info("Corpus: " + URLDecoder.decode(pid,"UTF-8"));						
+						c = corpusList.get(URLDecoder.decode(pid,"UTF-8"));
+						if (c == null) c = corpusList.get(DUMMY_PID);
+						//logger.info(c.getPid());
+					}					
+				}
+			}
+				
+							
+			else if (e == XMLEvent.START_ELEMENT &&
+					reader.getLocalName().equals("kwic") ){
+				sb = new StringBuilder();
+			}
+			
+			else if (e == XMLEvent.START_ELEMENT &&
+					reader.getLocalName().equals("c") ){					
+				reader.next();
+				sb.append(reader.getText());
+				sb.append(" ");
+			}				
+
+			else if (e == XMLEvent.START_ELEMENT &&
+					reader.getLocalName().equals("kw")){				
+				reader.next();
+				sb.append(reader.getText());
+				sb.append(" ");
+			}
+			
+			else if (e == XMLEvent.START_ELEMENT &&
+					reader.getLocalName().equals("c")){					
+				reader.next();
+				sb.append(reader.getText());
+				sb.append(" ");
+			}
+			
+			else if (e == XMLEvent.END_ELEMENT && 
+					reader.getLocalName().equals("kwic")){
+				kwic = normalizeString(sb.toString());
+				//logger.info("sentence {}", kwic);
+				c.addSentence(kwic);					
+				kwicCollector.append(kwic);
+			}				
+		}
+		return kwicCollector.toString();
+	}
+	
+	private String readInput(InputStream is, List<String[]> tokenizedSentences) 
+			throws XMLStreamException{
+		
+		String kwic;
+		StringBuilder sb;
+		StringBuilder kwicCollector = new StringBuilder();	
+		
+		XMLInputFactory f = XMLInputFactory.newInstance();
+		XMLStreamReader reader = f.createXMLStreamReader(is);
+		
+		while (reader.hasNext()) {			
+			if (reader.next() == XMLEvent.START_ELEMENT &&
+					reader.getLocalName().equals("Result") &&
+					reader.getPrefix().equals(HITS_PREFIX) &&
+					reader.getNamespaceURI().equals(HITS_NS)){
+					
+				sb = new StringBuilder();
+				
+				reader.next();					
+				sb.append(reader.getText());
+				sb.append(" "); // left context
+				
+				reader.next();
+				if (reader.getLocalName().equals("Hit") &&
+						reader.getPrefix().equals(HITS_PREFIX) &&
+						reader.getNamespaceURI().equals(HITS_NS)){
+					
+					reader.next();
+					sb.append(reader.getText());
+					sb.append(" ");
+				}
+				reader.next(); // end element Hit					
+				reader.next();
+				sb.append(reader.getText()); // right context
+				
+				kwic = normalizeString(sb.toString());
+				tokenizedSentences.add(kwic.split("\\s+"));
+				kwicCollector.append(kwic);
+			}			
+		}
+		return kwicCollector.toString();
+	}
+	
+	private String normalizeString(String text){
+		StringBuilder sb = new StringBuilder();
+		for (int j = 0; j < text.length(); j++){
+			char temp = text.charAt(j);					
+			if (!Character.isDigit(temp) && !Character.isLetter(temp)){						
+				if (temp == '\''){ sb.append(temp);	}
+				else { sb.append(" "+temp);	}							
+			}
+			else{ sb.append(temp); }
+		}
+		return sb.toString();			
+	}	
+	
+	private HttpGet createSearchRetrieveRequest(String query, String maximumRecords) 
+			throws ServletException{
+		try {
+			URIBuilder builder = new URIBuilder(endpointUri);	
+			builder.setParameter("operation", "searchRetrieve");
+			builder.setParameter("version", "1.2");
+			builder.setParameter("query", query);
+			builder.setParameter("startRecord", "1");
+			builder.setParameter("maximumRecords", maximumRecords);
+			URI requestURI = builder.build();
+			return new HttpGet(requestURI);
+		}
+		catch (URISyntaxException e) {
+			throw new ServletException();
+		}
+	}	
+}
diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
new file mode 100644
index 0000000..022a9df
--- /dev/null
+++ b/src/main/resources/log4j.properties
@@ -0,0 +1,7 @@
+
+log4j.rootLogger=DEBUG, STDOUT
+log4j.de.mannheim.ids.sruws.SRU_WS = INFO, STDOUT
+
+log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender
+log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout
+log4j.appender.STDOUT.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n
diff --git a/src/main/webapp/WEB-INF/web.xml b/src/main/webapp/WEB-INF/web.xml
new file mode 100644
index 0000000..27ff16c
--- /dev/null
+++ b/src/main/webapp/WEB-INF/web.xml
@@ -0,0 +1,57 @@
+<web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="SRU_WEB_SERVICE">
+  <display-name>SRU Web-service</display-name>
+  
+  <servlet>    
+    <servlet-name>SRU_WS</servlet-name>        
+    <servlet-class>de.mannheim.ids.sruws.SRU_WS</servlet-class>
+    <init-param>
+        <param-name>de.mannheim.ids.sruws.endpoint.korap</param-name>
+        <param-value>http://clarin.ids-mannheim.de/korapsru</param-value>
+    </init-param>
+    <init-param>
+        <param-name>de.mannheim.ids.sruws.endpoint.cosmas</param-name>
+        <param-value>http://clarin.ids-mannheim.de/cosmassru</param-value>
+    </init-param>
+    <init-param>
+        <param-name>de.mannheim.ids.sruws.endpoint.leipzig</param-name>
+        <param-value>http://clarinws.informatik.uni-leipzig.de:8080/CQL</param-value>
+    </init-param>
+    <init-param>
+        <param-name>de.mannheim.ids.sruws.endpoint.tuebingen</param-name>
+        <param-value>http://weblicht.sfs.uni-tuebingen.de/rws/sru/</param-value>
+    </init-param>
+    <init-param>
+        <param-name>de.mannheim.ids.sruws.endpoint.stuttgart</param-name>
+        <param-value>http://clarin01.ims.uni-stuttgart.de/SRUCQIBridge</param-value>
+    </init-param>
+    <!-- same case like goethe, pid is sub-resource -->
+    <init-param> 
+        <param-name>de.mannheim.ids.sruws.endpoint.muenchen</param-name>
+        <param-value>https://clarin.phonetik.uni-muenchen.de/BASSRU/</param-value>
+    </init-param>
+    <!-- weird pid -->
+    <init-param>
+        <param-name>de.mannheim.ids.sruws.endpoint.mpi</param-name>
+        <param-value>http://cqlservlet.mpi.nl/</param-value>
+    </init-param>   
+    <!-- doesnt provide scan -->		
+    <init-param>
+        <param-name>de.mannheim.ids.sruws.endpoint.saarland</param-name>
+        <param-value>http://fedora.clarin-d.uni-saarland.de/sru2</param-value>
+    </init-param>
+    <!-- empty pid -->
+    <init-param>
+        <param-name>de.mannheim.ids.sruws.endpoint.berlin</param-name>
+        <param-value>http://dspin.dwds.de:8088/DDC-Endpoint/sru</param-value>
+    </init-param>
+    <!-- pid doesn't match -->
+    <init-param>
+        <param-name>de.mannheim.ids.sruws.endpoint.hamburg</param-name>
+        <param-value>http://virt-fedora.multilingua.uni-hamburg.de:8080/HZSKsru/</param-value>
+    </init-param>			 
+  </servlet>
+  <servlet-mapping>
+    <servlet-name>SRU_WS</servlet-name>
+    <url-pattern>/*</url-pattern>
+  </servlet-mapping>
+</web-app>
\ No newline at end of file
diff --git a/src/main/webapp/index.jsp b/src/main/webapp/index.jsp
new file mode 100644
index 0000000..c38169b
--- /dev/null
+++ b/src/main/webapp/index.jsp
@@ -0,0 +1,5 @@
+<html>
+<body>
+<h2>Hello World!</h2>
+</body>
+</html>