Add gzip support

Change-Id: I91895689a2204d43672b5638a9a293807b437a59
diff --git a/Changes b/Changes
index 9c5628a..840bb56 100755
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.1.0 2023-03-07
+        - Add gzip support
+        - Add indexer tests
+
 0.0.1 2023-02-23
         - Update dependencies for security reasons.
         - Add dependabot
\ No newline at end of file
diff --git a/README.md b/README.md
index 398a0c1..f4101ff 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,9 @@
 
 ## Running
 
-The binary can be started without prerequisites. The `templates` folder has to be kept in the root directory.
+The binary can be started without prerequisites.
+The `templates` folder has to be kept in the root directory.
+A `db` folder contains the database.
 
 Registration of the plugin in Kalamar is not yet officially supported -
 but it works by passing the JSON blob generated at `/plugin.json`
@@ -44,6 +46,7 @@
 "WPD11/A00/00005","Wikipedia","http://de.wikipedia.org/wiki/Ang_Lee"
 ```
 
+These files can be gzipped as well.
 Then run the indexation with:
 
 ```shell
diff --git a/service.go b/service.go
index 55f312e..2fac3c1 100644
--- a/service.go
+++ b/service.go
@@ -1,12 +1,14 @@
 package main
 
 import (
+	"compress/gzip"
 	"encoding/csv"
 	"encoding/json"
 	"io"
 	"log"
 	"net/http"
 	"os"
+	"path/filepath"
 
 	badger "github.com/dgraph-io/badger/v3"
 	"github.com/gin-gonic/gin"
@@ -48,29 +50,71 @@
 	}
 }
 
-func add(corpusID, docID, textID string, provider string, url string) error {
-	err := db.Update(func(txn *badger.Txn) error {
+func add(dbx *badger.DB, corpusID, docID, textID string, provider string, url string) error {
+	err := dbx.Update(func(txn *badger.Txn) error {
 		err := txn.Set([]byte(corpusID+"/"+docID+"/"+textID), []byte(provider+","+url))
 		return err
 	})
 	return err
 }
 
-func initDB(dir string) {
+func InitDB(dir string) {
 	if db != nil {
 		return
 	}
-	var err error
-	db, err = badger.Open(badger.DefaultOptions(dir))
+	db = initDB(dir)
+}
+
+func initDB(dir string) *badger.DB {
+	dbx, err := badger.Open(badger.DefaultOptions(dir))
 	if err != nil {
 		log.Fatal(err)
 	}
+	return dbx
 }
 
 func closeDB() {
 	db.Close()
 }
 
+func IndexDB(ri io.Reader) error {
+	return indexDB(ri, db)
+}
+
+// indexDB reads in a csv file and adds
+// information to the database
+func indexDB(ri io.Reader, dbx *badger.DB) error {
+
+	r := csv.NewReader(ri)
+
+	txn := dbx.NewTransaction(true)
+
+	i := 0
+
+	for {
+		record, err := r.Read()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			log.Fatal(err)
+		}
+
+		if err := txn.Set([]byte(record[0]), []byte(record[1]+","+record[2])); err == badger.ErrTxnTooBig {
+			log.Println("Commit", record[0], "after", i, "inserts")
+			i = 0
+			err = txn.Commit()
+			if err != nil {
+				log.Fatal("Unable to commit")
+			}
+			txn = db.NewTransaction(true)
+			_ = txn.Set([]byte(record[0]), []byte(record[1]+","+record[2]))
+		}
+		i++
+	}
+	return txn.Commit()
+}
+
 func setupRouter() *gin.Engine {
 	r := gin.Default()
 	r.LoadHTMLGlob("templates/*")
@@ -140,7 +184,7 @@
 		log.Println(".env file not loaded.")
 	}
 
-	initDB("db")
+	InitDB("db")
 	defer closeDB()
 
 	// Index csv file
@@ -150,40 +194,24 @@
 		if err != nil {
 			log.Fatal(err)
 		}
-		r := csv.NewReader(file)
 
-		txn := db.NewTransaction(true)
+		fileExt := filepath.Ext(os.Args[1])
 
-		i := 0
-
-		for {
-			record, err := r.Read()
-			if err == io.EOF {
-				break
-			}
+		if fileExt == ".gz" || fileExt == ".csvz" {
+			var gzipr io.Reader
+			gzipr, err = gzip.NewReader(file)
 			if err != nil {
-				log.Fatal(err)
+				log.Fatal("Unable to open gzip file")
+			} else {
+				err = IndexDB(gzipr)
 			}
-
-			if err := txn.Set([]byte(record[0]), []byte(record[1]+","+record[2])); err == badger.ErrTxnTooBig {
-				log.Println("Commit", record[0], "after", i, "inserts")
-				i = 0
-				err = txn.Commit()
-				if err != nil {
-					log.Fatal("Unable to commit")
-				}
-				txn = db.NewTransaction(true)
-				_ = txn.Set([]byte(record[0]), []byte(record[1]+","+record[2]))
-			}
-			i++
+		} else {
+			err = IndexDB(file)
 		}
-		err = txn.Commit()
 
 		if err != nil {
 			log.Fatal("Unable to commit")
 		}
-
-		return
 	}
 	r := setupRouter()
 
diff --git a/service_test.go b/service_test.go
index 34e58b2..65fb8c5 100644
--- a/service_test.go
+++ b/service_test.go
@@ -1,6 +1,8 @@
 package main
 
 import (
+	"compress/gzip"
+	"io"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -13,7 +15,7 @@
 
 	dir := t.TempDir()
 
-	initDB(dir)
+	InitDB(dir)
 	defer closeDB()
 	router := setupRouter()
 
@@ -25,7 +27,7 @@
 	assert.Equal(t, http.StatusNotFound, w.Code)
 	assert.Equal(t, "No entry found", w.Body.String())
 
-	assert.Nil(t, add("s11", "s12", "s13", "sueddeutsche", "http://example.org"))
+	assert.Nil(t, add(db, "s11", "s12", "s13", "sueddeutsche", "http://example.org"))
 
 	w = httptest.NewRecorder()
 	req, _ = http.NewRequest(http.MethodGet, "/s11/s12/s13", nil)
@@ -102,5 +104,55 @@
 	assert.Contains(t, w.Header().Get("Content-Type"), "application/json")
 	assert.Contains(t, w.Body.String(), "permissions")
 	assert.Contains(t, w.Body.String(), "/plugin/fun")
+}
+
+func TestIndexer(t *testing.T) {
+
+	dir := t.TempDir()
+
+	dbx := initDB(dir)
+	defer dbx.Close()
+
+	// Test index plain
+	file, err := os.Open("testdata/sz_mapping_example1.csv")
+	assert.Nil(t, err)
+	defer file.Close()
+	indexDB(file, dbx)
+
+	// Test index gzip
+	file, err = os.Open("testdata/sz_mapping_example2.csv.gz")
+	assert.Nil(t, err)
+	defer file.Close()
+	var gzipr io.Reader
+	gzipr, err = gzip.NewReader(file)
+	assert.Nil(t, err)
+	indexDB(gzipr, dbx)
+
+	txn := dbx.NewTransaction(true)
+	defer txn.Discard()
+
+	item, err := txn.Get([]byte("U92/JAN/00001"))
+	assert.Nil(t, err)
+	err = item.Value(func(val []byte) error {
+		assert.Equal(t, string(val), "Süddeutsche Zeitung,https://archiv.szarchiv.de/Portal/restricted/Start.act?articleId=A800000")
+		return nil
+	})
+	assert.Nil(t, err)
+
+	item, err = txn.Get([]byte("U92/JAN/00003"))
+	assert.Nil(t, err)
+	err = item.Value(func(val []byte) error {
+		assert.Equal(t, string(val), "Süddeutsche Zeitung,https://archiv.szarchiv.de/Portal/restricted/Start.act?articleId=A800010")
+		return nil
+	})
+	assert.Nil(t, err)
+
+	item, err = txn.Get([]byte("U92/FEB/00003"))
+	assert.Nil(t, err)
+	err = item.Value(func(val []byte) error {
+		assert.Equal(t, string(val), "Süddeutsche Zeitung,https://archiv.szarchiv.de/Portal/restricted/Start.act?articleId=A806912")
+		return nil
+	})
+	assert.Nil(t, err)
 
 }
diff --git a/testdata/sz_mapping_example1.csv b/testdata/sz_mapping_example1.csv
new file mode 100644
index 0000000..b64ca3a
--- /dev/null
+++ b/testdata/sz_mapping_example1.csv
@@ -0,0 +1,3 @@
+"U92/JAN/00001","Süddeutsche Zeitung","https://archiv.szarchiv.de/Portal/restricted/Start.act?articleId=A800000"
+"U92/JAN/00002","Süddeutsche Zeitung","https://archiv.szarchiv.de/Portal/restricted/Start.act?articleId=A800001"
+"U92/JAN/00003","Süddeutsche Zeitung","https://archiv.szarchiv.de/Portal/restricted/Start.act?articleId=A800010"
diff --git a/testdata/sz_mapping_example2.csv.gz b/testdata/sz_mapping_example2.csv.gz
new file mode 100644
index 0000000..d13829a
--- /dev/null
+++ b/testdata/sz_mapping_example2.csv.gz
Binary files differ