Add gzip support
Change-Id: I91895689a2204d43672b5638a9a293807b437a59
diff --git a/Changes b/Changes
index 9c5628a..840bb56 100755
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.1.0 2023-03-07
+ - Add gzip support
+ - Add indexer tests
+
0.0.1 2023-02-23
- Update dependencies for security reasons.
- Add dependabot
\ No newline at end of file
diff --git a/README.md b/README.md
index 398a0c1..f4101ff 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,9 @@
## Running
-The binary can be started without prerequisites. The `templates` folder has to be kept in the root directory.
+The binary can be started without prerequisites.
+The `templates` folder has to be kept in the root directory.
+A `db` folder contains the database.
Registration of the plugin in Kalamar is not yet officially supported -
but it works by passing the JSON blob generated at `/plugin.json`
@@ -44,6 +46,7 @@
"WPD11/A00/00005","Wikipedia","http://de.wikipedia.org/wiki/Ang_Lee"
```
+These files can be gzipped as well.
Then run the indexation with:
```shell
diff --git a/service.go b/service.go
index 55f312e..2fac3c1 100644
--- a/service.go
+++ b/service.go
@@ -1,12 +1,14 @@
package main
import (
+ "compress/gzip"
"encoding/csv"
"encoding/json"
"io"
"log"
"net/http"
"os"
+ "path/filepath"
badger "github.com/dgraph-io/badger/v3"
"github.com/gin-gonic/gin"
@@ -48,29 +50,71 @@
}
}
-func add(corpusID, docID, textID string, provider string, url string) error {
- err := db.Update(func(txn *badger.Txn) error {
+func add(dbx *badger.DB, corpusID, docID, textID string, provider string, url string) error {
+ err := dbx.Update(func(txn *badger.Txn) error {
err := txn.Set([]byte(corpusID+"/"+docID+"/"+textID), []byte(provider+","+url))
return err
})
return err
}
-func initDB(dir string) {
+func InitDB(dir string) {
if db != nil {
return
}
- var err error
- db, err = badger.Open(badger.DefaultOptions(dir))
+ db = initDB(dir)
+}
+
+func initDB(dir string) *badger.DB {
+ dbx, err := badger.Open(badger.DefaultOptions(dir))
if err != nil {
log.Fatal(err)
}
+ return dbx
}
func closeDB() {
db.Close()
}
+func IndexDB(ri io.Reader) error {
+ return indexDB(ri, db)
+}
+
+// indexDB reads in a csv file and adds
+// information to the database
+func indexDB(ri io.Reader, dbx *badger.DB) error {
+
+ r := csv.NewReader(ri)
+
+ txn := dbx.NewTransaction(true)
+
+ i := 0
+
+ for {
+ record, err := r.Read()
+ if err == io.EOF {
+ break
+ }
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ if err := txn.Set([]byte(record[0]), []byte(record[1]+","+record[2])); err == badger.ErrTxnTooBig {
+ log.Println("Commit", record[0], "after", i, "inserts")
+ i = 0
+ err = txn.Commit()
+ if err != nil {
+ log.Fatal("Unable to commit")
+ }
+ txn = db.NewTransaction(true)
+ _ = txn.Set([]byte(record[0]), []byte(record[1]+","+record[2]))
+ }
+ i++
+ }
+ return txn.Commit()
+}
+
func setupRouter() *gin.Engine {
r := gin.Default()
r.LoadHTMLGlob("templates/*")
@@ -140,7 +184,7 @@
log.Println(".env file not loaded.")
}
- initDB("db")
+ InitDB("db")
defer closeDB()
// Index csv file
@@ -150,40 +194,24 @@
if err != nil {
log.Fatal(err)
}
- r := csv.NewReader(file)
- txn := db.NewTransaction(true)
+ fileExt := filepath.Ext(os.Args[1])
- i := 0
-
- for {
- record, err := r.Read()
- if err == io.EOF {
- break
- }
+ if fileExt == ".gz" || fileExt == ".csvz" {
+ var gzipr io.Reader
+ gzipr, err = gzip.NewReader(file)
if err != nil {
- log.Fatal(err)
+ log.Fatal("Unable to open gzip file")
+ } else {
+ err = IndexDB(gzipr)
}
-
- if err := txn.Set([]byte(record[0]), []byte(record[1]+","+record[2])); err == badger.ErrTxnTooBig {
- log.Println("Commit", record[0], "after", i, "inserts")
- i = 0
- err = txn.Commit()
- if err != nil {
- log.Fatal("Unable to commit")
- }
- txn = db.NewTransaction(true)
- _ = txn.Set([]byte(record[0]), []byte(record[1]+","+record[2]))
- }
- i++
+ } else {
+ err = IndexDB(file)
}
- err = txn.Commit()
if err != nil {
log.Fatal("Unable to commit")
}
-
- return
}
r := setupRouter()
diff --git a/service_test.go b/service_test.go
index 34e58b2..65fb8c5 100644
--- a/service_test.go
+++ b/service_test.go
@@ -1,6 +1,8 @@
package main
import (
+ "compress/gzip"
+ "io"
"net/http"
"net/http/httptest"
"os"
@@ -13,7 +15,7 @@
dir := t.TempDir()
- initDB(dir)
+ InitDB(dir)
defer closeDB()
router := setupRouter()
@@ -25,7 +27,7 @@
assert.Equal(t, http.StatusNotFound, w.Code)
assert.Equal(t, "No entry found", w.Body.String())
- assert.Nil(t, add("s11", "s12", "s13", "sueddeutsche", "http://example.org"))
+ assert.Nil(t, add(db, "s11", "s12", "s13", "sueddeutsche", "http://example.org"))
w = httptest.NewRecorder()
req, _ = http.NewRequest(http.MethodGet, "/s11/s12/s13", nil)
@@ -102,5 +104,55 @@
assert.Contains(t, w.Header().Get("Content-Type"), "application/json")
assert.Contains(t, w.Body.String(), "permissions")
assert.Contains(t, w.Body.String(), "/plugin/fun")
+}
+
+func TestIndexer(t *testing.T) {
+
+ dir := t.TempDir()
+
+ dbx := initDB(dir)
+ defer dbx.Close()
+
+ // Test index plain
+ file, err := os.Open("testdata/sz_mapping_example1.csv")
+ assert.Nil(t, err)
+ defer file.Close()
+ indexDB(file, dbx)
+
+ // Test index gzip
+ file, err = os.Open("testdata/sz_mapping_example2.csv.gz")
+ assert.Nil(t, err)
+ defer file.Close()
+ var gzipr io.Reader
+ gzipr, err = gzip.NewReader(file)
+ assert.Nil(t, err)
+ indexDB(gzipr, dbx)
+
+ txn := dbx.NewTransaction(true)
+ defer txn.Discard()
+
+ item, err := txn.Get([]byte("U92/JAN/00001"))
+ assert.Nil(t, err)
+ err = item.Value(func(val []byte) error {
+ assert.Equal(t, string(val), "Süddeutsche Zeitung,https://archiv.szarchiv.de/Portal/restricted/Start.act?articleId=A800000")
+ return nil
+ })
+ assert.Nil(t, err)
+
+ item, err = txn.Get([]byte("U92/JAN/00003"))
+ assert.Nil(t, err)
+ err = item.Value(func(val []byte) error {
+ assert.Equal(t, string(val), "Süddeutsche Zeitung,https://archiv.szarchiv.de/Portal/restricted/Start.act?articleId=A800010")
+ return nil
+ })
+ assert.Nil(t, err)
+
+ item, err = txn.Get([]byte("U92/FEB/00003"))
+ assert.Nil(t, err)
+ err = item.Value(func(val []byte) error {
+ assert.Equal(t, string(val), "Süddeutsche Zeitung,https://archiv.szarchiv.de/Portal/restricted/Start.act?articleId=A806912")
+ return nil
+ })
+ assert.Nil(t, err)
}
diff --git a/testdata/sz_mapping_example1.csv b/testdata/sz_mapping_example1.csv
new file mode 100644
index 0000000..b64ca3a
--- /dev/null
+++ b/testdata/sz_mapping_example1.csv
@@ -0,0 +1,3 @@
+"U92/JAN/00001","Süddeutsche Zeitung","https://archiv.szarchiv.de/Portal/restricted/Start.act?articleId=A800000"
+"U92/JAN/00002","Süddeutsche Zeitung","https://archiv.szarchiv.de/Portal/restricted/Start.act?articleId=A800001"
+"U92/JAN/00003","Süddeutsche Zeitung","https://archiv.szarchiv.de/Portal/restricted/Start.act?articleId=A800010"
diff --git a/testdata/sz_mapping_example2.csv.gz b/testdata/sz_mapping_example2.csv.gz
new file mode 100644
index 0000000..d13829a
--- /dev/null
+++ b/testdata/sz_mapping_example2.csv.gz
Binary files differ