Add access to count based collocations

Change-Id: I47b3176285974b65ec8cc2e5871892ef0d104e7b
diff --git a/Artifacts.toml b/Artifacts.toml
index 6e07db6..003db48 100644
--- a/Artifacts.toml
+++ b/Artifacts.toml
@@ -1,6 +1,6 @@
 [wpd19_10000]
-git-tree-sha1 = "b4e4b99c090cdf72d6c8d1377f2d99259f8184ac"
+git-tree-sha1 = "6da73dbb0ceae824285c607ed25d3903552c6d43"
 
     [[wpd19_10000.download]]
-    sha256 = "c305219849482ffc7f01471bd8a23e16c321a35768f7568c6cbd35fece0e0171"
-    url = "https://korap.ids-mannheim.de/data/wpd19_10000.tar.bz2"
+    sha256 = "5df533a2a1031a0a9b3a2de5a5e7b02690f06bb1c057ae05b01724b9d3b17bb1"
+    url = "https://korap.ids-mannheim.de/data/wpd19_10000.tar.gz"
diff --git a/Manifest.toml b/Manifest.toml
index 9a44136..18082a7 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,7 +2,7 @@
 
 julia_version = "1.8.0-rc3"
 manifest_format = "2.0"
-project_hash = "af668ce292bd766579fbcfeae7d9729d80ba7ba5"
+project_hash = "41cd7bc09cb3ff13d255588879730bbece439f06"
 
 [[deps.ArgTools]]
 uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
diff --git a/Project.toml b/Project.toml
index 980c784..f2dc2a9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,6 +9,7 @@
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
+Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
 PCRE2 = "c9310f65-a42c-5928-aca3-d34f64192029"
diff --git a/README.md b/README.md
index 72c51f7..8e713ad 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,8 @@
 cos_sim(model, model2, "also")
 
 kld(model2, model)
+
+c = get_collocates(model)
 ```
 
 ## License
diff --git a/src/DerekoVecs.jl b/src/DerekoVecs.jl
index f53b99c..2a93825 100644
--- a/src/DerekoVecs.jl
+++ b/src/DerekoVecs.jl
@@ -1,5 +1,5 @@
 module DerekoVecs
-export load, knn, cos_sim, d2vmodel, kld, kldResult
+export load, knn, cos_sim, d2vmodel, kld, kldResult, get_collocates
 
 using Mmap
 using DelimitedFiles
@@ -9,6 +9,9 @@
 using Pkg.Artifacts
 using StatsBase
 using Base.PCRE
+using Libdl
+
+const libcdb = Libdl.find_library("libcollocatordb", [".", "/usr/local/lib64", "/usr/lib64", "/usr/local/lib", "/usr/lib"])
 
 struct d2vmodel
     M::Matrix{Float32}
@@ -18,6 +21,7 @@
     vocab::Array{String}
     freqs::Array{Float64}
     total_tokens::Int64
+    cdb::Union{Ptr{Nothing},Nothing}
 end
 
 struct kldResult
@@ -30,11 +34,41 @@
     kld::Float64
 end
 
+struct collocate
+    w2::UInt32
+    f2::UInt64
+    raw::UInt64
+    pmi::Float64
+    npmi::Float64
+    llr::Float64
+    lfmd::Float64
+    md::Float64
+    left_raw::UInt64
+    right_raw::UInt64
+    left_pmi::Float64
+    right_pmi::Float64
+    dice::Float64
+    logdice::Float64
+    ldaf::Float64
+    window::Int32
+    af_window::Int32
+end
+
+
 function load(modelfn)::d2vmodel
+    cdb = nothing
     if (occursin(r".vecs$", modelfn))
         (n, d) = map(s -> parse(Int, s), split(readline(modelfn), " "))
         vocabfn = replace(modelfn, ".vecs" => ".vocab")
         file = readdlm(vocabfn, ' ', String, dims=(n, 2), quotes=false)
+
+        rocksdbfn = replace(modelfn, ".vecs" => "")
+        println(stderr, "Hallo " * rocksdbfn)
+        if (isdirpath(rocksdbfn * ".rocksdb/") && libcdb != "")
+            println("opening " * rocksdbfn)
+            cdb = open_collocatordb(rocksdbfn)
+
+        end
     else
         delim = ('\t' in readline(modelfn) ? '\t' : ' ')
         file = readdlm(modelfn, delim, String, quotes=false)
@@ -52,15 +86,15 @@
     else
         sum(map(x -> parse(Int64, x), file[:, 2]))
     end
-    
+
     freqs = map(x -> parse(Float64, x) / total, file[:, 2])
     vocabdict = Dict{String,Int64}(zip(vocab, 1:n))
     vecsfn = "$(modelfn).vecs"
     if (occursin(r".vecs$", modelfn) && isfile(vecsfn))
         M = Mmap.mmap(vecsfn, Matrix{Float32}, (d, n))
-        d2vmodel(M, d, n, vocabdict, vocab, freqs, total)
+        d2vmodel(M, d, n, vocabdict, vocab, freqs, total, cdb)
     else
-        d2vmodel(Matrix{Float32}(undef, 2, 0), 0, n, vocabdict, vocab, freqs, total)
+        d2vmodel(Matrix{Float32}(undef, 2, 0), 0, n, vocabdict, vocab, freqs, total, cdb)
     end
 end
 
@@ -156,5 +190,30 @@
 
 kld(targetfn::String, bgfn::String)::kldResult = kld(load(targetfn), load(bgfn))
 
+function open_collocatordb(path::String)::Ptr{Cvoid}
+    @ccall libcdb.open_collocatordb(path::Cstring)::Ptr{Cvoid}
 end
 
+function get_collocates(cdb::Ptr{Nothing}, node::Int64)::Vector{collocate}
+    res = @ccall libcdb.get_collocators(cdb::Ptr{collocate}, node::Cuint)::Ptr{collocate}
+    i = 0
+    for c in unsafe_wrap(Vector{collocate}, res, 1000, own=false)
+        if (c.w2 == 0)
+            break
+        end
+        i += 1
+    end
+    unsafe_wrap(Vector{collocate}, res, i - 1, own=false)
+end
+
+function get_collocates(dv::d2vmodel, node::Int)::Vector{collocate}
+    get_collocates(dv.cdb, node)
+end
+
+function get_collocates(dv::d2vmodel, node::String)::Vector{collocate}
+    get_collocates(dv, dv.vocabdict[node])
+end
+
+end
+
+# cdb = open_collocatordb("/vol/work/kupietz/Work2/kl/trunk/Analysemethoden/word2vec/models/dereko-2017-ii")
diff --git a/test/Artifacts.toml b/test/Artifacts.toml
index 6e07db6..003db48 100644
--- a/test/Artifacts.toml
+++ b/test/Artifacts.toml
@@ -1,6 +1,6 @@
 [wpd19_10000]
-git-tree-sha1 = "b4e4b99c090cdf72d6c8d1377f2d99259f8184ac"
+git-tree-sha1 = "6da73dbb0ceae824285c607ed25d3903552c6d43"
 
     [[wpd19_10000.download]]
-    sha256 = "c305219849482ffc7f01471bd8a23e16c321a35768f7568c6cbd35fece0e0171"
-    url = "https://korap.ids-mannheim.de/data/wpd19_10000.tar.bz2"
+    sha256 = "5df533a2a1031a0a9b3a2de5a5e7b02690f06bb1c057ae05b01724b9d3b17bb1"
+    url = "https://korap.ids-mannheim.de/data/wpd19_10000.tar.gz"
diff --git a/test/runtests.jl b/test/runtests.jl
index e0ff466..bc29554 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -41,4 +41,14 @@
         @test wpd19.total_tokens == wpd19_freqlist.total_tokens
         @test isapprox(kld(wpd19_freqlist, wpd19).kld, 0)
     end
+
+    @testset "DerekoVecs.jl: collocation analysis" begin
+        if (!isnothing(wpd19.cdb))
+            println(wpd19.vocab[30])
+            coll = get_collocates(wpd19, "werden")
+            @test coll[1].ldaf > 10
+            @test coll[1].ldaf > coll[3].ldaf
+        end
+    end
+
 end