blob: e71d01e973b861f16beda0fbbc68ef0e138baf99 [file] [log] [blame]
module DerekoVecs
export load, knn, cos_sim, d2vmodel, kld, kldResult, get_collocates, collocate
using Mmap
using DelimitedFiles
using LinearAlgebra
using Distances
using DataFrames
using Pkg.Artifacts
using StatsBase
using Base.PCRE
using Libdl
const libcdb = Libdl.find_library("libcollocatordb", [".", "/usr/local/lib64", "/usr/lib64", "/usr/local/lib", "/usr/lib"])
struct d2vmodel
M::Matrix{Float32}
m::Int64
n::Int64
vocabdict::Dict{String,Int64}
vocab::Array{String}
freqs::Array{Float64}
total_tokens::Int64
cdb::Union{Ptr{Nothing},Nothing}
end
struct kldResult
df::DataFrame
token_count::Array{Int64}
common_type_count::Int64
common_token_count::Int64
common_type_share::Float64
common_token_share::Float64
kld::Float64
end
struct collocate
w2::UInt32
f2::UInt64
raw::UInt64
pmi::Float64
npmi::Float64
llr::Float64
lfmd::Float64
md::Float64
left_raw::UInt64
right_raw::UInt64
left_pmi::Float64
right_pmi::Float64
dice::Float64
logdice::Float64
ldaf::Float64
window::Int32
af_window::Int32
end
function load(modelfn)::d2vmodel
cdb = nothing
if (occursin(r".vecs$", modelfn))
(n, d) = map(s -> parse(Int, s), split(readline(modelfn), " "))
vocabfn = replace(modelfn, ".vecs" => ".vocab")
file = readdlm(vocabfn, ' ', String, dims=(n, 2), quotes=false)
rocksdbfn = replace(modelfn, ".vecs" => "")
if (isfile(rocksdbfn * ".rocksdb/CURRENT") && libcdb != "")
cdb = open_collocatordb(rocksdbfn)
end
else
delim = ('\t' in readline(modelfn) ? '\t' : ' ')
file = readdlm(modelfn, delim, String, quotes=false)
end
vocab = file[:, 1]
n = length(vocab)
sizefn = replace(modelfn, r"\.[^.]+" => s".size")
total = if (isfile(sizefn)) # .size-file with corrected token count?
open(sizefn) do io
readline(io)
parse(Int, readline(io))
end
else
sum(map(x -> parse(Int64, x), file[:, 2]))
end
freqs = map(x -> parse(Float64, x) / total, file[:, 2])
vocabdict = Dict{String,Int64}(zip(vocab, 1:n))
vecsfn = "$(modelfn).vecs"
if (occursin(r".vecs$", modelfn) && isfile(vecsfn))
M = Mmap.mmap(vecsfn, Matrix{Float32}, (d, n))
d2vmodel(M, d, n, vocabdict, vocab, freqs, total, cdb)
else
d2vmodel(Matrix{Float32}(undef, 2, 0), 0, n, vocabdict, vocab, freqs, total, cdb)
end
end
function cos_sim(m1::d2vmodel, m2::d2vmodel, w1index::Int64, w2index::Int64)
try
dot(m1.M[:, w1index], m2.M[:, w2index])
catch error
-1.0
end
end
cos_sim(m::d2vmodel, w1::String, w2::String) = cos_sim(m, m, w1, w2)
cos_sim(m1::d2vmodel, m2::d2vmodel, w1::String, w2::String) = cos_sim(m1, m2, m1.vocabdict[w1], m2.vocabdict[w2])
cos_sim(m1::d2vmodel, m2::d2vmodel, w::String) = cos_sim(m1, m2, m1.vocabdict[w], m2.vocabdict[w])
cos_sim(m1::d2vmodel, m2::d2vmodel, w1index::Int64) = cos_sim(m1, m2, w1index, w1index)
cos_sim(m::d2vmodel, w1index::Int64, w2index::Int64) = cos_sim(m, m, w1index, w2index)
function minus(m::d2vmodel, w1::String, w2::String)
knn(m, normalize(m.M[:, m.vocabdict[w1]] - m.M[:, m.vocabdict[w2]]), 1)[1]
end
function knn(m::d2vmodel, v::Array{Float32}, k)
# dist = Array{Float64}(undef, size(m.M)[2])
#@time Threads.for i in 1:size(M)[2] dist[i]=dot(v, M[:,i]) end
#knn = sortperm(dist, rev=true)[1:k]
knn = sortperm(map(x -> dot(v, m.M[:, x]), 1:m.n), rev=true)[1:k]
map(x -> m.vocab[x], knn)
end
function knn(m::d2vmodel, w1index::Int64, k::Int)
knn(m, m.M[:, w1index], k)
end
function knn(m::d2vmodel, w1::String, k)
knn(m, m.vocabdict[w1], k)
end
kldc(p, q) = p * log(p / q)
function kld(dictp::Array{Dict}, total::Array{Int64})::kldResult
min_vocab_size = minimum(map(length, dictp))
min_token_size = minimum(total)
common_types = collect(reduce(intersect, map(keys, dictp)))
common_type_share = length(common_types) * 100 / min_vocab_size
common_tokens = Array{Int64}(undef, length(dictp))
p = Array{Float64,2}(undef, 2, length(common_types))
for i in 1:length(dictp)
common_tokens[i] = 0
for j in 1:length(common_types)
p[i, j] = get(dictp[i], common_types[j], 0)
common_tokens[i] += p[i, j] * total[i]
end
end
common_token_share = minimum(common_tokens) * 100.0 / min_token_size
kld = Array{Float64}(undef, length(common_types))
frq = Array{Float64}(undef, length(common_types))
for i in 1:(length(dictp)-1)
for j in (i+1):length(dictp)
for k in 1:length(common_types)
kld[k] = kldc(p[1, k], p[j, k])
frq[k] = get(dictp[j], common_types[k], 0)
end
end
end
kldcs = hcat(Vector{String}(common_types), kld, frq)
# df = DataFrame(type = Vector{String}(common_types), pkld = Vector{Float64}(kldcs), freq = Vector{Float64}(frq))
df = DataFrame(kldcs, ["type", "pkld", "freq"])
# df = DataFrame()
df.type = Vector{String}(df.type)
df.pkld = Vector{Float64}(df.pkld)
df.freq = Vector{Float64}(df.freq)
transform!(df, :freq => (x -> competerank(x, rev=true)) => :rank)
kldResult(df, total, length(common_types), minimum(common_tokens), common_type_share, common_token_share, sum(df.pkld))
end
"Calculate contributions to the Kullback-Leibler divergence from the target language model to the background language model"
function kld(target::d2vmodel, bg::d2vmodel)::kldResult
dictp = Array{Dict}(undef, 2)
total = Array{Int64}(undef, 2)
# delim = ('\t' in readline(fnames[1]) ? '\t' : ' ')
dictp[1] = Dict(zip(target.vocab, target.freqs))
dictp[2] = Dict(zip(bg.vocab, bg.freqs))
kld(dictp, [target.total_tokens, bg.total_tokens])
end
kld(targetfn::String, bgfn::String)::kldResult = kld(load(targetfn), load(bgfn))
function open_collocatordb(path::String)::Ptr{Cvoid}
@ccall libcdb.open_collocatordb(path::Cstring)::Ptr{Cvoid}
end
function get_collocates(cdb::Ptr{Nothing}, node::Int64, max_vocab_index::Int64, max::Int64)::Vector{collocate}
res = @ccall libcdb.get_collocators(cdb::Ptr{collocate}, node::Cuint)::Ptr{collocate}
if res == Ptr{collocate}(C_NULL)
return Vector{collocate}()
end
i = 0
unsafe_array = unsafe_wrap(Vector{collocate}, res, max, own=false)
window = unsafe_array[1].window
for c in unsafe_array
i += 1
if (c.w2 < 0 || c.w2 > max_vocab_index || c.window != window)
i -= 1
break
end
end
unsafe_wrap(Vector{collocate}, res, i, own = false)
end
function get_collocates(dv::d2vmodel, node::Int64, max = 200)::DataFrame
collocates = get_collocates(dv.cdb, node - 1, length(dv.vocab), max)
df = DataFrame(collocates)
df.w2 = map(x -> x+1, df.w2)
df.collocate = map(x -> dv.vocab[x], df.w2)
df
end
function get_collocates(dv::d2vmodel, node::String, max = 200)::DataFrame
get_collocates(dv, dv.vocabdict[node], max)
end
end
# cdb = open_collocatordb("/vol/work/kupietz/Work2/kl/trunk/Analysemethoden/word2vec/models/dereko-2017-ii")