src/DerekoVecs.jl - ids-kl/DerekoVecs.jl - Gitiles

 module DerekoVecs
 export load, knn, cos_sim, d2vmodel, kld, kldResult, get_collocates, collocate

 using Mmap
 using DelimitedFiles
 using LinearAlgebra
 using Distances
 using DataFrames
 using Pkg.Artifacts
 using StatsBase
 using Base.PCRE
 using Libdl

 const libcdb = Libdl.find_library("libcollocatordb", [".", "/usr/local/lib64", "/usr/lib64", "/usr/local/lib", "/usr/lib"])

 struct d2vmodel
     M::Matrix{Float32}
     m::Int64
     n::Int64
     vocabdict::Dict{String,Int64}
     vocab::Array{String}
     freqs::Array{Float64}
     total_tokens::Int64
     cdb::Union{Ptr{Nothing},Nothing}
 end

 struct kldResult
     df::DataFrame
     token_count::Array{Int64}
     common_type_count::Int64
     common_token_count::Int64
     common_type_share::Float64
     common_token_share::Float64
     kld::Float64
 end

 struct collocate
     w2::UInt32
     f2::UInt64
     raw::UInt64
     pmi::Float64
     npmi::Float64
     llr::Float64
     lfmd::Float64
     md::Float64
     left_raw::UInt64
     right_raw::UInt64
     left_pmi::Float64
     right_pmi::Float64
     dice::Float64
     logdice::Float64
     ldaf::Float64
     window::Int32
     af_window::Int32
 end


 function load(modelfn)::d2vmodel
     cdb = nothing
     if (occursin(r".vecs$", modelfn))
         (n, d) = map(s -> parse(Int, s), split(readline(modelfn), " "))
         vocabfn = replace(modelfn, ".vecs" => ".vocab")
         file = readdlm(vocabfn, ' ', String, dims=(n, 2), quotes=false)

         rocksdbfn = replace(modelfn, ".vecs" => "")
         if (isfile(rocksdbfn * ".rocksdb/CURRENT") && libcdb != "")
             cdb = open_collocatordb(rocksdbfn)
         end
     else
         delim = ('\t' in readline(modelfn) ? '\t' : ' ')
         file = readdlm(modelfn, delim, String, quotes=false)
     end

     vocab = file[:, 1]
     n = length(vocab)

     sizefn = replace(modelfn, r"\.[^.]+" => s".size")
     total = if (isfile(sizefn)) # .size-file with corrected token count?
         open(sizefn) do io
             readline(io)
             parse(Int, readline(io))
         end
     else
         sum(map(x -> parse(Int64, x), file[:, 2]))
     end

     freqs = map(x -> parse(Float64, x) / total, file[:, 2])
     vocabdict = Dict{String,Int64}(zip(vocab, 1:n))
     vecsfn = "$(modelfn).vecs"
     if (occursin(r".vecs$", modelfn) && isfile(vecsfn))
         M = Mmap.mmap(vecsfn, Matrix{Float32}, (d, n))
         d2vmodel(M, d, n, vocabdict, vocab, freqs, total, cdb)
     else
         d2vmodel(Matrix{Float32}(undef, 2, 0), 0, n, vocabdict, vocab, freqs, total, cdb)
     end
 end

 function cos_sim(m1::d2vmodel, m2::d2vmodel, w1index::Int64, w2index::Int64)
     try
         dot(m1.M[:, w1index], m2.M[:, w2index])
     catch error
         -1.0
     end
 end

 cos_sim(m::d2vmodel, w1::String, w2::String) = cos_sim(m, m, w1, w2)
 cos_sim(m1::d2vmodel, m2::d2vmodel, w1::String, w2::String) = cos_sim(m1, m2, m1.vocabdict[w1], m2.vocabdict[w2])
 cos_sim(m1::d2vmodel, m2::d2vmodel, w::String) = cos_sim(m1, m2, m1.vocabdict[w], m2.vocabdict[w])
 cos_sim(m1::d2vmodel, m2::d2vmodel, w1index::Int64) = cos_sim(m1, m2, w1index, w1index)
 cos_sim(m::d2vmodel, w1index::Int64, w2index::Int64) = cos_sim(m, m, w1index, w2index)

 function minus(m::d2vmodel, w1::String, w2::String)
     knn(m, normalize(m.M[:, m.vocabdict[w1]] - m.M[:, m.vocabdict[w2]]), 1)[1]
 end

 function knn(m::d2vmodel, v::Array{Float32}, k)
     # dist = Array{Float64}(undef, size(m.M)[2])
     #@time Threads.for i in 1:size(M)[2] dist[i]=dot(v, M[:,i]) end
     #knn = sortperm(dist, rev=true)[1:k]
     knn = sortperm(map(x -> dot(v, m.M[:, x]), 1:m.n), rev=true)[1:k]
     map(x -> m.vocab[x], knn)
 end

 function knn(m::d2vmodel, w1index::Int64, k::Int)
     knn(m, m.M[:, w1index], k)
 end

 function knn(m::d2vmodel, w1::String, k)
     knn(m, m.vocabdict[w1], k)
 end

 kldc(p, q) = p * log(p / q)

 function kld(dictp::Array{Dict}, total::Array{Int64})::kldResult
     min_vocab_size = minimum(map(length, dictp))
     min_token_size = minimum(total)

     common_types = collect(reduce(intersect, map(keys, dictp)))

     common_type_share = length(common_types) * 100 / min_vocab_size

     common_tokens = Array{Int64}(undef, length(dictp))
     p = Array{Float64,2}(undef, 2, length(common_types))
     for i in 1:length(dictp)
         common_tokens[i] = 0
         for j in 1:length(common_types)
             p[i, j] = get(dictp[i], common_types[j], 0)
             common_tokens[i] += p[i, j] * total[i]
         end
     end

     common_token_share = minimum(common_tokens) * 100.0 / min_token_size

     kld = Array{Float64}(undef, length(common_types))
     frq = Array{Float64}(undef, length(common_types))
     for i in 1:(length(dictp)-1)
         for j in (i+1):length(dictp)
             for k in 1:length(common_types)
                 kld[k] = kldc(p[1, k], p[j, k])
                 frq[k] = get(dictp[j], common_types[k], 0)
             end
         end
     end
     kldcs = hcat(Vector{String}(common_types), kld, frq)
     # df = DataFrame(type = Vector{String}(common_types), pkld = Vector{Float64}(kldcs), freq = Vector{Float64}(frq))
     df = DataFrame(kldcs, ["type", "pkld", "freq"])
     # df = DataFrame()
     df.type = Vector{String}(df.type)
     df.pkld = Vector{Float64}(df.pkld)
     df.freq = Vector{Float64}(df.freq)
     transform!(df, :freq => (x -> competerank(x, rev=true)) => :rank)
     kldResult(df, total, length(common_types), minimum(common_tokens), common_type_share, common_token_share, sum(df.pkld))
 end

 "Calculate contributions to the Kullback-Leibler divergence from the target language model to the background language model"
 function kld(target::d2vmodel, bg::d2vmodel)::kldResult
     dictp = Array{Dict}(undef, 2)
     total = Array{Int64}(undef, 2)

     # delim = ('\t' in readline(fnames[1]) ? '\t' : ' ')

     dictp[1] = Dict(zip(target.vocab, target.freqs))
     dictp[2] = Dict(zip(bg.vocab, bg.freqs))

     kld(dictp, [target.total_tokens, bg.total_tokens])
 end

 kld(targetfn::String, bgfn::String)::kldResult = kld(load(targetfn), load(bgfn))

 function open_collocatordb(path::String)::Ptr{Cvoid}
     @ccall libcdb.open_collocatordb(path::Cstring)::Ptr{Cvoid}
 end

 function get_collocates(cdb::Ptr{Nothing}, node::Int64, max_vocab_index::Int64, max::Int64)::Vector{collocate}
     res = @ccall libcdb.get_collocators(cdb::Ptr{collocate}, node::Cuint)::Ptr{collocate}
     if res == Ptr{collocate}(C_NULL)
         return Vector{collocate}()
     end

     i = 0
     unsafe_array = unsafe_wrap(Vector{collocate}, res, max, own=false)
     window = unsafe_array[1].window
     for c in unsafe_array
         i += 1
         if (c.w2 < 0 || c.w2 > max_vocab_index || c.window != window)
             i -= 1
             break
         end
     end
     unsafe_wrap(Vector{collocate}, res, i, own = false)
 end

 function get_collocates(dv::d2vmodel, node::Int64, max = 200)::DataFrame
     collocates = get_collocates(dv.cdb, node - 1, length(dv.vocab), max)
     df = DataFrame(collocates)
     df.w2 = map(x -> x+1, df.w2)
     df.collocate = map(x -> dv.vocab[x], df.w2)
     df
 end

 function get_collocates(dv::d2vmodel, node::String, max = 200)::DataFrame
     get_collocates(dv, dv.vocabdict[node], max)
 end

 end

 # cdb = open_collocatordb("/vol/work/kupietz/Work2/kl/trunk/Analysemethoden/word2vec/models/dereko-2017-ii")
	module DerekoVecs
	export load, knn, cos_sim, d2vmodel, kld, kldResult, get_collocates, collocate

	using Mmap
	using DelimitedFiles
	using LinearAlgebra
	using Distances
	using DataFrames
	using Pkg.Artifacts
	using StatsBase
	using Base.PCRE
	using Libdl

	const libcdb = Libdl.find_library("libcollocatordb", [".", "/usr/local/lib64", "/usr/lib64", "/usr/local/lib", "/usr/lib"])

	struct d2vmodel
	M::Matrix{Float32}
	m::Int64
	n::Int64
	vocabdict::Dict{String,Int64}
	vocab::Array{String}
	freqs::Array{Float64}
	total_tokens::Int64
	cdb::Union{Ptr{Nothing},Nothing}
	end

	struct kldResult
	df::DataFrame
	token_count::Array{Int64}
	common_type_count::Int64
	common_token_count::Int64
	common_type_share::Float64
	common_token_share::Float64
	kld::Float64
	end

	struct collocate
	w2::UInt32
	f2::UInt64
	raw::UInt64
	pmi::Float64
	npmi::Float64
	llr::Float64
	lfmd::Float64
	md::Float64
	left_raw::UInt64
	right_raw::UInt64
	left_pmi::Float64
	right_pmi::Float64
	dice::Float64
	logdice::Float64
	ldaf::Float64
	window::Int32
	af_window::Int32
	end


	function load(modelfn)::d2vmodel
	cdb = nothing
	if (occursin(r".vecs$", modelfn))
	(n, d) = map(s -> parse(Int, s), split(readline(modelfn), " "))
	vocabfn = replace(modelfn, ".vecs" => ".vocab")
	file = readdlm(vocabfn, ' ', String, dims=(n, 2), quotes=false)

	rocksdbfn = replace(modelfn, ".vecs" => "")
	if (isfile(rocksdbfn * ".rocksdb/CURRENT") && libcdb != "")
	cdb = open_collocatordb(rocksdbfn)
	end
	else
	delim = ('\t' in readline(modelfn) ? '\t' : ' ')
	file = readdlm(modelfn, delim, String, quotes=false)
	end

	vocab = file[:, 1]
	n = length(vocab)

	sizefn = replace(modelfn, r"\.[^.]+" => s".size")
	total = if (isfile(sizefn)) # .size-file with corrected token count?
	open(sizefn) do io
	readline(io)
	parse(Int, readline(io))
	end
	else
	sum(map(x -> parse(Int64, x), file[:, 2]))
	end

	freqs = map(x -> parse(Float64, x) / total, file[:, 2])
	vocabdict = Dict{String,Int64}(zip(vocab, 1:n))
	vecsfn = "$(modelfn).vecs"
	if (occursin(r".vecs$", modelfn) && isfile(vecsfn))
	M = Mmap.mmap(vecsfn, Matrix{Float32}, (d, n))
	d2vmodel(M, d, n, vocabdict, vocab, freqs, total, cdb)
	else
	d2vmodel(Matrix{Float32}(undef, 2, 0), 0, n, vocabdict, vocab, freqs, total, cdb)
	end
	end

	function cos_sim(m1::d2vmodel, m2::d2vmodel, w1index::Int64, w2index::Int64)
	try
	dot(m1.M[:, w1index], m2.M[:, w2index])
	catch error
	-1.0
	end
	end

	cos_sim(m::d2vmodel, w1::String, w2::String) = cos_sim(m, m, w1, w2)
	cos_sim(m1::d2vmodel, m2::d2vmodel, w1::String, w2::String) = cos_sim(m1, m2, m1.vocabdict[w1], m2.vocabdict[w2])
	cos_sim(m1::d2vmodel, m2::d2vmodel, w::String) = cos_sim(m1, m2, m1.vocabdict[w], m2.vocabdict[w])
	cos_sim(m1::d2vmodel, m2::d2vmodel, w1index::Int64) = cos_sim(m1, m2, w1index, w1index)
	cos_sim(m::d2vmodel, w1index::Int64, w2index::Int64) = cos_sim(m, m, w1index, w2index)

	function minus(m::d2vmodel, w1::String, w2::String)
	knn(m, normalize(m.M[:, m.vocabdict[w1]] - m.M[:, m.vocabdict[w2]]), 1)[1]
	end

	function knn(m::d2vmodel, v::Array{Float32}, k)
	# dist = Array{Float64}(undef, size(m.M)[2])
	#@time Threads.for i in 1:size(M)[2] dist[i]=dot(v, M[:,i]) end
	#knn = sortperm(dist, rev=true)[1:k]
	knn = sortperm(map(x -> dot(v, m.M[:, x]), 1:m.n), rev=true)[1:k]
	map(x -> m.vocab[x], knn)
	end

	function knn(m::d2vmodel, w1index::Int64, k::Int)
	knn(m, m.M[:, w1index], k)
	end

	function knn(m::d2vmodel, w1::String, k)
	knn(m, m.vocabdict[w1], k)
	end

	kldc(p, q) = p * log(p / q)

	function kld(dictp::Array{Dict}, total::Array{Int64})::kldResult
	min_vocab_size = minimum(map(length, dictp))
	min_token_size = minimum(total)

	common_types = collect(reduce(intersect, map(keys, dictp)))

	common_type_share = length(common_types) * 100 / min_vocab_size

	common_tokens = Array{Int64}(undef, length(dictp))
	p = Array{Float64,2}(undef, 2, length(common_types))
	for i in 1:length(dictp)
	common_tokens[i] = 0
	for j in 1:length(common_types)
	p[i, j] = get(dictp[i], common_types[j], 0)
	common_tokens[i] += p[i, j] * total[i]
	end
	end

	common_token_share = minimum(common_tokens) * 100.0 / min_token_size

	kld = Array{Float64}(undef, length(common_types))
	frq = Array{Float64}(undef, length(common_types))
	for i in 1:(length(dictp)-1)
	for j in (i+1):length(dictp)
	for k in 1:length(common_types)
	kld[k] = kldc(p[1, k], p[j, k])
	frq[k] = get(dictp[j], common_types[k], 0)
	end
	end
	end
	kldcs = hcat(Vector{String}(common_types), kld, frq)
	# df = DataFrame(type = Vector{String}(common_types), pkld = Vector{Float64}(kldcs), freq = Vector{Float64}(frq))
	df = DataFrame(kldcs, ["type", "pkld", "freq"])
	# df = DataFrame()
	df.type = Vector{String}(df.type)
	df.pkld = Vector{Float64}(df.pkld)
	df.freq = Vector{Float64}(df.freq)
	transform!(df, :freq => (x -> competerank(x, rev=true)) => :rank)
	kldResult(df, total, length(common_types), minimum(common_tokens), common_type_share, common_token_share, sum(df.pkld))
	end

	"Calculate contributions to the Kullback-Leibler divergence from the target language model to the background language model"
	function kld(target::d2vmodel, bg::d2vmodel)::kldResult
	dictp = Array{Dict}(undef, 2)
	total = Array{Int64}(undef, 2)

	# delim = ('\t' in readline(fnames[1]) ? '\t' : ' ')

	dictp[1] = Dict(zip(target.vocab, target.freqs))
	dictp[2] = Dict(zip(bg.vocab, bg.freqs))

	kld(dictp, [target.total_tokens, bg.total_tokens])
	end

	kld(targetfn::String, bgfn::String)::kldResult = kld(load(targetfn), load(bgfn))

	function open_collocatordb(path::String)::Ptr{Cvoid}
	@ccall libcdb.open_collocatordb(path::Cstring)::Ptr{Cvoid}
	end

	function get_collocates(cdb::Ptr{Nothing}, node::Int64, max_vocab_index::Int64, max::Int64)::Vector{collocate}
	res = @ccall libcdb.get_collocators(cdb::Ptr{collocate}, node::Cuint)::Ptr{collocate}
	if res == Ptr{collocate}(C_NULL)
	return Vector{collocate}()
	end

	i = 0
	unsafe_array = unsafe_wrap(Vector{collocate}, res, max, own=false)
	window = unsafe_array[1].window
	for c in unsafe_array
	i += 1
	if (c.w2 < 0 \|\| c.w2 > max_vocab_index \|\| c.window != window)
	i -= 1
	break
	end
	end
	unsafe_wrap(Vector{collocate}, res, i, own = false)
	end

	function get_collocates(dv::d2vmodel, node::Int64, max = 200)::DataFrame
	collocates = get_collocates(dv.cdb, node - 1, length(dv.vocab), max)
	df = DataFrame(collocates)
	df.w2 = map(x -> x+1, df.w2)
	df.collocate = map(x -> dv.vocab[x], df.w2)
	df
	end

	function get_collocates(dv::d2vmodel, node::String, max = 200)::DataFrame
	get_collocates(dv, dv.vocabdict[node], max)
	end

	end

	# cdb = open_collocatordb("/vol/work/kupietz/Work2/kl/trunk/Analysemethoden/word2vec/models/dereko-2017-ii")