blob: 1604ab3589499ecc1d4704226be594871e2aa765 [file] [log] [blame]
using Artifacts
using DerekoVecs
using DataFrames
using Test
@testset "DerekoVecs.jl" begin
wpd19 = load(joinpath(artifact"wpd19_10000", "wpd19_10000", "wpd19_10000.vecs"))
@testset "DerekoVecs.jl: loading" begin
@test wpd19.m == 200
@test wpd19.n >= 10000
end
@testset "DerekoVecs.jl: similarities" begin
@test isapprox(DerekoVecs.cos_sim(wpd19, "war", "war"), 1)
@test cos_sim(wpd19, "wurden", "war") == cos_sim(wpd19, "war", "wurden")
@test cos_sim(wpd19, "wurde", "wurden") > cos_sim(wpd19, "wurde", "ich")
@test cos_sim(wpd19, "wurden", "war") == cos_sim(wpd19, wpd19, "war", "wurden")
@test isapprox(cos_sim(wpd19, wpd19, "war"), 1)
@test isapprox(cos_sim(wpd19, wpd19, 50), 1)
@test isapprox(cos_sim(wpd19, 50, 50), 1)
end
@testset "DerekoVecs.jl: knn" begin
@test "dieser" in knn(wpd19, "der", 3)
@test "wurden" in knn(wpd19, "wurde", 3)
end
@testset "DerekoVecs.jl: kld" begin
mykld = kld(wpd19, wpd19)
@test mykld.common_type_count == length(wpd19.vocabdict)
@test isapprox(mykld.common_type_share, 100)
@test isapprox(mykld.kld, 0)
@test wpd19.total_tokens == mykld.common_token_count
@test isapprox(mykld.common_token_share, 100)
end
@testset "DerekoVecs.jl: load freq list only" begin
wpd19_freqlist = load(joinpath(artifact"wpd19_10000", "wpd19_10000", "wpd19_10000.vocab"))
@test wpd19.total_tokens == wpd19_freqlist.total_tokens
@test isapprox(kld(wpd19_freqlist, wpd19).kld, 0)
end
@testset "DerekoVecs.jl: collocation analysis" begin
if (!isnothing(wpd19.cdb))
df = get_collocates(wpd19, "werden")
@test df.collocate[1] == "kann"
@test df.ldaf[1] > 10
@test df.ldaf[1] > df.ldaf[2]
@test nrow(get_collocates(wpd19, 3, 1)) == 1
@test nrow(get_collocates(wpd19, 3, 2)) == 2
end
end
@testset "DerekoVecs.jl: TSne plot" begin
df = DerekoVecs.tsne([wpd19], ["werden", "kann"], 5)
@test nrow(df) == 10
df2 = df = DerekoVecs.tsne(wpd19, "werden", 5)
@test nrow(df2) == 5
end
end