blob: 1604ab3589499ecc1d4704226be594871e2aa765 [file] [log] [blame]
Marc Kupietz41b0f682022-07-21 15:32:45 +02001using Artifacts
Marc Kupietz7101f9e2022-07-21 08:58:19 +02002using DerekoVecs
Marc Kupietz370dcb52022-07-30 18:41:10 +02003using DataFrames
Marc Kupietz7101f9e2022-07-21 08:58:19 +02004using Test
5
Marc Kupietz41b0f682022-07-21 15:32:45 +02006
Marc Kupietz7101f9e2022-07-21 08:58:19 +02007@testset "DerekoVecs.jl" begin
Marc Kupietz41b0f682022-07-21 15:32:45 +02008
9 wpd19 = load(joinpath(artifact"wpd19_10000", "wpd19_10000", "wpd19_10000.vecs"))
10
11 @testset "DerekoVecs.jl: loading" begin
Marc Kupietz5a024f82022-07-28 12:33:57 +020012 @test wpd19.m == 200
13 @test wpd19.n >= 10000
Marc Kupietz41b0f682022-07-21 15:32:45 +020014 end
15
16 @testset "DerekoVecs.jl: similarities" begin
17 @test isapprox(DerekoVecs.cos_sim(wpd19, "war", "war"), 1)
18 @test cos_sim(wpd19, "wurden", "war") == cos_sim(wpd19, "war", "wurden")
19 @test cos_sim(wpd19, "wurde", "wurden") > cos_sim(wpd19, "wurde", "ich")
20 @test cos_sim(wpd19, "wurden", "war") == cos_sim(wpd19, wpd19, "war", "wurden")
Marc Kupietz438a18a2022-07-22 09:24:12 +020021 @test isapprox(cos_sim(wpd19, wpd19, "war"), 1)
22 @test isapprox(cos_sim(wpd19, wpd19, 50), 1)
23 @test isapprox(cos_sim(wpd19, 50, 50), 1)
Marc Kupietz41b0f682022-07-21 15:32:45 +020024 end
25
26 @testset "DerekoVecs.jl: knn" begin
27 @test "dieser" in knn(wpd19, "der", 3)
28 @test "wurden" in knn(wpd19, "wurde", 3)
29 end
30
Marc Kupietz5a024f82022-07-28 12:33:57 +020031 @testset "DerekoVecs.jl: kld" begin
32 mykld = kld(wpd19, wpd19)
33 @test mykld.common_type_count == length(wpd19.vocabdict)
34 @test isapprox(mykld.common_type_share, 100)
35 @test isapprox(mykld.kld, 0)
36 @test wpd19.total_tokens == mykld.common_token_count
37 @test isapprox(mykld.common_token_share, 100)
38 end
39
40 @testset "DerekoVecs.jl: load freq list only" begin
41 wpd19_freqlist = load(joinpath(artifact"wpd19_10000", "wpd19_10000", "wpd19_10000.vocab"))
42 @test wpd19.total_tokens == wpd19_freqlist.total_tokens
43 @test isapprox(kld(wpd19_freqlist, wpd19).kld, 0)
44 end
Marc Kupietze37554d2022-07-29 18:15:03 +020045
46 @testset "DerekoVecs.jl: collocation analysis" begin
47 if (!isnothing(wpd19.cdb))
Marc Kupietz63acc2e2022-07-30 17:21:17 +020048 df = get_collocates(wpd19, "werden")
49 @test df.collocate[1] == "kann"
50 @test df.ldaf[1] > 10
Marc Kupietzb3465f62022-08-01 16:43:01 +020051 @test df.ldaf[1] > df.ldaf[2]
Marc Kupietz370dcb52022-07-30 18:41:10 +020052 @test nrow(get_collocates(wpd19, 3, 1)) == 1
53 @test nrow(get_collocates(wpd19, 3, 2)) == 2
Marc Kupietze37554d2022-07-29 18:15:03 +020054 end
55 end
56
Marc Kupietzb3465f62022-08-01 16:43:01 +020057 @testset "DerekoVecs.jl: TSne plot" begin
58 df = DerekoVecs.tsne([wpd19], ["werden", "kann"], 5)
59 @test nrow(df) == 10
Marc Kupietzf6e78402022-08-03 17:32:02 +020060 df2 = df = DerekoVecs.tsne(wpd19, "werden", 5)
61 @test nrow(df2) == 5
Marc Kupietzb3465f62022-08-01 16:43:01 +020062 end
Marc Kupietz7101f9e2022-07-21 08:58:19 +020063end