Marc Kupietz | 41b0f68 | 2022-07-21 15:32:45 +0200 | [diff] [blame] | 1 | using Artifacts |
Marc Kupietz | 7101f9e | 2022-07-21 08:58:19 +0200 | [diff] [blame] | 2 | using DerekoVecs |
| 3 | using Test |
| 4 | |
Marc Kupietz | 41b0f68 | 2022-07-21 15:32:45 +0200 | [diff] [blame] | 5 | |
Marc Kupietz | 7101f9e | 2022-07-21 08:58:19 +0200 | [diff] [blame] | 6 | @testset "DerekoVecs.jl" begin |
Marc Kupietz | 41b0f68 | 2022-07-21 15:32:45 +0200 | [diff] [blame] | 7 | |
| 8 | wpd19 = load(joinpath(artifact"wpd19_10000", "wpd19_10000", "wpd19_10000.vecs")) |
| 9 | |
| 10 | @testset "DerekoVecs.jl: loading" begin |
Marc Kupietz | 5a024f8 | 2022-07-28 12:33:57 +0200 | [diff] [blame] | 11 | @test wpd19.m == 200 |
| 12 | @test wpd19.n >= 10000 |
Marc Kupietz | 41b0f68 | 2022-07-21 15:32:45 +0200 | [diff] [blame] | 13 | end |
| 14 | |
| 15 | @testset "DerekoVecs.jl: similarities" begin |
| 16 | @test isapprox(DerekoVecs.cos_sim(wpd19, "war", "war"), 1) |
| 17 | @test cos_sim(wpd19, "wurden", "war") == cos_sim(wpd19, "war", "wurden") |
| 18 | @test cos_sim(wpd19, "wurde", "wurden") > cos_sim(wpd19, "wurde", "ich") |
| 19 | @test cos_sim(wpd19, "wurden", "war") == cos_sim(wpd19, wpd19, "war", "wurden") |
Marc Kupietz | 438a18a | 2022-07-22 09:24:12 +0200 | [diff] [blame] | 20 | @test isapprox(cos_sim(wpd19, wpd19, "war"), 1) |
| 21 | @test isapprox(cos_sim(wpd19, wpd19, 50), 1) |
| 22 | @test isapprox(cos_sim(wpd19, 50, 50), 1) |
Marc Kupietz | 41b0f68 | 2022-07-21 15:32:45 +0200 | [diff] [blame] | 23 | end |
| 24 | |
| 25 | @testset "DerekoVecs.jl: knn" begin |
| 26 | @test "dieser" in knn(wpd19, "der", 3) |
| 27 | @test "wurden" in knn(wpd19, "wurde", 3) |
| 28 | end |
| 29 | |
Marc Kupietz | 5a024f8 | 2022-07-28 12:33:57 +0200 | [diff] [blame] | 30 | @testset "DerekoVecs.jl: kld" begin |
| 31 | mykld = kld(wpd19, wpd19) |
| 32 | @test mykld.common_type_count == length(wpd19.vocabdict) |
| 33 | @test isapprox(mykld.common_type_share, 100) |
| 34 | @test isapprox(mykld.kld, 0) |
| 35 | @test wpd19.total_tokens == mykld.common_token_count |
| 36 | @test isapprox(mykld.common_token_share, 100) |
| 37 | end |
| 38 | |
| 39 | @testset "DerekoVecs.jl: load freq list only" begin |
| 40 | wpd19_freqlist = load(joinpath(artifact"wpd19_10000", "wpd19_10000", "wpd19_10000.vocab")) |
| 41 | @test wpd19.total_tokens == wpd19_freqlist.total_tokens |
| 42 | @test isapprox(kld(wpd19_freqlist, wpd19).kld, 0) |
| 43 | end |
Marc Kupietz | e37554d | 2022-07-29 18:15:03 +0200 | [diff] [blame] | 44 | |
| 45 | @testset "DerekoVecs.jl: collocation analysis" begin |
| 46 | if (!isnothing(wpd19.cdb)) |
Marc Kupietz | 63acc2e | 2022-07-30 17:21:17 +0200 | [diff] [blame^] | 47 | df = get_collocates(wpd19, "werden") |
| 48 | @test df.collocate[1] == "kann" |
| 49 | @test df.ldaf[1] > 10 |
| 50 | @test df.ldaf[1] > df.ldaf[3] |
Marc Kupietz | e37554d | 2022-07-29 18:15:03 +0200 | [diff] [blame] | 51 | end |
| 52 | end |
| 53 | |
Marc Kupietz | 7101f9e | 2022-07-21 08:58:19 +0200 | [diff] [blame] | 54 | end |