Marc Kupietz | 41b0f68 | 2022-07-21 15:32:45 +0200 | [diff] [blame] | 1 | using Artifacts |
Marc Kupietz | 7101f9e | 2022-07-21 08:58:19 +0200 | [diff] [blame] | 2 | using DerekoVecs |
Marc Kupietz | 370dcb5 | 2022-07-30 18:41:10 +0200 | [diff] [blame] | 3 | using DataFrames |
Marc Kupietz | 7101f9e | 2022-07-21 08:58:19 +0200 | [diff] [blame] | 4 | using Test |
| 5 | |
Marc Kupietz | 41b0f68 | 2022-07-21 15:32:45 +0200 | [diff] [blame] | 6 | |
Marc Kupietz | 7101f9e | 2022-07-21 08:58:19 +0200 | [diff] [blame] | 7 | @testset "DerekoVecs.jl" begin |
Marc Kupietz | 41b0f68 | 2022-07-21 15:32:45 +0200 | [diff] [blame] | 8 | |
| 9 | wpd19 = load(joinpath(artifact"wpd19_10000", "wpd19_10000", "wpd19_10000.vecs")) |
| 10 | |
| 11 | @testset "DerekoVecs.jl: loading" begin |
Marc Kupietz | 5a024f8 | 2022-07-28 12:33:57 +0200 | [diff] [blame] | 12 | @test wpd19.m == 200 |
| 13 | @test wpd19.n >= 10000 |
Marc Kupietz | 41b0f68 | 2022-07-21 15:32:45 +0200 | [diff] [blame] | 14 | end |
| 15 | |
| 16 | @testset "DerekoVecs.jl: similarities" begin |
| 17 | @test isapprox(DerekoVecs.cos_sim(wpd19, "war", "war"), 1) |
| 18 | @test cos_sim(wpd19, "wurden", "war") == cos_sim(wpd19, "war", "wurden") |
| 19 | @test cos_sim(wpd19, "wurde", "wurden") > cos_sim(wpd19, "wurde", "ich") |
| 20 | @test cos_sim(wpd19, "wurden", "war") == cos_sim(wpd19, wpd19, "war", "wurden") |
Marc Kupietz | 438a18a | 2022-07-22 09:24:12 +0200 | [diff] [blame] | 21 | @test isapprox(cos_sim(wpd19, wpd19, "war"), 1) |
| 22 | @test isapprox(cos_sim(wpd19, wpd19, 50), 1) |
| 23 | @test isapprox(cos_sim(wpd19, 50, 50), 1) |
Marc Kupietz | 41b0f68 | 2022-07-21 15:32:45 +0200 | [diff] [blame] | 24 | end |
| 25 | |
| 26 | @testset "DerekoVecs.jl: knn" begin |
| 27 | @test "dieser" in knn(wpd19, "der", 3) |
| 28 | @test "wurden" in knn(wpd19, "wurde", 3) |
| 29 | end |
| 30 | |
Marc Kupietz | 5a024f8 | 2022-07-28 12:33:57 +0200 | [diff] [blame] | 31 | @testset "DerekoVecs.jl: kld" begin |
| 32 | mykld = kld(wpd19, wpd19) |
| 33 | @test mykld.common_type_count == length(wpd19.vocabdict) |
| 34 | @test isapprox(mykld.common_type_share, 100) |
| 35 | @test isapprox(mykld.kld, 0) |
| 36 | @test wpd19.total_tokens == mykld.common_token_count |
| 37 | @test isapprox(mykld.common_token_share, 100) |
| 38 | end |
| 39 | |
| 40 | @testset "DerekoVecs.jl: load freq list only" begin |
| 41 | wpd19_freqlist = load(joinpath(artifact"wpd19_10000", "wpd19_10000", "wpd19_10000.vocab")) |
| 42 | @test wpd19.total_tokens == wpd19_freqlist.total_tokens |
| 43 | @test isapprox(kld(wpd19_freqlist, wpd19).kld, 0) |
| 44 | end |
Marc Kupietz | e37554d | 2022-07-29 18:15:03 +0200 | [diff] [blame] | 45 | |
| 46 | @testset "DerekoVecs.jl: collocation analysis" begin |
| 47 | if (!isnothing(wpd19.cdb)) |
Marc Kupietz | 63acc2e | 2022-07-30 17:21:17 +0200 | [diff] [blame] | 48 | df = get_collocates(wpd19, "werden") |
| 49 | @test df.collocate[1] == "kann" |
| 50 | @test df.ldaf[1] > 10 |
Marc Kupietz | b3465f6 | 2022-08-01 16:43:01 +0200 | [diff] [blame^] | 51 | @test df.ldaf[1] > df.ldaf[2] |
Marc Kupietz | 370dcb5 | 2022-07-30 18:41:10 +0200 | [diff] [blame] | 52 | @test nrow(get_collocates(wpd19, 3, 1)) == 1 |
| 53 | @test nrow(get_collocates(wpd19, 3, 2)) == 2 |
Marc Kupietz | e37554d | 2022-07-29 18:15:03 +0200 | [diff] [blame] | 54 | end |
| 55 | end |
| 56 | |
Marc Kupietz | b3465f6 | 2022-08-01 16:43:01 +0200 | [diff] [blame^] | 57 | @testset "DerekoVecs.jl: TSne plot" begin |
| 58 | df = DerekoVecs.tsne([wpd19], ["werden", "kann"], 5) |
| 59 | @test nrow(df) == 10 |
| 60 | plotTsne([wpd19], ["werden", "kann"], 5) |
| 61 | end |
Marc Kupietz | 7101f9e | 2022-07-21 08:58:19 +0200 | [diff] [blame] | 62 | end |