Marc Kupietz | bf9bac0 | 2022-04-11 21:16:47 +0200 | [diff] [blame] | 1 | use utf8; |
| 2 | package IDS::DeReKoVecs::Read; |
Marc Kupietz | 3576c62 | 2023-11-05 08:51:58 +0100 | [diff] [blame] | 3 | use LWP::Simple; |
Marc Kupietz | bf9bac0 | 2022-04-11 21:16:47 +0200 | [diff] [blame] | 4 | use strict; |
| 5 | use warnings; |
| 6 | use Config; |
| 7 | |
| 8 | my $src_file = undef; |
| 9 | |
| 10 | our $mergedEnd=0; |
| 11 | our %cache; |
| 12 | our %cccache; # classic collocator cache |
| 13 | our %spcache; # similar profile cache |
| 14 | our $opt_p = 5676; |
| 15 | our $opt_C; |
| 16 | |
| 17 | BEGIN { |
| 18 | $src_file = __FILE__; |
| 19 | $src_file =~ s/Read.pm/derekovecs-server.c/; |
| 20 | } |
| 21 | |
Marc Kupietz | f604691 | 2024-12-03 17:21:57 +0100 | [diff] [blame] | 22 | use Inline C => "$src_file" => CLEAN_AFTER_BUILD => 0, BUILD_NOISY => 1, ccflags => $Config{ccflags} . "-Wall -Wno-unused-result -O4 -mno-avx2 -I/usr/local/include", libs => "-L/usr/local/lib64 -L/usr/local/lib -lcollocatordb"; |
Marc Kupietz | bf9bac0 | 2022-04-11 21:16:47 +0200 | [diff] [blame] | 23 | #use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -O4 -mtune k9"; |
| 24 | #use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -Ofast -march k8 -mtune k8 "; |
| 25 | |
| 26 | use Mojo::JSON qw(decode_json encode_json to_json); |
| 27 | use Exporter qw(import); |
| 28 | |
Marc Kupietz | 0ab9739 | 2024-12-10 16:16:32 +0100 | [diff] [blame^] | 29 | our @EXPORT = qw(init_net load_sprofiles getVocabSize getDowntimeCalendar getCollocationAssociation getClassicCollocatorsCached getSimilarProfiles getSimilarProfilesCached getBiggestMergedDifferences filter_garbage get_neighbours getWordNumber dump_vecs dump_for_numpy cos_similarity_as_json get_version getPosWiseW2VCollocators); |
Marc Kupietz | bf9bac0 | 2022-04-11 21:16:47 +0200 | [diff] [blame] | 30 | |
Marc Kupietz | 3576c62 | 2023-11-05 08:51:58 +0100 | [diff] [blame] | 31 | sub getDowntimeCalendar { |
| 32 | my ($url) = @_; |
| 33 | if ($url =~ m/^\s*$/) { |
| 34 | return ""; |
| 35 | } |
| 36 | my $calendar = LWP::Simple::get($url); |
| 37 | return $calendar; |
| 38 | } |
Marc Kupietz | bf9bac0 | 2022-04-11 21:16:47 +0200 | [diff] [blame] | 39 | |
| 40 | sub getCollocationAssociation { |
| 41 | my ($c, $word, $collocate) = @_; |
| 42 | return getCollocationScores($word, $collocate) |
| 43 | } |
| 44 | |
| 45 | sub getClassicCollocatorsCached { |
Marc Kupietz | 3eeec65 | 2024-11-18 18:30:04 +0100 | [diff] [blame] | 46 | my ($c, $word, $compare_to) = @_; |
Marc Kupietz | bf9bac0 | 2022-04-11 21:16:47 +0200 | [diff] [blame] | 47 | my $s2 = ""; |
| 48 | if($word > $mergedEnd) { |
| 49 | $word-=$mergedEnd; |
| 50 | } |
| 51 | |
Marc Kupietz | 3eeec65 | 2024-11-18 18:30:04 +0100 | [diff] [blame] | 52 | my $pipe; |
| 53 | if($compare_to ne "") { |
| 54 | $c->app->log->info("comparing syn neighbours to: $compare_to/getClassicCollocators?w=$word"); |
| 55 | open $pipe, "lwp-request $compare_to/getClassicCollocators?w=$word |"; |
Marc Kupietz | bf9bac0 | 2022-04-11 21:16:47 +0200 | [diff] [blame] | 56 | } |
Marc Kupietz | 3eeec65 | 2024-11-18 18:30:04 +0100 | [diff] [blame] | 57 | |
Marc Kupietz | bf9bac0 | 2022-04-11 21:16:47 +0200 | [diff] [blame] | 58 | if($opt_C || !$cccache{$word}) { |
| 59 | $c->app->log->info("Getting classic collocates of $word."); |
| 60 | $cccache{$word} = getClassicCollocators($word); |
| 61 | $cccache{$word} =~ s/:(-?)(nan|inf)/:"${1}${2}"/g; |
Marc Kupietz | 2c84b5d | 2023-11-06 12:56:52 +0100 | [diff] [blame] | 62 | $cccache{$word} =~ s/"""/"\\""/g; |
Marc Kupietz | bf9bac0 | 2022-04-11 21:16:47 +0200 | [diff] [blame] | 63 | } else { |
| 64 | $c->app->log->info("Getting classic collocates for $word from cache."); |
| 65 | } |
Marc Kupietz | 3eeec65 | 2024-11-18 18:30:04 +0100 | [diff] [blame] | 66 | |
| 67 | if(defined($pipe)) { |
| 68 | while(<$pipe>) { |
Marc Kupietz | bf9bac0 | 2022-04-11 21:16:47 +0200 | [diff] [blame] | 69 | $s2 .= $_; |
| 70 | } |
Marc Kupietz | 3eeec65 | 2024-11-18 18:30:04 +0100 | [diff] [blame] | 71 | close($pipe); |
Marc Kupietz | bf9bac0 | 2022-04-11 21:16:47 +0200 | [diff] [blame] | 72 | } |
| 73 | |
| 74 | if(length($s2) > 2000) { |
| 75 | my $d1 = decode_json($cccache{$word}); |
| 76 | my $d2 = decode_json($s2); |
| 77 | my %d2ld; |
| 78 | my $minLd = 14; |
| 79 | foreach my $i (@{$d2->{collocates}}) { |
| 80 | $d2ld{$i->{word}}=$i->{ld}; |
| 81 | $minLd=$i->{ld} if($i->{ld} < $minLd); |
| 82 | } |
| 83 | foreach my $i (@{$d1->{collocates}}) { |
| 84 | my $w = $i->{word}; |
| 85 | $i->{delta} = $i->{ld} - (defined $d2ld{$w} ? $d2ld{$w} : $minLd-0.1); |
| 86 | } |
| 87 | return(encode_json($d1)); |
| 88 | } else { |
| 89 | my $d1 = decode_json($cccache{$word}); |
| 90 | foreach my $i (@{$d1->{collocates}}) { |
| 91 | $i->{delta} = 0; |
| 92 | } |
| 93 | return(encode_json($d1)); |
| 94 | } |
| 95 | } |
| 96 | |
| 97 | sub getSimilarProfilesCached { |
| 98 | my ($c, $word) = @_; |
| 99 | if(!$spcache{$word}) { |
| 100 | $spcache{$word} = getSimilarProfiles($word); |
| 101 | } else { |
| 102 | $c->app->log->info("Getting similar profiles for $word from cache:"); |
| 103 | } |
| 104 | return $spcache{$word}; |
| 105 | } |
| 106 | |
| 107 | return 1; |