blob: 1d9f0a828208d90a397de47ad32a032ae87bd55e [file] [log] [blame]
Marc Kupietzbf9bac02022-04-11 21:16:47 +02001use utf8;
2package IDS::DeReKoVecs::Read;
Marc Kupietz3576c622023-11-05 08:51:58 +01003use LWP::Simple;
Marc Kupietzbf9bac02022-04-11 21:16:47 +02004use strict;
5use warnings;
6use Config;
7
8my $src_file = undef;
9
10our $mergedEnd=0;
11our %cache;
12our %cccache; # classic collocator cache
13our %spcache; # similar profile cache
14our $opt_p = 5676;
15our $opt_C;
16
17BEGIN {
18 $src_file = __FILE__;
19 $src_file =~ s/Read.pm/derekovecs-server.c/;
20}
21
Marc Kupietzf6046912024-12-03 17:21:57 +010022use Inline C => "$src_file" => CLEAN_AFTER_BUILD => 0, BUILD_NOISY => 1, ccflags => $Config{ccflags} . "-Wall -Wno-unused-result -O4 -mno-avx2 -I/usr/local/include", libs => "-L/usr/local/lib64 -L/usr/local/lib -lcollocatordb";
Marc Kupietzbf9bac02022-04-11 21:16:47 +020023#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -O4 -mtune k9";
24#use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -Ofast -march k8 -mtune k8 ";
25
26use Mojo::JSON qw(decode_json encode_json to_json);
27use Exporter qw(import);
28
Marc Kupietz9f6a19d2024-11-15 16:22:47 +010029our @EXPORT = qw(init_net load_sprofiles getVocabSize getDowntimeCalendar getCollocationAssociation getClassicCollocatorsCached getSimilarProfiles getSimilarProfilesCached getBiggestMergedDifferences filter_garbage get_neighbours getWordNumber dump_vecs dump_for_numpy cos_similarity_as_json get_version);
Marc Kupietzbf9bac02022-04-11 21:16:47 +020030
Marc Kupietz3576c622023-11-05 08:51:58 +010031sub getDowntimeCalendar {
32 my ($url) = @_;
33 if ($url =~ m/^\s*$/) {
34 return "";
35 }
36 my $calendar = LWP::Simple::get($url);
37 return $calendar;
38}
Marc Kupietzbf9bac02022-04-11 21:16:47 +020039
40sub getCollocationAssociation {
41 my ($c, $word, $collocate) = @_;
42 return getCollocationScores($word, $collocate)
43}
44
45sub getClassicCollocatorsCached {
Marc Kupietz3eeec652024-11-18 18:30:04 +010046 my ($c, $word, $compare_to) = @_;
Marc Kupietzbf9bac02022-04-11 21:16:47 +020047 my $s2 = "";
48 if($word > $mergedEnd) {
49 $word-=$mergedEnd;
50 }
51
Marc Kupietz3eeec652024-11-18 18:30:04 +010052 my $pipe;
53 if($compare_to ne "") {
54 $c->app->log->info("comparing syn neighbours to: $compare_to/getClassicCollocators?w=$word");
55 open $pipe, "lwp-request $compare_to/getClassicCollocators?w=$word |";
Marc Kupietzbf9bac02022-04-11 21:16:47 +020056 }
Marc Kupietz3eeec652024-11-18 18:30:04 +010057
Marc Kupietzbf9bac02022-04-11 21:16:47 +020058 if($opt_C || !$cccache{$word}) {
59 $c->app->log->info("Getting classic collocates of $word.");
60 $cccache{$word} = getClassicCollocators($word);
61 $cccache{$word} =~ s/:(-?)(nan|inf)/:"${1}${2}"/g;
Marc Kupietz2c84b5d2023-11-06 12:56:52 +010062 $cccache{$word} =~ s/"""/"\\""/g;
Marc Kupietzbf9bac02022-04-11 21:16:47 +020063 } else {
64 $c->app->log->info("Getting classic collocates for $word from cache.");
65 }
Marc Kupietz3eeec652024-11-18 18:30:04 +010066
67 if(defined($pipe)) {
68 while(<$pipe>) {
Marc Kupietzbf9bac02022-04-11 21:16:47 +020069 $s2 .= $_;
70 }
Marc Kupietz3eeec652024-11-18 18:30:04 +010071 close($pipe);
Marc Kupietzbf9bac02022-04-11 21:16:47 +020072 }
73
74 if(length($s2) > 2000) {
75 my $d1 = decode_json($cccache{$word});
76 my $d2 = decode_json($s2);
77 my %d2ld;
78 my $minLd = 14;
79 foreach my $i (@{$d2->{collocates}}) {
80 $d2ld{$i->{word}}=$i->{ld};
81 $minLd=$i->{ld} if($i->{ld} < $minLd);
82 }
83 foreach my $i (@{$d1->{collocates}}) {
84 my $w = $i->{word};
85 $i->{delta} = $i->{ld} - (defined $d2ld{$w} ? $d2ld{$w} : $minLd-0.1);
86 }
87 return(encode_json($d1));
88 } else {
89 my $d1 = decode_json($cccache{$word});
90 foreach my $i (@{$d1->{collocates}}) {
91 $i->{delta} = 0;
92 }
93 return(encode_json($d1));
94 }
95}
96
97sub getSimilarProfilesCached {
98 my ($c, $word) = @_;
99 if(!$spcache{$word}) {
100 $spcache{$word} = getSimilarProfiles($word);
101 } else {
102 $c->app->log->info("Getting similar profiles for $word from cache:");
103 }
104 return $spcache{$word};
105}
106
107return 1;