| use utf8; |
| package IDS::DeReKoVecs::Read; |
| use LWP::Simple; |
| use strict; |
| use warnings; |
| use Config; |
| |
| my $src_file = undef; |
| |
| our $mergedEnd=0; |
| our %cache; |
| our %cccache; # classic collocator cache |
| our %spcache; # similar profile cache |
| our $opt_p = 5676; |
| our $opt_C; |
| |
| BEGIN { |
| $src_file = __FILE__; |
| $src_file =~ s/Read.pm/derekovecs-server.c/; |
| } |
| |
| use Inline C => "$src_file" => CLEAN_AFTER_BUILD => 0, BUILD_NOISY => 1, ccflags => $Config{ccflags} . "-Wall -Wno-unused-result -fno-rtti -O4 -I/usr/local/include", libs => "-L/usr/local/lib64 -L/usr/local/lib -lcollocatordb"; |
| #use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -O4 -mtune k9"; |
| #use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -Ofast -march k8 -mtune k8 "; |
| |
| use Mojo::JSON qw(decode_json encode_json to_json); |
| use Exporter qw(import); |
| |
| our @EXPORT = qw(init_net load_sprofiles getVocabSize getDowntimeCalendar getCollocationAssociation getClassicCollocatorsCached getSimilarProfiles getSimilarProfilesCached getBiggestMergedDifferences filter_garbage get_neighbours getWordNumber dump_vecs dump_for_numpy cos_similarity_as_json get_version); |
| |
| sub getDowntimeCalendar { |
| my ($url) = @_; |
| if ($url =~ m/^\s*$/) { |
| return ""; |
| } |
| my $calendar = LWP::Simple::get($url); |
| return $calendar; |
| } |
| |
| sub getCollocationAssociation { |
| my ($c, $word, $collocate) = @_; |
| return getCollocationScores($word, $collocate) |
| } |
| |
| sub getClassicCollocatorsCached { |
| my ($c, $word, $compare_to) = @_; |
| my $s2 = ""; |
| if($word > $mergedEnd) { |
| $word-=$mergedEnd; |
| } |
| |
| my $pipe; |
| if($compare_to ne "") { |
| $c->app->log->info("comparing syn neighbours to: $compare_to/getClassicCollocators?w=$word"); |
| open $pipe, "lwp-request $compare_to/getClassicCollocators?w=$word |"; |
| } |
| |
| if($opt_C || !$cccache{$word}) { |
| $c->app->log->info("Getting classic collocates of $word."); |
| $cccache{$word} = getClassicCollocators($word); |
| $cccache{$word} =~ s/:(-?)(nan|inf)/:"${1}${2}"/g; |
| $cccache{$word} =~ s/"""/"\\""/g; |
| } else { |
| $c->app->log->info("Getting classic collocates for $word from cache."); |
| } |
| |
| if(defined($pipe)) { |
| while(<$pipe>) { |
| $s2 .= $_; |
| } |
| close($pipe); |
| } |
| |
| if(length($s2) > 2000) { |
| my $d1 = decode_json($cccache{$word}); |
| my $d2 = decode_json($s2); |
| my %d2ld; |
| my $minLd = 14; |
| foreach my $i (@{$d2->{collocates}}) { |
| $d2ld{$i->{word}}=$i->{ld}; |
| $minLd=$i->{ld} if($i->{ld} < $minLd); |
| } |
| foreach my $i (@{$d1->{collocates}}) { |
| my $w = $i->{word}; |
| $i->{delta} = $i->{ld} - (defined $d2ld{$w} ? $d2ld{$w} : $minLd-0.1); |
| } |
| return(encode_json($d1)); |
| } else { |
| my $d1 = decode_json($cccache{$word}); |
| foreach my $i (@{$d1->{collocates}}) { |
| $i->{delta} = 0; |
| } |
| return(encode_json($d1)); |
| } |
| } |
| |
| sub getSimilarProfilesCached { |
| my ($c, $word) = @_; |
| if(!$spcache{$word}) { |
| $spcache{$word} = getSimilarProfiles($word); |
| } else { |
| $c->app->log->info("Getting similar profiles for $word from cache:"); |
| } |
| return $spcache{$word}; |
| } |
| |
| return 1; |