package Ernad::Learn::Mocla; use strict; use warnings; use Carp qw(cluck longmess shortmess croak confess); use Data::Dumper; use File::Basename; use File::Slurper; use File::Path; #use List::Util qw(shuffle); #use Storable; use URI::Escape; use Ernad::Erimp; use Ernad::Dates; use Ernad::Files; use Ernad::Learn::Lag; use Ernad::Learn::Isink; use Ernad::Learn::Weights; use Ernad::Learn::Dokli; my $exfit_ext='.txt'; binmode(STDOUT,':utf8'); ## constructor sub new { my $this=shift; my $class=ref($this) || $this; my $m={}; bless $m, $class; my $params=shift; ## copy parameters into the object foreach my $key (keys %{$params}) { $m->{$key}=$params->{$key}; } if(not defined($m->{'impna'})) { if(not defined($m->{'e'})) { confess "I need an impna here."; } $m->{'impna'}=$m->{'e'}->{'impna'}; } if(not defined($m->{'e'})) { $m->{'e'}=Ernad::Erimp->new({'impna' => $m->{'impna'}, 'repcode'=> $m->{'repcode'}, 'verbose' => $m->{'verbose'}}); } if(not defined($m->{'d'})) { $m->{'d'}=Ernad::Learn::Dokli->new({'impna' => $m->{'impna'}, 'e'=> $m->{'e'}, 'repcode'=> $m->{'repcode'}, 'verbose' => $m->{'verbose'}}); } &Ernad::Learn::Common::set_basic($m); $m->init(); return $m; } ### constructor #sub new { # my $this=shift; # my $class=ref($this) || $this; # my $d={}; # bless $d, $class; # my $params=shift; # ## copy parameters into the object # foreach my $key (keys %{$params}) { # $m->{$key}=$params->{$key}; # } # if(not defined($params->{'impna'})) { # if(defined($m->{'e'}->{'impna'})) { # $m->{'impna'}=$m->{'e'}->{'impna'}; # } # else { # confess "fatal: no impna parameter\n"; # } # } # if(not $m->{'impna'}) { # confess "fatal: empty impna parameter\n"; # } # my $impna=$m->{'impna'}; # if(not defined($m->{'e'})) { # $m->{'e'}=Ernad::Erimp->new({'impna'=>$impna, 'no_reports'=>1}); # } # ## import the train_limit_by_days # my $train_limit=$m->{'e'}->{'conf'}->{'train_limit_by_days'} // ''; # if($train_limit) { # $m->{'ernad_limit_by_days'}=$train_limit; # $m->{'e'}->echo(__LINE__,"train limit by days set to $train_limit",4); # } # else { # $m->{'ernad_limit_by_days'}=0; # } # ## import the svm_flags # my $svm_train_flags=$m->{'e'}->{'conf'}->{'svm_train_flags'} // ''; # if($svm_train_flags) { # $m->{'svm_train_flags'}=$svm_train_flags; # $m->{'e'}->echo(__LINE__,"svm_train_flags set to $svm_train_flags",4); # } # else { # confess "I need svm_train_flags in the ernad configuration."; # } # ## set the time. it could be given at invocation # if(not defined($m->{'time'})) { # $m->{'time'}=time; # } # $m->set_report_training_limits(); # $m->set_issuedates(); # $m->set_dirs(); # $m->set_lisig(); # # # $m->set_dokli(); # $m->check_weighing_function(); # return $f; #} sub check_weighing_function { my $m=shift; my $scheme=$m->{'e'}->{'conf'}->{'weighing_scheme'} or confess "I need a weighing scheme in the ernad configuration."; my $weigh=$Ernad::Weights::weigh->{$scheme} or confess "The weighing scheme $scheme appears not to be defined."; ## this line is only there to prevent a warning of the type ## Name "Ernad::Weights::weigh" used only once: possible typo at Mocla.pm my $weigh_dump = Dumper $Ernad::Weights::weigh->{$scheme}; $m->{'weigh'}=$weigh; } sub set_report_training_limits { my $m=shift; my $e=$m->{'e'}; foreach my $repcode ($e->list_repcodes()) { ## list_repcodes should not have the allport_repcode, this is a ## temporary check if($repcode eq $e->get_allport_repcode()) { next; } my $etd=$e->{'report'}->{$repcode}->{'earliest_train_date'} // 0; $m->{'earliest_train_date'}->{$repcode}=$etd; } } sub init { my $m=shift; my $svm_train_flags=$m->{'e'}->{'conf'}->{'svm_train_flags'} // ''; if($svm_train_flags) { $m->{'svm_train_flags'}=$svm_train_flags; $m->{'e'}->echo(__LINE__,"svm_train_flags set to $svm_train_flags",4); } else { confess "I need svm_train_flags in the ernad configuration."; } } sub set_mocla_file_and_dates { my $m=shift; ## does it for a repcode, or for allport my $repcode=shift // ''; if(not defined($m->{'issuedates'})) { confess "I need $m->{'issuedates'} defined here."; } ## I implement report training limits through a separate training dates set ## train_dates are the same as for the allport, initially foreach my $issuedate (keys %{$m->{'issuedates'}}) { if($repcode) { $m->{'train_dates'}->{$repcode}->{$issuedate}=1; } else { $m->{'train_dates'}->{$issuedate}=1; } } if(not defined($repcode)) { $m->{'e'}->echo(__LINE__,"I set general training dates.",1); return; } ## If there is a start limit set, we cut the train dates my $report_train_start=$m->{'earliest_train_date'}->{$repcode}; if($report_train_start) { foreach my $date (keys %{$m->{'issuedates'}}) { if(&Ernad::Common::compare_date($report_train_start,$date) < 0) { $m->{'e'}->echo(__LINE__,"I unselect $date, before report train start $report_train_start",3); next; } delete $m->{'train_dates'}->{$repcode}->{$date}; } } if(not scalar keys %{$m->{'train_dates'}->{$repcode}}) { $m->{'e'}->echo(__LINE__,"I have no dates to train for $repcode",1); return 0; } my $ext=&Ernad::Common::find_extreme_issuedates($m->{'train_dates'}->{$repcode}); my $version=$ext->{'min'}.'_'.$ext->{'max'}; my $time; if(defined($m->{'dokli_time'})) { $time=$m->{'dokli_time'}; } else { $m->find_dokli(); $time=$m->{'dokli_time'} or confess "I need dokli_time here."; } $m->{'train_version'}->{$repcode}=$version; $m->{'train_file'}->{$repcode}=$m->{'mocla_dir'}.'/'.$repcode.'/'.$version.'_'.$time.".train"; $m->{'model_file'}->{$repcode}=$m->{'mocla_dir'}.'/'.$repcode.'/'.$version.'_'.$time.".model"; $m->{'lock_file'}->{$repcode}=$m->{'mocla_dir'}.'/'.$repcode.'/'.$version.'_'.$time.".lock"; $m->{'out_file'}->{$repcode}=$m->{'mocla_dir'}.'/'.$repcode.'/'.$version.'_'.$time.".out"; $m->{'err_file'}->{$repcode}=$m->{'mocla_dir'}.'/'.$repcode.'/'.$version.'_'.$time.".err"; $m->{'time_file'}->{$repcode}=$m->{'mocla_dir'}.'/'.$repcode.'/'.$version.'_'.$time.".time"; return $version; } sub get_dokli_object { my $m=shift; my $fitport; #if($m->{'d'} and $m->{'repcode'}) { # return $m->{'d'}->{$m->{'repcode'}}; #} if($m->{'fitport'}) { $fitport=$m->{'fitport'}; } elsif($m->{'e'}->{'fitport'}) { $fitport=$m->{'e'}->{'fitport'}; } else { if(not $m->{'repcode'}) { confess "I have no repcode."; } $fitport=&Ernad::Learn::Common::get_fitport($m,$m->{'repcode'}); } if($m->{'e'}->{'conf'}->{'separate_doklis'}) { if(not $m->{'repcode'}) { confess "I have no repcode."; } } if(not defined($m->{'d'}->{$fitport})) { if($m->{'repcode'}) { $m->{'e'}->echo(__LINE__,"\$m->{repcode} is ".$m->{'repcode'}); } else { $m->{'e'}->echo(__LINE__,"I have no \$m->{repcode}."); } if($m->{'e'}->{'repcode'}) { $m->{'e'}->echo(__LINE__,"\$m->{e}->{repcode} is ".$m->{'e'}->{'repcode'}); } else { $m->{'e'}->echo(__LINE__,"I have no \$m->{'e'}->{repcode}."); } $m->{'d'}->{$fitport}=Ernad::Learn::Dokli->new({'impna' => $m->{'impna'}, 'time' => $m->{'time'}, 'e'=>$m->{'e'}, 'verbose' => $m->{'verbose'}, 'repcode' => $m->{'repcode'}, 'report' => $m->{'repcode'}}); } ## set it as a state $m->{'dokli'}=$m->{'d'}->{$fitport}; } sub find_dokli { my $m=shift; $m->get_dokli_object(); my $d=$m->{'dokli'} or confess "I need a dokli object here."; $m->{'e'}->echo(__LINE__,"I call set_dokli"); $d->set_dokli(); my $dokli_file=$d->{'dokli_file'} or confess "I need a dokli_file here"; $m->{'e'}->echo(__LINE__,"the dokli_file is $dokli_file"); if(not -f $dokli_file) { $d->build_dokli(); } $m->{'dokli_file'}=$d->{'dokli_file'} or confess "I need a dokli_file here"; if(not -f $m->{'dokli_file'}) { confess "I can not open ".$m->{'dokli_file'}; } $m->{'dokli_time'}=$d->{'dokli_time'} or confess "I need a dokli_time here"; } ## best in here means the most recent sub find_most_recent_model { my $m=shift; my $repcode=shift // confess "I need a repcode here."; my $fitport=&Ernad::Learn::Common::get_fitport($m,$repcode); $m->get_dokli_object(); #if(not defined($m->{'d'}->{$fitport})) { # $m->{'d'}->{$fitport}=Ernad::Learn::Dokli->new({'impna' => $m->{'impna'}, # 'report' => $m->{'fitport'}, # 'time' => $m->{'time'}, # 'verbose' => $m->{'verbose'}}); #} my $d=$m->{'d'} // confess "I need a dockli here."; # my $d=$m->{'d'}->{$fitport} or confess "I need a fitport here."; if(not defined($m->{'mocla_dir'})) { confess "I need a mocla_dir here"; } my $mocla_dir=$m->{'mocla_dir'}.'/'.$repcode; if(not $mocla_dir) { confess "I need mocla_dir set here"; } if(not -d $mocla_dir) { mkpath($mocla_dir); } opendir( my $moclas, $mocla_dir) or confess "I can't open the mocla_dir $mocla_dir."; my $best_time=0; my $best_period=0; my $best_file; $m->{'e'}->echo(__LINE__,"I go looking for model files in $mocla_dir.",3); while (my $file = readdir $moclas ) { if(not $file=~m|(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})_(\d+)\.model$|) { $m->{'e'}->echo(__LINE__,"I'm skipping file '$file' when looking for model files.",4); next; } $m->{'e'}->echo(__LINE__,"I found $repcode model $file"); my $model_fufi="$mocla_dir/$file"; ## from an incomplete run, it may be zero if(-z $model_fufi) { unlink $model_fufi; next; } my $start=$1; my $end=$2; my $time=$3; ## We look at the period here, but this seems my $period=&Ernad::Dates::diff_dates($start,$end); if(not $d->check_if_tidafs_exists($time)) { $m->{'e'}->echo(__LINE__,"found $repcode model for time $time, but there is no dokli",4); next; } if($time > $best_time) { $best_file=$file; $best_time=$time; $best_period=$period; next; } if($time == $best_time) { if($period > $best_period) { $best_file=$file; $best_period=$period; } } } if(not defined($best_file)) { $m->{'e'}->echo(__LINE__,"I can not find a model for $repcode."); return ''; } my $best_model=$mocla_dir.'/'.$best_file; $m->{'e'}->echo(__LINE__,"$best_model is the most recent model for $repcode",1); return $best_model; } ## this is the general approach ## but sometimes this does not work, as individual ## handles may be under other issuedates. sub build_test_file_by_issuedate { my $m=shift; my $e=$main::e // confess "I need an erimp here."; my $repcode=shift // confess "I need a repcode here."; my $issuedate=shift // confess "I need an issuedate here."; my $out=''; my $model_file=$m->find_most_recent_model($repcode); if(not $model_file) { $m->{'e'}->echo(__LINE__,"I have no model for $repcode. I start to build one."); $m->model_report($repcode); $model_file=$m->find_most_recent_model($repcode); } my $model_time=&Ernad::Common::get_time_from_file_name($model_file) or confess "I can not find the time in '$model_file'"; $m->{'time'}=$model_time; $m->{'d'}->{'time'}=$model_time; $m->{'d'}->{'dokli_time'}=$model_time; $m->{'model_file'}->{$repcode}=$model_file; if(not defined($m->{'fitport'})) { confess "I need a fitport here."; } ## if we don't truncate, we take the test lines from the dokli. my $glob=$m->{'dokli_dir'}.'/*_'.$model_time.'.txt'; $e->echo(__LINE__,"I am looking for $glob"); my @files=glob($glob); ## dokli per report --> redol my $redol_file=$files[0]; my $test_file=$m->{'dokli_dir'}.'/'.$issuedate.'_'.$model_time.'.test'; if(not &Ernad::Common::does_file_need_renewal($test_file,$redol_file)) { $m->{'e'}->echo(__LINE__,"I skip the renewal of $test_file, it needs no renewal.",2); $m->{'test_file'}=$test_file; return $test_file; } $m->get_dokli_object(); $m->{'dokli'}->set_dokli_for_model($redol_file); $m->find_dokli(); if(not $e->{'conf'}->{'truncate'}) { ## we use the dokli $m->{'e'}->echo(__LINE__,"The redol_file is $redol_file, I look for $issuedate."); if(not $m->{'dokli'}->is_date_in_dokli($issuedate,$redol_file)) { $m->{'e'}->echo(__LINE__,"$issuedate IS NOT IN dokli_file $redol_file",2); $m->{'dokli'}->update_dokli($redol_file); } else { $m->{'e'}->echo(__LINE__,"$issuedate IS IN dokli_file $redol_file",2); } $model_time=&Ernad::Common::get_time_from_file_name($redol_file); $m->{'e'}->echo(__LINE__,"I get vemlis for $issuedate from $redol_file at $model_time"); my $vemlis=$m->{'dokli'}->get_vemlis_by_date($issuedate, $redol_file, $model_time) // ''; if(not $vemlis) { confess "I see no vemlis for $issuedate for report $repcode at $model_time in $redol_file"; } foreach my $line (split("\n",$vemlis)) { $out.="0 $line\n"; } if($out=~m| \d+ |) { confess "bad $out"; } open(F,"> $test_file"); print F $out; close F; $m->{'e'}->echo(__LINE__,"I write $test_file."); return $test_file; } ## the case when we don't have a dokli entry $m->{'e'}->echo(__LINE__,"I have been configured to truncate. I build the test file without dockli."); $test_file=$m->build_test_file_without_dokli(); return $test_file; } ;; sub build_test_file_without_dokli { my $m=shift; my $e=$main::e // confess "I need an erimp here."; my $issuedate=$e->{'issuedate'} // confess "I need an issuedate here."; my $repcode=$e->{'repcode'} // confess "I need a repcode here."; my $rerc=$e->{'report'}->{$repcode} // confess "I don't know about $repcode."; my $d=$m->{'d'} // confess "I need a dokli here"; ## build the exfit file my $doc=$rerc->{'amf_doc'}; if(not &Ernad::Common::count_texts_in_rif($doc)) { confess "Your amf_doc seems to contain no documents."; } my $namex_file=$e->{'dir'}->{'learn'}.'/namex/'.$issuedate.'_term.txt'; if(not -f $namex_file) { $namex_file=$d->build_exfit('term',$issuedate,$doc,'namex'); } $e->echo(__LINE__,"done"); my $dokli_time=$m->{'dokli_time'} or confess "I need dokli_time here."; $d->make_test_file_with_terms_only($namex_file); } ## this is the general approach ## but sometimes this does not work, as individual ## handles may be under other issuedates. # sub build_test_file_by_issuedate_old { # my $m=shift; # my $repcode=shift // confess "I need a repcode here."; # my $issuedate=shift // confess "I need an issuedate here."; # my $out=''; # my $model_file=$m->find_most_recent_model($repcode); # if(not $model_file) { # $m->{'e'}->echo(__LINE__,"I have no model for $repcode. I start to build one."); # $m->model_report($repcode); # $model_file=$m->find_most_recent_model($repcode); # } # my $model_time=&Ernad::Common::get_time_from_file_name($model_file) # or confess "I can not find the time in '$model_file'"; # $m->{'time'}=$model_time; # $m->{'d'}->{'time'}=$model_time; # $m->{'d'}->{'dokli_time'}=$model_time; # $m->{'model_file'}->{$repcode}=$model_file; # if(not defined($m->{'fitport'})) { # confess "I need a fitport here."; # } # my $glob=$m->{'dokli_dir'}.'/*_'.$model_time.'.txt'; # my @files=glob($glob); # ## dokli per report --> redol # my $redol_file=$files[0]; # my $test_file=$m->{'dokli_dir'}.'/'.$issuedate.'_'.$model_time.'.test'; # if(not &Ernad::Common::does_file_need_renewal($test_file,$redol_file)) { # $m->{'e'}->echo(__LINE__,"I skip the renewal of $test_file, it needs no renewal.",2); # $m->{'test_file'}=$test_file; # return $test_file; # } # $m->get_dokli_object(); # $m->{'dokli'}->set_dokli_for_model($redol_file); # $m->find_dokli(); # if(not $m->{'dokli'}->is_date_in_dokli($issuedate,$redol_file)) { # $m->{'e'}->echo(__LINE__,"$issuedate IS NOT IN dokli_file $redol_file",2); # $m->{'dokli'}->update_dokli($redol_file); # } # else { # $m->{'e'}->echo(__LINE__,"$issuedate IS IN dokli_file $redol_file",2); # } # $model_time=&Ernad::Common::get_time_from_file_name($redol_file); # $m->{'e'}->echo(__LINE__,"I get vemlis for $issuedate from $redol_file at $model_time"); # my $vemlis=$m->{'dokli'}->get_vemlis_by_date($issuedate, $redol_file, $model_time) // ''; # if(not $vemlis) { # confess "I see no vemlis for $issuedate for report $repcode at $model_time in $redol_file"; # } # foreach my $line (split("\n",$vemlis)) { # $out.="0 $line\n"; # } # if($out=~m| \d+ |) { # confess "bad $out"; # } # open(F,"> $test_file"); # print F $out; # close F; # return $test_file; # } ## this is the general approach ## but sometimes this does not work, as individual ## handles may be under other issuedates. # sub build_test_file_via_temporary_exfit { # my $m=shift; # my $e=$main::e // confess "I need a repcode here."; # my $repcode=$e->{'repcode'} // confess "I need a repcode here."; # my $issuedate=$e->{'issuedate'} // confess "I need an issuedate here."; # my $out=''; # my $model_file=$m->find_most_recent_model($repcode); # if(not $model_file) { # $m->{'e'}->echo(__LINE__,"I have no model for $repcode. I start to build one."); # $m->model_report($repcode); # $model_file=$m->find_most_recent_model($repcode); # } # my $model_time=&Ernad::Common::get_time_from_file_name($model_file) # or confess "I can not find the time in '$model_file'"; # $m->{'time'}=$model_time; # $m->{'d'}->{'time'}=$model_time; # $m->{'d'}->{'dokli_time'}=$model_time; # $m->{'model_file'}->{$repcode}=$model_file; # if(not defined($m->{'fitport'})) { # confess "I need a fitport here."; # } # my $glob=$m->{'dokli_dir'}.'/*_'.$model_time.'.txt'; # my @files=glob($glob); # ## dokli per report --> redol # my $redol_file=$files[0]; # my $test_file=$m->{'dokli_dir'}.'/'.$issuedate.'_'.$model_time.'.test'; # if(not &Ernad::Common::does_file_need_renewal($test_file,$redol_file)) { # $m->{'e'}->echo(__LINE__,"I skip the renewal of $test_file, it needs no renewal.",2); # $m->{'test_file'}=$test_file; # return $test_file; # } # $m->get_dokli_object(); # $m->{'dokli'}->set_dokli_for_model($redol_file); # $m->find_dokli(); # if(not $m->{'dokli'}->is_date_in_dokli($issuedate,$redol_file)) { # $m->{'e'}->echo(__LINE__,"$issuedate IS NOT IN dokli_file $redol_file",2); # $m->{'dokli'}->update_dokli($redol_file); # } # else { # $m->{'e'}->echo(__LINE__,"$issuedate IS IN dokli_file $redol_file",2); # } # $model_time=&Ernad::Common::get_time_from_file_name($redol_file); # $m->{'e'}->echo(__LINE__,"I get vemlis for $issuedate from $redol_file at $model_time"); # my $vemlis=$m->{'dokli'}->get_vemlis_by_date($issuedate, $redol_file, $model_time) // ''; # if(not $vemlis) { # confess "I see no vemlis for $issuedate for report $repcode at $model_time in $redol_file"; # } # foreach my $line (split("\n",$vemlis)) { # $out.="0 $line\n"; # } # if($out=~m| \d+ |) { # confess "bad $out"; # } # open(F,"> $test_file"); # print F $out; # close F; # return $test_file; # } # # # sub complement_test_file { # my $m=shift; # my $test_file=shift; # my $handle=shift // confess "I need an issuedate here."; # my $out=''; # my $model_file=$m->find_most_recent_model($repcode); # if(not $model_file) { # $m->{'e'}->echo(__LINE__,"I have no model for $repcode. I start to build one."); # $m->model_report($repcode); # $model_file=$m->find_most_recent_model($repcode); # } # my $model_time=&Ernad::Common::get_time_from_file_name($model_file) # or confess "I can not find the time in '$model_file'"; # $m->{'time'}=$model_time; # $m->{'model_file'}->{$repcode}=$model_file; # if(not defined($m->{'fitport'})) { # confess "I need a fitport here."; # } # my $glob=$m->{'dokli_dir'}.'/*_'.$model_time.'.txt'; # my @files=glob($glob); # ## dokli per report --> redol # my $redol_file=$files[0]; # if(not &Ernad::Common::does_file_need_renewal($test_file,$redol_file)) { # $m->{'e'}->echo(__LINE__,"I skip the renewal of $test_file, it needs no renewal.",2); # $m->{'test_file'}=$test_file; # return $test_file; # } # $m->get_dokli_object(); # $m->{'dokli'}->set_dokli_for_model($redol_file); # if(not $m->{'dokli'}->is_date_in_dokli($issuedate,$redol_file)) { # $m->{'e'}->echo(__LINE__,"$issuedate is not in dokli_file $redol_file",2); # $m->{'dokli'}->update_dokli($redol_file); # } # my $handles=$m->get_allport_sent_handles # # ;; # # my $vemlis=$m->{'dokli'}->get_vemlis_by_date($issuedate, $redol_file) // ''; # if(not $vemlis) { # confess "I see no vemlis for $issuedate for report $repcode at $model_time in $redol_file"; # } # foreach my $line (split("\n",$vemlis)) { # $out.="0 $line\n"; # } # if($out=~m| \d+ |) { # confess "bad $out"; # } # open(F,"> $test_file"); # print F $out; # close F; # return $test_file; # } sub predict { my $m=shift; my $repcode=shift or confess 'I need a repcode here'; my $issuedate=shift or confess 'I need an issuedate here'; my $flags=shift // ''; my $e=$main::e // confess "I need an erimp here"; my $test_file; #if(not $e->{'conf'}->{'separate_doklis'}) { $test_file=$m->build_test_file_by_issuedate($repcode,$issuedate); #} #else { # die "here."; #} ## this should fix the test file if the previous approach fails $m->check_test_file($test_file); my $model_file=$m->{'model_file'}->{$repcode} or confess "I can't see a model file"; my $model_dir=dirname($model_file); my $class_file=$test_file; $class_file=~s|\.test$|\.class|; my $class_file_name=basename($class_file); $class_file=$model_dir.'/'.$class_file_name; my $out_file=$class_file; $out_file=~s|\.class|.out|; my $err_file=$class_file; $err_file=~s|\.class|.err|; $flags.=" -b 1"; if(not &Ernad::Common::does_file_need_renewal($class_file,$model_file,$test_file)) { $m->{'e'}->echo(__LINE__,"I skip the renewal of $class_file, it needs no renewal.",2); $m->{'class_file'}->{$repcode}->{$issuedate}=$class_file; } else { ## request probababilites my $s="svm-predict $flags $test_file $model_file $class_file > $out_file 2> $err_file"; $m->{'e'}->echo(__LINE__,"I run $s"); system($s); } if(not -z $err_file) { my $error=&File::Slurper::read_text($err_file); chomp $error; confess "smv_predict gives me an error '$error'"; } ## conduct a check if the number of lines in the out_file my $count_lines_in_test=&Ernad::Common::count_lines_in_file($test_file); my $count_lines_in_class=&Ernad::Common::count_lines_in_file($class_file); if($count_lines_in_test != $count_lines_in_class -1) { confess "$test_file has $count_lines_in_test but $class_file has $count_lines_in_class"; } ## used for a check in the caller $m->{'class_file'}->{$repcode}->{$issuedate}="$class_file"; ## this is a debugging tool my $weights=$m->write_side_file($class_file,$test_file); return $weights; } sub write_side_file { my $m=shift; my $class_file=shift; my $test_file=shift; my @class_lines=&File::Slurper::read_lines($class_file); ## first line is a comment my $first_line=shift @class_lines; chomp $first_line; if($first_line eq 'labels -1 1') { map ~s|\S+\s+\S+\s+(\S+)|$1|g, @class_lines; } elsif($first_line eq 'labels 1 -1') { map ~s|\S+\s+(\S+)\s+\S+|$1|g, @class_lines; } else { confess "bad first line $first_line"; } my $side_file=$class_file; $side_file=~s|\.class$|\.side|; my @test_lines=&File::Slurper::read_lines($test_file); #print Dumper @test_lines; #print "\n\n"; map ~s|[^#]+#\s*||g, @test_lines; #print Dumper @test_lines; my $count=0; my $out; my $weights; foreach my $line (@test_lines) { chomp $line; my $handle=uri_unescape($line); my $weight=$class_lines[$count]; $out.="$weight\t$line\n"; $weights->{$handle}=$weight; $count++; } &File::Slurper::write_text($side_file,$out); return $weights; } ## callable externally sub model_report { my $m=shift; my $repcode=shift // ''; my $e=$m->{'e'} // confess 'I need an erimp here.'; if(not $repcode) { $repcode=$e->{'repcode'} // confess "I need a repcode here."; } ## do an extra check to prnit terlis separate my $type_treli_separate=1; if(not $m->{'e'}->can_the_report_be_modelled($repcode)) { $m->{'e'}->echo(__LINE__,"I can't model the report $repcode because I have no accepted papers.",1); $e->{'report'}->{$repcode}->{'modelled'}=0; return 0; } $m->set_mocla_file_and_dates($repcode); my $train_file=$m->{'train_file'}->{$repcode} or confess "I need a train file here"; $m->{'e'}->echo(__LINE__,"I have set the train_file to $train_file"); my $model_file=$m->{'model_file'}->{$repcode} or confess "I need a model file here"; $m->{'e'}->echo(__LINE__,"I have set the model_file to $model_file"); if(-f $model_file) { $m->{'e'}->echo(__LINE__,"I found the model_file $model_file."); my $model_age=(-M $model_file) * 60 * 60 * 24; ## Check is the model is recent. If it is, we should skip remodelling. my $lagger=$Ernad::Learn::Lag::lag->{$m->{'impna'}}; if(not defined($lagger)) { confess "I need a lagger here."; } my $required_age=&{$lagger}($repcode) // confess "I need a required age here."; if($model_age < $required_age) { $m->{'e'}->echo(__LINE__, "The model $model_file is too young for me to redo it."); $e->{'report'}->{$repcode}->{'modelled'}=1; return 0; } ## find the training file that the model is based on if(not -f $train_file and -f $model_file) { confess "A model file $model_file exists without a training file $train_file."; } if(-f $train_file) { $m->{'e'}->echo(__LINE__,"I found the train_file is $train_file."); if(-M $train_file < -M $model_file) { confess "The train file $train_file is more recent than the model_file $model_file."; } my $sent_dir=$m->{'e'}->{'report'}->{$repcode}->{'dir'}->{'sent'} // confess 'I need a sent_dir here'; if(not &Ernad::Common::does_file_need_renewal($train_file, $sent_dir)) { $m->{'e'}->echo(__LINE__,"I found $model_file. It is up to date, ending build_model."); $e->{'report'}->{$repcode}->{'modelled'}=1; return $train_file; } } } if(not -f $model_file) { $m->{'e'}->echo(__LINE__,"I don't see model_file $model_file. I need to build it."); } #my $dokli_file=$m->{'dokli_file'} // ''; #if(not $dokli_file or not -f $dokli_file) { # $m->{'e'}->echo(__LINE__,"I am building dokli $dokli_file."); # $m->get_dokli_object(); # my $d=$m->{'dokli'} or confess "I need a dokli object here."; # $d->build_dokli('add'); #} $m->find_dokli(); if(not -f $m->{'dokli_file'}) { confess "building the dokli appears to have failed"; } my $lock_file=$m->{'lock_file'}->{$repcode} or confess "I need a lock file here"; $m->{'e'}->echo(__LINE__,"lock_file is $lock_file.",7); if(-f $lock_file) { ## remove the lock after more than one day my $lock_days_old=-M $lock_file; if($lock_days_old > 1) { $m->{'e'}->echo(__LINE__,"I delete an $lock_days_old lock file $lock_file",1); unlink $lock_file; } else { $m->{'e'}->echo(__LINE__,"I find $lock_file, I don't build $model_file",1); $e->{'report'}->{$repcode}->{'modelled'}=0; return 0 ; } } else { $m->{'e'}->echo(__LINE__,"I don't see that file, so I move on.",7); } my $trained=$m->build_train_file($repcode,$m->{'dokli_file'}); if(not $trained) { $m->{'e'}->echo(__LINE__,"I could not build the train file.",1); $e->{'report'}->{$repcode}->{'modelled'}=0; return 0; } if(not &Ernad::Common::does_file_need_renewal($model_file, $train_file)) { $m->{'e'}->echo(__LINE__,"I skip the renewal of $model_file, it's newer than the train file $train_file.",1); unlink $lock_file; return $train_file; } ## only for reporting my $out_file=$m->{'out_file'}->{$repcode} or confess "I need a out file here"; my $err_file=$m->{'err_file'}->{$repcode} or confess "I need a err file here"; my $time_file=$m->{'time_file'}->{$repcode} or confess "I need a err file here"; ## -b 1 is required to get probablities my $flags="-b 1" . ' '. $m->{'svm_train_flags'}; my $train_bin='/usr/bin/svm-train'; my $time_bin='/usr/bin/time'; my $s="$time_bin -o $time_file -p $train_bin $flags $train_file $model_file > $out_file 2> $err_file"; my $start=time; $m->{'e'}->echo(__LINE__,"command is $s",1); system("touch $lock_file"); system($s); my $end=time; unlink $lock_file; my $error_text=&File::Slurper::read_text($err_file); if(not $error_text=~m|^\s*$|) { confess "Training has finished with an error '$error_text'."; } my $run_time=$end-$start; $m->{'e'}->echo(__LINE__,"done in $run_time",1); delete $m->{'train_file'}->{$repcode}; delete $m->{'isink_files'}->{$repcode}; $e->{'report'}->{$repcode}->{'modelled'}=1; return $train_file; } ## only called by build_model, but in an else, so ## let's put it in a separate method. sub build_train_file { my $m=shift; my $repcode=shift or confess "I need a repcode here."; my $train_file=$m->{'train_file'}->{$repcode} or confess "I need a training file here"; $m->{'e'}->echo(__LINE__,"I am pondering the renewal of train_file is $train_file",7); my $verbose=$m->{'verbose'} // confess 'I need verbosity set here.'; if(not defined($m->{'i'})) { $m->{'i'}=Ernad::Learn::Isink->new({'impna' => $m->{'impna'}, 'repcode'=> $repcode, 'verbose' => $m->{'verbose'}, 'e' => $m->{'e'}}); } my $i=$m->{'i'} or confess "I need an isink object here."; ## traing can be long-run, so we always list the isinks rather than ## relying on an earlier listing. $i->list_isink_files($repcode,'isink'); my $isink_files=$i->{'file'}->{'isink'}; if(not defined($isink_files->{$repcode})) { $m->{'e'}->echo(__LINE__,"I can't find isink_file for $repcode. I can't build the train file without them."); return 0; } my $iskin_dir=$m->{'isink_dir'}.'/'.$repcode; ## from an incomplete run, it may be zero if(-z $train_file) { unlink $train_file; } if(not &Ernad::Common::does_file_need_renewal($train_file,$iskin_dir,$m->{'dokli_file'})) { $m->{'e'}->echo(__LINE__,"I skip the renewal of $train_file, it needs no renewal.",1); $m->{'train_file'}->{$repcode}=$train_file; return 1; } if(not defined($m->{'train_dates'}->{$repcode})) { confess "I need to have train_dates defined here"; } &Ernad::Files::prepare_for_file($train_file); my $train_fh; open($train_fh,"> $train_file") or confess "I could not open $train_file."; foreach my $issuedate (sort keys %{$i->{'file'}->{'isink'}->{$repcode}}) { my $is_still_in_vedex=$m->make_trelis($repcode,$issuedate,$train_fh); if($is_still_in_vedex) { close $train_fh; unlink $train_file; confess "Some handles are still in the vedex: $is_still_in_vedex."; } } close $train_fh; delete $m->{'train_dates'}->{$repcode}; #$m->{'train_file'}->{$repcode}=$train_file; $m->{'e'}->echo(__LINE__,"I have written $train_file"); } sub make_trelis { my $m=shift; my $repcode=shift // confess; my $issuedate=shift // confess; my $train_fh=shift; my $i=$m->{'i'}; my $e=$m->{'e'} // confess "I need an erimp here."; my $rerc=$e->{'report'}->{$repcode} // confess "I don't know about the report $repcode."; ## if we do it without learnable_dates if(not defined($rerc->{'learnable_issuedates'})) { if(not defined($m->{'train_dates'}->{$repcode}->{$issuedate})) { $m->{'e'}->echo(__LINE__,"I skip date $issuedate. It does not go into the train_file for $repcode",1); return; } if(not defined($m->{'issuedates'}->{$issuedate})) { $m->{'e'}->echo(__LINE__,"I skip date $issuedate. It does has not valid allport date.",1); next; } } else { $m->{'issuedates'}=$rerc->{'learnable_issuedates'}; } $m->{'e'}->echo(__LINE__,"getting vemlis for $repcode issuedate $issuedate",1); my $vemlis=$m->{'dokli'}->get_vemlis_by_date($issuedate) // ''; if(not $vemlis) { # # $m->{'e'}->echo(__LINE__,"WARN: no vemlis for $issuedate for report $repcode",0); confess " no vemlis for $issuedate for report $repcode"; # # next; } my $isink_file=$i->{'file'}->{'isink'}->{$repcode}->{$issuedate}; if(not -f $isink_file) { confess "I can't see the isink_file $isink_file."; } my $isink_text=&File::Slurper::read_text($isink_file); my $vedex=$m->index_vemlis($vemlis); foreach my $isili (split("\n",$isink_text)) { #print "$isili\n"; my $treli=$m->make_treli($isili,$vedex) or next; print $train_fh $treli; } ## check we have nothing in the vedex left my $is_still_in_vedex=join(' ', keys %{$vedex}); return $is_still_in_vedex; } sub make_treli { my $m=shift; my $isili=shift; my $vedex=shift; $isili=~m|^([-+]1)\s+#\s+(\S+)\s*$| or confess "bad isili $isili"; my $indic=$1; my $handle=$2; my $vem=$vedex->{$handle}; if(not defined($vem)) { $m->{'e'}->echo(__LINE__,"WARN: I don't have a vedex entry for $handle."); return ''; } my $treli="$indic $vem # " . uri_escape($handle) . "\n"; delete $vedex->{$handle}; return $treli; } sub index_vemlis { my $m=shift; my $vemlis=shift; my $vedex={}; foreach my $vemli (split("\n",$vemlis)) { $vemli=~m|^([^#]+)\s+#\s+(\S+)\s*$| or confess "bad vemli $vemli"; my $vem=$1; my $handle=uri_unescape($2); $vedex->{$handle}=$vem; } return $vedex; } ## only check for the reappearance of dates, this is obsolete 2017-05-08 #sub check_train_file { # my $m=shift; # my $in_file=shift; # open(F,"< $in_file") or cluck "I can't open $in_file"; # my $line; # my $tl; # my $old_date; # my $count=0; # while ($line=) { # chomp $line; # if(not $line=~m|([-+]1)[^#]+# (\d{4}-\d{2}-\d{2})\s(\S+)|) { # print "bad line $line\n"; # next; # } # my $stat=$1; # my $date=$2; # my $escaped_handle=$3; # my $handle=uri_unescape($escaped_handle); # $tl->{$count}->{'stat'}=$stat; # $tl->{$count}->{'date'}=$date; # $tl->{$count}->{'escaped_handle'}=$escaped_handle; # $tl->{$count}->{'handle'}=$handle; # if(defined($tl->{$handle})) { # croak "handle $handle has appeared twice"; # } # $tl->{$handle}=1; # $tl->{$date}++; # ## check for reappearance of dates # ## first line # $count++; # if(not defined($old_date)) { # $old_date=$date; # $tl->{'old_dates'}->{$date}=1; # next; # } # if($date eq $old_date) { # next; # } # if(defined($tl->{'old_dates'}->{$date})) { # croak "date $date has come back"; # } # } #} sub type_train_file { my $m=shift; my $repcode=shift; if(not $repcode) { confess "I need a repcode to find its test file."; } if(not $m->{'e'}->{'report'}->{$repcode}) { confess "I don't know the report '$repcode'.\n"; } ## returns the train file my $train_file=$m->model_report($repcode); $m->{'d'}->type_vemli_file($train_file); } sub type_test_file { my $m=shift; my $repcode=shift; if(not $repcode) { confess "I need a repcode to find its test file."; } my $issuedate=shift // ''; if(not $issuedate) { confess "I need an issuedate to find its test file."; } my $test_file=$m->build_test_file($repcode,$issuedate); $m->check_test_file($test_file); $m->{'d'}->type_vemli_file($test_file); } ## counts the lines and sees if they correspond ## to the lines in the allport_sent file sub check_test_file { my $m=shift; my $file=shift // confess "I need a test file here."; $m->{'e'}->echo(__LINE__,"I am checking the test file $file."); if(not -f $file) { confess "I don't see the file $file"; } ## find the sent_handle my $issuedate=&Ernad::Common::find_issuedate_from_file($file); my $namf_file=$m->{'e'}->{'file'}->{'namf'} // ''; my $sent_handles; if($namf_file) { if( not -f $namf_file) { confess "I can't access the namf_file $namf_file"; } $sent_handles=&Ernad::Common::get_handles_from_rif($namf_file,'hash'); } else { $sent_handles=$m->get_allport_sent_handles($issuedate,'hash'); } my $test_handles=$m->get_test_handles($file,'hash'); my $missing_in_test; foreach my $s_h (keys %{$sent_handles}) { if(not defined($test_handles->{$s_h})) { $missing_in_test->{$s_h}=1; } } my $missing_in_sent; foreach my $t_h (keys %{$test_handles}) { if(not defined($sent_handles->{$t_h})) { $missing_in_sent->{$t_h}=1; } } my $count_missing_in_sent=scalar(keys %{$missing_in_sent}); my $count_missing_in_test=scalar(keys %{$missing_in_test}); if(not $count_missing_in_sent and not $count_missing_in_test) { return 1; } if($count_missing_in_sent) { confess "Fixme, I can't handle this case yet."; return 0; } ## remaining case: something missing in the test. $m->{'e'}->echo(__LINE__,"I miss $count_missing_in_test missing in my test."); $m->{'e'}->echo(__LINE__,"I will try to get them from the dokli."); my $time=&Ernad::Common::find_time_from_file($file); ## check if we have a dokli my $dokli_file=$m->{'d'}->get_dokli_by_time($time); my $out; foreach my $handle (keys %{$missing_in_test}) { my $vemli=$m->{'d'}->get_vemli_by_handle($handle,$dokli_file,$issuedate); $vemli=~s|^\d{4}-\d{2}-\d{2}\s*||; if(not $vemli) { $m->{'e'}->echo(__LINE__,"WARN: I could not get a vemli for $handle by handle."); } $out.="0 $vemli\n"; } if($out=~m| \d+ |) { confess "This is bad vemli output: '$out'."; } open(T,">> $file"); print T $out; close T; return 1; } sub get_test_handles { my $m=shift; my $file=shift // confess "I need a test file here."; my $how=shift // confess "I need to know how."; if(not -f $file) { confess "I don't see the file $file"; } if(($how ne 'array') and ($how ne 'hash')) { confess "I don't know how, '$how'?"; } open(F,"< $file"); my $count=0; my $handles; my $line; my $count_lines=0; while($line=) { chomp $line; $count_lines++; if(not $line=~m|[^#]+#\s(\S+)\s*$|) { confess "I have a bad line number $count_lines in file $file\n$line"; } my $handle=uri_unescape($1); if($how eq 'array') { $handles->[$count++]=$handle; } else { $handles->{$handle}=1; } } close F; return $handles; } sub get_allport_sent_handles { my $m=shift; my $issuedate=shift // confess "I need an issuedate here."; my $how=shift // ''; if(not $issuedate=~m|^\d{4}-\d{2}-\d{2}$|) { print "Your issuedate '$issuedate' does not look like an issuedate to me\n"; } if(($how ne 'array' and $how ne 'hash')) { confess "I don't know how, '$how'?"; } my $source_dir=$m->{'e'}->{'dir'}->{'allport_sent'}; if(not $source_dir) { confess "I need a source_dir here."; } if(not -d $source_dir) { confess "I don't see the directory $source_dir"; } my $rif=&Ernad::Common::get_latest_rif($source_dir,$issuedate); if(not $rif) { confess "I could not find the sent rif for $issuedate in $source_dir"; } my $handles=&Ernad::Common::get_handles_from_rif($rif,'hash'); return $handles; } 1;