package Ernad::Learn::Dokli; use strict; use warnings; use Carp qw(cluck longmess shortmess croak confess); use Data::Dumper; use File::Basename; use File::Slurper; use File::Path; use IO::File; use List::Util qw(shuffle); use Storable; use URI::Escape; use Ernad::Dates; use Ernad::Erimp; use Ernad::Files; use Ernad::Learn::Weights; use Ernad::Learn::Common; use Ernad::Store; #use Ernad::Seed; my $exfit_ext='.txt'; binmode(STDOUT,':utf8'); ## constructor sub new { my $this=shift; my $class=ref($this) || $this; my $d={}; bless $d, $class; my $params=shift; ## copy parameters into the object foreach my $key (keys %{$params}) { $d->{$key}=$params->{$key}; } my $e; if(not defined($d->{'e'})) { if(defined($main::e)) { $d->{'e'}=$main::e; $d->{'impna'}=$d->{'e'}->{'impna'}; $e=$main::e; } } elsif($d->{'e'}) { $e=$d->{'e'}; } else { $e=$main::e; } if(not $e) { confess "I need an erimp here."; } if(not defined($d->{'impna'})) { $d->{'impna'}=$d->{'e'}->{'impna'} // confess "I need an impna here."; } if(not defined($d->{'verbose'})) { $d->{'verbose'}=$d->{'e'}->{'verbose'} // confess "I need an verbosity here."; } ## legacy code #if(not defined($d->{'e'})) { # $d->{'e'}=Ernad::Erimp->new({'impna' => $d->{'impna'}, # 'verbose' => $d->{'verbose'}}); #} if(not $d->{'verbose'}) { confess "You need to set the verbosity."; } if($d->{'e'}->{'conf'}->{'separate_doklis'}) { if(not defined($e->{'repcode'})) { confess "I need a repcode here."; } } &Ernad::Learn::Common::set_basic($d); $d->init(); return $d; } sub check_weighing_function { my $d=shift; my $scheme=$d->{'e'}->{'conf'}->{'weighing_scheme'} or confess "I need a weighing scheme in the ernad configuration."; my $weigh=$Ernad::Learn::Weights::weigh->{$scheme} or confess "The weighing scheme $scheme appears not to be defined."; $d->{'weigh'}=$weigh; } sub init { my $d=shift; ## feature classes $d->{'fitclas'}=['term','frin','frex']; ## if we set the no_frex option, we don't do frex foreach my $fitcla (@{$d->{'fitclas'}}) { $d->{$fitcla}=1; } my $e=$d->{'e'}; ## import the train_limit_by_days my $train_limit=$d->{'e'}->{'conf'}->{'train_limit_by_days'} // ''; if($train_limit) { $d->{'ernad_limit_by_days'}=$train_limit; $d->{'e'}->echo(__LINE__,"train limit by days set to $train_limit",4); } else { $d->{'ernad_limit_by_days'}=0; } ## import the no_frex, if not given in the invocation parameters if(not defined($d->{'no_frex'})) { my $no_frex=$d->{'e'}->{'conf'}->{'no_frex'} // ''; $d->{'no_frex'}=$no_frex; } ## import the no_frin, if not given in the invocation parameters if(not defined($d->{'no_frin'})) { my $no_frin=$d->{'e'}->{'conf'}->{'no_frin'} // ''; $d->{'no_frin'}=$no_frin; } ## set the time. it could be given at invocation if(not defined($d->{'time'})) { $d->{'time'}=time; } if(not $d->{'e'}->{'conf'}->{'no_allport'}) { $d->{'source_dir'}=$e->{'dir'}->{'allport_sent'}; } elsif(defined($e->{'repcode'})) { $d->{'source_dir'}=$e->{'report'}->{$e->{'repcode'}}->{'dir'}->{'source'}; } $d->init_fitcla_functions(); $d->set_fitport(); if($e->{'seedable'}) { &set_report_past_issuedates($e->{'repcode'}); } else { &Ernad::Learn::Common::set_issuedates($d); } #&Ernad::Learn::Common::set_dirs($d); $d->set_dirs(); $d->set_lisig(); $d->check_weighing_function(); $d->find_or_write_lisig(); ## if we have a matching lisig, use its time. if($d->{'lisig_time'}) { $d->{'time'}=$d->{'lisig_time'}; } ## reset the basics with this time &Ernad::Learn::Common::set_basic($d,$d->{'lisig_time'}); } sub set_dirs { my $d=shift; my $arg_time=shift // ''; my $exfits_dir; my $fitport=$d->{'fitport'} or confess "I need a fitport here"; my $learn_dir=$d->{'learn_dir'} or confess "I need a learn_dir here"; $d->{'exfit_dir'}=$learn_dir.'/exfit/'.$fitport; my $exfit_dir=$d->{'exfit_dir'}; my $fitar_dir=$d->{'exfit_dir'} or confess "I don't have an exfit_dir."; $fitar_dir=~s|/exfit/|/fitar/|; $d->{'fitar_dir'}=$fitar_dir; my $fidek_dir=$d->{'exfit_dir'}; $fidek_dir=~s|/exfit/|/fidek/|; $d->{'fidek_dir'}=$fidek_dir; my $time; if($arg_time) { $time=$arg_time; } elsif($d->{'update_time'}) { $time=$d->{'update_time'}; } elsif(defined($d->{'time'})) { $time=$d->{'time'}; } else { $d->{'time'}=time; } my $file_version=$d->{'version'}.'_'.$time; $d->{'file_version'}=$file_version; foreach my $fitcla (@{$d->{'fitclas'}}) { $d->{'fitar_file'}->{$fitcla}=$d->{'fitar_dir'}.'/'.$file_version.'_'.$fitcla.'.txt'; $d->{'all_fitar_file'}->{$fitcla}=$d->{'fitar_dir'}.'/'.$file_version.'.txt'; $d->{'fidek_file'}->{$fitcla}=$d->{'fidek_dir'}.'/'.$file_version.'_'.$fitcla.'.dump'; } my $dokli_dir=$d->{'exfit_dir'}; $dokli_dir=~s|/exfit/|/dokli/|; $d->{'dokli_dir'}=$dokli_dir; $d->{'all_dokli_glob'}=$d->{'dokli_dir'}.'/*.txt'; $d->{'fitrank_file'}=$d->{'fitar_dir'}.'/fitrank_'.$time.'.dump'; } sub init_fitcla_functions { my $d=shift; $d->{'treat_fit'}->{'term'} = sub { my $t=shift; if($t=~m|^[[:upper:]][[:lower:]]*$|) { $t=lc($t); } return $t; }; $d->{'treat_fit'}->{'frin'} = sub { my $t=shift; $t=~s|^\s||; $t=~s|\s$||; return $t; }; $d->{'treat_fit'}->{'frex'} = sub { my $t=shift; my $rc=shift; ## first process like a frin $t=&{$d->{'treat_fit'}->{'frin'}}($t); $t=~s|\s+| |; $t=~s|\s+$||; $t=~s|^\s+||; if(not $t=~m| |) { return ''; } ## remove terms in brackets, they are usually abreviations $t=~s|\(.*\)||g; ## these are separators. Easiest just to delete the lines that ## contain them and assume the components are covered elsewhere my @seps=(':',',','/','&',' - ','"','$'); foreach my $sep (@seps) { if($t=~m|\Q$sep\E|) { if($rc eq 'read') { $d->{'e'}->echo(__LINE__,"I skip line '$t' because it contains a separator",2); return ''; } elsif($rc eq 'check') { $t=~s|\Q$sep\E||; } else { confess "unknown rc"; } } } $t=~s|\n$||; $t=~s|\.$||; $t=~s|^\.\s*||; $t=~s|\x{2019}|'|g; $t=~s|^'||g; $t=~s|'$||g; ## only uppercase #if($t=~m|^[A-Z ]+$|) { $t=lc($t); #} return $t; }; $d->{'split_line'}->{'term'} = sub { my $line=shift; my @fits; foreach my $term (split(/[\s\p{punct}]/,$line)) { ## function returns nothing when it is not finding a worthy term my $fit=&{$d->{'treat_fit'}->{'term'}}($term,'read') or next; push(@fits,$fit); } return @fits; }; $d->{'split_line'}->{'frin'} = sub { my $line=shift; my @fits; ## It is assumed that the entire line is a feature. my $fit=&{$d->{'treat_fit'}->{'frin'}}($line,'read') or return; push(@fits,$fit); return @fits; }; $d->{'split_line'}->{'frex'} = sub { my $line=shift; my @fits; ## It is assumed that the entire line is a feature. my $fit=&{$d->{'treat_fit'}->{'frex'}}($line,'read') or return; push(@fits,$fit); return @fits; }; } ## set the fitport sub set_fitport { my $d=shift; my $e=$d->{'e'}; my $given_report=$d->{'report'} // ''; ## fitport my $allport=$d->{'e'}->get_allport_repcode(); if(defined($d->{'fitport'}) and $d->{'fitport'}) { $d->{'e'}->echo(__LINE__,"fitport is already defined as '". $d->{'fitport'}."'",2); return; } if(not $given_report) { $d->{'fitport'}=$allport; return; } ## for a fitport other than the allport, a terms exfit must be ## represent. It is that one that has to be checked ## in the case of no_allport, the fitex files should be a link my $exfit_xslt_file=$d->find_exfit_xslt_file('term',$given_report); if(-f $exfit_xslt_file and not $->{'conf'}->{'no_allport'} and not -l $exfit_xslt_file) { $d->{'e'}->echo(__LINE__,"I will be using a non-allport fitport ".$d->{'fitport'},2); $d->{'fitport'}=$given_report; return; } ## no implementation actually uses the fitport yet $d->{'fitport'}=$allport; } sub find_exfit_xslt_file { my $d=shift; my $fitcla=shift; my $repcode=shift; my $exfit_xslt_dir=$d->{'e'}->{'dir'}->{'exfit_xslt'}; my $exfit_file; ## in the case of no allport if($d->{'e'}->{'conf'}->{'no_allport'}) { $exfit_file=$fitcla.$d->{'e'}->{'const'}->{'xsl_ext'}; return $exfit_xslt_dir.'/'.$exfit_file; } ## otherwise my $ext='_'.$fitcla.$d->{'e'}->{'const'}->{'xsl_ext'}; $exfit_file=$exfit_xslt_dir.'/'.$repcode.$ext; return $exfit_file; } sub add_fit_for_fitar { my $d=shift; my $fitcla=shift // confess "I need a fitcla here"; my $fit=shift // confess "I need fits here"; $d->{'e'}->echo(__LINE__,"I add fit $fit to fitar for $fitcla",10); if(not defined($d->{'fre'}->{$fitcla}->{$fit})) { $d->{'fre'}->{$fitcla}->{$fit}=1; return; } $d->{'fre'}->{$fitcla}->{$fit}++; } sub add_fit_for_fidek { my $d=shift; my $fitcla=shift // confess "I need a fitcla here."; my $fit=shift // confess "I need a fit here."; my $handle=shift // confess "I need a handle here."; if(not defined($d->{'fidek'}->{$fitcla}->{'h'}->{$handle})) { $d->{'fidek'}->{$fitcla}->{'h'}->{$handle}->{$fit}=1; return; } $d->{'fidek'}->{$fitcla}->{'h'}->{$handle}->{$fit}++; } sub set_fitrank_time { my $d=shift; if(defined($d->{'dokli_time'})) { my $dokli_time=$d->{'dokli_time'}; $d->{'fitrank_time'}=$dokli_time; $d->{'e'}->echo(__LINE__,"I set fitrank_time to dokli_time $dokli_time",10); return $dokli_time; } if(defined($d->{'time'})) { my $time=$d->{'time'}; $d->{'fitrank_time'}=$time; $d->{'e'}->echo(__LINE__,"I set fitrank_time to d->{time} $time",10); return $time; } confess "I don't know how to set the fitrank time."; } sub add_fit_for_fisto { my $d=shift; my $fitcla=shift // confess "I need a fitcla here."; my $fit=shift // confess "I need a fit here."; my $handle=shift // confess "I need a handle here."; my $issuedate=shift // confess "I need an issuedate here."; if(not defined($d->{'fitrank'})) { confess "I must have f->{fitrank} defined here"; } my $count_fit=0; my $time=$d->{'fitrank_time'} or confess "I must have f->{fitrank_time} defined here."; if(not defined($d->{'fitrank'}->{$time}->{$fit})) { $d->{'e'}->echo(__LINE__,"No l->{fitrank}->{$time} is defined for $fit, presumed unknown",7); return; } $d->{'e'}->echo(__LINE__,"I found the feauture $fit",7); if(not defined($d->{'fisto'}->{$issuedate}->{$fitcla}->{'h'}->{$handle}->{$fit})) { $d->{'fisto'}->{$issuedate}->{$fitcla}->{'h'}->{$handle}->{$fit}=1; return; } $d->{'fisto'}->{$issuedate}->{$fitcla}->{'h'}->{$handle}->{$fit}++; } sub add_lines_for_fitar { my $d=shift; my $fitcla=shift; my $file=shift; my @lines=&File::Slurper::read_lines($file); my $count_lines=0; my $count_fits=0; foreach my $line (@lines) { $count_lines++; if($d->is_informational_exfit_line($line)) { next; } my @fits=&{$d->{'split_line'}->{$fitcla}}($line) or next; foreach my $fit (@fits) { $count_fits++; $d->add_fit_for_fitar($fitcla,$fit); $d->{'e'}->echo(__LINE__,"I'm adding fit $fit from $file.",7); } } if(not $count_fits) { $d->{'e'}->echo(__LINE__,"WARN: I added no fits from $file.",7); } else { $d->{'e'}->echo(__LINE__,"I added $count_fits fits for file $file.",7); } } sub add_lines_for_fidek { my $d=shift; my $fitcla=shift; my $file=shift; my @lines=&File::Slurper::read_lines($file); my $handle; foreach my $line (@lines) { if($d->is_informational_exfit_line($line)) { $handle=$d->{'handle'} // ''; $d->{'e'}->echo(__LINE__,"setting handle to $handle",20); next; } if(not $handle) { $d->{'e'}->echo(__LINE__,"I have no handle set, moving forward"); next; } $d->{'e'}->echo(__LINE__,"I am adding $line",20); my @fits=&{$d->{'split_line'}->{$fitcla}}($line) or next; foreach my $fit (@fits) { $d->add_fit_for_fidek($fitcla,$fit,$handle); } } } sub add_lines_for_fisto { my $d=shift; my $fitcla=shift; my $file=shift; my $issuedate=shift; ## an optional limit for debugging my $limit=shift // ''; my @lines=&File::Slurper::read_lines($file); my $handle; my $count_lines=0; my $count_papers=0; foreach my $line (@lines) { if($d->is_informational_exfit_line($line)) { chomp $line; $handle=$d->{'handle'} // ''; $d->{'e'}->echo(__LINE__,"setting handle to $handle",2); $d->{'e'}->echo(__LINE__,"'$line' is informational",2); $count_papers++; next; } if(not $handle) { $d->{'e'}->echo(__LINE__,"I have no handle set, moving forward"); next; } my @fits=&{$d->{'split_line'}->{$fitcla}}($line) or next; foreach my $fit (@fits) { $d->add_fit_for_fisto($fitcla,$fit,$handle,$issuedate); } if($limit and $count_papers > $limit) { last; } } } ## an attempt to save more # sub add_lines_for_fisto_single { # my $d=shift; # my $fitcla=shift; # my $file=shift; # my $issuedate=shift; # if(not -f $file) { # confess "I can't see the file $file."; # } # if(not defined($d->{'fh'}->{$file})) { # my $fh = IO::File->new(); # $fh->open("< $file"); # $fh->binmode('utf8'); # $d->{'fh'}->{$file}=$fh; # } # my $fh=$d->{'fh'}->{$file}; # while($line=<$fh>) # if($d->is_informational_exfit_line($line)) { # chomp $line; # $handle=$d->{'handle'} // ''; # $d->{'e'}->echo(__LINE__,"setting handle to $handle",2); # $d->{'e'}->echo(__LINE__,"'$line' is informational",2); # next; # } # if(not $handle) { # $d->{'e'}->echo(__LINE__,"I have no handle set, moving forward"); # next; # } # my @fits=&{$d->{'split_line'}->{$fitcla}}($line) or next; # foreach my $fit (@fits) { # $d->add_fit_for_fisto($fitcla,$fit,$handle,$issuedate); # } # if(not defined($old_handle)) { # $old_handle=$handle; # next; # } # if($old_handle ne $handle) { # print Dumper $d->{'fisto'}; # die; # } # $old_handle=$handle; # } # } sub is_informational_exfit_line { my $d=shift; my $line=shift; if($line=~m|^\s*$|) { return 1; } chomp $line; if(not $line=~s|^ ||) { $d->{'e'}->echo(__LINE__,"handle: $line",10); $d->{'handle'}=$line; return 1; } $d->{'handle'}=undef; return 0; } sub build_exfits_for_all_fitclas { my $d=shift; foreach my $fitcla (@{$d->{'fitclas'}}) { $d->{'e'}->echo(__LINE__,"fitcla is $fitcla"); $d->build_exfits($fitcla); } } sub echo { my $d=shift; my $line_number=shift; my $text=shift or confess "I need something to echo"; my $verbosity=shift // 0; if(not $verbosity=~m|^\d+|) { confess "I don't like the verbosity $verbosity"; } if(not defined($d->{'verbose'}) or ($verbosity < $d->{'verbose'})) { print "$line_number | $text", "\n"; } } sub build_exfits { my $d=shift; my $e=$d->{'e'}; my $rerc; if($e->{'conf'}->{'separate_doklis'}) { if(not defined($e->{'repcode'})) { confess "I need a repcode here."; } $rerc=$e->{'report'}->{$e->{'repcode'}} // confess "I need a rerc here."; } my $fitcla=shift or confess "no fitcla to build"; if($fitcla ne 'term' and $fitcla ne 'frin' and $fitcla ne 'frex') { confess "fitcla must be 'term' 'frin' or 'frex', it can not be $fitcla."; } if($e->{'conf'}->{'no_frex'} and $fitcla eq 'frex') { return; } if($e->{'conf'}->{'no_frin'} and $fitcla eq 'frin') { return; } my $rifs; if(not $e->{'conf'}->{'no_allport'}) { $rifs=&Ernad::Common::get_latest_rif_from_each_issue_in_dir($d->{'source_dir'}); } else { $rifs=&Ernad::Common::list_source_issues($d->{'source_dir'}); } $e->echo(__LINE__,"I start to build exfits."); ## was shuffle foreach my $rif (sort @$rifs) { $e->echo(__LINE__,"rif is $rif",2); my $issuedate=&Ernad::Common::find_issuedate_from_file($rif); if(not $issuedate) { $e->echo(__LINE__,"I did not find an issuedate on file $rif"); next; } ## check if the issue is sent if($e->{'conf'}->{'separate_doklis'}) { my $sent_dir=$rerc->{'dir'}->{'sent'} // confess "I need a sent_dir here."; if(not &Ernad::Common::is_issue_in_dir($sent_dir,$issuedate)) { $e->echo(__LINE__,"$issuedate is not issued yet, it's not going into the dokli."); next; } } ## we need to parse all the exfits for frex if(not $e->{'conf'}->{'separate_doklis'}) { if(not $fitcla eq 'frex') { if(not $d->{'issuedates'}->{$issuedate}) { $e->echo(__LINE__,"issuedate $issuedate is too old or not existant."); next; } } if(not &Ernad::Learn::Common::check_range($d,$issuedate)) { $e->echo(__LINE__,"issuedate $issuedate is out of range"); next; } } ## at Tania's 05-24 #if(not $d->is_date_to_be_added($issuedate)) { # $d->{'e'}->echo(__LINE__,"I skip $issuedate because it's already in the dokli.",2); # next; #} $e->echo(__LINE__,"Let's check the exfit for $fitcla at $issuedate"); $d->build_exfit($fitcla,$issuedate,$rif); $e->echo(__LINE__,"I checked the exfit for $issuedate",); } $e->echo(__LINE__,"build_exfit for $fitcla done"); } sub build_exfit { my $d=shift; my $fitcla=shift or confess "no fitcla to build"; my $issuedate=shift or confess "no date to build exfit for"; my $rif=shift or confess "no rif build exfit from"; my $type=shift // 'exfit'; if($fitcla ne 'term' and $fitcla ne 'frin' and $fitcla ne 'frex') { confess "fitcla must be 'term' 'frin' or 'frex', it can not be $fitcla."; } my $e=$d->{'e'} or confess "no erimp"; my $ext='_'.$fitcla.$e->{'const'}->{'xsl_ext'}; my $exfit_xslt_file=$d->find_exfit_xslt_file($fitcla,$d->{'fitport'}); if(not ( -f $exfit_xslt_file or -l $exfit_xslt_file)) { confess "no such exfit_file '$exfit_xslt_file'"; } $d->{'e'}->echo(__LINE__,"exfit_xslt_file is $exfit_xslt_file",3); $d->{'exfit_dir'}=$e->{'dir'}->{'learn'}.'/exfit/'.$d->{'fitport'}; $d->{'namex_dir'}=$e->{'dir'}->{'learn'}.'/namex'; my $out_file=$d->{'exfit_dir'}.'/'.$issuedate.'_'.$fitcla.$exfit_ext; ## namex case my $doc; &Ernad::Files::prepare_for_file($out_file); if(ref $rif eq 'XML::LibXML::Document') { ## this should then be the case, but we better check if($type eq 'namex') { $out_file=$d->{'namex_dir'}.'/'.$issuedate.'_'.$fitcla.$exfit_ext; $e->{'file'}->{'namex'}=$out_file; my $namf=$e->{'namf'} // confess "I need this defined here."; ## check if we have one namex made. if(not &Ernad::Common::does_file_need_renewal($out_file,$namf,$exfit_xslt_file)) { $d->{'e'}->echo(__LINE__,"I skip the renewal of $out_file, it needs no renewal.",2); return $out_file; } } $doc=$rif; } elsif(-f $rif) { if(not &Ernad::Common::does_file_need_renewal($out_file,$rif,$exfit_xslt_file)) { $d->{'e'}->echo(__LINE__,"I skip the renewal of $out_file, it needs no renewal.",2); return $out_file; } $doc=&Ernad::Common::load_and_return_xml("$rif") or confess "I could not parse the file $rif."; } else { confess "I don't know what to do with your \$rif argument."; } ## check that there are n if(not &Ernad::Common::count_texts_in_rif($doc)) { confess "Your rif $rif seems to contain no documents."; } # $e->echo(__LINE__,"I transform\n".$doc->toString()); $e->echo(__LINE__,"I start building $out_file ..."); my $text=$e->transform_to_chars($doc,$exfit_xslt_file); $e->echo(__LINE__,"I transformed $rif with $exfit_xslt_file.",20); if(not $text) { print $doc->toString; confess "I have an empty transformation result here. $@"; } &Ernad::Common::prepare_for_file($out_file); &File::Slurper::write_text($out_file,$text,'utf8'); $d->{'e'}->echo(__LINE__,"I saved $out_file."); return $out_file; } ## reads the features from a fitar sub get_fit { my $d=shift; my $fitcla=shift; my $fitter=$d->{'fit'}->{$fitcla}; if(defined($fitter)) { return $fitter; } ## will build the fitar again $d->read_fit($fitcla); return $d->{'fit'}->{$fitcla}; } ## reads the features from a fitar sub read_fit { my $d=shift; my $fitcla=shift; my $fitar_file=$d->{'fitar_file'}->{$fitcla} or confess "no fitar file for $fitcla"; if(not -f $fitar_file) { $d->build_fitar($fitcla); } if(not -f $fitar_file) { confess "build_fitar did not produce a fitar $fitar_file"; } my @lines=&File::Slurper::read_lines($fitar_file); my $fitter=$d->{'fit'}->{$fitcla}; foreach my $line (@lines) { chomp $line; $line=~m|([^\t]+)+\t([^\t]+)+\t(.+)| or confess "invalid line '$line' in $fitar_file"; $fitter->{$3}=$1; } $d->{'fit'}->{$fitcla}=$fitter; } sub is_date_to_be_added { my $d=shift // "I need a Dokli here."; my $date=shift // confess "I need a date here."; if(not defined($d->{'dates_to_add'})) { $d->{'e'}->echo(__LINE__, "I have no \$dates_to_add" ,3); return 1; } if(defined($d->{'dates_to_add'}->{$date})) { return 1; } return 0; } sub set_dates_to_add { my $d=shift; my $dokli_file=shift // $d->{'dokli_file'}; if(not defined($dokli_file)) { confess "I need a dokli_file here."; } if(not -f $dokli_file) { confess "I can't open dokli_file $dokli_file."; } ## f->{'issuedates'} has all the issuedates of the allport my $e=$main::e // confess "I need an erimp here."; if(not $e->{'conf'}->{'separate_doklis'}) { foreach my $issuedate (keys %{$d->{'issuedates'}}) { $d->{'e'}->echo(__LINE__,"set_dates_to_add considers $issuedate"); if($d->is_date_in_dokli($issuedate,$dokli_file)) { $d->{'e'}->echo(__LINE__,"date $issuedate is in dokli_file $dokli_file"); next; } $d->{'e'}->echo(__LINE__,"set_dates_to_add requests to add $issuedate"); $d->{'dates_to_add'}->{$issuedate}=1; } } else { ## use the learnable_issuedates my $repcode=$e->{'repcode'} // confess "I need a repcode here."; my $rerc=$e->{'report'}->{$repcode} // confess "I don't know the report $repcode."; foreach my $date (keys %{$rerc->{'learnable_issuedates'}}) { $d->{'dates_to_add'}->{$date}=1; } } } sub renew_fidek { my $d=shift; my $fitcla=shift // confess "I need a fitcla here."; #my $e=$d->{'e'}; #if($e->{'conf'}->{'no_frex'} and $fitcla eq 'frex') { # return; #} #if($e->{'conf'}->{'no_frin'} and $fitcla eq 'frin') { # return; #} if($fitcla ne 'term' and $fitcla ne 'frin' and $fitcla ne 'frex') { confess "I can't deal with fitla $fitcla."; } my $fidek_file=$d->{'fidek_file'}->{$fitcla} or confess "I can't find a fidek file for $fitcla"; my $exfit_files=$d->list_exfit_files($fitcla); $d->{'e'}->echo(__LINE__,"renewing fidek of $fitcla"); if($fitcla ne 'frex') { $d->{'e'}->echo(__LINE__,"Before building fidek for $fitcla"); $d->{'e'}->echo(__LINE__,"I am deleting the checked_fitter"); delete $d->{'checked_fitter'}; if(not &Ernad::Common::does_file_need_renewal($fidek_file,@{$exfit_files})) { $d->{'e'}->echo(__LINE__,"I don't have to build $fidek_file"); return; } $d->{'e'}->echo(__LINE__,"I have to build a fidek file"); $d->fidek_from_exfits_without_scanner($fitcla); } elsif(not defined($d->{'no_frex'}) or not $d->{'no_frex'}) { $d->build_fidek_with_scanner($fitcla); } } sub build_fidek_with_scanner { my $d=shift; my $fitcla=shift or confess 'no fitcla'; my $fidek_file=$d->{'fidek_file'}->{$fitcla} or confess "I can't find a fidek file for $fitcla"; my $scan_file=$d->get_scan_file($fitcla) or confess "no scan_file defined"; $d->{'e'}->echo(__LINE__,"I get scan_file $scan_file ... "); my $exfit_files=$d->list_exfit_files($fitcla); my $lisig_file=$scan_file; ## use the lisig to see if we already have matching files. ## search for lisig files my $dir=dirname($scan_file); if(&Ernad::Common::does_file_need_renewal($scan_file,@{$exfit_files})) { $d->build_fre_from_exfits($fitcla); $d->build_scanner($fitcla); } $d->{'e'}->echo(__LINE__,"done"); $d->get_scanner('frex'); if($d->{'verbose'}) { $d->{'e'}->echo(__LINE__,"done"); } my $scanner=$d->{'scanner'}->{'frex'}; #print $scanner; &Ernad::Files::prepare_for_file($fidek_file); $d->{'fidek_file'}->{$fitcla}=$fidek_file; ## check if we have to renew the exfits $d->build_exfits($fitcla); ## now build it ## $d->fidek_from_exfits($fitcla); #$d->scan_file_for_frex_fits($file,''); ## was shuffle my @files=sort(@{$d->list_exfit_files('term')}); foreach my $file (@files) { ## do it silently $d->{'e'}->echo(__LINE__,"scanning $file for frex fits ..."); $d->scan_file_for_frex_fits($file,'fidek'); $d->{'e'}->echo(__LINE__,"done"); } $d->store_fidek('frex'); delete $d->{'scanner'}; } ## sub build_fre_from_exfits { my $d=shift; my $fitcla=shift or confess 'no fitcla'; my $exfit_dir=$d->{'exfit_dir'}; if(not defined($exfit_dir)) { confess "The exfit_dir not defined. Have you updated the exfit files?"; } opendir( my $exfits, $exfit_dir) or confess "I can't open the exfit_dir $exfit_dir."; my $count_files=0; while (my $file = readdir $exfits ) { if(not $file=~m|(\d{4}-\d{2}-\d{2})_$fitcla\.txt$|) { $d->{'e'}->echo(__LINE__,"I'm skipping file $file when looking for $fitcla files.",2); next; } my $issuedate=$1; if($fitcla ne 'fre') { if(not $d->{'issuedates'}->{$issuedate}) { $d->{'e'}->echo(__LINE__,"issuedate $issuedate is too old or not existant.",3); next; } } $d->{'e'}->echo(__LINE__,"build_fre_from_exfits: I'm adding file $file",4); $d->add_lines_for_fitar($fitcla,"$exfit_dir/$file"); } } ## sub fidek_from_exfits_without_scanner { my $d=shift; my $fitcla=shift or confess 'no fitcla'; if(not (($fitcla eq 'term') or ($fitcla eq 'frin'))) { confess "fitcla must be 'term' or 'frin', not $fitcla"; } my $exfit_dir=$d->{'exfit_dir'}; if(not defined($exfit_dir)) { confess "The exfit_dir not defined. Have you updated the exfit files?"; } opendir( my $exfits, $exfit_dir) or confess "I can't open the exfit_dir $exfit_dir."; my $count_files=0; my $count_files_added=0; my $max_files_added=$d->{'max_fidek_files_add'} // 0; while (my $file = readdir $exfits ) { if(not $file=~m|(\d{4}-\d{2}-\d{2})_$fitcla\.txt$|) { $d->{'e'}->echo(__LINE__,"I'm skipping file $file when looking for $fitcla files.",2); next; } my $date=$1; $d->{'e'}->echo(__LINE__,"fidek_from_exfits_without_scanner: I'm adding file $file."); #$count_files_added++; #if($max_files_added and $count_files_added > $max_files_added) { # print "read last file to add because of extrenal limit\n"; #} my $file_read_time=$d->{'fidek'}->{$fitcla}->{'f'}->{$file} // 0; ## save time by checking the if($file_read_time) { my $mtime=&Ernad::Files::mtime($file); if($mtime < $file_read_time) { $d->{'e'}->echo(__LINE__,"file $file has been read earlier"); next; } } $d->{'fidek'}->{$fitcla}->{'f'}->{$file}=time; my $fufi="$exfit_dir/$file"; $d->{'e'}->echo(__LINE__,"I am adding $fufi to my fidek."); $d->add_lines_for_fidek($fitcla,"$fufi"); } $d->store_fidek($fitcla); } ## sub fisto_from_exfits_without_scanner { my $d=shift; my $e=$main::e // confess 'I need an erimp here'; my $fitcla=shift or confess 'no fitcla'; if(not (($fitcla eq 'term') or ($fitcla eq 'frin'))) { confess "fitcla must be 'term' or 'frin', not $fitcla"; } if(not defined($d->{'dates_to_add'})) { confess "You must set dates_to_add before calling fisto"; } my $exfit_dir=$d->{'exfit_dir'}; if(not defined($exfit_dir)) { confess "The exfit_dir not defined. Have you updated the exfit files?"; } $d->build_exfits($fitcla); opendir( my $exfits, $exfit_dir) or confess "I can't open the exfit_dir $exfit_dir."; $d->set_fitrank_time(); foreach my $issuedate (keys %{$d->{'dates_to_add'}}) { my $file=$exfit_dir.'/'.$issuedate.'_'.$fitcla.'.txt'; if(not -f $file) { ## this is normal in the case of no_allport, since there is ## no allport issue available if($e->{'conf'}->{'no_allport'}) { if($issuedate eq $e->{'issuedate'}) { my $rerc=$e->{'report'}->{$e->{'repcode'}} // confess "I need a rerc here."; my $amf_doc=$rerc->{'amf_doc'} // confess 'I need an amf_doc here.'; $e->echo(__LINE__,"I have to build a test file without exfits"); die; #$d->build_exfits_in_memory(); next; } } else { confess "I can't find the exfit file $file to add to the fisto"; } } $d->add_lines_for_fisto($fitcla,$file,$issuedate); } } sub get_scan_file { my $d=shift; my $fitcla=shift // confess "I need a fitcla here."; my $scan_file=$d->{'scan_file'}->{$fitcla}; if(defined($scan_file)) { return $scan_file; } my $version=$d->{'version'} or confess 'I need a version here.'; my $fitar_file=$d->{'fitar_file'}->{$fitcla}; $d->{'fitar_file'}->{$fitcla}=$fitar_file; my $perl_file=$fitar_file; $perl_file=~s|\.txt$|.pl|; $d->{'scan_file'}->{$fitcla}=$perl_file; return $perl_file; } sub split_frex_into_terms { my $d=shift; my $in=shift; my $scanner_limit=$d->{'e'}->{'conf'}->{'scanner_limit'}; if(not $scanner_limit) { confess "I need a scanner_limit parameter, put one into ernad.conf"; } $in=~s|^\s+||; $in=~s|\s$||; $in=~s|\s+| |; #print "in is $in\n"; my @terms=split(/\s/,$in); if(scalar(@terms)>$scanner_limit) { $d->{'e'}->echo(__LINE__,"I skip the line '$in'. It is over the limit $scanner_limit.",7); return undef; } if(not scalar(@terms)) { $d->{'e'}->echo(__LINE__,"I skip the line '$in'. I can't get terms from it.",1); return undef; } return \@terms; } sub build_scanner { my $d=shift; my $fitcla=shift; ## only used internally and in get_scanner my $name='fit'; ## we have an exception for fre here my $in=$d->{'fre'}->{$fitcla} or confess "no fre for $fitcla"; my $out="package Scan;\n"; $out.="use utf8;\n"; $out.="our \$fit;\n"; my $scanner_limit=$d->{'e'}->{'conf'}->{'scanner_limit'}; if(not $scanner_limit) { confess "I need a scanner_limit parameter, put one into ernad.conf"; } foreach my $line (keys %{$in}) { $d->{'e'}->echo(__LINE__,"frex line '$line'",2); my $split=$d->split_frex_into_terms($line); my @words; if(ref $split eq 'ARRAY') { @words=@{$split}; } else { print "line is '$line'\n"; print "split is ", Dumper $split, "\n"; print "ref is " , ref $split, "\n"; next; } $out.='$fit'; foreach my $word (@words) { $d->{'e'}->echo(__LINE__,"word is $word",2); $out.='->{"'.$word.'"}'; } $out.="=".$in->{$line}.";\n"; } $out.="1;\n"; my $out_file=$d->{'scan_file'}->{$fitcla}; &Ernad::Files::prepare_for_file($out_file); open(F, "> $out_file") or confess "could not open $out_file"; binmode(F,":utf8"); print F $out; close F; ## free memory $d->{'e'}->echo(__LINE__,"I delete fre for $fitcla"); delete $d->{'fre'}->{$fitcla}; } sub get_scanner { my $d=shift; my $fitcla=shift; if(defined($d->{'scanner'}->{$fitcla})) { return $d->{'scanner'}->{$fitcla}; } $d->read_scanner($fitcla); return $d->{'scanner'}->{$fitcla}; } sub read_scanner { my $d=shift; my $fitcla=shift // confess "I need a fitcla here."; $d->get_scan_file($fitcla); my $scan_file=$d->{'scan_file'}->{$fitcla} or confess "no scan_file for $fitcla"; if(not -f $scan_file) { confess "I can't find the scan_file $scan_file."; } ## try to accelerate by using the dump my $scan_dump=$scan_file; $scan_dump=~s|\.pl$|.dump|; if(-f $scan_dump) { $d->{'e'}->echo(__LINE__,"I found a scan dump $scan_dump"); if(-M $scan_dump < -M $scan_file) { $d->{'e'}->echo(__LINE__,"using it."); my $s=&Ernad::Store::get_dump($scan_dump); $d->{'scanner'}->{$fitcla}=$s; my $out= $d->{'e'}->echo(__LINE__,join("","here is the scanner\n", Dumper $s), 7); return $s; } else { $d->{'e'}->echo(__LINE__,"not using it"); } } require $scan_file; my $s=$Scan::fit; $d->{'scanner'}->{$fitcla}=$s; store $Scan::fit, $scan_dump; $d->{'e'}->echo(__LINE__,"I delete fre for $fitcla"); delete $d->{'fre'}->{$fitcla}; return $s; } sub print_fits { my $d=shift; my $fitcla=shift; if(not defined($d->{'fre'}->{$fitcla})) { confess "I have no features for $fitcla to print."; } my $out_file=$d->{'fitar_file'}->{$fitcla} or confess 'I have no fitar file defined.'; my $fre=$d->{'fre'}->{$fitcla}; my @fits=sort {$fre->{$b} <=> $fre->{$a}} keys %$fre; my $rank=0; &Ernad::Files::prepare_for_file($out_file); open(O,"> $out_file") or confess "could not open out_file $out_file"; binmode(O,':utf8'); foreach my $fit (@fits) { if($fit=~m|\t|) { confess "The feature '$fit' contains a tab."; } $fit=~s|\x{FDD3}||g; print O $rank++, "\t", $fre->{$fit}, "\t", $fit, "\n"; } close O; $d->{'e'}->echo(__LINE__,"I wrote $out_file."); ## free memory, hopefully $d->{'e'}->echo(__LINE__,"I delete fre for $fitcla"); delete($d->{'fre'}->{$fitcla}); } sub remove_features_below_limit { my $d=shift; my $fitcla=shift; my $given_limit=shift; my $limit; ## a zero limit is no limit if(defined($given_limit)) { if(not $given_limit) { return; } $d->{'e'}->echo(__LINE__,"using feature limit $limit"); } else { $limit=1; } if(not defined($d->{'fre'}->{$fitcla})) { confess 'nothing to remove singulars from'; } my $fre=$d->{'fre'}->{$fitcla}; ## clear from frinfre all singular terms foreach my $fit (keys %$fre) { if($fre->{$fit} <= $limit) { delete $fre->{$fit}; $d->{'e'}->echo(__LINE__,"singular features $fit removed from $fitcla features"); } } } sub list_exfit_files { my $d=shift; my $fitcla=shift or confess "no fitcla"; my $exfit_dir=$d->{'exfit_dir'}; opendir( my $exfits, $exfit_dir) or confess "can't open $exfit_dir"; my $count_files=0; my @files; while (my $file = readdir $exfits) { if(not $file=~m|(\d{4}-\d{2}-\d{2})_$fitcla\.txt$|) { $d->{'e'}->echo(__LINE__,"skipping file $file in list_exfit_files",3); next; } my $issuedate=$1; $d->{'exfit_files'}->{$fitcla}->[$count_files++]="$exfit_dir/$file"; } return $d->{'exfit_files'}->{$fitcla}; } sub get_checked_fitter { my $d=shift; my $fitcla=shift; if(defined($d->{'checked_fitter'}->{$fitcla})) { return $d->{'checked_fitter'}->{$fitcla}; } $d->read_checked_fitter_scanner($fitcla); return $d->{'checked_fitter'}->{$fitcla}; } sub read_checked_fitter_scanner { my $d=shift; my $fitcla=shift; my $scanner=$d->get_scanner($fitcla); if(not defined($scanner)) { confess "I could not get a scanner for $fitcla."; } else { $d->{'e'}->echo(__LINE__,"I got a scanner here for $fitcla"); } ## this builds the fitar again, but the ## date will be the current date my $fitter=$d->get_fit($fitcla); my $strict=shift // 0; $d->{'e'}->echo(__LINE__,"starting read_checked_fitter_scanner"); foreach my $fit (keys %{$fitter}) { $d->{'e'}->echo(__LINE__,"fit is $fit",2); my $t=$d->split_frex_into_terms($fit) // next; my @terms=@{$t} // "I need this defined here."; my $line=join(' ', @terms); my $sub_scanner=$scanner; #print "looking at $line\n"; foreach my $term (@terms) { #print "term is $term\n"; if(not defined($sub_scanner->{$term})) { if($strict) { confess "fit '$fit' is not in scanner"; } $d->{'e'}->echo(__LINE__,"fit $fit is not in scanner",2); } if(ref $sub_scanner->{$term}) { $sub_scanner=$sub_scanner->{$term}; } else { last; } } $d->{'checked_fitter'}->{$fitcla}->{$fit}=1; } $d->{'e'}->echo(__LINE__,"I am done with read_checked_fitter_scanner"); ## just in case it was there #delete $d->{'fit'}->{$fitcla}; #delete $d->{'fitter'}->{$fitcla}; #delete $d->{'checked_fitter'}->{$fitcla}; return $d->{'checked_fitter'}->{$fitcla}; } sub scan_file_for_frex_fits { my $d=shift; my $file=shift or confess "I can't see a file to scan"; my $target=shift or confess "I can't see a target to scan to"; my $fitcla='frex'; my $issuedate; if($target eq 'fisto') { $issuedate=shift // confess "I need an issuedate here."; } $d->{'e'}->echo(__LINE__,"I scan $file for frex fits",1); my $fitter=$d->get_checked_fitter($fitcla) or confess "I could not get the fitter"; my $s=$d->{'scanner'}->{$fitcla} or confess "I could not get the scanner for $fitcla"; my @lines=&File::Slurper::read_lines($file); ## this may run repeatedly, read existing data ## if we scan for fisto we need the issuedate my $o; if($target eq 'fisto') { $o=$d->{$target}->{$issuedate}->{'frex'}->{'h'}; } elsif($target eq 'fidek') { $o=$d->{$target}->{'frex'}->{'h'}; } else { confess "I don't know how to deal with target $target"; } my $handle; foreach my $line (@lines) { if($d->is_informational_exfit_line($line)) { $handle=$d->{'handle'} // ''; $d->{'e'}->echo(__LINE__,"setting handle to $handle",4); next; } if(not $handle) { $d->{'e'}->echo(__LINE__,"I have not handle set, moving forward",4); next; } my $treated_line=&{$d->{'treat_fit'}->{'frex'}}($line,'check'); if(not $treated_line) { $d->{'e'}->echo(__LINE__,"line '$line' did not pass treat_fit. I skip it.",4); next; } $d->{'e'}->echo(__LINE__,"looking for frex in \"$treated_line,\"",4); my @terms=split(/\s+/,$treated_line); my $count_terms=-1; my $ss; my $start_term; while(defined($terms[$count_terms++])) { ## nothing is to be scanned if first term ## does not match $start_term=$terms[$count_terms]; if(not $start_term) { next; } $d->{'e'}->echo(__LINE__,"start_term is $start_term",4); if(not defined($s->{$start_term})) { $d->{'e'}->echo(__LINE__,"nothing starts with '$start_term'",4); } else { $d->{'e'}->echo(__LINE__,"something starts with '$start_term'",4); } my $old_ss=$ss // $s; my $ss=$s->{$start_term}; ## if a first match is found, snip contains ## the following terms until the my $snip=$start_term; ## phrase is the current phrase my $phrase=$start_term; my $count_int=0; while($count_int++ < 5) { my $pos=$count_terms+$count_int; ## reached the of the line if(not defined($terms[$pos])) { last; } $snip.=' '.$terms[$pos]; } $count_int=0; while($count_int++ < 5) { my $pos=$count_terms+$count_int; my $term=$terms[$pos]; if(not $term) { last; } $phrase.=' '.$term; $d->{'e'}->echo(__LINE__,"count_int $count_int, term is '$term', phrase is '$phrase'",4); if(ref($ss) and not defined($ss->{$term})) { ## control print $d->{'e'}->echo(__LINE__,"> $phrase\t$snip",4); $count_int=0; last; } ## found first term if(not defined($fitter->{$phrase})) { $d->{'e'}->echo(__LINE__,"phrase '$phrase' is not an endpoint",4); if(ref $ss) { $ss=$ss->{$term}; } next; } $d->{'e'}->echo(__LINE__,"*** '$phrase' in $handle",4); if($handle) { if(not defined($o->{$handle}->{$phrase})) { $o->{$handle}->{$phrase}=1; } else { $o->{$handle}->{$phrase}++; } } else { confess "there is no handle defined"; } ## reposition $ss; ## did we read the end of the phrase? if(not ref $ss) { $d->{'e'}->echo(__LINE__,"I reached the end point of the scannen",4); last; } $d->{'e'}->echo(__LINE__,"moving forward by term '$term'",4); $ss=$ss->{$term}; } $d->{'e'}->echo(__LINE__,"count_int $count_int",4); } } if($target eq 'fisto') { $d->{$target}->{$issuedate}->{'frex'}->{'h'}=$o; } elsif($target eq 'fidek') { $d->{$target}->{'frex'}->{'h'}=$o; } else { confess "I don't know how to deal with target $target"; } } sub store_fidek { my $d=shift; my $fitcla=shift; my $fidek=$d->{'fidek'}->{$fitcla}; my $e=$d->{'e'} // confess "no erimp"; if($e->{'conf'}->{'no_frex'} and $fitcla eq 'frex') { return; } if($e->{'conf'}->{'no_frin'} and $fitcla eq 'frin') { return; } if(not defined($fidek)) { confess "fidek for $fitcla is not defined\n"; } my $fidek_file=$d->{'fidek_file'}->{$fitcla}; if(not defined($fidek_file)) { confess "fidek file is not defined\n"; } &Ernad::Files::prepare_for_file($fidek_file); $d->{'e'}->echo(__LINE__,"I am writing fidek $fidek_file"); store $fidek, $fidek_file or confess "I could not store the fidek: $!"; ## add a json store for the fidek my $fidek_json_file=$fidek_file; $fidek_json_file=~s|\.dump$|.json| or confess "The fidek file". $fidek_file." should have ended in .dump"; &Ernad::Common::save_to_file_utf8($fidek, $fidek_json_file); $d->{'e'}->echo(__LINE__,"I stored a copy of the fidek in $fidek_json_file"); if($fitcla eq 'fre') { $d->{'e'}->echo(__LINE__,"I am deleting checked_fitter"); delete $d->{'checked_fitter'}; $d->{'e'}->echo(__LINE__,"I am deleting scanner"); delete $d->{'scanner'}; } } sub store_fitrank { my $d=shift; if(not defined($d->{'fitrank_file'})) { confess "fitrank_file is not defined, I can't store to it."; } &Ernad::Files::prepare_for_file($d->{'fitrank_file'}); if(not defined($d->{'fitrank'})) { confess "fitrank is not defined, I can't store it."; } store($d->{'fitrank'}, $d->{'fitrank_file'}) or confess "I could not store the fitrank $!"; } sub read_fidek_from_file { my $d=shift; my $fitcla=shift; my $fidek_file=$d->{'fidek_file'}->{$fitcla}; if(not defined($fidek_file)) { confess "fidek file is not defined\n"; } undef $d->{'fidek'}->{$fitcla}; if(not -f $fidek_file) { $d->renew_fidek($fitcla); return; } my $got=retrieve $fidek_file; if(not defined($got)) { confess "I could not get the fidek of $fitcla from $fidek_file.\n"; } $d->{'fidek'}->{$fitcla}=$got; } sub read_fideks { my $d=shift; my @fitclas=@{$d->{'fitclas'}}; foreach my $fitcla (@fitclas) { $d->{'e'}->echo(__LINE__,"I am looking for the fidek for $fitcla",1); $d->read_fidek_from_file($fitcla); } } ## simple subroutine sub show_proposed { my $in=shift; my $term=shift; if(not defined($in->{$term})) { #print Dumper $in; #die "I cant find input for term '$term'\n"; return "\n"; } if(not ref($in->{$term})) { return; } my $out="?"; my $ss=$in->{$term}; foreach my $term (sort keys %${ss}) { $out.=' '.$term; } return "$out\n"; } sub print_part_of_fidek { my $d=shift; my $fitcla=shift; my $fidek=$d->{'fidek'}->{$fitcla} or confess "undefined \$fidek for $fitcla\n"; my $h=$fidek->{'h'} or confess "undefined \$fidek->{'h'} for $fitcla\n"; ## was shuffle my @handles=sort keys %{$h}; my $handle=$handles[0]; #print "fitclas is $fitcla\n"; #print "handle is $handle\n"; #print Dumper $fidek->{'h'}->{$handle}; } #sub prepare_for_dokli { # my $d=shift; #} sub is_fitrank_set { my $d=shift; foreach my $sumda (@{$d->{'sumdas'}}) { if(not defined($d->{$sumda})) { $d->{'e'}->echo(__LINE__,"sumda $sumda is not set"); return 0; } } $d->{'e'}->echo(__LINE__,"all sumdas are set"); return 1; } #sub are_sumdas_set { # my $d=shift; # foreach my $sumda (@{$d->{'sumdas'}}) { # if(not defined($d->{$sumda})) { # $d->{'e'}->echo(__LINE__,"sumda $sumda is not set"); # return 0; # } # } # $d->{'e'}->echo(__LINE__,"all sumdas are set"); # return 1; #} ## #sub can_sumdas_be_read { # my $d=shift; # if(not defined($d->{'dokli_time'})) { # return 0; # } #} sub retrieve_fitrank { my $d=shift; my $time=shift // confess "retrieve_fitrank() needs a time argument."; $d->{'fitrank_file'}=$d->{'fitar_dir'}.'/fitrank_'.$time.'.dump'; if(not -f $d->{'fitrank_file'}) { return ''; } ## set them in a separate loop # #my $time=$d->{'dokli_time'} // $d->{'time'}; if(defined($d->{'fitrank'}->{$time})) { $d->{'e'}->echo(__LINE__,"fitrank of $time is defined"); #$d->clear_other_fitranks($time); return $d->{'fitrank'}->{$time}; } if(not defined($d->{'fitrank_file'})) { confess 'I need fitrank_file defined here'; } my $fitrank_file=$d->{'fitrank_file'}; #my $fitrank=''; #eval { # $fitrank=&Ernad::Common::retrieve($fitrank_file); #}; #if(not $fitrank) { # print "I can not retrieve $fitrank_file"; # # } my $fitrank=&Ernad::Store::get_dump($fitrank_file); $d->{'fitrank'}->{$time}=$fitrank; return $fitrank; } sub clear_other_fitranks { my $d=shift; my $target_time=shift // confess "I need a time here"; if(not defined($d->{'fitrank'}->{$target_time})) { confess "I need the fitrank for $target_time defined here"; } foreach my $time (keys %{$d->{'fitrank'}}) { if($time != $target_time) { $d->{'e'}->echo(__LINE__,"I clear the fitrank of $time.",2); delete $d->{'fitrank'}->{$time}; } } $d->{'e'}->echo(__LINE__,"I clear the fitrank of $target_time.",2); } ## a function to save memory sub clear_if_not_of_time { my $d=shift; my $what=shift or die; my $except_time=shift or die; if(not $except_time=~m|^\d{10}$|) { confess "bad time $except_time"; } if(not defined($d->{$what})) { confess "no such what for $what"; } foreach my $time (keys %{$d->{'what'}}) { if(not $time=~m|^\d{10}$|) { confess "bad time $time"; } if($time == $except_time) { next; } delete $d->{$what}->{$time}; } } sub print_dokli { my $d=shift; my $dokli_file=shift // $d->{'dokli_file'} // die; if(not -f $dokli_file) { confess "I can't see the file $dokli_file"; } my $dokli_time=$d->{'dokli_time'}; if(not defined($dokli_time)) { confess "I need a dokli_time here\n"; } $d->retrieve_fitrank($dokli_time); $d->create_semaf($dokli_time); open(F,"< $dokli_file") or confess "I can not open the file $dokli_file"; my $line; while($line=) { print $d->sema($line); } } sub create_semaf { my $d=shift; my $time=shift or confess "no time"; my $fitrank=$d->{'fitrank'}->{$time} or confess "I need a fitrank defined here"; foreach my $fit (keys %$fitrank) { my $rank=$fitrank->{$fit}; $d->{'sema'}->{$rank}=$fit; } } sub sema { my $d=shift; my $in=shift; my $mode=shift // 'full'; my $sema=$d->{'sema'} or confess "I need a sema here"; #die $in; ## remove the date ## remove the indicator $in=~s|^(\S+)\s||; my $indic=$1; my $handle; my $date; if($in=~s|\s+#\s+(\d{4}-\d{2}-\d{2})\s*(\S+)\s*$||) { $date=$1; $handle=$2; } elsif($in=~s|\s+#\s+(\S+)\s*$||) { $handle=$1; } my $out=''; if($mode=~m|indic|) { $out.="$indic "; } if(defined($handle) and defined($date)) { $out.=$date.' '.uri_unescape($handle); if(not $mode=~m|line|) { $out.="\n "; } else { $out.=" "; } } elsif(defined($handle)) { $out.=uri_unescape($handle); if(not $mode=~m|line|) { $out.="\n "; } else { $out.=" "; } } my $line=''; foreach my $part (split(/ /,$in)) { $part=~m|^(\d+):|; my $rank=$1; my $sema=$sema->{$rank} // '/'; if($sema=~m|\s+|) { $sema="\"$sema\""; } if($mode=~m|sema_only|) { $part="$sema"; } else { $part=~s|^(\d+):|$sema:|; } $out.=$part; $line.=$part; if(not $mode=~m|line| and length($line) > 75) { $line=''; $out.="\n "; } else { $out.=' '; $line.=' '; } } $out.="\n"; return $out; } ## sub write_sumdas { my $d=shift; my $sumda_time=$d->{'dokli_time'} // $d->{'time'}; foreach my $sumda (@{$d->{'sumdas'}}) { my $sumda_file=$d->{'sumda_file'}->{$sumda} or confess "You should have set the sumda file here"; ## don't write the file again if(-f $sumda_file) { $d->{'e'}->echo(__LINE__,"I found the sumda file $sumda_file. I will not write it."); } $d->{'e'}->echo(__LINE__,"I write the sumda file $sumda_file."); &Ernad::Files::prepare_for_file($sumda_file); store $d->{$sumda}, $sumda_file; } } ## sub summarize_fideks { my $d=shift; my $e=$d->{'e'}; $d->{'e'}->echo(__LINE__,"I am summarizing the fideks."); my $time=$d->{'time'} or confess "I need an f->{time} here."; if(defined($d->{'fitrank'}->{$time})) { $d->{'e'}->echo(__LINE__,"I have the fitrank of $time, I don't need to summarize the fideks"); return; } if($d->retrieve_fitrank($time)) { $d->{'e'}->echo(__LINE__,"I have retrieved the fitrank, I don't need to summarize the fideks"); return; } my @fitclas=@{$d->{'fitclas'}}; ## was shuffle foreach my $fitcla (sort @fitclas) { if($e->{'conf'}->{'no_frin'} and $fitcla eq 'frin') { next; } if($e->{'conf'}->{'no_frex'} and $fitcla eq 'frex') { next; } if(not $e->{'conf'}->{'separate_doklis'}) { ## do not read the fidek again!, don't know why if(not defined($d->{'fidek'}->{$fitcla})) { my $fidek_file=$d->{'fidek_file'}->{$fitcla}; if(-f $fidek_file) { $d->{'e'}->echo(__LINE__,"I'm reading the fidek from $fidek_file"); confess "here $fidek_file"; $d->read_fidek_from_file($fitcla); } else { $d->{'e'}->echo(__LINE__,"I have to renew the fidek"); $d->renew_fidek($fitcla); } } } else { $d->{'e'}->echo(__LINE__,"Doklis are separate, I am told to renew the fidek."); $d->renew_fidek($fitcla); } } foreach my $fitcla (@fitclas) { my $fidek=$d->{'fidek'}->{$fitcla}->{'h'}; if(not $fidek) { if(not (($fitcla eq 'frex') and $d->{'no_frex'})) { $d->renew_fidek('frex'); } } foreach my $handle (keys %${fidek}) { $d->{'e'}->echo(__LINE__,"handle is $handle",10); #$d->{'all_handles'}->{$handle}=1; foreach my $fit (keys %{$fidek->{$handle}}) { my $count_fit=$fidek->{$handle}->{$fit}; if(not defined($d->{'count_fit'}->{$fit})) { $d->{'count_fit'}->{$fit}=$count_fit; } else { $d->{'count_fit'}->{$fit}+=$count_fit; } } } } ## delete singular fits foreach my $fit (keys %{$d->{'count_fit'}}) { if($d->{'count_fit'}->{$fit}<2) { delete $d->{'count_fit'}->{$fit}; $d->{'deleted_fits'}->{$fit}=1; } } @{$d->{'all_fits_ranked'}}=sort {$d->{'count_fit'}->{$b} <=> $d->{'count_fit'}->{$a}} keys %{$d->{'count_fit'}}; my $rank=1; $d->{'e'}->echo(__LINE__,"I am count_fit"); delete $d->{'count_fit'}; foreach my $fit (@{$d->{'all_fits_ranked'}}) { $d->{'fitrank'}->{$time}->{$fit}=$rank++; } $d->{'e'}->echo(__LINE__,"I am deleting all_fits_ranked"); delete $d->{'all_fits_ranked'}; if(not defined($d->{'fitrank_file'})) { confess 'I need to have the fitrank_file defined here'; } $d->{'e'}->echo(__LINE__,"I am storing fitrank in $d->{'fitrank_file'}"); #print Dumper $d->{'fitrank'}; my $fitrank_time=$d->set_fitrank_time(); &Ernad::Files::prepare_for_file($d->{'fitrank_file'}); if(not ref($d->{'fitrank'}->{$fitrank_time})) { confess "I see no fitrank but ". Dumper keys %{$d->{'fitrank'}}; } store($d->{'fitrank'}->{$fitrank_time}, $d->{'fitrank_file'}); ## add a json store for the fitrank my $fitrank_json_file=$d->{'fitrank_file'}; $fitrank_json_file=~s|\.dump$|.json| or confess "The fit rank file " .$d->{'fitrank_file'}." should have ended in .dump"; &Ernad::Common::save_to_file_utf8($d->{'fitrank'}->{$fitrank_time}, $fitrank_json_file); $d->{'e'}->echo(__LINE__,"I stored a copy of the fitrank in $fitrank_json_file"); } sub build_all_fitar { my $d=shift; $d->summarize_fideks(); &Ernad::Files::prepare_for_file($d->{'all_fitar_file'}); open(F,"> ".$d->{'all_fitar_file'}) or die; } sub build_dokli { my $d=shift; my $e=$d->{'e'} // confess "I need an erimp here."; my $force=shift // ''; if($d->{'current_dokli'}) { my $dokli_file=$d->{'current_dokli'}; $d->{'e'}->echo(__LINE__,"The dokli $dokli_file is up to date."); if(not $force) { return 0; } $d->{'e'}->echo(__LINE__,"I am forced to continue."); } my $time=$d->{'time'} // time; ## again make sure we use the lisig_time if defined if(defined($d->{'lisig_time'})) { $d->{'time'}=$d->{'lisig_time'}; $time=$d->{'lisig_time'}; } my $dokli_lock=$d->{'dokli_dir'}.'/lock'; if(-f $dokli_lock) { $d->{'e'}->echo(__LINE__,"I found a dokli_lock."); if(-M $dokli_lock > 1) { $d->{'e'}->echo(__LINE__,"I remove this old lock."); unlink $dokli_lock; } else { $d->{'e'}->echo(__LINE__,"I am leaving build_dokli because of the lock."); return 0; } } my $lisig=$d->{'lisig'} or confess "I need a lisig here."; foreach my $fitcla (@{$d->{'fitclas'}}) { $d->{'e'}->echo(__LINE__,"I build the exfits for $fitcla"); $d->build_exfits($fitcla); } $d->{'e'}->echo(__LINE__,"end",10); $d->summarize_fideks(); $d->{'e'}->echo(__LINE__,"I am deleting the checked_fitter"); delete $d->{'checked_fitter'}; my $fitrank_time=$d->set_fitrank_time(); my $dokda=$d->get_dokda() or die; if(not defined($d->{'dokli_file'})) { $d->set_dokli(); } &Ernad::Files::prepare_for_file($d->{'dokli_file'}); open(F,"> ".$d->{'dokli_file'}) or die; print F $lisig; my @issuedates; my $rerc; if($e->{'conf'}->{'separate_doklis'}) { if(not defined($e->{'repcode'})) { confess "I need a repcode here."; } $rerc=$e->{'report'}->{$e->{'repcode'}} // confess "I need a rerc here."; if(not keys %{$rerc->{'learnable_issuedates'}}) { $d->{'e'}->echo(__LINE__,"I resret the issuedates"); $rerc->{'learnable_issuedates'}=&set_report_past_issuedates($e->{'repcode'}); if(not scalar keys %{$rerc->{'learnable_issuedates'}}) { confess "I have no learnable issuedates."; } } @issuedates=sort keys %{$rerc->{'learnable_issuedates'}}; } else { @issuedates=sort keys %{$d->{'issuedates'}}; } if(not scalar(@issuedates)) { confess "I see no issuedates."; } foreach my $date (@issuedates) { $d->{'e'}->echo(__LINE__,"I found a date called $date",2); my $handle_ref=$dokda->{'handles'}->{$date}; if($e->{'conf'}->{'separate_doklis'}) { my $repcode=$e->{'repcode'} // confess "I need a repcode here."; my $rerc=$e->{'report'}->{$e->{'repcode'}} // confess "I need a rerc here."; my $sent_dir=$rerc->{'dir'}->{'sent'} // confess "I need a sent_dir here."; if(not &Ernad::Common::is_issue_in_dir($sent_dir,$date)) { $e->echo(__LINE__,"$date is not issued yet, it's not going into the dokli."); next; } } if(not defined($handle_ref)) { confess "no handles defined for $date"; } $d->{'e'}->echo(__LINE__,"build_dokli, date is $date"); my $count_vemlis=0; my $count_empty_vemlis=0; foreach my $handle (shuffle @$handle_ref) { $d->{'e'}->echo(__LINE__,"build_dokli, handle is $handle",10); my $vemli=$d->prep_vemli($handle,$date); ## vemli may be empty if it is a duplicate handle $count_vemlis++; if(not $vemli) { $count_empty_vemlis++; next; } $d->{'e'}->echo(__LINE__,"vemli\n$vemli",5); print F $vemli or confess "I can't write to the ".$d->{'dokli_file'}; } if($count_empty_vemlis > $count_vemlis / 2) { confess "I see more empty than full vemlis, something is wrong here."; } } close F; ## check if empty, if so, delete my $dokli_file=$d->{'dokli_file'}; if(-z $dokli_file) { $d->{'e'}->echo(__LINE__,"WARN I am deleting the empty dokli_file $dokli_file"); unlink($dokli_file); } #$d->{'e'}->echo(__LINE__,"I am deleting current_handle_fits"); #delete $d->{'current_handle_fits'}; $d->{'e'}->echo(__LINE__,"I am deleting fitrank"); #delete $d->{'fitrank'}; $d->get_dodex(); $d->{'e'}->echo(__LINE__,"I have finished build_dokli."); unlink $dokli_lock; return 1; } sub get_dokli_file { my $d=shift; my $dokli_file=$d->{'dokli_file'} or confess "I can not find a dockli_file set."; if(not -f $dokli_file) { $d->{'e'}->echo("I build dokli_file $dokli_file"); $d->build_dokli(); } return $dokli_file; } sub summarize_fisto { my $d=shift; foreach my $issuedate (keys %{$d->{'fisto'}}) { foreach my $fitcla (@{$d->{'fitclas'}}) { foreach my $handle (keys %{$d->{'fisto'}->{$issuedate}->{$fitcla}->{'h'}}) { foreach my $fit (keys %{$d->{'fisto'}->{$issuedate}->{$fitcla}->{'h'}->{$handle}}) { my $count=$d->{'fisto'}->{$issuedate}->{$fitcla}->{'h'}->{$handle}->{$fit}; if(defined($count)) { $d->{'fisto_handles'}->{$issuedate}->{$handle}->{$fit}+=$count; } $d->{'fisto_handles'}->{$issuedate}->{$handle}->{$fit}=$count; } } } } delete $d->{'fisto'}; } sub prep_vemli { my $d=shift; my $handle=shift; my $date=shift; $d->{'e'}->echo(__LINE__,"preparing vemli for $handle",10); $d->gather_features_from_various_fideks($handle); my $fits=$d->{'current_handle_fits'}; #if(not defined($fits)) { # $d->{'e'}->echo(__LINE__,"WARN: I can't see the fits for $handle at $date"; if(not defined($fits)) { $d->{'e'}->echo(__LINE__,"I can't see a feature for handle $handle, most likely it's a duplicate handle"); return ''; } my $fitrank_time=$d->set_fitrank_time(); my $fitrank=$d->{'fitrank'}->{$fitrank_time} or confess "I need a fitrank here."; my $deleted_fits=$d->{'deleted_fits'}; ### or confess "I need deleted fits here"; my $wm=$d->{'weigh'} or confess "I need a weighing scheme"; my $weights=&{$wm}($fits,$fitrank,$deleted_fits); my $safe_handle=uri_escape($handle); my $out="$date$weights # ". "$safe_handle\n"; $d->{'line'}=$out; return $out; } sub prep_vemli_fisto { my $d=shift; my $handle=shift; my $date=shift; my $fits=$d->{'fisto_handles'}->{$date}->{$handle} or confess "I can't see feature for handle $handle\n"; my $fitrank=$d->{'fitrank'}->{$d->{'fitrank_time'}} or confess "I need a fitrank here."; #my $deleted_fits=$d->{'deleted_fits'} # or confess "I need deleted fits here"; my $wm=$d->{'weigh'} or confess "I need a weighing scheme"; my $weights=&{$wm}($fits, $fitrank); # $deleted_fits); my $safe_handle=uri_escape($handle); my $out="$date$weights # ". "$safe_handle\n"; $d->{'vemli'}=$out; return $out; } # my @fits=keys %{$fits}; # my $rank; # my $total=0; # my $norm=0; # my $vemliner; # if(not defined($d->{'fitrank'})) { # confess "I need \$d->{fitrank} defined"; # } # foreach my $fit (@fits) { # $rank=$d->{'fitrank'}->{$d->{'fitrank_time'}}->{$fit}; # if(not $rank) { # $d->{'e'}->echo(__LINE__,"I update. Fit '$fit' has no rank, I skip."); # next; # } # my $count=$d->{'fisto_handles'}->{$date}->{$handle}->{$fit}; # if(not $count) { # confess "I can't find a count for feature '$fit'." . Dumper $d->{'fisto_handles'}->{$date}; # next; # } # my $weight=sqrt($count); # $vemliner->{$rank}=sqrt($count); # $total+=$weight; # } # my $out; # my $denominator=sqrt($norm); # foreach my $rank (sort {$a <=> $b} keys %{$vemliner}) { # my $weight=$vemliner->{$rank}/$total; # $weight=sprintf("%6.4f",$weight); # my $print_weight=substr($weight,1); # $out.=" $rank:".$print_weight; # } # my $safe_handle=uri_escape($handle); # $out="$date$out # ". "$safe_handle\n"; # # return $out; #} sub get_dokda { my $d=shift; my $fitcla=shift; my $dokda=$d->{'dokda'}; if(defined($dokda)) { return $dokda; } $d->{'e'}->echo(__LINE__,"reading dokda"); $d->read_dokda(); return $d->{'dokda'}; } sub read_dokda { my $d=shift; my $exfit_dir=$d->{'exfit_dir'}; ## we use the exfit for term only $d->{'e'}->echo(__LINE__,"In read_dokda, I need to check the exfits for term."); $d->build_exfits('term'); opendir( my $exfit_dh, $exfit_dir) or confess "I can't open the exfit_dir $exfit_dir."; my $count_files=0; while (my $file = readdir $exfit_dh ) { if(not $file=~m|(\d{4}-\d{2}-\d{2})_term\.txt$|) { $d->{'e'}->echo(__LINE__,"I'm skipping file $file when looking for dodka files."); next; } my $date=$1; my $fufi="$exfit_dir/$file"; my @lines=&File::Slurper::read_lines($fufi); $d->{'e'}->echo(__LINE__,"I look for dodka in $fufi"); my $handle=''; foreach my $line (@lines) { chomp $line; if(not defined($line) or not $line) { next; } ## a line without a blank starts a new handle, skip others $d->{'e'}->echo(__LINE__,"dokda line is '$line'",10); if($line=~m|^ |) { $d->{'e'}->echo(__LINE__,"I go next",10); next; } $d->{'e'}->echo(__LINE__,"dokda line is '$line'",10); $handle=$line; $d->{'e'}->echo(__LINE__,"I found handle $handle in $fufi for dokda",10); push(@{$d->{'dokda'}->{'handles'}->{$date}},$handle); #$d->{'dokda'}->{'date'}->{$handle}=$date; } } } sub read_dokda_old { my $d=shift; my $exfit_dir=$d->{'exfit_dir'}; ## we use the exfits for term only $d->{'e'}->echo(__LINE__,"In read_dokda, I need to check the exfits for term."); $d->build_exfits('term'); ## these are the lines that contain handles my $s="grep -vH '^ ' $exfit_dir/*term*"; my @lines=`$s`; my $count_lines=0; foreach my $line (@lines) { chomp $line; $count_lines++; if(not defined($line) or not $line) { next; } $d->{'e'}->echo(__LINE__,"dokda line is '$line'"); if(not $line=~m|^([^:]*):(.+)|) { $d->{'e'}->echo(__LINE__,"bad dodka line $count_lines\n$line"); #exit; next; } my $name=$1; my $handle=$2; $name=~m|/(\d{4}-\d{2}-\d{2})_term\.txt|; my $date=$1; if(not $date) { $d->{'e'}->echo(__LINE__,"I could not parse a date from '$line'"); } if(not $handle) { next; } $d->{'e'}->echo(__LINE__,"I found handle $handle in at date $date in dokda",3); push(@{$d->{'dokda'}->{'handles'}->{$date}},$handle); #$d->{'dokda'}->{'date'}->{$handle}=$date; } } ## sub get_isssuedates_hash_from_dokli_file { my $d=shift; my $file=shift // "print I need a file here."; if(not -f $file) { die "I can't open the file $file."; } open(F,"< $file"); my $line; my $dates_in_dokli={}; while($line=) { chomp $line; if($line=~m|\s*#\s*(\d{4}-\d{2}-\d{2})|) { $dates_in_dokli->{$1}=1; } else { last; } } close F; return $dates_in_dokli; } ## used for bim sub make_test_file_with_terms_only { my $d=shift; my $namex_file=shift; if(not -f $namex_file) { confess "I can't open $namex_file"; } my $e=$main::e // confess "I need an erimp here"; my $issuedate=$e->{'issuedate'}; if(not defined($d->{'dokli_file'})) { $d->set_dokli(); } if(not $d->{'fitrank_time'}) { $d->set_fitrank_time(); } my $dokli_file=$d->{'dokli_file'} // confess "I need a dokli_file here."; my $dates_in_dokli=$d->get_isssuedates_hash_from_dokli_file($dokli_file); if(defined($dates_in_dokli->{$issuedate})) { die "I should not build a test file for a date $issuedate I have in my dokli $dokli_file."; } my $dokli_file_name=basename($dokli_file); ## Xmas 2017 my $time; if($dokli_file_name=~m|\d{4}-\d{2}-\d{2}_\d{4}-\d{2}-\d{2}_(\d{10})|) { $time=$1; } elsif($dokli_file_name=~m|\d{4}-\d{2}-\d{2}_\d{4}-\d{2}-\d{2}_([0-9a-z]{6})|) { my $shoti=$1; $time=&Krichel::Shoti::ekam($shoti); } else { confess "I can't parse the dokli file name $dokli_file_name"; } $d->{'dokli_time'}=$time; $d->{'update_time'}=$time; my $fitrank=$d->retrieve_fitrank($time); ## determine the test file name my $test_file=dirname($dokli_file).'/'.$issuedate.'_'.$time.'.test'; if(not &Ernad::Common::does_file_need_renewal($test_file,$dokli_file,$namex_file)) { $e->echo(__LINE__,"I skip the renewal of $test_file, it needs no renewal.",2); $e->{'test_file'}=$test_file; return $test_file; } open(TEST_FILE,"> $test_file"); #$m->get_dokli_object(); ## d->{'sema'} is of the form number --> fit $d->create_semaf($time); ## the weighing functions my $wm=$d->{'weigh'}; my $fisto_dump=dirname($dokli_file).'/'.$issuedate.'_'.$time.'.fisto.dump'; my $fisto; if(-f $fisto_dump) { $fisto=retrieve($fisto_dump); } else { $d->add_lines_for_fisto('term',$namex_file,$issuedate); $fisto=$d->{'fisto'}->{$issuedate}->{'term'}->{'h'}; store $fisto, $fisto_dump; } my @handles=shuffle keys %$fisto; foreach my $handle (@handles) { my $fits=$fisto->{$handle}; my $weights=&{$wm}($fits,$fitrank); my $safe_handle=uri_escape($handle); my $out="0$weights # ". "$safe_handle\n"; print TEST_FILE $out; } #print Dumper $fisto; #die; close TEST_FILE; return $test_file; } sub update_dokli { my $d=shift; my $dokli_file=shift or confess "I need a dokli_file here."; my $dokli_file_name=basename($dokli_file); if(not $dokli_file_name=~m|\d{4}-\d{2}-\d{2}_\d{4}-\d{2}-\d{2}_(\d{10})|) { confess "I can't parse the dokli file name $dokli_file_name"; } my $time=$1; $d->{'dokli_time'}=$time; $d->{'update_time'}=$time; ## make sure we have no_frex defined if(not defined($d->{'no_frex'})) { $d->{'no_frex'}=''; } ## I am not sure we need that here #$d->set_dirs(); if(not -f $dokli_file) { confess "I can't open the dokli_file $dokli_file"; } ## these should take the time as argument $d->retrieve_fitrank($time); $d->create_semaf($time); foreach my $fitcla (@{$d->{'fitclas'}}) { if($fitcla eq 'frex' and $d->{'no_frex'}) { next; } if($fitcla eq 'frin' and $d->{'no_frin'}) { next; } my $glob=$d->{'fidek_dir'}.'/*'.$d->{'dokli_time'}.'_'.$fitcla.'.dump'; my @files=glob($glob) or confess "no fidek dump for $fitcla at dokli ".$d->{'dokli_time'}; my $file=$files[0]; $d->{'e'}->echo(__LINE__,"I found a fidek file for update $file"); ## say that this is an update $d->{'update'}->{'fidek_file'}->{$fitcla}=$file; ## say that this is an update $d->{'fidek_file'}->{$fitcla}=$file; } ## load frexers if(not $d->{'no_frex'}) { foreach my $end ('pl','txt','dump') { my $glob=$d->{'fitar_dir'}.'/*'.$d->{'dokli_time'}.'_frex.'.$end; my @files=glob($glob) or confess "no frexer with end $end at dokli ".$d->{'dokli_time'}; my $file=$files[0]; if(not -f $file) { confess "no such file $file\n"; } $d->{'e'}->echo(__LINE__,"I found a frexer $file"); $d->{'update'}->{'frexer_file'}->{$end}=$file; } } my $dokda=$d->get_dokda(); ## finds the current dates that are not in the dokli $d->set_dates_to_add($dokli_file); if(not scalar (keys %{$d->{'dates_to_add'}})) { $d->{'e'}->echo(__LINE__,"I can't see any issues to be added"); return; } $d->fisto_from_exfits_without_scanner('term'); $d->fisto_from_exfits_without_scanner('frin'); if(not $d->{'no_frex'}) { $d->fisto_from_exfits_with_scanner(); } #$d->{'e'}->echo(__LINE__,Dumper $d->{'fisto'},3); foreach my $date (keys %{$d->{'dates_to_add'}}) { my $handle_ref=$dokda->{'handles'}->{$date}; if(not defined($handle_ref)) { confess "no handles defined for $date"; } } $d->summarize_fisto(); if(not defined($d->{'dates_to_add'})) { confess "I need \$d->{dates_to_add} defined here."; } my $dokli_lock=$dokli_file; $dokli_lock=~s|\.txt$|.lock| or confess; if(-f $dokli_lock) { $d->{'e'}->echo(__LINE__,"I found a dokli_lock $dokli_lock"); if(-M $dokli_lock > 1) { $d->{'e'}->echo(__LINE__,"I remove this old lock."); unlink $dokli_lock; } else { $d->{'e'}->echo(__LINE__,"I am leaving build_dokli because of the lock."); return 0; } } if(not defined($d->{'no_update'}) or not $d->{'no_update'}) { $d->{'e'}->echo(__LINE__,"I open $dokli_file"); open(F,">> ".$dokli_file) or confess; } foreach my $issuedate (sort keys %{$d->{'dates_to_add'}}) { $d->{'e'}->echo(__LINE__,"I am updating $issuedate"); foreach my $handle (shuffle keys %{$d->{'fisto_handles'}->{$issuedate}}) { my $vemli=$d->prep_vemli_fisto($handle,$issuedate) or confess "I could not build a vemli"; my $sema=$d->sema($vemli); if(not defined($d->{'no_update'})) { ## Rosa waiting bug #if($vemli=~m|^\d{4}-\d{2}-\d{2} \d+ |) { # confess "This is not a valid vemli: $vemli"; #} chomp $vemli; $d->{'e'}->echo(__LINE__,"I am printing $vemli",4); print F "$vemli\n"; } else { $d->{'e'}->echo(__LINE__, $sema); } } } $d->{'e'}->echo(__LINE__,"I am done with update_dokli of $dokli_file"); close F; $d->{'e'}->echo(__LINE__,"I delete ".$d->{'dodex_file'}); unlink $d->{'dodex_file'}; my $dokli_bana=basename($dokli_file); delete $d->{'dodex'}->{$dokli_bana}; delete $d->{'dodex_file'}; delete $d->{'dates_to_add'}; delete $d->{'fh'}->{$dokli_bana}; unlink $dokli_lock; } sub fisto_from_exfits_with_scanner { my $d=shift; $d->{'scan_file'}->{'frex'}=$d->{'update'}->{'frexer_file'}->{'pl'} or confess "I don't see a frexer pl"; $d->get_scanner('frex'); $d->{'e'}->echo(__LINE__,"done"); ## check if we have to renew the exfits $d->build_exfits('frex'); if(not defined($d->{'dates_to_add'})) { confess "I have no dates to add"; } foreach my $issuedate (sort keys %{$d->{'dates_to_add'}}) { ## my $file=$d->{'exfit_dir'}.'/'.$issuedate.'_frex.txt'; my $file=$d->{'exfit_dir'}.'/'.$issuedate.'_term.txt'; if(not -f $file) { confess "I can't find the exfit file $file to add to the fisto"; } $d->scan_file_for_frex_fits($file,'fisto',$issuedate); } } sub build_fitar { my $d=shift; my $fitcla=shift or confess 'no fitcla'; $d->build_fre_from_exfits($fitcla); if($fitcla eq 'frin' or $fitcla eq 'term') { $d->remove_features_below_limit($fitcla,1); } $d->print_fits($fitcla,'ar'); } sub gather_features_from_various_fideks { my $d=shift; my $handle=shift // confess "I have no handle to gather fits for"; if(defined $d->{'current_handle_fits'}) { $d->{'e'}->echo(__LINE__,"I am deleting current_handle_fits",3); delete $d->{'current_handle_fits'}; } my @fitclas=@{$d->{'fitclas'}}; my $ranker=$d->{'fitrank'}->{$d->{'fitrank_time'}} or confess "I don't seem to have fitrank at fitrank time"; if(not $d->{'fidek'}) { $d->read_fideks(); if(not $d->{'fidek'}) { confess "I don't see \$d->{'fidek'}"; } } ## we should at least have term fits if(not $d->{'fidek'}->{'term'}) { confess "I don't see \$d->{'fidek'}->{'term'}"; } ## we should at least have term fits if(not $d->{'fidek'}->{'term'}->{'h'}) { #print Dumper $d->{'fidek'}->{'term'}; confess "I don't see \$d->{'fidek'}->{'term'}->{'h'}\n"; } foreach my $fitcla (@fitclas) { if($d->{'e'}->{'conf'}->{'no_frex'} and $fitcla eq 'frex') { next; } if($d->{'e'}->{'conf'}->{'no_frin'} and $fitcla eq 'frin') { next; } my $fits=$d->{'fidek'}->{$fitcla}->{'h'}->{$handle}; my $d_fits=Dumper $fits; $d->{'e'}->echo(__LINE__,"I got \$fits $d_fits",10); foreach my $fit (keys %{$fits}) { my $rank=$ranker->{$fit}; if(not defined($rank)) { $d->{'e'}->echo(__LINE__,"skipping feature $fit without rank",4); next; } else { $d->{'e'}->echo(__LINE__,"keeping feature $fit with rank $rank",4); } if(defined($d->{'current_handle_fits'}->{$fit})) { $d->{'e'}->echo(__LINE__,"$fit is in several fitclas"); } my $count=$fits->{$fit}; $d->{'e'}->echo(__LINE__,"setting a count $count for fit $fit",4); $d->{'current_handle_fits'}->{$fit}=$count; } #print Dumper $d->{'current_handle_fits'}; #exit; ## save memory but better make sure the sumda is not saved $d->{'e'}->echo(__LINE__,"I am deleting the fidek for fitcla at $handle",3); delete $d->{'fidek'}->{$fitcla}->{'h'}->{$handle}; } } sub get_dodex { my $d=shift; my $dokli_file=shift; if(not defined($dokli_file)) { $dokli_file=$d->{'dokli_file'} or confess "I need a dokli_file set"; } if(not -f $dokli_file) { confess "I can't see the file $dokli_file"; } my $dodex_file=$dokli_file; $dodex_file=~s|\.txt|.index| or confess "I have a bad dockli_file name $dokli_file"; if(not &Ernad::Common::does_file_need_renewal($dodex_file,$dokli_file)) { $d->{'e'}->echo(__LINE__,"dodex $dodex_file needs no update."); $d->{'dodex_file'}=$dodex_file; $d->{'dokli_file'}=$dokli_file; $d->{'e'}->echo(__LINE__,"get_dodex: I get it from file $dodex_file"); $d->get_dodex_from_file; $d->{'e'}->echo(__LINE__,"I loaded $dodex_file."); return $dodex_file; } $d->{'e'}->echo(__LINE__,"get_dodex: I update $dodex_file"); open(F,"< $dokli_file") or confess "I can not open the file $dokli_file"; open(I,"> $dodex_file") or confess "I can not open the file $dodex_file"; my $count_bytes=0; my $old_issuedate=''; my $line; ## I qadd a check if the dates are set chunks my $issuedates_done; while($line=) { if($d->is_this_a_lisig_line($line)) { # $count_bytes+=length($line); next; } $line=~m|^(\d{4}-\d{2}-\d{2}) | or confess "I found a bad line '$line' in the dokli $dokli_file"; my $issuedate=$1; if(defined($issuedates_done->{$issuedate})) { confess "I found $issuedate again in the dokli_file $dokli_file\n"; } if(not $old_issuedate) { $old_issuedate=$issuedate; $count_bytes+=length($line); next; } if($old_issuedate eq $issuedate) { $count_bytes+=length($line); next; } print I "$old_issuedate $count_bytes\n"; $issuedates_done->{$old_issuedate}=1; $old_issuedate=$issuedate; ## since it's only ascii, we have no problem using the length $count_bytes+=length($line); } print I "$old_issuedate $count_bytes\n"; close F; close I; $d->{'e'}->echo(__LINE__,"I am calling get_dodex_from_file again, after update."); $d->{'dodex_file'}=$dodex_file; ## ok we could have done the job here ... $d->get_dodex_from_file($dokli_file); return $d->{'dodex_file'}; } sub get_dodex_from_file { my $d=shift; my $dodex_file=$d->{'dodex_file'} or confess "I need a dodex_file set"; ## to set a range of dodexes for various dokli files my $dokli_file=$d->{'dokli_file'} or confess "I need a dokli_file set"; my $time; if(-f $dokli_file) { $time=&Ernad::Common::get_time_from_file_name($dokli_file); } else { $time=$d->{'time'} or confess "I need the time set here"; } open(I,"< $dodex_file") or confess "I can not open the file $dodex_file"; delete $d->{'dodex'}; my $old_offset=''; my $line; while($line=) { $line=~m|^(\d{4}-\d{2}-\d{2}) (\d+)| or confess "I found a bad line '$line' in the dodex $dodex_file"; my $issuedate=$1; my $offset=$2; if(not $old_offset) { $d->{'dodex'}->{$time}->{$issuedate}->{'start'}=0; $d->{'dodex'}->{$time}->{$issuedate}->{'from'}=$offset; $old_offset=$offset; next; } $d->{'dodex'}->{$time}->{$issuedate}->{'start'}=$old_offset; $d->{'dodex'}->{$time}->{$issuedate}->{'from'}=$offset-$old_offset; $old_offset=$offset; } } sub is_date_in_dokli { my $d=shift; my $date=shift or confess "I need a date here."; my $dokli_file=shift or confess "I need a dokli_file here."; $d->echo(__LINE__,"I am looking for date $date in the dokli $dokli_file."); if(not -f $dokli_file) { confess "The dokli $dokli_file has gone."; } ## additional check to see if time is the same as on the dokli my $dokli_time=&Ernad::Common::get_time_from_file_name($dokli_file); $d->echo(__LINE__,"File $dokli_file has time $dokli_time."); my $time=$d->{'time'} or confess "I need the time set here."; if($dokli_time ne $time) { $d->echo(__LINE__,"I set d->{time} $time to $dokli_time."); #$d->echo(__LINE__,"I want to set d->{time} $time to $dokli_time."); $d->{'time'}=$d->{'dokli_time'}; #$d->{'time'}=$d->{'dokli_time'}; } if(not defined($d->{'dodex'}->{$dokli_time}->{$date})) { $d->echo(__LINE__,"I get the dodex for $dokli_file."); $d->get_dodex($dokli_file); if(not defined($d->{'dodex'}->{$dokli_time})) { if(defined($d->{'dodex'}->{$time}->{$date})) { $d->echo(__LINE__,"I set d->{time} back to $time."); $d->{'time'}=$time; return 1; } ### !!! $d->echo(__LINE__,"I can't get the dodex."); $d->echo(__LINE__,"dokli_time is $dokli_time."); my $dodex=$d->{'dodex'}; foreach my $date (sort keys %$dodex) { $d->echo(__LINE__,"I have a time $time in dodex."); } $d->echo(__LINE__,"I load dodex for $dokli_file."); $d->get_dodex($dokli_file); foreach my $date (sort keys %$dodex) { $d->echo(__LINE__,"I have a time $time in dodex."); } exit; if(not defined($d->{'dodex'}->{$dokli_time})) { confess "I still don't have a dodex for time $time nor time $dokli_time."; } } } if(defined($d->{'dodex'}->{$dokli_time}->{$date})) { return 1; } return 0; } sub get_vemlis_by_date { my $d=shift; my $issuedate=shift or confess "I need an issuedate argument here"; my $dokli_file=shift; my $model_time=shift // ''; if(not defined($dokli_file)) { $dokli_file=$d->{'dokli_file'} or confess "I need a dokli_file set"; } $d->{'e'}->echo(__LINE__,"I get my vemlis from $dokli_file",1); if(not -f $dokli_file) { delete $d->{'dokli_file'}; $d->{'e'}->echo(__LINE__,"I don't have this dokli_file $dokli_file"); $d->build_dokli('add'); } if(not -f $dokli_file) { confess "I can't see $dokli_file"; } ## refresh the dodex, it may not be current after update_dokli $d->get_dodex($dokli_file); my $dodex_dump=Dumper $d->{'dodex'}; $d->{'e'}->echo(__LINE__,"dodex now: $dodex_dump",4); my $lisig=$d->get_lisig_from_dokli($dokli_file); if(not $lisig) { confess "I need a lisig here."; } my $lisig_length=length($lisig) or confess "I need a lisig here"; my $dokli_time=&Ernad::Common::get_time_from_file_name($dokli_file); my $start=$d->{'dodex'}->{$dokli_time}->{$issuedate}->{'start'} // 0; if(not $start and $start != 0) { $d->{'e'}->echo(__LINE__,"No $start start for $issuedate in dodex"); return; } $d->{'e'}->echo(__LINE__,"Start is $start start for $issuedate in dodex"); my $from=$d->{'dodex'}->{$dokli_time}->{$issuedate}->{'from'} // ''; if(not $from) { $d->{'e'}->echo(__LINE__,"No \$from for $issuedate in dodex"); return; } else { $d->{'e'}->echo(__LINE__,"From is $from for $issuedate in dodex"); } my $dokli_bana=basename($dokli_file); if(not defined($d->{'fh'}->{$dokli_bana})) { open($d->{'fh'}->{$dokli_bana},"< $dokli_file") or confess "I can not open the file $dokli_file"; } seek($d->{'fh'}->{$dokli_bana},$start+$lisig_length,0); my $text; read($d->{'fh'}->{$dokli_bana},$text,$from); $text=~s|$issuedate\s+||g; if(not $text) { $d->{'e'}->echo(__LINE__,"vemlis found are empty"); } return $text; } ## this should only be used in case the by isusedate fails. sub get_vemli_by_handle { my $d=shift; my $e=$d->{'e'}; my $handle=shift or confess "I need an issuedate argument here"; my $dokli_file=shift; my $issuedate=shift or confess "I need an issuedate argument here"; if(not defined($dokli_file)) { $dokli_file=$d->{'dokli_file'} or confess "I need a dokli_file set."; } $d->{'e'}->echo(__LINE__,"I get vemlis for $handle from $dokli_file",7); if(not -f $dokli_file) { delete $d->{'dokli_file'}; $d->{'e'}->echo(__LINE__,"I don't have this dokli_file $dokli_file"); $d->build_dokli('add'); } if(not -f $dokli_file) { confess "I can't see $dokli_file"; } open(F,"< $dokli_file"); my $done=0; my $line; my $u_h=uri_escape($handle); while(not $done and $line=) { if($line=~m|\s+#\s+\Q$u_h\E\s*$|) { chomp $line; close F; return $line; } } ## We have no data for that paper if($e->{'conf'}->{'separate_doklis'}) { $e->echo(__LINE__,"WARN: I can't find $handle in the $dokli_file I fake it."); } else { $e->echo(__LINE__,"WARN: I can't find $handle in the dokli $dokli_file, I fake it."); } ## make up a fake entry my $uh=uri_escape($handle); return "$issuedate # $uh"; } sub get_dokli_by_time { my $d=shift; my $time=shift or confess "I need a time here."; if(not $time=~m|^\d{10}$|) { confess "Your time $time looks dodgy."; } my $glob=$d->{'dokli_dir'}.'/*_'.$time.'.txt'; my @files=glob($glob); ## dokli per report --> redol my $redol_file=$files[0]; $d->set_dokli_for_model($redol_file); if(not $d->{'dokli_file'}) { confess "I still don't have a dokli_file"; } return $d->{'dokli_file'}; } ###sub model_report { ### my $d=shift; ### my $repcode=shift or confess 'I need a repcode for build_model_file'; ### ## do an extra check to prnit terlis separate ### my $type_treli_separate=1; ### if(not $d->{'e'}->can_the_report_be_modelled($repcode)) { ### $d->{'e'}->echo(__LINE__,"I can't model the report $repcode because I have no accepted papers.",1); ### return; ### } ### $d->set_mocla_file_and_dates($repcode); ### my $train_file=$d->{'train_file'}->{$repcode} or confess "I need a train file here"; ### $d->{'e'}->echo(__LINE__,"I have set the train_file to $train_file"); ### my $model_file=$d->{'model_file'}->{$repcode} or confess "I need a model file here"; ### $d->{'e'}->echo(__LINE__,"I have set the model_file to $model_file"); ### if(-f $model_file) { ### $d->{'e'}->echo(__LINE__,"I found the model_file $model_file."); ### ## find the training file that the model is based on ### if(not -f $train_file and -f $model_file) { ### confess "A model file $model_file exists without a training file $train_file."; ### } ### if(-f $train_file) { ### $d->{'e'}->echo(__LINE__,"I found the train_file is $train_file."); ### if(-M $train_file < -M $model_file) { ### confess "The train file $train_file is more recent than the model_file $model_file."; ### } ### my $sent_dir=$d->{'e'}->{'report'}->{$repcode}->{'dir'}->{'sent'} or die; ### if(not &Ernad::Common::does_file_need_renewal($train_file, $sent_dir)) { ### $d->{'e'}->echo(__LINE__,"I found $model_file. It is up to date, ending build_model."); ### return 1; ### } ### } ### } ### $d->{'e'}->echo(__LINE__,"I need to update the model_file $model_file."); ### my $dokli_file=$d->{'dokli_file'}; ### if(not -f $dokli_file) { ### $d->{'e'}->echo(__LINE__,"I am building dokli $dokli_file."); ### $d->build_dokli('add'); ### } ### if(not -f $dokli_file) { ### confess "buliding the dokli appears to have failed"; ### } ### my $lock_file=$d->{'lock_file'}->{$repcode} or confess "I need a lock file here"; ### $d->{'e'}->echo(__LINE__,"lock_file is $lock_file"); ### if(-f $lock_file) { ### ## remove the lock after more than one day ### my $lock_days_old=-M $lock_file; ### if($lock_days_old > 1) { ### $d->{'e'}->echo(__LINE__,"I delete an $lock_days_old lock file $lock_file",1); ### unlink $lock_file; ### } ### else { ### $d->{'e'}->echo(__LINE__,"I find $lock_file, I don't build $model_file",1); ### return 0 ; ### } ### } ### my $trained=$d->build_train_file($repcode,$d->{'dokli_file'}); ### if(not $trained) { ### $d->{'e'}->echo(__LINE__,"I could not build the train file.",1); ### return 0; ### } ### if(not &Ernad::Common::does_file_need_renewal($model_file, $train_file)) { ### $d->{'e'}->echo(__LINE__,"I skip the renewal of $model_file, it's newer than the train file $train_file.",1); ### unlink $lock_file; ### return $train_file; ### } ### ## only for reporting ### my $out_file=$d->{'out_file'}->{$repcode} or confess "I need a out file here"; ### my $err_file=$d->{'err_file'}->{$repcode} or confess "I need a err file here"; ### ## -b 1 is required to get probablities ### my $flags="-b 1" . ' '. $d->{'svm_train_flags'}; ### my $s="svm-train $flags $train_file $model_file > $out_file 2> $err_file"; ### my $start=time; ### $d->{'e'}->echo(__LINE__,"command is $s",1); ### system("touch $lock_file"); ### system($s); ### my $end=time; ### unlink $lock_file; ### my $run_time=$end-$start; ### $d->{'e'}->echo(__LINE__,"done in $run_time",1); ### delete $d->{'train_file'}->{$repcode}; ### delete $d->{'isink_files'}->{$repcode}; ### return $train_file; ### } ### ### ## only called by build_model, but in an else, so ### ## let's put it in a separate method. ### sub build_train_file { ### my $d=shift; ### my $repcode=shift ### or confess "I need a repcode here."; ### my $train_file=$d->{'train_file'}->{$repcode} or confess "I need a training file here"; ### $d->{'e'}->echo(__LINE__,"I am pondering the renewal of train_file is $train_file"); ### if(not defined($d->{'i'})) { ### $d->{'i'}=Ernad::Isink->new({'impna' => $d->{'impna'}, ### {'f' => $f}, ### {'e' => $d->{'e'}}}); ### } ### my $i=$d->{'i'} or confess "I need an isink object here."; ### ## traing can be long-run, so we always list the isinks rather than ### ## relying on an earlier listing. ### $i->list_isink_files($repcode); ### my $isink_files=$i->{'isink_files'}; ### if(not defined($isink_files)) { ### $d->{'e'}->echo(__LINE__,"I can't find isink_file for $repcode. I can't build the train file without them."); ### return 0; ### } ### my $iskin_dir=$d->{'isink_dir'}.'/'.$repcode; ### if(not &Ernad::Common::does_file_need_renewal($train_file,$iskin_dir,$d->{'dokli_file'})) { ### $d->{'e'}->echo(__LINE__,"I skip the renewal of $train_file, it needs no renewal.",1); ### $d->{'train_file'}->{$repcode}=$train_file; ### return 1; ### } ### if(not defined($d->{'train_dates'}->{$repcode})) { ### confess "I need to have train_dates defined here"; ### } ### &Ernad::Files::prepare_for_file($train_file); ### my $train_fh; ### open($train_fh,"> $train_file") or confess "I could not open $train_file."; ### foreach my $issuedate (sort keys %{$i->{'isink_files'}->{$repcode}}) { ### my $is_still_in_vedex=$d->make_trelis($repcode,$issuedate,$train_fh); ### if($is_still_in_vedex) { ### close $train_fh; ### unlink $train_file; ### confess "Some handles are still in the vedex: $is_still_in_vedex."; ### } ### } ### close $train_fh; ### delete $d->{'train_dates'}->{$repcode}; ### #$d->{'train_file'}->{$repcode}=$train_file; ### $d->{'e'}->echo(__LINE__,"I have written $train_file"); ### } ### ### sub make_trelis { ### my $d=shift; ### my $repcode=shift // confess; ### my $issuedate=shift // confess; ### my $train_fh=shift; ### my $i=$d->{'i'}; ### if(not defined($d->{'train_dates'}->{$repcode}->{$issuedate})) { ### $d->{'e'}->echo(__LINE__,"I skip date $issuedate. It does not go into the train_file for $repcode",1); ### next; ### } ### if(not defined($d->{'issuedates'}->{$issuedate})) { ### $d->{'e'}->echo(__LINE__,"I skip date $issuedate. It does has not valid allport date.",1); ### next; ### } ### $d->{'e'}->echo(__LINE__,"getting vemlis for $repcode issuedate $issuedate",1); ### my $vemlis=$d->get_vemlis_by_date($issuedate) // ''; ### if(not $vemlis) { ### # # $d->{'e'}->echo(__LINE__,"WARN: no vemlis for $issuedate for report $repcode",0); ### confess " no vemlis for $issuedate for report $repcode"; ### # # next; ### } ### my $isink_file=$i->{'isink_files'}->{$repcode}->{$issuedate}; ### if(not -f $isink_file) { ### confess "I can't see the isink_file $isink_file."; ### } ### my $isink_text=&File::Slurper::read_text($isink_file); ### my $vedex=$d->index_vemlis($vemlis); ### foreach my $isili (split("\n",$isink_text)) { ### #print "$isili\n"; ### my $treli=$d->make_treli($isili,$vedex); ### print $train_fh $treli; ### } ### ## check we have nothing in the vedex left ### my $is_still_in_vedex=join(' ', keys %{$vedex}); ### return $is_still_in_vedex; ### #print Dumper $vedex; ### #die; ### #print Dumper $vedex; ### #print $isink_text; ### #die; ### #my $trelis=$d->get_treli_by_date($vemlis,$isink_text,$issuedate); ### # some debugging code. ### #if($type_treli_separate) { ### # my $treli_file="/tmp/$issuedate.treli"; ### # open(TEST,"> $treli_file"); ### # print TEST $trelis or confess "I could not print"; ### # close TEST; ### # $d->{'e'}->echo(__LINE__,"I printed trelis to $treli_file"); ### #} ### #print F $trelis or confess "I could not print"; ### #$d->{'e'}->echo(__LINE__,"I printed trelis for $issuedate to $train_file"); ### } ### ### sub make_treli { ### my $d=shift; ### my $isili=shift; ### my $vedex=shift; ### $isili=~m|^([-+]1)\s+#\s+(\S+)\s*$| or confess "bad isili $isili"; ### my $indic=$1; ### my $handle=$2; ### my $vem=$vedex->{$handle}; ### if(not defined($vem)) { ### confess "I don't have a vedex entry for $handle."; ### } ### my $treli="$indic $vem # " . uri_escape($handle) . "\n"; ### delete $vedex->{$handle}; ### return $treli; ### } ### ### ### sub index_vemlis { ### my $d=shift; ### my $vemlis=shift; ### my $vedex={}; ### foreach my $vemli (split("\n",$vemlis)) { ### $vemli=~m|^([^#]+)\s+#\s+(\S+)\s*$| ### or confess "bad vemli $vemli"; ### my $vem=$1; ### my $handle=uri_unescape($2); ### $vedex->{$handle}=$vem; ### } ### return $vedex; ### } ### sub model_all_reports { ### my $d=shift; ### my $o_k=shift; ### if(($o_k ne 'overwrite') and ($o_k ne 'keep')) { ### confess 'o_k has to be overwrite or keep'; ### } ### my $e=$d->{'e'}; ### my @reports=$e->get_cural_repcodes() or confess "I can't get to the reports"; ### $d->find_isinker_file(); ### foreach my $repcode (shuffle @reports) { ### $d->{'e'}->echo(__LINE__,"I model $repcode, building isisnk",1); ### $d->build_isink_files_for_report($repcode); ### $d->{'e'}->echo(__LINE__,"I am done with the isink for $repcode",1); ### $d->model_report($repcode); ### } ### } ### ## mode parameter may contain line to print sema as a line sub type_vemli_file { my $d=shift; my $file=shift; my $mode=shift // 'full'; if(not -f $file) { confess "No such file $file"; } my $time=&Ernad::Common::get_time_from_file_name($file); ## This should not be here. The time should be an argument. open(F,"< $file"); my $line; ## output structure my $t; $d->retrieve_fitrank($time); $d->create_semaf($time); my $indic; while($line=) { chomp $line; if($line=~m|^# \d{4}-\d{2}-\d{2}_\d{10}$|) { next; } if($line=~m|^# \d{4}-\d{2}-\d{2}_[0-9a-z]{6}$|) { next; } ## change for lines in the dokli, move date to the rear $line=~s|^(\d{4}-\d{2}-\d{2} )([^#]+# )|0 $2$1|; if(not ($line=~m|^([-+])1| or $line=~m|^(0) |)) { confess "bad line $line"; } my $indic=$1; if($indic eq '-' and $mode=~m|positive|) { next; } print $d->sema($line,$mode); } } ### sub check_train_file { ### my $d=shift; ### my $in_file=shift; ### open(F,"< $in_file") or cluck "I can't open $in_file"; ### my $line; ### my $t; ### my $old_date; ### my $count=0; ### while($line=) { ### chomp $line; ### if(not $line=~m|([-+]1)[^#]+# (\d{4}-\d{2}-\d{2})\s(\S+)|) { ### print "bad line $line\n"; ### next; ### } ### my $stat=$1; ### my $date=$2; ### my $escaped_handle=$3; ### my $handle=uri_unescape($escaped_handle); ### $t->{$count}->{'stat'}=$stat; ### $t->{$count}->{'date'}=$date; ### $t->{$count}->{'escaped_handle'}=$escaped_handle; ### $t->{$count}->{'handle'}=$handle; ### if(defined($t->{$handle})) { ### croak "handle $handle has appeared twice"; ### } ### $t->{$handle}=1; ### $t->{$date}++; ### ## check for reappearance of dates ### ## first line ### $count++; ### if(not defined($old_date)) { ### $old_date=$date; ### $t->{'old_dates'}->{$date}=1; ### next; ### } ### if($date eq $old_date) { ### next; ### } ### if(defined($t->{'old_dates'}->{$date})) { ### croak "date $date has come back"; ### } ### } ### } ### ### sub type_test_file { ### my $d=shift; ### my $repcode=shift; ### if(not $repcode) { ### confess "I need a repcode to find its test file."; ### } ### my $issuedate=shift // ''; ### if(not $issuedate) { ### confess "I need an issuedate to find its test file."; ### } ### ### my $test_file=$d->build_test_file($repcode,$issuedate); ### $d->type_vemli_file($test_file); ### } ### ### sub type_train_file { ### my $d=shift; ### my $repcode=shift; ### if(not $repcode) { ### confess "I need a repcode to find its test file."; ### } ### ## returns the train file ### my $train_file=$d->model_report($repcode); ### $d->type_vemli_file($train_file); ### } ## it's a primitive check whether something is there ## for the timed data file tidaf ## does only do dokli check sub check_if_tidafs_exists { my $d=shift; my $time=shift or confess 'I need a time here'; if(not $time=~m|^\d{10}$|) { confess "You don't give me a valid time: $time"; } my $glob=$d->{'dokli_dir'}.'/*'.'_'.$time.'.txt'; my @files=glob($glob); my $count_files=scalar(@files); if($count_files) { return $count_files; } $d->{'e'}->echo(__LINE__,"WARN: I don't have the dokli for time $time"); ### fixme and test ## check if we have moved it to .gone, as on 2017-04-23 $glob=$d->{'dokli_dir'}.'/*'.'_'.$time.'.txt.gone'; @files=glob($glob); $count_files=scalar(@files); if($count_files) { return 0; } $d->{'e'}->echo(__LINE__,"I delete all files with time $time"); $d->delete_tidafs_with_time($time); #$dokli_file=$d->{'dokli_file'}; #$d->{'e'}->echo(__LINE__,"I build a new dokli"); #$d->build_dokli('add'); ## the only time where we force report training #$d->model_report($repcode); return 0; } sub delete_tidafs_with_time { my $d=shift; my $time=shift or confess "I need a time here"; if(not $time=~m|^\d{10}$|) { confess "I can't deal with the time '$time'"; } my $learn_dir=$d->{'learn_dir'} or confess "I need a learn_dir here"; if(not -d $learn_dir) { confess "no such directory: $learn_dir"; } my $s; $s="/usr/bin/find $learn_dir -name '*$time.*' -exec rm {} \\;"; $d->{'e'}->echo(__LINE__,"running $s"); system($s); ## for fitar files $s="/usr/bin/find $learn_dir -name '*$time"."_.*' -exec rm {} \\;"; $d->{'e'}->echo(__LINE__,"running $s"); system($s); } ## dokli signature sub get_lisig_from_dokli { my $d=shift; my $file=shift // confess "I need a file here"; if(not -f $file) { confess "no such file $file"; } my $lisig=''; open(DOKLI,"< $file"); my $line; while($line=) { my $lisig_line=$d->is_this_a_lisig_line($line); if(not $lisig_line) { last; } $lisig.=$lisig_line; } return $lisig; } sub is_this_a_lisig_line { my $d=shift; my $in=shift; ## Xmas 2017 if($in=~m|^# \d{4}-\d{2}-\d{2}_[0-9a-z]{6}\s*$|) { return $in; } if($in=~m|^# \d{4}-\d{2}-\d{2}_\d{10}\s*$|) { return $in; } ## namf case if($in=~m|^# \d{4}-\d{2}-\d{2}_\d{4}-\d{2}-\d{2}\s*$|) { return $in; } return ''; } # sub set_lisig { # my $d=shift; # my $e=$d->{'e'}; # my $includes=$d->{'issuedates'} # or confess "I need \$d->{issuedates} set"; # my $ext=$d->{'e'}->{'const'}->{'amf_ext'} # or confess "no amf extension"; # my $lisig=''; # my $rerc; # my $sent_dir; # my $separate_doklis=$e->{'conf'}->{'separate_doklis'} // ''; # if($separate_doklis) { # if(not defined($e->{'repcode'})) { # confess "I need a repcode here."; # } # $rerc=$e->{'report'}->{$e->{'repcode'}} // confess "I need a rerc here."; # $sent_dir=$rerc->{'dir'}->{'sent'} // confess "I need a sent_dir here."; # } # foreach my $date (sort keys %{$includes}) { # my @dates=sort @{$includes->{$date}}; # ## for ordinary issues # if($separate_doklis and not &Ernad::Common::is_issue_in_dir($sent_dir,$date)) { # $e->echo(__LINE__,"$date is not issued yet, it's not going into the dokli."); # next; # } # my $last_date= $dates[$#dates]; # my $put_on_lisig=$last_date; # if(not &Ernad::Common::is_rif_name($last_date)) { # $e->echo(__LINE__,"I skip $last_date"); # next; # } # $put_on_lisig=~s|\Q$ext.gz\E$||; # $put_on_lisig=~s|\Q$ext\E$||; # $lisig.='# '.$put_on_lisig."\n"; # } # $d->{'lisig'}=$lisig; # $d->{'lisig_length'}=length($lisig); # die $lisig; # } sub set_lisig_separating { my $d=shift; my $e=$d->{'e'}; my $includes; my $repcode=$e->{'repcode'} // confess "I need a repcode here."; my $rerc=$e->{'report'}->{$repcode} // confess "I don't know the report $repcode."; $includes=$d->{'issuedates'} or confess "I need \$d->{issuedates} set"; my $ext=$d->{'e'}->{'const'}->{'amf_ext'} or confess "no amf extension"; my $lisig=''; if(not $rerc->{'learnable_issuedates'}) { $rerc->{'learnable_issuedates'}=&set_report_past_issuedates($e->{'repcode'}); } if(not $rerc->{'learnable_issuedates'}) { confess "I need to have learnable_issuedates configured here."; } foreach my $date (sort keys %{$rerc->{'learnable_issuedates'}}) { my @dates=sort @{$includes->{$date}}; my $last_date= $dates[$#dates]; my $put_on_lisig=$last_date; if(not &Ernad::Common::is_rif_name($last_date)) { next; } else { $e->echo(__LINE__,"$last_date is a rif name."); } $put_on_lisig=~s|\Q$ext.gz\E$||; $put_on_lisig=~s|\Q$ext\E$||; $lisig.='# '.$put_on_lisig."\n"; } $d->{'lisig'}=$lisig; $e->echo(__LINE__,"The lisig is $lisig"); $d->{'lisig_length'}=length($lisig); } sub set_lisig { my $d=shift; my $e=$d->{'e'}; my $includes; if($e->{'conf'}->{'separate_doklis'}) { return $d->set_lisig_separating(); } $includes=$d->{'issuedates'} or confess "I need \$d->{issuedates} set"; my $ext=$d->{'e'}->{'const'}->{'amf_ext'} or confess "no amf extension"; my $lisig=''; foreach my $date (sort keys %{$includes}) { my @dates=sort @{$includes->{$date}}; my $last_date= $dates[$#dates]; my $put_on_lisig=$last_date; if(not &Ernad::Common::is_rif_name($last_date)) { next; } else { $e->echo(__LINE__,"$last_date is a rif name."); } $put_on_lisig=~s|\Q$ext.gz\E$||; $put_on_lisig=~s|\Q$ext\E$||; $lisig.='# '.$put_on_lisig."\n"; } $d->{'lisig'}=$lisig; $e->echo(__LINE__,"The lisig is $lisig"); $d->{'lisig_length'}=length($lisig); } sub find_or_write_lisig { my $d=shift; if(not defined($d->{'lisig'})) { $d->set_lisig(); } my $lisig_dir=$d->{'lisig_dir'} or confess "I need a lisig_dir here"; if(not -d $lisig_dir) { mkpath($lisig_dir); return; } my $lisig_set=$d->{'lisig'}; $d->{'e'}->echo(__LINE__,"I am looking for a lisig $lisig_set in $lisig_dir.",1); opendir( my $lisig_dh, $lisig_dir) or confess "I can't open the lisig_dir $lisig_dir."; while (my $lisig_file = readdir $lisig_dh ) { if(not $lisig_file=~m|^(\d+)\.txt$|) { $d->{'e'}->echo(__LINE__,"I'm skipping file $lisig_file when looking for lisig files.",10); next; } my $lisig_fufi="$lisig_dir/$lisig_file"; my $lisig_time=$1; my $lisig_in_file=&File::Slurper::read_text($lisig_fufi); if($lisig_in_file eq $d->{'lisig'}) { $d->{'lisig_time'}=$lisig_time; $d->{'e'}->echo(__LINE__,"I found the current lisig at $lisig_time.",1); return; } } my $time=$d->{'time'} // confess "I need a time set here"; my $lisig_file="$lisig_dir/$time.txt"; $d->{'e'}->echo(__LINE__,"I write the lisig file $lisig_file."); &File::Slurper::write_text($lisig_file,$d->{'lisig'},'utf8'); } ## this does not build the dokli sub set_dokli { my $d=shift; my $dokli_glob=$d->{'all_dokli_glob'} or confess "I need a dokli_glob here"; my $set_lisig=$d->{'lisig'} // ''; if(not $set_lisig) { $d->get_lisig(); } if(not $set_lisig) { confess "I need a lisig here."; } foreach my $file (glob($dokli_glob)) { my $lisig=$d->get_lisig_from_dokli($file); if(not $lisig) { unlink $file; } my $bana=basename($file) or next; if(not $bana=~m|^\d{4}-\d{2}-\d{2}_\d{4}-\d{2}-\d{2}_\d+\.txt$|) { $d->{'e'}->echo(__LINE__,"I skip the strange dokli_file name $bana"); next; } if($lisig eq $set_lisig) { $d->{'dokli_file'}=$file; $d->{'e'}->echo(__LINE__,"I have the dokli_file $file"); $d->{'dokli_time'}=&Ernad::Common::get_time_from_file_name($file); $d->get_dodex(); return $file; } else { $d->{'e'}->echo(__LINE__,"lisig on $bana does not match"); } } my $lisig=$d->{'lisig'} or confess "I need a lisig here."; if(not defined($d->{'dokli_file'})) { $d->{'e'}->echo(__LINE__,"The lisig is $lisig."); $d->{'e'}->echo(__LINE__,"I found no dokli that corresponds to our lisig."); my $dokli_file=$d->{'dokli_dir'}.'/'.$d->{'file_version'}.'.txt'; $d->{'dokli_file'}=$dokli_file; $d->{'dokli_time'}=$d->{'time'}; $d->{'e'}->echo(__LINE__,"I set the dokli_file to $dokli_file."); if(not -f $dokli_file) { $d->{'e'}->echo(__LINE__,"The dokli_file $dokli_file is not there but that's fine."); } $d->{'dokli_time'}=&Ernad::Common::get_time_from_file_name($dokli_file, 'allow_file_not_there'); $d->{'current_dokli'}=''; } else { $d->{'e'}->echo(__LINE__,"I found a dokli that corresponds to our lisig."); if(-f $d->{'dokli_file'}) { $d->{'current_dokli'}=$d->{'dokli_file'}; } return 1; } if(not defined($d->{'dokli_glob'})) { $d->{'dokli_glob'}=$d->{'dokli_dir'}.'/'.$d->{'version'}.'_*.txt'; } $d->{'e'}->echo(__LINE__,"I did not find a dokli"); return ''; } ## this does not build the dokli sub set_dokli_for_model { my $d=shift; my $model_file=shift or confess "I need a model_file here"; if(not -f $model_file) { confess "I can't see the model_file $model_file"; } my $model_time=&Ernad::Common::get_time_from_file_name($model_file); my $dokli_glob=$d->{'all_dokli_glob'} or confess "I need a dokli_glob here"; foreach my $file (glob($dokli_glob)) { my $bana=basename($file) or next; if(not $bana=~m|^\d{4}-\d{2}-\d{2}_\d{4}-\d{2}-\d{2}_(\d+)\.txt$|) { $d->{'e'}->echo(__LINE__,"I skip the strange dokli_file name $bana"); next; } my $time_on_dokli_file=$1 or confess "I can't find the dokli time on $bana"; if(not $time_on_dokli_file == $model_time) { next; } my $dokli_file=$d->{'dokli_dir'}.'/'.$bana; $d->{'dokli_file'}=$dokli_file; $d->{'dokli_time'}=$model_time; ## fixme, this is lazy undef $d->{'dokli_glob'}; } } ## ## deal with sidin data. This is done by Presort.pm # ## lisig for the sidin. We include it as long as the file exits # if($e->{'seedable'}) { # print "lisig is $lisig\n"; # &Ernad::Seed::setup(); # &Ernad::Seed::setup_for_report($e->{'repcode'}); # ## date the sidin at the time we got it # my $sidin_xml_file=$rerc->{'file'}->{'sidin_xml'}; # my $sidin_amf_file=$rerc->{'file'}->{'sidin_amf'}; # if($sidin_amf_file) { # my $sidin_date=&Ernad::Dates::mdate($sidin_xml_file); # ## fixme: this does not check for a date clash # $e->echo(__LINE__,"I am building an exfit for the seed date $sidin_date"); # $d->build_exfit('term',$sidin_date,$sidin_amf_file); # } # ## If the erimp is seedable, add the seeds if they exist. # ## # if($e->{'seedable'}) { # &Ernad::Seed::setup(); # &Ernad::Seed::setup_for_report($e->{'repcode'}); # ## date the sidin at the time we got it # my $sidin_xml_file=$rerc->{'file'}->{'sidin_xml'}; # my $sidin_amf_file=$rerc->{'file'}->{'sidin_amf'}; # if($sidin_amf_file) { # my $sidin_date=&Ernad::Dates::mdate($sidin_xml_file); # ## fixme: this does not check for a date clash # $e->echo(__LINE__,"I am building an exfit for the seed date $sidin_date"); # $d->build_exfit('term',$sidin_date,$sidin_amf_file); # } # } # # # } # # ## unclear whether this is all issuedates ever on only those by report. ## take allport_sent interpretation ## not used for isinks sub set_report_past_issuedates { my $e=$main::e or confess "I need an Erimp here"; my $repcode=shift; if(not $repcode) { $repcode=$e->{'repcode'}; } if(not $repcode) { confess "I need a repcode here."; } ## the second and third argument are completely optional in the call. ## they may be set manually at invocation my $rerc=$e->{'report'}->{$repcode} // confess "I don't know the report $repcode."; my $source_dir=$rerc->{'dir'}->{'source'} // confess "I need a source_dir here"; my $sent_dir=$rerc->{'dir'}->{'sent'} // confess "I need a sent_dir here"; if(not $source_dir) { $e->echo(__LINE__,"I don't see the $source_dir"); return 0; } my $dates; ### if the source dir is the allport_sent dir if($source_dir) { $dates=&Ernad::Common::find_hash_of_dates_in_subdir($source_dir); } if(not defined($dates)) { $e->echo(__LINE__,"I seem to have no issuedates to learn from."); return; } my $ext=&Ernad::Common::find_extreme_issuedates($dates); my $max=$ext->{'max'} or confess "I can't see the minimum"; my $min=$ext->{'min'} or confess "I can't see the maximum"; my $train_limit=$e->{'conf'}->{'train_limit_by_days'} // confess "I don't see the train_limit_by_days in ernad.conf."; my $elbyd=$e->{'conf'}->{'ernad_limit_by_days'}; ## implement a system-wide limit, but saves all the issuedate into ## all_issuedates. This is may be used for frex parsing my $learnable_issuedates; foreach my $date (keys %$dates) { if($elbyd and &Ernad::Dates::diff_dates($date,$max) > $elbyd) { $e->echo(__LINE__,"I exclude $date from learnable_issudates.",2); next; } if($e->{'conf'}->{'separate_doklis'}) { if(not &Ernad::Common::is_issue_in_dir($sent_dir,$date)) { $e->echo(__LINE__,"$date is not issued yet, it's not going into the learnable issue."); next; } } $learnable_issuedates->{$date}=$dates->{$date}; } ## if separate doklis, now done if($e->{'conf'}->{'separate_doklis'}) { return $learnable_issuedates; } ## if(not scalar(keys %$learnable_issuedates)) { $e->echo(__LINE__,"I don't have any learnable issuedates to work with."); return 0; } $rerc->{'learnable_issuedates'}=$learnable_issuedates; ## find new start and end $ext=&Ernad::Common::find_extreme_issuedates($learnable_issuedates); $max=$ext->{'max'} or confess "no max"; $min=$ext->{'min'} or confess "no min"; $rerc->{'learn_version'}=$min.'_'.$max; ## this should be cleared later #$d->{'issuedates'}=$learnable_issuedates; return $learnable_issuedates; } 1;