package Ernad::Presort::Fidek; use strict; use warnings; use base ('Ernad::Presorter'); use Carp qw(confess); use Data::Dumper; use File::Basename; use Ernad::Presort::Dates; use Ernad::Presort::Exfit; use Ernad::Presort::Fidek::Term; use Ernad::Presort::Fidek::Frex; use Ernad::Presort::Model; use Ernad::Presort::Ranfi; use Ernad::Presort::State; ## main function sub update_for_report { my $k=shift; my $repcode=shift // confess "I need a repcode here."; $k->setup(); ## extra to restrict in destination my $extra; ## only required for motto class my $ranid; if(not $k->{'f'}) { $k->{'f'}=Ernad::Presort::Exfit->new({'e'=>$k->{'e'}}); } if($k->motto() eq 'train') { ## the local version similar to the general clear $k->clear_dir(); $extra='_train'; } if($k->motto() eq 'class') { ## here the fidek has to find an agreement between model and ranfi $ranid=$k->concord_ranid(); ## make a second attempt; if(not $ranid) { use Ernad::Presort::Learn; ## it's a subroutine, not an object &Ernad::Presort::Learn::model($repcode,$k->{'e'}); $ranid=$k->concord_ranid(); } if(not $ranid) { $k->echo(__LINE__,"I have no model for $repcode."); return 0; } $k->{'ranid'}=$ranid; $k->{'rank'}=$k->{'r'}->load($ranid); ## set the extra $extra=$ranid; } my $f=$k->{'f'}; $f->set_report($repcode); $k->set_report($repcode); #my $fitclas=$k->get_fitclas(); my $fidek_dir=$k->set_dir('fidek'); my $exfit_dir=$f->set_dir('exfit'); my $exfit_files=$f->show_to_update($fidek_dir,$k->motto(),$extra); foreach my $exfit_file (sort keys %$exfit_files) { ## dont make class fideks when we have learnt the issue if($k->motto() eq 'class') { my $date=&Ernad::Presort::Dates::find_issuedate($exfit_file); if($k->{'e'}->is_futile($repcode,$date)) { $k->echo(__LINE__,"$date is futile for $repcode."); next; } $k->echo(__LINE__,"$date is not futile for $repcode."); ## but the above is not enough, we also need to check whether the date ## is futile, because we may have empty report issues that don't make ## a train file my $state=$k->{'t'}->find($date); if(($state eq 'done') or ($state eq 'prenatal')) { $k->echo(__LINE__,"I don't make a class fidek for $state date $date."); next; } } #foreach my $fitcla (keys %$fitclas) { # $f->set_fitcla($fitcla); # $k->set_fitcla($fitcla); $k->build_file($exfit_file); #} } my $to_clear=$k->clear_dir(); #my $to_clear=$k->clear($fidek_dir); } ## FixMe: should go somewhere else. ## find concordance ranid sub concord_ranid { my $k=shift; ## concord_ranid is callable exterally $k->setup(); my $model_list=$k->{'m'}->list_by('ranid'); if(not $model_list) { $k->echo(__LINE__,'I have no model.'); return 0; } my $ranfi_list=$k->{'r'}->list_by('ranid'); if(not $ranfi_list) { $k->echo(__LINE__,'I have no ranfi.'); return 0; } my $common; my $max_shoti=''; my $best_ranid=''; foreach my $ranid (keys %$model_list) { if($ranfi_list->{$ranid}) { ## use the model time to find out how recent the ranid. my $shoti=$model_list->{$ranid}->{'shoti'}; $common->{$ranid}=$shoti; if(not $best_ranid) { $best_ranid=$ranid; } if(($shoti cmp $max_shoti) > 0) { $max_shoti=$shoti; $best_ranid=$ranid; } } } if(not $best_ranid) { $k->echo(__LINE__,"I can not concord on a ranid."); return 0; } else { $k->echo(__LINE__,"I concord on ranid $best_ranid"); } $k->{'ranid'}=$best_ranid; return $best_ranid; } sub setup { my $k=shift; ## extra to restrict in destination ## only required for motto class #my $ranid; if(not $k->{'f'}) { $k->{'f'}=Ernad::Presort::Exfit->new({'e'=>$k->{'e'}}); ## ## $k->{'f'}->{'repcode'}=$k->{'repcode'}; ## ## } if(not $k->{'r'}) { $k->{'r'}=Ernad::Presort::Ranfi->new({'e'=>$k->{'e'}}); } if(not $k->{'m'}) { $k->{'m'}=Ernad::Presort::Model->new({'e'=>$k->{'e'}}); } if(not $k->{'t'}) { $k->{'t'}=Ernad::Presort::State->new({'e'=>$k->{'e'}}); } if($k->{'e'}->{'conf'}->{'frast'}) { if(not $k->{'a'}) { $k->{'a'}=Ernad::Presort::Fidek::Frex->new({'e'=>$k->{'e'}}); } } ## ## if(not $k->{'x'}) { $k->{'x'}=Ernad::Presort::Fidek::Frex->new({'e'=>$k->{'e'}}); $k->{'x'}->motto($k->motto()); } if(not $k->{'w'}) { $k->{'w'}=Ernad::Presort::Fidek::Term->new({'e'=>$k->{'e'}}); $k->{'w'}->motto($k->motto()); } ## ## #$ranid=$k->{'m'}->last_ranid(); ## if there is no model, just exit #$k->{'ranid'}=$ranid; } sub clear_dir { my $k=shift; my $fidek_dir=$k->set_dir('fidek'); my $set_ranid=$k->{'ranid'} // ''; ## clear for randits foreach my $file (glob("$fidek_dir/*")) { ## delete fideks refering to past ranids if($set_ranid and $file=~m|\.([a-z0-9]+)\.json\.gz$|) { my $ranid=$1; if($ranid ne $set_ranid) { $k->echo(__LINE__,"The current ranid is $set_ranid. I clear $file."); unlink $file; } } } ## remove class files for which we have train files, i.e. past ones foreach my $train_file (glob("$fidek_dir/*train*")) { my $date=&Ernad::Common::find_issuedate_from_file($train_file); foreach my $class_file (glob("$fidek_dir/$date*.*.json.gz")) { $k->echo(__LINE__,"I clear class fidek $class_file because I have $train_file."); unlink $class_file; } } } sub build_file { my $k=shift; my $in_file=shift // confess "I need a file here."; my $set_out_file=shift // ''; my $e=$k->{'e'} // $main::e // confess "I need my erimp here."; if($k->motto eq 'seed' and not $set_out_file) { confess "For the seed motto, I need an out_file set here."; } if($set_out_file and $k->motto ne 'seed') { confess "With a set_out_file, I need the motto to be 'seed'."; } if(not -f $set_out_file) { &Ernad::Common::prepare_for_file($set_out_file); } ## optional indicator only do do certain papids my $restrict=shift // ''; if($restrict and not (ref $restrict eq 'HASH')) { confess "If you give me a restriction, it has to be a hashref."; } ## abuse the set_out_file to force work accepted papers only my $use_only_papids; if(ref($set_out_file) eq 'HASH') { $use_only_papids=$set_out_file; } ## exfit if(not $k->{'f'}) { $k->{'f'}=Ernad::Presort::Exfit->new({'e'=>$k->{'e'}}); ## FixMe: I should only use one of these lines $k->{'f'}->{'report'}=$k->{'report'}; $k->{'f'}->{'repcode'}=$k->{'repcode'}; # $k->{'f'}->{'fitcla'}='term'; } #if(not $k->{'treat_fit'}) { # $k->init_fitcla_functions(); #} ### my $fitcla=$k->{'fitcla'} // confess "I need a fitcla set here."; #my $fitcla='term'; ## clear any existing data ### undef $k->{$fitcla}; my $f=$k->{'f'} // confess "I need an exfit object here."; my $date=&Ernad::Presort::Dates::find_issuedate($in_file); my $out_dir=$k->set_dir('fidek'); ## out_file for train my $out_file; my $train_out_file="$out_dir/$date".'_train.json'; ## could be empty at the start of the report my $ranid=$k->{'ranid'} // ''; ## when we class we need a ranid if($k->motto() eq 'class' and not $ranid) { $k->echo(__LINE__,"I have no ranid, I try to concord."); $ranid=$k->concord_ranid(); if(not $ranid) { warn "I have no concorded ranid."; return 0; } } my $class_out_file="$out_dir/$date"."_$ranid.json"; my $motto=$k->motto(); ## set this for the scanners as well if(not $k->{'x'}) { ## 2018-12-18, in a haste, try to add it back on $k->{'x'}=Ernad::Presort::Fidek::Frex->new({'e'=>$k->{'e'}}); } $k->{'x'}->motto($motto); if(not $k->{'w'}) { $k->{'w'}=Ernad::Presort::Fidek::Term->new({'e'=>$k->{'e'}}); $k->{'w'}->motto($k->motto()); } if($motto eq 'train') { if(not &Ernad::Common::does_file_need_renewal("$train_out_file.gz", "$in_file")) { $k->echo(__LINE__,"I don't renew $train_out_file.gz from $in_file"); return 0; } if($ranid) { if(-f "$class_out_file.gz") { $k->echo(__LINE__,"I see $class_out_file.gz."); if(-f "$class_out_file.gz") { $k->echo(__LINE__,"I delete $class_out_file.gz because I see $train_out_file"); unlink("$class_out_file.gz"); } } } $out_file=$train_out_file } elsif($motto eq 'class') { if(not $ranid) { $k->echo(__LINE__,"I have no ranid."); return 0; } if(not &Ernad::Common::does_file_need_renewal("$class_out_file.gz", "$in_file")) { $k->echo(__LINE__,"I don't renew $class_out_file.gz from $in_file"); return 0; } ## don't renew if the $train_out_file exits if($e->{'conf'}->{'separate_doklis'}) { if(-f "$train_out_file.gz" and not -z "$train_out_file.gz") { $k->echo(__LINE__,"I don't renew $class_out_file.gz because I found $train_out_file.gz"); return 0; } } $out_file=$class_out_file; } elsif($motto eq 'seed') { if(not &Ernad::Common::does_file_need_renewal("$set_out_file.gz", "$in_file")) { $k->echo(__LINE__,"I don't renew $set_out_file.gz from $in_file."); return 0; } $out_file=$set_out_file; } else { confess "I don't know about your motto '$motto'"; } my $paper; my $count_papers=0; while($paper=$f->next_paper_from_file($in_file)) { my $papid=$paper->{'papid'} // confess "I need a papid here."; if($restrict and not $restrict->{$papid}) { next; } $count_papers++; foreach my $line (@{$paper->{'lines'}}) { ## term $k->{'w'}->add_fits($k,$line,$papid); ## frex $k->{'x'}->add_fits($k,$line,$papid); } } if($k->{'fits'}) { $k->save_json_with_gz($k->{'fits'},$out_file); } if($k->motto eq 'seed') { return $k->{'fits'}; } undef $k->{'fits'}; return $out_file; } # sub add_fit_with_fitcla { # my $k=shift; # my $fit=shift // confess "I need a fit here."; # my $handle=shift // confess "I need a handle here."; # my $fitcla=$k->{'fitcla'} // confess "I need a fitcla here."; # my $motto=$k->motto(); # #if(not $k->{'rank'}) { # # $k->echo(__LINE__,"I have no rank, adding $fit for $fitcla at $motto."); # # ## FixMe: I load the data. # # my $ranid=$k->{'ranid'} // ''; # # if(not $ranid) { # # confess "I have no ranid."; # # } # # $k->{'rank'}=$k->{'r'}->load($ranid); # # if(not $k->{'rank'}) { # # confess "I have no rank for $ranid"; # # } # # $k->echo(__LINE__,"FixMe: I had to reload the rank data for $ranid."); # # } # #if(not $k->{'rank'}) { # # confess "I have no fitrank."; # #} # if($motto eq 'class') { # if(not $k->{'rank'}->{$fit}) { # return; # } # } # if(not defined($k->{$fitcla}->{$handle})) { # $k->{$fitcla}->{$handle}->{$fit}=1; # return; # } # $k->{$fitcla}->{$handle}->{$fit}++; # } ## called in Fidek::Frex and Fidek::Term sub add_fit { my $k=shift; my $fit=shift // confess "I need a fit here."; my $papid=shift // confess "I need a papid here."; ## how many to add, minus 1 my $adder=shift // 0; ## we hand it one less then the length we would add $adder=$adder+1; my $motto=$k->motto(); if($motto eq 'class') { if(not $k->{'rank'}->{$fit}) { return; } } if(not defined($k->{'fits'}->{$papid})) { $k->{'fits'}->{$papid}->{$fit}=$adder; return; } $k->{'fits'}->{$papid}->{$fit}+=$adder; } # sub init_fitcla_functions { # my $k=shift; # $k->{'treat_fit'}->{'term'} = sub { # my $t=shift; # if($t=~m|^[[:upper:]][[:lower:]]*$|) { # $t=lc($t); # } # return $t; # }; # $k->{'treat_fit'}->{'frin'} = sub { # my $t=shift; # $t=~s|^\s||; # $t=~s|\s$||; # return $t; # }; # $k->{'treat_fit'}->{'frex'} = sub { # my $t=shift; # my $rc=shift; # ## first process like a frin # $t=&{$k->{'treat_fit'}->{'frin'}}($t); # $t=~s|\s+| |; # $t=~s|\s+$||; # $t=~s|^\s+||; # if(not $t=~m| |) { # return ''; # } # ## remove terms in brackets, they are usually abreviations # $t=~s|\(.*\)||g; # ## these are separators. Easiest just to delete the lines that # ## contain them and assume the components are covered elsewhere # my @seps=(':',',','/','&',' - ','"','$'); # foreach my $sep (@seps) { # if($t=~m|\Q$sep\E|) { # if($rc eq 'read') { # $k->{'e'}->echo(__LINE__,"I skip line '$t' because it contains a separator",2); # return ''; # } # elsif($rc eq 'check') { # $t=~s|\Q$sep\E||; # } # else { # confess "unknown rc"; # } # } # } # $t=~s|\n$||; # $t=~s|\.$||; # $t=~s|^\.\s*||; # $t=~s|\x{2019}|'|g; # $t=~s|^'||g; # $t=~s|'$||g; # ## only uppercase # #if($t=~m|^[A-Z ]+$|) { # $t=lc($t); # #} # return $t; # }; # $k->{'split_line'}->{'term'} = sub { # my $line=shift; # my @fits; # foreach my $term (split(/[\s\p{punct}]/,$line)) { # ## function returns nothing when it is not finding a worthy term # my $fit=&{$k->{'treat_fit'}->{'term'}}($term,'read') or next; # push(@fits,$fit); # } # return @fits; # }; # $k->{'split_line'}->{'frin'} = sub { # my $line=shift; # my @fits; # ## It is assumed that the entire line is a feature. # my $fit=&{$k->{'treat_fit'}->{'frin'}}($line,'read') or return; # push(@fits,$fit); # return @fits; # }; # $k->{'split_line'}->{'frex'} = sub { # my $line=shift; # my @fits; # ## It is assumed that the entire line is a feature. # my $fit=&{$k->{'treat_fit'}->{'frex'}}($line,'read') or return; # push(@fits,$fit); # return @fits; # }; # } ## lists training fideks sub list { my $k=shift; #my $fitcla=shift // confess "I need a fitcla here."; my $what=shift // confess "I now need to know 'what' here."; #$k->set_fitcla($fitcla); my $fidek_dir=$k->set_dir('fidek'); #my $glob="$fidek_dir/*$fitcla*$what*"; my $glob="$fidek_dir/*$what*"; my @files=glob($glob); return \@files; } ## not sure if this is needed sub setup_exfits { my $k=shift; if(not $k->{'f'}) { $k->{'f'}=Ernad::Presort::Exfit->new({'e'=>$k->{'e'}}) } #my $fitcla=shift // $k->{'fitcla'}; #if(not $fitcla) { # confess "I need a fitclas here."; #} my $report=shift // $k->{'report'}; if(not $report) { print "I need a report here."; } $k->{'f'}->set_report($report); #$k->{'f'}->set_report($fitcla); } 1;