package Ernad::Presort::Exfit; use strict; use warnings; use base ('Ernad::Presorter'); use Carp qw(cluck confess); use Data::Dumper; use File::Basename; use File::Compare; use File::Copy; use File::Temp; use List::Util qw(shuffle); use Ernad::Dates; use Ernad::Report; use Ernad::Presort::State; use Ernad::Presort::Dates; sub update { my $f=shift; my $only_repcode=shift // ''; my $e=$f->{'e'} // $main::e // confess "I don't see my erimp."; #f# my $fitclas=$f->get_fitclas(); my $t=$f->{'t'}; if(not $t) { $t=Ernad::Presort::State->new({'e' => $f->{'e'}}); $f->{'t'}=$t; } #foreach my $fitcla (keys %$fitclas) { ## either allport or seperate reports my @learnports=$f->get_learnports(); foreach my $learnport (@learnports) { if(not $e->{'conf'}->{'allport_repcode'}) { if($only_repcode and ($learnport ne $only_repcode)) { next; } } if(not $e->{'conf'}->{'separate_doklis'}) { if($learnport ne $e->get_allport_repcode) { next; } } $f->echo(__LINE__,"My learnport is $learnport.",3); if($learnport eq 'nep') { confess "it can't be nep"; } ## FixMe. This should not have to be set twice. $f->{'repcode'}=$learnport; $f->{'e'}->{'repcode'}=$learnport; #f# $f->update_for_fitcla_and_learnport($fitcla,$learnport); $f->update_for_learnport($learnport); } } ## sub deal_with_seeds { my $f=shift; my $e=$f->{'e'} // $main::e // confess "I don't have my erimp."; if(not $f->{'s'}) { $f->{'s'}=Ernad::Presort::Seeds->new(); $f->{'s'}->setup(); $f->{'s'}->setup_for_report($f->{'repcode'}) or return 0; } my $seed_date=$f->{'s'}->set_seed_date(); $e->echo(__LINE__,"My seed date is $seed_date"); my $sidin_amf_file=$f->{'s'}->{'rerc'}->{'file'}->{'sidin_amf'} // confess "I need this defined."; $e->echo(__LINE__,"My sidin_amf_file is $sidin_amf_file."); if(not -f $sidin_amf_file) { my $sidin_xml_file=$f->{'s'}->{'rerc'}->{'file'}->{'sidin_xml'}; if(not -f $sidin_xml_file) { $sidin_amf_file=$f->{'s'}->get_sidin_xml_file(); if(not -f $sidin_xml_file) { confess "I can't see the sidin_xml_file $sidin_xml_file."; } } $sidin_amf_file=$f->{'s'}->get_sidin_amf_file(); } #f# my $out_file=$f->{'dir'}->{'exfit'}.'/'.$seed_date."_".$f->{'fitcla'}.'.seeds.txt'; my $out_file=$f->{'dir'}->{'exfit'}.'/'.$seed_date.'.seeds.txt'; if(not &Ernad::Common::does_file_need_renewal("$out_file.gz", $sidin_amf_file)) { $e->echo(__LINE__,"I don't renew $sidin_amf_file."); return 0; } my $style_dir=$e->{'dir'}->{'style'}; $f->{'file'}->{'xslt'}=$style_dir.'/exfit.xslt.xml'; #$f->set_xslt_file('term',$f->{'repcode'}); #$f->echo(__LINE__,"I transform $sidin_file to $out_file"); my $return=$f->transform($sidin_amf_file,$out_file); return $out_file; } sub update_for_learnport { my $f=shift; my $learnport=shift // confess "I need a learnport here."; $f->{'learnport'}=$learnport; my $exfit_dir=$f->set_dir('exfit'); $f->{'dir'}->{'exfit'}=$exfit_dir; my $repcode=$f->get_repcode($learnport); $f->{'repcode'}=$learnport; $f->{'t'}->{'repcode'}=$learnport; my $rerc=$f->{'e'}->{'report'}->{$repcode}; $f->{'rerc'}=$rerc; ## The xslt file does not depend on the istaf my $e=$f->{'e'} // confess "I don't see the erimp."; my $style_dir=$e->{'dir'}->{'style'}; $f->{'file'}->{'xslt'}=$style_dir.'/exfit.xslt.xml'; #if($e->{'conf'}->{'allport_repcode'}) { # confess "FixMe: I can't deal with your allported Erimp."; # if(not defined($f->{'date'}->{'oldest_sent'})) { $f->{'r'}=Ernad::Report->new($f->{'repcode'}); $f->{'date'}->{'oldest_sent'}=$f->{'r'}->get_oldest_sent_date($repcode); if(not defined($f->{'date'}->{'oldest_sent'})) { # confess "I have no oldest send date defined. It must be undefined if we have no issue."; } } if(not $f->{'date'}->{'oldest_sent'}) { $f->echo(__LINE__,"I have no oldest_sent for $repcode."); if(-d $exfit_dir) { my $glob="$exfit_dir/*"; $f->echo(__LINE__,"I unlink all '$glob'."); $f->unlink_all("$glob"); $f->deal_with_seeds(); } else { mkpath $exfit_dir; $f->deal_with_seeds(); return 0; } return 0; } ## it is assumed that all ever available issuedate are in the namf_dir my @files; my $allport_repcode=$e->{'conf'}->{'allport_repcode'}; if(not $allport_repcode) { my $namf_dir=$e->{'dir'}->{'namf'} // confess "I need a namf_dir here."; @files=glob("$namf_dir/*"); } else { my $sent_dir=$e->{'report'}->{$allport_repcode}->{'dir'}->{'sent'} // confess "I need a sent_dir here."; @files=glob("$sent_dir/*"); } ###my @files=shuffle glob("$namf_dir/*"); ## FixMe: again??? $f->deal_with_seeds(); ## FixMe: make some test if the report is not mature before dealing with seeds. foreach my $namf_file (@files) { ## FixMe in 2019 my $issuedate=&Ernad::Presort::Dates::find_issuedate($namf_file); if(not $issuedate) { confess "I don't see a date on $namf_file."; } ## check that the issudate of the namf is not earlier than ## the isusedate of the first presorted file found. if($f->{'date'}->{'oldest_sent'} and &Ernad::Dates::compare_dates($issuedate,$f->{'date'}->{'oldest_sent'})>0) { my $glob="$exfit_dir/$issuedate*"; $f->echo(__LINE__,"I unlink all '$glob'."); $f->unlink_all("$glob",'seeds'); next; } if(not $f->is_issuedate_in_range($issuedate)) { my $glob="$exfit_dir/$issuedate*"; if($e->{'conf'}->{'separate_doklis'}) { $f->echo(__LINE__,"I unlink all '$glob'."); $f->unlink_all("$glob",'seeds'); } next; } #f# $f->update_for_learnport_and_issuedate($fitcla,$learnport,$issuedate); $f->update_for_learnport_and_issuedate($learnport,$issuedate); } } ## returns number of files deleted sub unlink_all { my $f=shift; my $glob=shift // confess "I need an argument here."; my $exception=shift // ''; my $count=0; #$f->echo(__LINE__,"I unlink all '$glob'); foreach my $file (glob($glob)) { $count++; if($exception and $file=~m|$exception|) { $f->echo(__LINE__,"I dont remove $file",1); next; } $f->echo(__LINE__,"I remove $file",1); unlink $file; } return $count; } ## "mejran@gmail.com' sub update_for_learnport_and_issuedate { my $f=shift; my $learnport=shift // confess 'I need a learnport here.'; my $issuedate=shift // confess 'I need an issuedate here.'; my $namf_dir=$f->{'e'}->{'dir'}->{'namf'}; my $e=$f->{'e'}; my $allport_repcode=$e->{'conf'}->{'allport_repcode'}; if($allport_repcode) { $namf_dir=$e->{'report'}->{$allport_repcode}->{'dir'}->{'sent'}; } if(not $namf_dir) { confess 'I need a namf dir here.'; } my $exfit_dir=$f->{'dir'}->{'exfit'} // confess 'I need a exfit dir here.'; my $repcode=$learnport // confess 'I have lost my repcode.'; my $rerc=$f->{'e'}->{'report'}->{$learnport}; my $ps_dir=$rerc->{'dir'}->{'presorted'} // confess "I need a presorted dir here."; my $sd_dir=$rerc->{'dir'}->{'seeded'} // ''; if(not -d $ps_dir) { confess "I don't see your directory $ps_dir."; } my $in_file; my $out_file; if($allport_repcode) { my $sent_dir=$e->{'report'}->{$allport_repcode}->{'dir'}->{'sent'}; ## set the in_file $in_file=$e->{'d'}->latest_rif($sent_dir,$issuedate); } my $t=$f->{'t'}; $t->{'repcode'}=$repcode; my $state=$f->{'t'}->find($issuedate); ## determine the report-indpendent common_out_file; my $common_out_file; if($state eq 'untouched' and $e->{'conf'}->{'separate_doklis'}) { ## not already set by allport? if(not $in_file) { $in_file=&Ernad::Presort::Dates::namf_files_by_date($namf_dir,$issuedate); } my $up_dir=dirname($exfit_dir); #f# $out_file="$up_dir/$issuedate"."_$fitcla".'_class.txt'; $out_file="$up_dir/$issuedate".'_class.txt'; } elsif($state eq 'done') { ## not already set by allport? if(not $in_file) { $in_file=$e->{'d'}->latest_rif($ps_dir,$issuedate); } ## added for NEP, called at classing time #if($f->motto() eq 'class') { # ## make sure we don't create a class file for all dates # my $train_file="$exfit_dir/$issuedate".'_train.txt'; # if(not (-f $train_file or -f "$train_file.gz")) { # $out_file="$exfit_dir/$issuedate".'_class.txt'; # } #} ## if not sent by three lines above #elsif($out_file ne "$exfit_dir/$issuedate".'_class.txt') { $out_file="$exfit_dir/$issuedate".'_train.txt'; #} } elsif($state eq 'prenatal' and $e->{'conf'}->{'separate_doklis'}) { my $glob="$exfit_dir/$issuedate*"; $f->unlink_all("$glob"); return 0; } elsif($repcode ne $e->get_allport_repcode()) { $f->echo(__LINE__,"I see a grey report $repcode state $state at $issuedate."); my $glob="$exfit_dir/$issuedate*"; ## if($e->{'conf'}->{'separate_doklis'}) { $f->echo(__LINE__,"I unlink all '$glob'."); $f->unlink_all("$glob"); return 0; } # confess "I see grey $repcode state $state at $issuedate."; } print $e->echo(__LINE__,"my issuedate $issuedate and state is $state and $repcode"); if(not $in_file and $state eq 'done' and -d $sd_dir) { ## check in the seeds $in_file=$e->{'d'}->latest_rif($sd_dir,$issuedate); } if(not $in_file) { confess "I don't have an in_file for $issuedate. The state is '$state', the sd_dir is $sd_dir"; } ## if all else fails if(not $out_file) { $out_file="$exfit_dir/$issuedate".'_train.txt'; #confess "I need an outfile for $issuedate"; } #$f->set_xslt_file('term',$f->{'repcode'}); ## takes care of renewal requirement check my $return=$f->transform($in_file,$out_file); return $out_file; } sub transform { my $f=shift; my $in_file=shift // confess "I need an in_file defined."; my $out_file=shift // confess "I need an out_file defined."; ### ## this takes out_files without .gz $out_file=~s|\.gz$||; my $sheet=$f->{'file'}->{'xslt'}; ## added in a hurry on 2018-12-16 to fix problems with the ## seeding of the bims-unfpre if(not $sheet) { my $e=$f->{'e'} // $main::e // confess "I don't see my Erimp"; $sheet=$e->{'dir'}->{'style'}.'/'.'/exfit.xslt.xml'; } if(not $sheet) { confess "I need this xslt here."; } if(not &Ernad::Common::does_file_need_renewal("$out_file.gz", $f->{'file'}->{'xslt'}, $in_file)) { return 0; } $f->echo(__LINE__,"I transform $in_file with $sheet to $out_file."); #my $out=$f->{'e'}->transform_file_to_text($in_file,$sheet); my $out=$f->{'e'}->{'s'}->t($in_file,$sheet,'chars'); if(not $out) { confess "I applied $sheet to $in_file, but the output is empty."; return 0; } ## if the file is not there, it's simple if(not -f "$out_file.gz") { my $fh = IO::File->new("> $out_file"); print $fh $out; $fh->close; system("/bin/gzip -f $out_file"); return 1; } ## otherwise use a temporary file for the new version of the file my $new_fh = File::Temp->new(); print $new_fh $out; $new_fh->close(); my $new_temp_name=$new_fh->filename(); ## use a temporary file for the old file my $old_fh = File::Temp->new(); my $old_temp_name=$old_fh->filename(); system("/bin/zcat $out_file.gz > $old_temp_name"); if(not (compare($new_temp_name,$old_temp_name) == 0)) { system("/bin/gzip -f $new_temp_name"); copy("$new_temp_name.gz","$out_file.gz"); return 1; } return 0; } ## clears exfits files that are out of the training range sub clear { my $f=shift; # my $x=$f->{'x'} // confess "Run ->prepare_xslt first"; # $x->set_re_name($rif_2_exfit); # $x->set_in_check($range_check); # $x->clear_files($f); my $exfit_dir=$f->{'dir'}->{'exfit'} // confess 'I need my exfit_dir set here.'; foreach my $file (glob("$exfit_dir/*")) { my $issuedate=&Ernad::Presort::Dates::find_issuedate($file) or next; if($f->not_in_range($issuedate)) { $f->echo(__LINE__,"Issuedate $issuedate is not in range."); my $glob="$exfit_dir/$issuedate*"; $f->echo(__LINE__,"I unlink all '$glob'."); $f->unlink_all("$glob"); } } } # ## FixMe: I think this function could go # sub set_xslt_file { # my $f=shift; # ## is always term. # my $fitcla=shift; # my $repcode=shift; # my $exfit_xslt_dir=$f->{'e'}->{'dir'}->{'exfit_xslt'}; # my $xslt_file; # ## in the case of no allport # if($f->{'e'}->{'conf'}->{'no_allport'}) { # my $xslt_bana=$fitcla.$f->{'e'}->{'const'}->{'xsl_ext'}; # my $xslt_file=$exfit_xslt_dir.'/'.$xslt_bana; # $f->{'file'}->{'xslt'}=$xslt_file; # return # } # ## otherwise # my $ext='_'.$fitcla.$f->{'e'}->{'const'}->{'xsl_ext'}; # $xslt_file=$exfit_xslt_dir.'/'.$repcode.$ext; # if(not $xslt_file) { # confess "I need an XSLT file here."; # } # ## set it in the object # $f->{'file'}->{'xslt'}=$xslt_file; # return $xslt_file; # } ## used by seeds sub show_class_file { my $f=shift; #f# my $fitcla=shift // confess "I need a fitcla here."; my $issuedate=shift // confess "I need an issuedate here."; ## FixMe:: this should be in a call to a function my $exfit_dir=$f->{'e'}->{'dir'}->{'learn'}.'/exfit'; my $glob="$exfit_dir/$issuedate".'_class.txt.gz'; my @glob=glob($glob); if(not $glob[0]) { return ''; } return $glob[0]; } ## used by seeds sub show_train_file { my $f=shift; #f# my $fitcla=shift // confess "I need a fitcla here."; my $issuedate=shift // confess "I need an issuedate here."; if(not &Ernad::Dates::is($issuedate)) { confess "Your issuedate $issuedate is not a date."; } my $exfit_dir=$f->set_dir('exfit'); ## star in the latter part: seed or train #f# my $glob="$exfit_dir/$issuedate".'_'.$fitcla.'.*.txt.gz'; my $glob="$exfit_dir/$issuedate.*.txt.gz"; my @glob=glob($glob); $f->echo(__LINE__,"I look for glob '$glob'."); if(not $glob[0]) { return ''; } return $glob[0]; } ## invoqued by Fidek sub show_to_update { my $f=shift; my $dest_dir=shift // confess "I need a destination directory argument here."; my $what_for=shift // confess "I need to know what for."; my $exfit_dir=$f->set_dir('exfit') // confess 'I need an exfits_dir set.'; my $ranid=shift // ''; my $e=$f->{'e'} // $main::e // confess "I need an erimp."; if(not -d $dest_dir) { confess "I don't see the directory $dest_dir"; } ## the restriction placed on the update my $restrict; if($what_for eq 'train') { $restrict->{'source'}='train|seed'; } elsif($what_for eq 'class') { $restrict->{'destin'}='class'; ## this should be the opposite, where the files in exfit are called 'class' ## but for now it is 'train' if(not $e->{'conf'}->{'separate_doklis'}) { delete $restrict->{'destin'}; } ## for classification, the files are in the report-independent directory if($e->{'conf'}->{'separate_doklis'}) { $exfit_dir=dirname($exfit_dir); } } else { confess "I need what_for to be 'train' or 'class', but you gave me '$what_for'."; } ## an optional ranid, for classification, used by the Exfit::Class. ## second condition applied for NEP, and $e->{'conf'}->{'separate_doklis'}) { if($ranid) { $restrict->{'destin'}=$ranid; } my $sources=$f->get_sources($exfit_dir,$dest_dir,$restrict); } sub next_paper_from_file { my $f=shift; my $file=shift // confess "I need a file here."; ## fixed: this should not need a repcode ## $f->get_repcode(); #$f->echo(__LINE__,"Exfits look for the file $file.",100); if(not -f $file) { confess "I can't see your file $file."; } ## ##my $fitcla=shift // $f->{'fitcla'}; ## ##if(not $fitcla) { ## ## confess "I need a fitclas here."; ## ##} if($f->{'done'}->{$file}) { undef $f->{'current_fh'}; return undef; } ## new file my $tmp_name; if(not $f->{'current_fh'}) { ## IO::Uncompress::Gunzip can not seek backwards. ## open a temporary file and my $tmp_fh = File::Temp->new(); my $tmp_name=$tmp_fh->filename; system("/bin/zcat $file > $tmp_name"); if(-z $tmp_name) { confess "I seem to have an empty input $tmp_name from $file."; } $f->{'current_fh'}=IO::File->new($tmp_name, "r"); $f->echo(__LINE__,"I open $file"); $f->{'current_fh'}->binmode('utf8'); } ## the paper my $p; my $fh=$f->{'current_fh'}; my $papid=<$fh>; if(not defined($papid)) { $f->{'done'}->{$file}=1; return undef; } ## check if this is really a line with a papid if(not $papid=~m|^\S|) { confess "I should have a papid here."; } chomp $papid; $p->{'papid'}=$papid; $p->{'lines'}=[]; my $line; while($line=<$fh>) { ## last line reached if(not defined($line)) { ## the end $f->{'done'}->{$file}=1; undef $f->{'current_fh'}; return $p; } ## caught a papid if(not $line=~s|^\s||) { $f->{'current_fh'}->seek(-length($line),1); return $p; } push(@{$p->{'lines'}},$line); } $f->{'done'}->{$file}=1; return $p; } 1;