package Ernad::Presort::Seeds; use strict; use warnings; use base ('Ernad::Presorter'); use Carp qw(confess); use Clone qw(clone); use Date::Format; use Data::Dumper qw(Dumper); use File::Basename; use Ernad::Common; use Ernad::Erimp; use Ernad::Dates; #use Ernad::Report; use Krichel::Shoti; use Ernad::Presort::Exfit; use Ernad::Presort::Tfidf; use Ernad::Presort::Fidek; use Krichel::File; binmode(STDOUT,':utf8'); ## returns 1 on success, zero on failure sub setup { my $s=shift; my $e=$s->{'e'} // confess "I need an erimp here."; my $sidconf_dir=$e->{'dir'}->{'etc'}.'/'.$e->{'const'}->{'seed_dir'}; if(not (-d $sidconf_dir)) { $s->echo(__LINE__,"I see not no $sidconf_dir. Thus no report is seedable.",2); return 0; } $e->{'dir'}->{'sidconf'}=$sidconf_dir; my $seed_dir=$e->{'dir'}->{'seed'} // confess "I need a seed_dir here"; ## could not get this to run on 2015-12-21 if(not (-d $seed_dir or -l $seed_dir)) { $s->echo(__LINE__,"I see no $seed_dir. Thus no report is seedable.",2); return 0; } ## getting to insid my $get_input=$e->{'conf'}->{'get_input_for_seeds'} // ''; if(not $get_input) { $s->echo(__LINE__,"I see no \$conf->{'get_input_for_seeds'}. No report is seedable.",2); return 0; } my $namf_file=$e->{'file'}->{'namf'} // ''; if($namf_file and not -f $namf_file) { confess "I can't access the namf_file $namf_file"; } $s->{'today'}=&Ernad::Dates::today(); ## make sure we have an executable to get access to the seeds. my @get_input_parts=split(' ',$get_input); my $exec_file=$get_input_parts[0]; if(not $exec_file) { $s->echo(__LINE__,"I see the get_file_for_seeds command.",2); return 0; } $e->{'get_input_for_seeds'}=$get_input; if(not $s->{'f'}) { $s->{'f'}=Ernad::Presort::Exfit->new({'e'=>$s->{'e'}}); #$s->{'f'}->{'fitcla'}='term'; } if(not $s->{'k'}) { $s->{'k'}=Ernad::Presort::Fidek->new({'e'=>$s->{'e'}, 'f'=>$s->{'f'}}); ## seeding uses terms only #$s->{'k'}->{'fitcla'}='term'; } return 1; } sub set_seed_date { my $s=shift; my $repcode=$s->{'repcode'} // $s->{'e'}->{'repcode'} // confess "I need this set."; my $main_repcode=$main::e->{'repcode'}; if($repcode ne $main_repcode) { confess "There is a repcode confusion between $repcode and $main_repcode."; } my $rerc=$s->{'rerc'}; if(not $rerc) { $s->setup_for_report($repcode); $rerc=$s->{'rerc'}; } my $seed_date=$s->{'rerc'}->{'date'}->{'insid'} // confess "I need this here"; return $seed_date; } ## FixMe: this should probably go somewhere sub get_existing_shoti { my $s=shift; my $seed_dir=shift // confess "I need a seed_dir argument."; if(not -d $seed_dir) { confess "I don't see your directory $seed_dir"; } my $amf_shoti=0; my $xml_shoti=0; my @files=(glob("$seed_dir/*.xml"),glob("$seed_dir/*.xml.gz")); foreach my $file (@files) { my $bana=basename($file); if($bana=~m|^sidin_([0-9a-z]{6})\.amf\.xml(\.gz)?$|) { $amf_shoti=$1; next; } if($bana=~m|^sidin_([0-9a-z]{6})\.xml(\.gz)?$|) { $xml_shoti=$1; next; } } if(not $amf_shoti eq $xml_shoti) { confess "The amf shoti $amf_shoti does not match xml one $xml_shoti\n"; } ## return either return $xml_shoti; } ## this looks for a class exfit to hand over as a tsoin ## it needs to exist sub get_tsoin_exfits { my $s=shift; if(not $s->{'f'}) { confess "I need this defined here."; } #my $tsoin_exfit_file=$s->{'f'}->show_class_file('term',$s->{'issuedate'}); my $tsoin_exfit_file=$s->{'f'}->show_class_file($s->{'issuedate'}); if(not $tsoin_exfit_file) { confess "I don't have a tsoin_exfit_file.\n"; } if(not -f $tsoin_exfit_file) { if(not defined($s->{'rerc'}->{'file'}->{'tsoin_amf'})) { confess "You need to give me \$s->{'rerc'}->{'file'}->{'tsoin_amf'}."; } if(not defined($s->{'rerc'}->{'file'}->{'tsoin_amf'})) { confess "I can't see your file ".$s->{'rerc'}->{'file'}->{'tsoin_amf'}.'.'; } my $amf_file=$s->{'rerc'}->{'file'}->{'tsoin_amf'}; #$s->{'f'}->set_xslt_file('term',$s->{'repcode'}); my $return=$s->{'f'}->transform($amf_file,$tsoin_exfit_file); $s->echo(__LINE__,"I tried to make $tsoin_exfit_file, return is $return"); if(not -f $tsoin_exfit_file) { confess "I don't see the tsoin_exfit_file $tsoin_exfit_file."; } } return $tsoin_exfit_file; } sub setup_for_report { my $s=shift; my $repcode=shift // $s->get_repcode() // confess "I can't get the repcode."; $s->{'repcode'}=$repcode; my $e=$s->{'e'}; $s->{'rerc'}->{'dir'}=clone($e->{'report'}->{$repcode}->{'dir'}); #$s->{'r'}=Ernad::Report->new($repcode); ## the file that holds the handles of the seed docs, in etc/IMPNA/seed #my $insid_file=$s->{'r'}->seed_file(); my $insid_file=$e->{'p'}->seed_file($repcode); if(not -f $insid_file) { $s->echo(__LINE__,"I see no insid_file $insid_file. The report $repcode is unseedable.",2); return 0; } my $seed_dir=$e->{'dir'}->{'seed'}.'/'.$repcode; $s->{'rerc'}->{'dir'}->{'seed'}=$seed_dir; if(not -d $seed_dir) { &File::Path::mkpath($seed_dir); } $s->{'rerc'}->{'file'}->{'insid'}=$insid_file; my $shoti=$s->get_existing_shoti($seed_dir); ## this would zero if there is none if(not $shoti) { $shoti=&Krichel::Shoti::make(&Ernad::Dates::mtime($insid_file)); $s->{'rerc'}->{'date'}->{'insid'}=&Ernad::Dates::mdate($insid_file); $s->echo(__LINE__,"I set the shoti $shoti from the insid files"); } else { my $time=&Krichel::Shoti::ekam($shoti); $s->echo(__LINE__,"I use the shoti $shoti."); $s->{'rerc'}->{'time'}->{'insid'}=$time; $s->{'rerc'}->{'date'}->{'insid'}=time2str('%Y-%m-%d',$time); } $s->{'rerc'}->{'time'}->{'insid'}=&Ernad::Dates::mtime($insid_file); $s->{'rerc'}->{'shoti'}=$shoti; $s->{'rerc'}->{'file'}->{'sidin_xml'} ="$seed_dir/sidin_$shoti.xml"; $s->get_sidin_xml_file($repcode); $s->{'rerc'}->{'file'}->{'sidin_amf'} ="$seed_dir/sidin_$shoti.amf.xml"; $s->get_sidin_amf_file($repcode); #$s->{'rerc'}=clone($s->{'rerc'}); return 1; } ## could require namf sub setup_for_issue { my $s=shift; my $e=$s->{'e'} // confess 'Where is my erimp?'; my $date=shift // $e->{'issuedate'} // confess "I need an issuedate"; $s->{'date'}=$date; $s->{'issuedate'}=$date; if(defined($e->{'file'}->{'namf'})) { $s->{'rerc'}->{'file'}->{'tsoin_amf'} = $e->{'file'}->{'namf'}; } else { confess "I need a namf file here."; #$s->{'rerc'}->{'file'}->{'tsoin_amf'} ="$seed_dir/tsoin_$date.amf.xml"; $s->{'rerc'}->{'file'}->{'tsoin_amf'} = $e->{'file'}->{'namf'}; } my $seed_dir=$s->{'rerc'}->{'dir'}->{'seed'} // confess "I need this set up."; $s->{'rerc'}->{'file'}->{'score_dump'} ="$seed_dir/$date"."_score.json"; $s->{'rerc'}->{'file'}->{'weights_dump'} ="$seed_dir/$date"."_weigths.json"; return $date; } ## function used be external callers sub get_scores { my $s=shift; my $repcode=$s->{'repcode'} // confess "I need this defined."; ## fixme: I don't think we need the period here my $period=shift // ''; # my $date=$s->{'issuedate'} // confess "I need this defined here."; $s->setup_for_report($repcode); $s->setup_for_issue(); $s->get_sidin_xml_file($repcode); my $sidin_file=$s->get_sidin_amf_file($repcode); my $sidin_exfit_file=$s->get_sidin_exfits(); $s->echo(__LINE__,"the sidin_exfit_file is $sidin_exfit_file"); my $tsoin_exfit_file=$s->get_tsoin_exfits(); $s->echo(__LINE__,"the tsoin_exfit_file is $tsoin_exfit_file"); if(not $s->{'k'}) { confess "I need this \$s->{'k'} defined here."; } my $fidek_dir=$s->{'k'}->set_dir('fidek'); my $seed_date=$s->set_seed_date // confess "I need a seed date"; my $issuedate=$s->{'issuedate'} // confess "I need an issuedate"; my $sidin_fidek_file="$fidek_dir/$seed_date".'_term_seeds.json'; my $tsoin_fidek_file="$fidek_dir/$issuedate".'_term_seeds.json'; $s->{'k'}->motto('seed'); if(&Ernad::Common::does_file_need_renewal("$sidin_fidek_file.gz",$sidin_exfit_file)) { $s->echo(__LINE__,"I build $sidin_fidek_file.gz."); ## output file is given in second place here $s->{'k'}->build_file($sidin_exfit_file,$sidin_fidek_file); } else { $s->echo(__LINE__,"I don't renew $sidin_fidek_file.gz."); } if(&Ernad::Common::does_file_need_renewal($tsoin_fidek_file,$tsoin_exfit_file)) { $s->echo(__LINE__,"I build $tsoin_fidek_file.gz."); ## output file is given in second place here $s->{'k'}->build_file($tsoin_exfit_file,$tsoin_fidek_file); } else { $s->echo(__LINE__,"I don't renew $tsoin_fidek_file.gz."); } my $i=Ernad::Presort::Tfidf->new({'e'=>$s->{'e'}, ## a temporary weights file to speed up, 'weights_file'=>$s->{'rerc'}->{'file'}->{'weights_dump'}}); $i->set_file('sid',"$sidin_fidek_file.gz"); my $scores=$i->get_score("$tsoin_fidek_file.gz"); return $scores; } sub find_scores { my $s=shift; my $repcode=shift // $s->{'repcode'} // confess "I need a repcode here"; my $date=shift // $s->{'date'}; $s->setup_for_report($repcode); my $rerc=$s->{'rerc'} // confess "I think you need to setup first."; if(not -f $rerc->{'file'}->{'score_dump'}) { $s->echo(__LINE__,"I don't have the score dump. You need to get the score.",2); return 0; } ## Does the score need renewing? Look at all the file types we need ## calculate it. my @types=keys %{$rerc->{'file'}}; my $type_list=Dumper @types; $s->echo(__LINE__,"The type list is $type_list.",5); ## could be the list file or the dump, but since the score dump is written last, we have my $score_dump_file=$rerc->{'file'}->{'score_dump'}; my $score_age=-M $score_dump_file; ## a loop to find whether we have to renew the score file foreach my $type (@types) { my $to_check_file=$rerc->{'file'}->{$type}; ## don't check the score_dump_file against itself if($to_check_file eq $score_dump_file) { next; } if(not -f $to_check_file) { $s->echo(__LINE__,"I don't have $type file $to_check_file, I get the score.",2); return 0; } my $file_age = -M $to_check_file; if($file_age < $score_age) { $s->echo(__LINE__,"The $type file $to_check_file is not up-to-date. I need to renew the score.",2); return 0; } } $s->echo(__LINE__,"I get the score from $score_dump_file.",2); #my $score=&Ernad::Common::load_from_file($score_dump_file); my $score=&Krichel::File::load($score_dump_file); $rerc->{'score'}=$score; return $score; } sub clean { my $s=shift; delete $s->{'rerc'}; } sub get_sidin_xml_file { my $s=shift; my $repcode=$s->{'repcode'} // shift // confess "I need a repcode here"; my $rerc=$s->{'rerc'} // confess "I don't know about the report $repcode"; my $insid_file=$rerc->{'file'}->{'insid'} // confess "I need this defined."; #my $sidin_file=$rerc->{'dir'}->{'seed'}."/sidin.xml"; my $sidin_file=$s->{'rerc'}->{'file'}->{'sidin_xml'}; $s->{'rerc'}->{'file'}->{'sidin_xml'}=$sidin_file; my $get_input_for_seeds=$s->{'e'}->{'conf'}->{'get_input_for_seeds'}; my $seeds_refresh_in_days=$s->{'e'}->{'conf'}->{'seeds_refresh_in_days'}; my $refresh=0; if(not -f $sidin_file) { $s->echo(__LINE__,"$sidin_file is not there.",2); $refresh=1; } if(-z $sidin_file) { $s->echo(__LINE__,"$sidin_file is empty.",2); $refresh=1; } if(&Ernad::Common::does_file_need_renewal($sidin_file,$insid_file)) { $refresh=1; } if($seeds_refresh_in_days and (-M $sidin_file > $seeds_refresh_in_days)) { $s->echo(__LINE__,"$sidin_file is older than $seeds_refresh_in_days days old.",2); $refresh=1; } if($refresh) { my $sys="$get_input_for_seeds $insid_file > $sidin_file"; $s->echo(__LINE__,"running $sys",2); system($sys); } else { $s->echo(__LINE__,"I am am not refreshing $sidin_file.",2); } return $sidin_file; } ## also used by Ernad::Presort::Chopa sub get_sidin_amf_file { my $s=shift; my $repcode=$s->{'repcode'} // shift // confess "I need a repcode here"; my $rerc=$s->{'rerc'}; if(not $rerc) { $s->setup_for_report($repcode); $rerc=$s->{'rerc'}; } if(not $s->{'rerc'}) { print Dumper $s->{'rerc'}; confess "I don't know about the report $repcode"; } my $sidin_xml_file=$s->{'rerc'}->{'file'}->{'sidin_xml'} // confess "I need this defined here."; if(not -f $sidin_xml_file) { confess "I don't see your sidin_xml_file $sidin_xml_file."; } my $sidin_amf_file=$s->{'rerc'}->{'file'}->{'sidin_amf'}; my $sidin_xml_to_amf=$s->{'e'}->{'conf'}->{'sidin_xml_to_amf'}; if(not $sidin_xml_to_amf) { confess "I don't see your configuration setting sidin_xml_to_amf"; } if(not -f $sidin_xml_to_amf) { confess "I can't see the script that transforms sidin xml to amf '$sidin_xml_to_amf'."; } if(not &Ernad::Common::does_file_need_renewal($sidin_amf_file, $sidin_xml_to_amf, $sidin_xml_file)) { return $sidin_amf_file; } my $sys="$sidin_xml_to_amf $sidin_xml_file $sidin_amf_file"; $s->echo(__LINE__,"running $sys",2); system($sys); if(not -f $sidin_amf_file) { confess "I don't see your sidin_amf_file $sidin_amf_file."; } if(-z $sidin_amf_file) { confess "Your sidin_amf_file $sidin_amf_file is empty."; } return $sidin_amf_file; } sub get_sidin_exfits { my $s=shift; ## get the exfit module if(not $s->{'f'}) { confess "I need this defined here."; } my $insid_date=$s->{'rerc'}->{'date'}->{'insid'}; ## try this #if(not $insid_date) { # $s->set_seed_date(); # $insid_date=$s->{'rerc'}->{'date'}->{'insid'}; #} if(not $insid_date) { confess "I don't see the insid_date."; } $s->echo(__LINE__,"The insid date is $insid_date."); #my $sidin_exfit_file=$s->{'f'}->show_train_file('term',$insid_date); my $sidin_exfit_file=$s->{'f'}->show_train_file($insid_date); if(&Ernad::Common::does_file_need_renewal("$sidin_exfit_file.gz", $s->{'rerc'}->{'file'}->{'sidin_xml'}, $s->{'rerc'}->{'file'}->{'sidin_amf'}, $sidin_exfit_file)) { $s->{'f'}->deal_with_seeds(); $s->echo(__LINE__,"I told the exfits to deal with the seeds."); #$sidin_exfit_file=$s->{'f'}->show_train_file('term',$insid_date); $sidin_exfit_file=$s->{'f'}->show_train_file($insid_date); if(not $sidin_exfit_file) { confess "The exfits don't have a sidin_exfit_file."; } } if(not -f "$sidin_exfit_file") { confess "I don't see the sidin_exfit_file $sidin_exfit_file."; } return $sidin_exfit_file; } 1;