package Ernad::Learn; use strict; use warnings; use Carp qw(cluck longmess shortmess croak confess); use Data::Dumper qw(Dumper); use File::Basename; use List::Util qw(shuffle); use Ernad::Adverts; use Ernad::Common; use Ernad::Chungju; use Ernad::Dates; use Ernad::Erimp; use Ernad::Learn::Mocla; use Ernad::Sort; ## we set this as global states my $issuedate; my $repcode; binmode(STDOUT,':utf8'); ## constructor sub new { my $this=shift; my $class=ref($this) || $this; my $l={}; bless $l, $class; my $params=shift; if(not defined($params->{'impna'})) { die "fatal: no impna parameter\n"; } if(not $params->{'impna'}) { die "fatal: empty impna parameter\n"; } ## copy paramenters into the object foreach my $key (keys %{$params}) { $l->{$key}=$params->{$key}; } my $verbose=$l->{'verbose'} // 0; if(not defined($l->{'e'})) { $l->{'e'}=Ernad::Erimp->new({'impna' => $l->{'impna'}, # 'repcode' => $l->{'repcode'}, 'verbose'=> $verbose}); } else { $l->{'e'}->{'verbose'}=$verbose; } if(not defined($l->{'m'})) { $l->{'m'}=Ernad::Learn::Mocla->new({'impna' => $l->{'impna'}, 'verbose'=> $verbose, 'repcode'=> $l->{'repcode'}, 'e' => $l->{'e'}}); } &Ernad::Learn::Common::set_basic($l); $l->init(); return $l; } sub init { my $l=shift; my $e=$l->{'e'}; ## this does not have to be defined $l->{'allport'}=$l->{'e'}->get_allport_repcode(); $l->{'xpc'}=$e->{'xpc'}; $l->{'ernad_ns'}=$e->{'const'}->{'ernad_ns'}; $l->{'amf_ns'}=$e->{'const'}->{'amf_ns'}; $l->{'papers_xp'}='/amf:amf/amf:collection/amf:haspart/amf:text'; } ## find what dates can be learned, and what have to be learned sub set_dates_to_learn { my $l=shift; my $e=$l->{'e'} or confess "I need a erimp object here."; my $repcode=shift // ''; if(not $repcode) { $repcode=$l->{'repcode'}; } if(not $repcode) { confess "I need a repcode here."; } $l->{'e'}->echo(__LINE__,"I am setting dates to learn for $repcode."); my $rerc=$e->{'report'}->{$repcode} or confess "I have no \$rerc for $repcode. Does the report $repcode exist?"; ## check if we have issuedates if(not ref($l->{'issuedates'}) eq 'HASH' or not scalar(keys %{$l->{'issuedates'}})) { confess "I need issuedates here."; } my @report_issuedates=keys %{$l->{'issuedates'}}; ## all issudates that a report can have my $report_issuedates=$e->get_report_issuedates(); if(not ref($report_issuedates) eq 'HASH' or not scalar(keys %{$report_issuedates})) { confess "I need report_issuedates here."; } ## the non-futile issues that can be sorted again my $all_sortable_issues_for_report={}; my $birthday=$l->{'e'}->get_report_birthday($repcode) or confess "I can't get the birthday of report $repcode"; foreach my $issuedate (keys %{$report_issuedates}) { $l->{'e'}->echo(__LINE__,"report_issuedate is $issuedate",4); my $elbyd=$l->{'ernad_limit_by_days'}; if($elbyd and &Ernad::Dates::diff_dates($issuedate,$e->{'today'}) > $elbyd) { $l->{'e'}->echo(__LINE__,"I skip $issuedate. It's further than $elbyd.",2); next; } ## earlier than the birthday if(&Ernad::Dates::diff_dates($issuedate,$birthday) > 0) { $l->{'e'}->echo(__LINE__,"I skip $issuedate. It's further than the birthday $birthday."); next; } ## not recent: fixme #if(&Ernad::Dates::diff_dates($issuedate,$l->{'e'}->{'today'}) > 100) { # $l->{'e'}->echo(__LINE__,"I skip $issuedate. It's further than 100 days old.",2); # next; #} if($e->is_futile($repcode,$issuedate)) { $l->{'e'}->echo(__LINE__,"issuedate $issuedate is futile for report $repcode",4); ## delete them from all_unsorted_issues to check if we have some remaining next; } $all_sortable_issues_for_report->{$issuedate}=1; } ## the non-futile issues that have not been sorted my $all_to_be_sorted_issues_for_report={}; foreach my $issuedate (keys %{$all_sortable_issues_for_report}) { my $today=`date -I`; chomp $today; if(not defined($l->{'ernad_limit_by_days'})) { confess 'I need an ernad_limit_by_days defined here.'; } if($e->is_presorted($repcode,$issuedate)) { $l->{'e'}->echo(__LINE__,"issuedate $issuedate is futile for report $repcode"); ## delete them from all_unsorted_issues to check if we have some remaining next; } $all_to_be_sorted_issues_for_report->{$issuedate}=1; } $l->{'all_sortable_issues_for_report'}=$all_sortable_issues_for_report; $l->{'all_to_be_sorted_issues_for_report'}=$all_to_be_sorted_issues_for_report; } sub force_model { my $l=shift; my $repcode=shift or confess "I need a repcode here"; #if(($o_k_n ne 'overwrite') # and ($o_k_n ne 'keep')) { # confess 'o_k_n has to be overwrite or keep or no_write'; #} my $fitport=&Ernad::Learn::Common::get_fitport($l,$repcode); if(not defined($l->{'m'}->{$fitport})) { $l->{'d'}->{$fitport}=Ernad::Learn::Dokli->new({'impna' => $l->{'impna'}}); } my $m=$l->{'d'}; ## by default, keeps old if it is up-to-date $m->{'d'}->build_dokli(); $l->set_report($repcode); $l->{'m'}->model_report($repcode); $l->presort_ahead($repcode); } ## sub set_issuedate { my $l=shift; my $issuedate=shift // confess "I need an issuedate here"; my $silent=shift // ''; # Find the file my $allport_sent_file; if(not $silent) { $l->{'e'}->echo(__LINE__,"I try to set the issuedate to $issuedate",4); } my $issue_glob=$l->{'e'}->{'dir'}->{'allport_sent'}.'/'.$issuedate.'_*'; $allport_sent_file = Ernad::Common::get_last_file( $issue_glob ); if(not $allport_sent_file) { die "I can't see an allport sent file for $issue_glob"; } if(not -f $allport_sent_file) { die "I can't the the file $allport_sent_file"; } if(not $silent) { $l->{'e'}->echo(__LINE__,"I found $allport_sent_file"); } my $allport_dom=Ernad::Generate::load_and_return_xml($allport_sent_file) or die; my @papers=$l->{'xpc'}->findnodes($l->{'papers_xp'},$allport_dom); if(not $silent) { $l->{'e'}->echo(__LINE__,"I loaded $allport_sent_file"); } $l->{'allport_dom'}=$allport_dom; $l->{'issuedate'}=$issuedate; @{$l->{'papers'}}=@papers; } ## sub unset_issuedate { my $l=shift; delete $l->{'allport_dom'}; delete $l->{'issuedate'}; delete $l->{'papers'}; delete $l->{'unsorted_doc'}; $l->{'e'}->echo(__LINE__,"I unset the issuedate."); } ## sub model_all_reports { my $l=shift; my $e=$l->{'e'} or confess "I need an erimp here"; my @repcodes=$e->list_repcodes; foreach my $repcode (shuffle @repcodes) { my $fitport=&Ernad::Learn::Common::get_fitport($l,$repcode); if(not defined($l->{'m'}->{$fitport})) { $l->{'m'}->{$fitport}=Ernad::Learn::Dokli->new({'impna' => $l->{'impna'}, 'e'=> $l->{'e'}, 'verbose'=> $l->{'verbose'}}); } $l->{'m'}->model_report($repcode); } } ## sub set_report { my $l=shift; my $repcode=shift; my $silent=shift // ''; my $e=$l->{'e'}; my $rerc=$e->{'report'}->{$repcode} or confess "I have no rerc, does report $repcode exist?"; if(not $silent) { $l->{'e'}->echo(__LINE__,"I set the report to $repcode.",4); } $l->{'rerc'}=$rerc; $l->{'repcode'}=$repcode; } ## sub presort { my $l=shift; $l->check_dapor(); ## case where there has never been any issue if(not $l->{'e'}->can_the_report_be_modelled($l->{'repcode'})) { $l->create_unsorted('keep'); $l->{'presorted_doc'}=$l->{'unsorted_doc'}; return 0; } my $unsorted_doc=$l->{'unsorted_doc'}; if(not defined($unsorted_doc)) { $l->{'e'}->echo(__LINE__,"I am creating an unsorted_doc"); $l->create_unsorted('keep'); } if(not defined($l->{'unsorted_doc'})) { confess "I need an \$l->{unsorted_doc} here"; } my $total_unsorted_texts=&Ernad::Common::count_texts_in_rif($l->{'unsorted_doc'}); my $fitport=&Ernad::Learn::Common::get_fitport($l,$repcode); if(not defined($l->{'m'}->{$fitport})) { $l->{'m'}->{$fitport}=Ernad::Learn::Dokli->new({'impna' => $l->{'impna'}}); } my $weights=$l->{'m'}->predict($l->{'repcode'},$l->{'issuedate'}); my $class_file=$l->{'m'}->{'class_file'}->{$l->{'repcode'}}->{$l->{'issuedate'}} // ''; if(not $class_file or not -f $class_file) { die "I can't see class_file $class_file. It looks like the prediction failed."; } my $count_lines_in_class_file=`grep -c ^ $class_file`; chomp $count_lines_in_class_file; my $issuedate=$l->{'issuedate'}; $l->{'e'}->echo(__LINE__,"issuedate is $issuedate"); my $items_classed=$count_lines_in_class_file-1; $l->{'e'}->echo(__LINE__,"class_file $class_file contains $items_classed items"); my $rif=$l->{'us_rif'} or confess "I can't see the us_rif"; $l->{'e'}->echo(__LINE__,"rif $rif contains $total_unsorted_texts texts"); if($items_classed != $total_unsorted_texts) { confess "I detect an item count missmatch between $rif and $class_file."; } my $crit=basename($class_file); $crit=~s|^([^.]+)\..*|$1|; my $in_doc=$l->{'unsorted_doc'}->cloneNode(1); my $doc=&Ernad::Sort::inject_sort($in_doc,$weights,$crit); my $out_doc=&Ernad::Sort::sort_by_criterion($doc,$crit,'d'); $l->{'presorted_doc'}=$out_doc; delete $l->{'unsorted_doc'}; delete $l->{'m'}->{'class_file'}->{$repcode}->{$issuedate}; return 1; } ## sub unset_report { my $l=shift; delete $l->{'repcode'}; delete $l->{'rerc'}; delete $l->{'unsorted_doc'}; $l->{'e'}->echo(__LINE__,"I unset the report"); } sub check_dapor { my $l=shift; $repcode=$l->{'repcode'} or confess "no repcode"; $issuedate=$l->{'issuedate'} or confess "no issuedate"; ## make sure we have have everything set $l->set_report($l->{'repcode'},'silent'); $l->set_issuedate($l->{'issuedate'},'silent'); $l->{'e'}->echo(__LINE__,"repcode is $repcode, issuedate is $issuedate",10); } sub create_presorted { my $l=shift; my $o_k_n=shift; if(($o_k_n ne 'overwrite') and ($o_k_n ne 'keep') and ($o_k_n ne 'no_write')) { die 'o_k_n has to be overwrite or keep or no_write'; } if($o_k_n eq 'keep') { if($l->is_already_there('presorted')) { my $report=$l->{'repcode'}; my $issuedate=$l->{'issuedate'}; $l->{'e'}->echo(__LINE__,"There is already a presorted issue for $repcode at $issuedate."); return; } } ## presort may not work if there are no selection $l->{'e'}->echo(__LINE__,"Starting to presort."); my $has_been_presorted=$l->presort(); if(not $has_been_presorted) { $l->{'e'}->echo(__LINE__,"I have done no presorting."); return; } my $amf_doc=$l->{'presorted_doc'} or confess "I don't have a presorted document"; my $ps_dir=$l->{'rerc'}->{'dir'}->{'presorted'} or die; my $ps_rif=$ps_dir.'/'.Ernad::Common::name_by_date($issuedate); if($o_k_n eq 'no_write') { return; } $l->{'e'}->echo(__LINE__,"creating issue to $ps_rif.",1); $amf_doc->toFile($ps_rif,1); $l->{'e'}->echo(__LINE__,"I wrote $ps_rif."); &Ernad::Adverts::inject_adverts_into_rif($ps_rif,$l->{'e'}); $l->{'e'}->echo(__LINE__,"I placed adverts into $ps_rif."); if($o_k_n eq 'overwrite') { $l->delete_already_there('presorted',$ps_rif); } return $ps_rif; } sub create_unsorted { my $l=shift; my $o_k_n=shift; if(not defined($o_k_n)) { confess "The o_k_n parameter is missing on create_unsorted"; } if(($o_k_n ne 'overwrite') and ($o_k_n ne 'keep') and ($o_k_n ne 'no_write')) { confess 'o_k_n has to be overwrite or keep or no_write'; } if(not ($o_k_n eq 'no_write')) { if($l->{'e'}->is_futile($l->{'repcode'},$l->{'issuedate'})) { return 0; } } if($o_k_n eq 'keep') { my $rerc=$l->{'rerc'} or die "no rerc"; my $us_dir= $rerc->{'dir'}->{'unsorted'} or die; my $issuedate=$l->{'issuedate'} // confess "I need an issuedate here."; my $us_rif_glob="$us_dir/*$issuedate*"; my @files=glob($us_rif_glob); if(scalar @files) { my $us_rif=$files[$#files]; $l->{'e'}->echo(__LINE__,"There is an unsorted file $us_rif, I won't make a new one.",1); $l->{'us_rif'}=$us_rif; $l->{'unsorted_doc'}=&Ernad::Common::load_and_return_xml($us_rif); return 1; } } $l->check_dapor(); my $allport_dom=$l->{'allport_dom'} or die "I need the allport_dom here."; my @papers=@{$l->{'papers'}}; if(not scalar(@papers)) { die "The issue of $issuedate seems to have no papers"; } my $e=$l->{'e'}; my $xpc=$l->{'xpc'}; ## Createe a report issue from allport issue ## first, set report id my $rerc=$l->{'rerc'} or die "no rerc"; my $repdoc=$rerc->{'repdoc'} or die; #my $to_string=$repdoc->toString(); #$l->{'e'}->echo(__LINE__,"here is repdoc\n$to_string"); my $report_id=$rerc->{'id'} or die; my $report_issue_handle = $report_id.":".$issuedate; my $amf_element=$repdoc->documentElement(); #my $collection_element=&Ernad::Common::get_collection_element($repdoc); my $collection_element=$e->{'x'}->collection_element($repdoc); $collection_element->setAttribute( 'id', $report_issue_handle ); ## add the issue date to the data my $issuedate_element=$allport_dom->createElementNS($l->{'ernad_ns'},'issuedate'); $issuedate_element->appendText($issuedate); $collection_element->appendChild($issuedate_element); ## issue advert, should be handled by Ernad::Adverts ## tihs assumes it is in the allport issue. This is not done automatically my $advert=$e->{'xpc'}->find('//e:advert',$allport_dom)->get_node(1) // ''; if($advert) { $collection_element->appendChild($advert); } foreach my $paper (@{$l->{'papers'}}) { my $haspart_element=$allport_dom->createElementNS($l->{'amf_ns'},'haspart'); $haspart_element->appendChild($paper->cloneNode(1)); $collection_element->appendChild($haspart_element); } ## remove existing ones if set to overwrite my $amf_doc=$e->get_amf_doc(); $amf_element=$amf_doc->documentElement(); $amf_element->appendChild($collection_element); $l->{'unsorted_doc'}=$amf_doc; #$amf_doc->toFile('/tmp/o.xml'); if($o_k_n eq 'no_write') { $l->{'e'}->echo(__LINE__,"I give up on creating an unsorted issue because I have no_write set"); return; } ## Save report to unsorted my $source_dir = $rerc->{'dir'}->{'source_dir'}; my $us_dir= $rerc->{'dir'}->{'unsorted'} or die; my $us_rif=$us_dir.'/'.Ernad::Common::name_by_date($issuedate); $l->{'e'}->echo(__LINE__,"Copying allport issue to $us_rif.",1); $l->{'us_rif'}=$us_rif; ## transit: change the unsorted report issue document, mainly to ## address futli report pointers my $transit_xslt_file=$e->{'dir'}->{'style'}.'/transit'.$e->{'const'}->{'xsl_ext'}; if(-f $transit_xslt_file) { $amf_doc=$e->transform($amf_doc,'transit'); $l->{'unsorted_doc'}=$amf_doc; } $amf_doc->toFile($us_rif,1); $l->{'e'}->echo(__LINE__,"I write $us_rif."); if($o_k_n eq 'overwrite') { $l->delete_already_there('unsorted',$us_rif); } ## not yet implemented &Ernad::Adverts::inject_adverts_into_rif($us_rif,$l->{'e'}); $l->{'e'}->echo(__LINE__,"I placed adverts into $us_rif."); } sub delete_already_there { my $l=shift; my $us_ps=shift; ## except the file, to delete all others after creation my $except=shift // ''; if(($us_ps ne 'unsorted') and ($us_ps ne 'presorted')) { die 'us_ps has to be unsorted or presorted.'; } $l->check_dapor(); $l->{'e'}->echo(__LINE__,"I delete $repcode $issuedate $us_ps except $except",1); my $rerc=$l->{'rerc'} or die "I need the allport_dom here."; my $dir=$rerc->{'dir'}->{$us_ps} or die; my @already_there_files=glob("$dir/$issuedate*"); foreach my $file (@already_there_files) { if($except and $file eq $except) { $l->{'e'}->echo(__LINE__,"I keep except $file",3); next; } $l->{'e'}->echo(__LINE__,"I remove $file",3); unlink $file; } } sub is_already_there { my $l=shift; my $us_ps=shift; if(($us_ps ne 'unsorted') and ($us_ps ne 'presorted')) { die 'us_ps has to be unsorted or presorted.'; } ## do we have report and issuedate set $l->check_dapor(); my $rerc=$l->{'rerc'} or die "I need the allport_dom here."; my $dir=$rerc->{'dir'}->{$us_ps} or die; my @already_there_files=glob("$dir/$issuedate*"); foreach my $file (@already_there_files) { my $fufi="$file"; $l->{'e'}->echo(__LINE__,"I found $fufi"); } if(scalar(@already_there_files)) { return 1; } return 0; } sub presort_ahead { my $l=shift; my $repcode=shift or confess "I need a repcode here."; my $o_k_n=shift // 'keep'; my $fitport=&Ernad::Learn::Common::get_fitport($l,$repcode); if(not defined($l->{'m'}->{$fitport})) { $l->{'m'}->{$fitport}=Ernad::Learn::Dokli->new({'impna' => $l->{'impna'}, 'e'=> $l->{'e'}, 'verbose'=> $l->{'verbose'}}); } my $m=$l->{'m'}; if(not defined($m->{'issuedates'})) { $m->set_issuedates; if(not defined($m->{'issuedates'})) { confess "I can't find issuedates"; } } $l->set_report($repcode); $l->set_dates_to_learn(); my $model_file=$l->{'m'}->find_most_recent_model($repcode); if(not $model_file) { $l->{'e'}->echo(__LINE__,"I have no model for $repcode. I start to build one."); $l->{'m'}->model_report($repcode); $model_file=$m->find_most_recent_model($repcode); ## still not model file if(not $model_file) { $l->{'e'}->echo(__LINE__,"I can't get a model for $repcode, therefore presort_ahead fails."); my @sortable_issues=keys %{$l->{'all_sortable_issues_for_report'}}; my $total_sortable_issues=scalar @sortable_issues; if(not $total_sortable_issues) { $l->{'e'}->echo(__LINE__,"I have no sortable issue for this report."); $l->{'e'}->echo(__LINE__,"Looks like a virgin report."); $l->{'e'}->echo(__LINE__,"I use the last allport issue as a fallback."); my $dates=$l->{'e'}->get_available_issuedates(); my @sorted_dates=sort keys %$dates; my $last_date=pop @sorted_dates; $l->{'e'}->echo(__LINE__,"My last available allport issuedate is $last_date."); ## hopefully changing that state var will not cause problems push(@sortable_issues,$last_date); } ## still create unsorted ahead foreach my $issuedate (sort @sortable_issues) { $l->{'e'}->echo(__LINE__,"I create unsorted $repcode for $issuedate."); $l->set_issuedate($issuedate); $l->{'e'}->echo(__LINE__,"Done setting the issuedate $issuedate."); $l->create_unsorted('keep'); $l->{'e'}->echo(__LINE__,"Done creating the unsorted.\n\n;"); } return 0; } } my $presorted_something=0; my $ps_rif; foreach my $issuedate (sort keys %{$l->{'all_sortable_issues_for_report'}}) { $l->{'e'}->echo(__LINE__,"I presort $repcode for $issuedate."); $l->set_issuedate($issuedate); $l->{'e'}->echo(__LINE__,"Done setting the issuedate $issuedate."); $ps_rif=$l->create_presorted('overwrite'); $l->{'e'}->echo(__LINE__,"Done creating the presorted."); $presorted_something=1; } if(not $presorted_something) { $l->{'e'}->echo(__LINE__,"I found no issues to presort."); } delete $l->{'unsorted_doc'}; delete $l->{'presorted_doc'}; return $ps_rif; } # sub wipe_report { # my $l=shift; # my $repcode=shift or confess "I need a repcode here."; # my $fitport=&Ernad::Learn::Common::get_fitport($l,$repcode); # if(not defined($l->{'m'}->{$fitport})) { # $l->{'m'}->{$fitport}=Ernad::Learn::Dokli->new({'impna' => $l->{'impna'}}); # } # my $m=$l->{'m'}; # if(not defined($f->{'issuedates'})) { # $m->set_issuedates(); # if(not defined($m->{'issuedates'})) { # confess "I can't find issuedates"; # } # } # $l->set_report($repcode); # my @issuedates=keys %{$m->{'issuedates'}}; # foreach my $issuedate (@issuedates) { # $l->set_issuedate($issuedate); # print "clearing work for $issuedate\n"; # $l->clear_work('with_source'); # } # } #sub initialize_report { # my $l=shift; # my $repcode=shift or confess "I need a repcode here."; # # my $f=$l->{'f'}; # # my $f=$l->{'f'}; # if(not defined($f->{'issuedates'})) { # $f->set_issuedates(); # if(not defined($f->{'issuedates'})) { # confess "I can't find issuedates"; # } # } # $l->set_report($repcode); # my @issuedates=keys %{$f->{'issuedates'}}; # foreach my $issuedate (@issuedates) { # $l->set_issuedate($issuedate); # if($l->is_futile()) { # next; # } # print "setting for $issuedate\n"; # $l->create_unsorted('overwrite'); # } #} 1;