package Ernad::Presort::Predi; use strict; use warnings; use Carp qw(cluck longmess shortmess croak confess); use Data::Dumper; use File::Basename; use File::Slurper; use URI::Escape; use Krichel::File; use base ('Ernad::Presorter'); use Ernad::Presort::Class; use Ernad::Presort::Ranfi; binmode(STDOUT,':utf8'); ## returns the repcode sub setup { my $p=shift; #my $a=Ernad::Presort::Class->new({'e'=>$p->{'e'}}); #my $p->{'dir'}->{'class'}=$ my $learn_dir=$p->{'e'}->{'dir'}->{'learn'}; ## set_dir can't be used here, as we use the report my $repcode=$p->get_repcode(); $p->{'dir'}->{'class'}=$learn_dir.'/class/'.$repcode; my $m=Ernad::Presort::Model->new({'e'=>$p->{'e'}}); $p->{'m'}=$m; $p->{'ranid'}=$p->{'m'}->last_ranid(); $p->{'file'}->{'model'}=$p->{'m'}->most_recent(); } ## callable externally sub type { my $p=shift; ## restrict my $given_papid=shift // ''; if(not $p->{'ranid'}) { $p->setup(); } if(not $p->{'ranid'}) { confess "I don't see the ranid."; } my $ranid=$p->{'ranid'}; ## should be defined externally my $issuedate=$main::issuedate // confess "I don't have an issuedate."; my $test_file=$p->{'dir'}->{'class'}.'/'.$issuedate.'_'.$p->{'ranid'}.".test"; if(not -f $test_file) { confess "I don't see your test file $test_file."; } my $r=Ernad::Presort::Ranfi->new({'e'=>$p->{'e'}}); my $firan=$p->invert_ranfi($r->load($ranid)); open(T,"< $test_file") or confess "I can't open your test file $test_file."; my $vemli; while($vemli=) { $vemli=~m|0\s+([^#]+)\s+# (\S+)$| or confess "I can't parse $vemli."; my $data=$1; ## handle my $papid=uri_unescape($2); if($given_papid and $papid ne $given_papid) { next; } my $out=''; foreach my $fitpart (split(/\s+/,$data)) { $fitpart=~m|^(\d+):([.\d]+)$| or confess "I don't get your fitpart $fitpart"; my $fit=$1 or confess "I need a fit here."; my $weight=$2; if(not $firan->{$fit}) { confess "I have no fit rank for fit $fit"; } $out.=$firan->{$fit}.' '; } chop $out; print "$out\n"; } } ## maybe should go to ranfi sub invert_ranfi { my $p=shift; my $ranfi=shift; my $firan={}; foreach my $fit (keys %$ranfi) { my $rank=$ranfi->{$fit}; $firan->{$rank}=$fit; } return $firan; } sub run { my $p=shift; $p->setup; my $glob=$p->{'dir'}->{'class'}.'/*'.$p->{'ranid'}.".test"; foreach my $test_file (glob($glob)) { $p->run_file($test_file); } ## clear the old files in the directory $glob=$p->{'dir'}->{'class'}.'/*'; my $regx='^\d{4}-\d{2}-\d{2}_'.$p->{'ranid'}; foreach my $file (glob($glob)) { my $bana=basename($file); if(not $bana=~m|$regx|) { $p->echo(__LINE__,"I remove the non-current file $bana."); unlink $file; } } } ## gather results from teh side file sub gather { my $p=shift; $p->setup; my $ranid=$p->{'ranid'} // confess "I need a ranid here."; my $glob=$p->{'dir'}->{'class'}.'/*'; $p->{'data'}=undef; foreach my $file (glob($glob)) { my $bana=basename($file); if($ranid and not $bana=~m|^\d{4}-\d{2}-\d{2}_$ranid|) { $p->echo(__LINE__,"I remove a non-current ranid file $bana."); #unlink $file; next; } if(not $bana=~m|\.side$|) { next; } my $e=$p->{'e'} // $main::e // confess 'Where is my erimp?'; #my $issuedate=&Ernad::Common::find_issuedate_from_file($file) // # confess "I need an issuedate here."; my $issuedate=$e->{'f'}->issuedate($file) // confess "I need an issuedate here."; $p->{'data'}->{'files'}->{$issuedate}=$bana; my $side_fh=IO::File->new("< $file"); my $line; while($line=<$side_fh>) { chomp $line; my @pair=split("\t",$line); my $value=$pair[0]; my $handle=uri_unescape($pair[1]); $p->{'data'}->{'scores'}->{$issuedate}->{$handle}=$value; } } return $p->{'data'}; } ## code that was in Presort.pm sub check { ## #my $class_file=$m->{'class_file'}->{$repcode}->{$issuedate} // ''; #if(not $class_file or not -f $class_file) { # confess "I can't see class_file $class_file. It looks like the prediction failed."; #} #$e->echo(__LINE__,"I found the class file $class_file"); #my $count_lines_in_class_file=`grep -c ^ $class_file`; #chomp $count_lines_in_class_file; #$e->echo(__LINE__,"issuedate is $issuedate"); #my $items_classed=$count_lines_in_class_file-1; #$e->echo(__LINE__,"class_file $class_file contains $items_classed items"); #my $total_unsorted_texts=&Ernad::Common::count_texts_in_rif($unsorted_doc); #$e->echo(__LINE__,"The \$rerc->{amf_doc} contains $total_unsorted_texts texts"); #if($items_classed != $total_unsorted_texts) { # confess "$items_classed are classed but there are $total_unsorted_texts unsorted texts"; #} } sub run_file { my $p=shift; my $test_file=shift; if(not -f $test_file) { confess "I don't see your file $test_file"; } my $flags=shift // ''; ## $p->check_test_file($test_file); my $model_file=$p->{'file'}->{'model'} or confess "I can't see a model file."; my $model_dir=dirname($model_file); my $class_file=$test_file; $class_file=~s|\.test$|\.class|; my $out_file=$class_file; $out_file=~s|\.class|.out|; my $err_file=$class_file; $err_file=~s|\.class|.err|; $flags.=" -b 1"; if(not &Ernad::Common::does_file_need_renewal($class_file,$model_file,$test_file)) { $p->echo(__LINE__,"I skip the renewal of $class_file, it needs no renewal.",2); return 0; } else { ## request probababilites my $s="svm-predict $flags $test_file $model_file $class_file > $out_file 2> $err_file"; $p->echo(__LINE__,"I run $s"); system($s); } if(not -z $err_file) { my $error=&File::Slurper::read_text($err_file); chomp $error; confess "smv_predict gives me an error '$error'"; } ## conduct a check my $count_lines_in_test=&Krichel::File::count_lines($test_file); my $count_lines_in_class=&Krichel::File::count_lines($class_file); if($count_lines_in_test != ($count_lines_in_class-1)) { confess "$test_file has $count_lines_in_test but $class_file has $count_lines_in_class"; } ## used for a check in the caller #$p->{'class_file'}->{$repcode}->{$issuedate}="$class_file"; ## this is a debugging tool my $scores=$p->write_side_file($class_file,$test_file); return $scores; } sub write_side_file { my $p=shift; my $class_file=shift; my $test_file=shift; my @class_lines=&File::Slurper::read_lines($class_file); ## first line is a comment my $first_line=shift @class_lines; chomp $first_line; if($first_line eq 'labels -1 1') { map ~s|\S+\s+\S+\s+(\S+)|$1|g, @class_lines; } elsif($first_line eq 'labels 1 -1') { map ~s|\S+\s+(\S+)\s+\S+|$1|g, @class_lines; } else { confess "I found '$first_line' in class_file $class_file."; } my $side_file=$class_file; $side_file=~s|\.class$|\.side|; my @test_lines=&File::Slurper::read_lines($test_file); #print Dumper @test_lines; #print "\n\n"; map ~s|[^#]+#\s*||g, @test_lines; #print Dumper @test_lines; my $count=0; my $out; my $scores; foreach my $line (@test_lines) { chomp $line; my $handle=uri_unescape($line); my $score=$class_lines[$count]; $out.="$score\t$line\n"; $scores->{$handle}=$score; $count++; } &File::Slurper::write_text($side_file,$out); return $scores; } 1;