package Ernad::Presort::Tfidf; use strict; use warnings; use Carp qw(confess); use Data::Dumper qw(Dumper); use base ('Ernad::Presorter'); binmode(STDOUT,':utf8'); sub get_score { my $i=shift; my $tso_file=shift // confess "I need a tso fidek here"; $i->set_file('tso',$tso_file); if(not $i->{'sid'}) { confess "I need \$i->{sid}"; } if(defined $i->{'weights_file'} and -f $i->{'weights_file'}.'.gz') { $i->{'weights'}=$i->load_json_with_gz($i->{'weights_file'}.'.gz'); } else { $i->gather_data(); $i->echo(__LINE__," done."); } my $scores=$i->score(); delete $i->{'weights'}; return $scores; } sub gather_data { my $i=shift; ## set the document numbers, invoquing count_doc twice $i->{'count_doc'}->{'total'}=$i->count_doc('sid')+$i->count_doc('tso'); ## length of each doc, and counting of features $i->examine('sid'); ## sid has to be examined first! $i->examine('tso'); ## now do the idf $i->echo(__LINE__,"I am calculating dofs ..."); $i->dof(); $i->echo(__LINE__," done."); ## calculate weight $i->echo(__LINE__,"I am calculating weights ..."); $i->weights_tfidf(); $i->echo(__LINE__," done."); if(defined($i->{'weights_file'})) { $i->save_json_with_gz($i->{'weights'},$i->{'weights_file'}); } delete $i->{'tso'}; delete $i->{'sid'}; delete $i->{'dof'}; } ## this should not need to be changed sub score { my $i=shift; if(not $i->{'weights'}) { confess "I need weights to score."; } foreach my $tso_pid (keys %{$i->{'weights'}->{'tso'}}) { ## the score has to be initialized, otherwise Ernad::sort will confess. $i->{'scores'}->{$tso_pid}=0; my $tso_doc=$i->{'weights'}->{'tso'}->{$tso_pid}; foreach my $fit (keys %{$tso_doc}) { my $weight=$tso_doc->{$fit}; foreach my $sid_pid (keys %{$i->{'weights'}->{'sid'}}) { my $sid_doc=$i->{'weights'}->{'sid'}->{$sid_pid}; if(not $sid_doc->{$fit}) { next; } $i->{'scores'}->{$tso_pid}+=$sid_doc->{$fit} * $tso_doc->{$fit}; } } } return $i->{'scores'}; } ## this where we can experiement, but change the name in the $i->gather_data sub weights_tfidf { my $i=shift; my $codo= $i->{'count_doc'}->{'sid'}+$i->{'count_doc'}->{'tso'}; foreach my $fit (keys %{$i->{'fits'}->{'sid'}}) { foreach my $type ('sid','tso') { foreach my $pid (keys %{$i->{$type}}) { ## make sure we have at least one feature weight. We can't initialize all to zero, ## because otherwise we run out of memory. If we don't have a at least one ## feature, even with weight 0, we the tso paper weights undefined. That ## will lead to an undefined score. if(not defined($i->{'weights'}->{'tso'}->{$pid})) { $i->{'weights'}->{'tso'}->{$pid}->{$fit}=0; } my $count=$i->{$type}->{$pid}->{$fit}; if(not $count) { next; } my $tefr=$count/$i->{'pids'}->{$pid}->{'length'}; my $idof=log($codo/$i->{'dof'}->{'all'}->{$fit}); $i->{'weights'}->{$type}->{$pid}->{$fit}=$tefr*$idof; } } } } ## raw document frequence sub dof { my $i=shift; if(not defined($i->{'fits'}->{'sid'})) { confess "You need to \$i->examine(sid) first!"; } foreach my $fit (keys %{$i->{'fits'}->{'sid'}}) { $i->{'dof'}->{'all'}->{$fit}=0; $i->{'dof'}->{'sid'}->{$fit}=0; $i->{'dof'}->{'tso'}->{$fit}=0; ## count doc frequency in both sid and tso foreach my $type ('sid','tso') { foreach my $pid (keys %{$i->{$type}}) { if($i->{$type}->{$pid}->{$fit}) { $i->{'dof'}->{'all'}->{$fit}++; $i->{'dof'}->{$type}->{$fit}++; } } } } } sub set_file { my $i=shift; my $type=shift // confess "I need a type here."; if($type ne 'sid' and $type ne 'tso') { confess "I need your file type to be sid or tso."; } my $file=shift // "I need to have the seed file defined."; if(not -f $file) { confess "I don't see your $type file $file."; } $i->{'file'}->{$type}=$file; $i->{$type}=$i->load_json_with_gz($i->{'file'}->{$type}); return $file; } sub examine { my $i=shift; my $type=shift // confess "I need a type here."; if($type ne 'sid' and $type ne 'tso') { confess "I need your file type to be sid or tso."; } my $fidek=$i->{$type} // confess "I need this set here."; if($type eq 'tso' and not defined($i->{'fits'}->{'sid'})) { confess "You need to \$i->examine(sid) first!"; } foreach my $pid (keys %$fidek) { #$i->echo(__LINE__,"pid is $pid ... "); $i->{'pids'}->{$pid}->{'length'}=0; my $doc=$fidek->{$pid}; foreach my $fit (keys %{$doc}) { ## increment the length of the paper $i->{'pids'}->{$pid}->{'length'}+=$doc->{$fit}; ## count the occurence of the feature if($type eq 'sid') { $i->{'fits'}->{$type}->{$fit}+=$doc->{$fit}; } ## type is tso, count only if it's in sid papers else { if(not $i->{'fits'}->{'sid'}->{$fit}) { next; } $i->{'fits'}->{'tso'}->{$fit}+=$doc->{$fit}; } } #$i->echo(__LINE__,"length is ".$i->{'pids'}->{$pid}->{'length'}); #$i->echo(__LINE__,"\n"); } } sub count_doc { my $i=shift; my $type=shift // confess "I need a type here."; if($type ne 'sid' and $type ne 'tso') { confess "I need your file type to be sid or tso."; } my $fidek=$i->{$type} // confess "I need this set here."; my $count=0; foreach my $pid (keys %$fidek) { $count++; } $i->{'count_doc'}->{$type}=$count; return $count; } 1;