package Ernad::Presort::Tfidf;

use strict;
use warnings;

use Carp qw(confess);

use Data::Dumper qw(Dumper);
use base ('Ernad::Presorter');

binmode(STDOUT,':utf8');

sub get_score {
  my $i=shift;
  my $tso_file=shift // confess "I need a tso fidek here";
  $i->set_file('tso',$tso_file);
  if(not $i->{'sid'}) {
    confess "I need \$i->{sid}";
  }
  if(defined $i->{'weights_file'} and -f $i->{'weights_file'}.'.gz') {
    $i->{'weights'}=$i->load_json_with_gz($i->{'weights_file'}.'.gz');
  }
  else {
    $i->gather_data();
    $i->echo(__LINE__," done.");
  }
  my $scores=$i->score();
  delete $i->{'weights'};
  return $scores;
}

sub gather_data {
  my $i=shift;
  ## set the document numbers, invoquing count_doc twice
  $i->{'count_doc'}->{'total'}=$i->count_doc('sid')+$i->count_doc('tso');
  ## length of each doc, and counting of features
  $i->examine('sid');
  ## sid has to be examined first!
  $i->examine('tso');
  ## now do the idf
  $i->echo(__LINE__,"I am calculating dofs ...");
  $i->dof();
  $i->echo(__LINE__," done.");
  ## calculate weight
  $i->echo(__LINE__,"I am calculating weights ...");
  $i->weights_tfidf();
  $i->echo(__LINE__," done.");
  if(defined($i->{'weights_file'})) {
    $i->save_json_with_gz($i->{'weights'},$i->{'weights_file'});
  }
  delete $i->{'tso'};
  delete $i->{'sid'};
  delete $i->{'dof'};
}

## this should not need to be changed
sub score {
  my $i=shift;
  if(not $i->{'weights'}) {
    confess "I need weights to score."; 
  }
  foreach my $tso_pid (keys %{$i->{'weights'}->{'tso'}}) {
    ## the score has to be initialized, otherwise Ernad::sort will confess.
    $i->{'scores'}->{$tso_pid}=0;
    my $tso_doc=$i->{'weights'}->{'tso'}->{$tso_pid};
    foreach my $fit (keys %{$tso_doc}) {
      my $weight=$tso_doc->{$fit};
      foreach my $sid_pid (keys %{$i->{'weights'}->{'sid'}}) {
        my $sid_doc=$i->{'weights'}->{'sid'}->{$sid_pid};
        if(not $sid_doc->{$fit}) {
          next;
        }
        $i->{'scores'}->{$tso_pid}+=$sid_doc->{$fit} * $tso_doc->{$fit};
      }
    }
  }
  return $i->{'scores'};
}

## this where we can experiement, but change the name in the $i->gather_data
sub weights_tfidf {
  my $i=shift;
  my $codo= $i->{'count_doc'}->{'sid'}+$i->{'count_doc'}->{'tso'};
  foreach my $fit (keys %{$i->{'fits'}->{'sid'}}) {
    foreach my $type ('sid','tso') {
      foreach my $pid (keys %{$i->{$type}}) {
        ## make sure we have at least one feature weight. We can't initialize all to zero, 
        ## because otherwise we run out of memory. If we don't have a at least one
        ## feature, even with weight 0, we the tso paper weights undefined. That
        ## will lead to an undefined score. 
        if(not defined($i->{'weights'}->{'tso'}->{$pid})) {
          $i->{'weights'}->{'tso'}->{$pid}->{$fit}=0;
        }
        my $count=$i->{$type}->{$pid}->{$fit};
        if(not $count) {
          next;
        }
        my $tefr=$count/$i->{'pids'}->{$pid}->{'length'};
        my $idof=log($codo/$i->{'dof'}->{'all'}->{$fit});
        $i->{'weights'}->{$type}->{$pid}->{$fit}=$tefr*$idof;
      }                                    
    }
  }
}

## raw document frequence
sub dof {
  my $i=shift;
  if(not defined($i->{'fits'}->{'sid'})) {
    confess "You need to \$i->examine(sid) first!";
  }
  foreach my $fit (keys %{$i->{'fits'}->{'sid'}}) {
    $i->{'dof'}->{'all'}->{$fit}=0;
    $i->{'dof'}->{'sid'}->{$fit}=0;
    $i->{'dof'}->{'tso'}->{$fit}=0;
    ## count doc frequency in both sid and tso
    foreach my $type ('sid','tso') {
      foreach my $pid (keys %{$i->{$type}}) {
        if($i->{$type}->{$pid}->{$fit}) {
          $i->{'dof'}->{'all'}->{$fit}++;
          $i->{'dof'}->{$type}->{$fit}++;
        }
      }
    }
  }
}

sub set_file {
  my $i=shift;
  my $type=shift // confess "I need a type here.";
  if($type ne 'sid' and $type ne 'tso') {
    confess "I need your file type to be sid or tso.";
  }
  my $file=shift // "I need to have the seed file defined.";
  if(not -f $file) {
    confess "I don't see your $type file $file.";
  }
  $i->{'file'}->{$type}=$file;
  $i->{$type}=$i->load_json_with_gz($i->{'file'}->{$type});
  return $file;
}

sub examine {
  my $i=shift;
  my $type=shift // confess "I need a type here.";
  if($type ne 'sid' and $type ne 'tso') {
    confess "I need your file type to be sid or tso.";
  }
  my $fidek=$i->{$type} // confess "I need this set here.";
  if($type eq 'tso' and not defined($i->{'fits'}->{'sid'})) {
    confess "You need to \$i->examine(sid) first!";
  }
  foreach my $pid (keys %$fidek) {
    #$i->echo(__LINE__,"pid is $pid ... ");
    $i->{'pids'}->{$pid}->{'length'}=0;
    my $doc=$fidek->{$pid};
    foreach my $fit (keys %{$doc}) {
      ## increment the length of the paper 
      $i->{'pids'}->{$pid}->{'length'}+=$doc->{$fit};
      ## count the occurence of the feature 
      if($type eq 'sid') {
        $i->{'fits'}->{$type}->{$fit}+=$doc->{$fit};
      }
      ## type is tso, count only if it's in sid papers
      else {
        if(not $i->{'fits'}->{'sid'}->{$fit}) {
          next;
        }
        $i->{'fits'}->{'tso'}->{$fit}+=$doc->{$fit};
      }        
    }
    #$i->echo(__LINE__,"length is ".$i->{'pids'}->{$pid}->{'length'});
    #$i->echo(__LINE__,"\n");
  }
}

sub count_doc {
  my $i=shift;
  my $type=shift // confess "I need a type here.";
  if($type ne 'sid' and $type ne 'tso') {
    confess "I need your file type to be sid or tso.";
  }
  my $fidek=$i->{$type} // confess "I need this set here.";
  my $count=0;
  foreach my $pid (keys %$fidek) {
    $count++;
  }
  $i->{'count_doc'}->{$type}=$count;
  return $count;
}


1;