package Ernad::Learn::Fit; use strict; use warnings; use Carp qw(confess); ## fit functions our $f; our $e=$main::e; ## fixme: this needs importing into Dokli.pm $f->{'treat_fit'}->{'term'} = sub { my $t=shift; if($t=~m|^[[:upper:]][[:lower:]]*$|) { $t=lc($t); } return $t; }; $f->{'treat_fit'}->{'frin'} = sub { my $t=shift; $t=~s|^\s||; $t=~s|\s$||; return $t; }; $f->{'treat_fit'}->{'frex'} = sub { my $t=shift; my $rc=shift; ## first process like a frin $t=&{$f->{'treat_fit'}->{'frin'}}($t); $t=~s|\s+| |; $t=~s|\s+$||; $t=~s|^\s+||; if(not $t=~m| |) { return ''; } ## remove terms in brackets, they are usually abreviations $t=~s|\(.*\)||g; ## these are separators. Easiest just to delete the lines that ## contain them and assume the components are covered elsewhere my @seps=(':',',','/','&',' - ','"','$'); foreach my $sep (@seps) { if($t=~m|\Q$sep\E|) { if($rc eq 'read') { $e->echo(__LINE__,"I skip line '$t' because it contains a separator",2); return ''; } elsif($rc eq 'check') { $t=~s|\Q$sep\E||; } else { confess "unknown rc"; } } } $t=~s|\n$||; $t=~s|\.$||; $t=~s|^\.\s*||; $t=~s|\x{2019}|'|g; $t=~s|^'||g; $t=~s|'$||g; ## only uppercase #if($t=~m|^[A-Z ]+$|) { $t=lc($t); #} return $t; }; $f->{'split_line'}->{'term'} = sub { my $line=shift; my @fits; foreach my $term (split(/[\s\p{punct}]/,$line)) { ## function returns nothing when it is not finding a worthy term my $fit=&{$f->{'treat_fit'}->{'term'}}($term,'read') or next; push(@fits,$fit); } return @fits; }; $f->{'split_line'}->{'frin'} = sub { my $line=shift; my @fits; ## It is assumed that the entire line is a feature. my $fit=&{$f->{'treat_fit'}->{'frin'}}($line,'read') or return; push(@fits,$fit); return @fits; }; $f->{'split_line'}->{'frex'} = sub { my $line=shift; my @fits; ## It is assumed that the entire line is a feature. my $fit=&{$f->{'treat_fit'}->{'frex'}}($line,'read') or return; push(@fits,$fit); return @fits; }; 1;