Changeset 294

Show
Ignore:
Timestamp:
11/25/06 17:53:57 (6 years ago)
Author:
lincoln
Message:

multiple enhancements to the reconciler:

  1. choose_title() now keeps its own history of titles seen and uses that in finding preferred titles. far more robust. addresses #5
  1. because of 1., increase $reclogic{title_xlate_table_min_alt_progs} to 1: only store a preferred title if we have at least 1 alternative
  1. reconciler can now deal with XMLTV data both with and without timezones in start/stop times
  1. reconciler now correctly parses all known 'generator-info-name' fields from input XMLTV files - primarily means its less verbose in logging and what it stores in its title-match history

due to changes 1 and 2, it is recommended that you manually reset your
title history with '~/.shepherd/tv_grab_au --reset'

Files:
2 modified

Legend:

Unmodified
Added
Removed
  • reconcilers/reconciler_mk2

    r284 r294  
    2121#    0.08  20oct06      improved preference-title rewriter, more debugging to logfile 
    2222#    0.09  24oct06      --preftitle support 
     23#    0.15  26nov06      reworked the title translations a bit 
     24#                        - now only stores preferred titles when it has seen more than once 
     25#                        - keeps last 30 days titles (indexed by time-of-day/day-of-week) 
     26#                          as back history for alternate titles when switching grabbers 
    2327 
    2428# 
     
    9195 
    9296my $progname = "reconciler_mk2"; 
    93 my $version = "0.14"; 
     97my $version = "0.15"; 
    9498 
    9599use LWP::UserAgent; 
     
    140144$reclogic{max_programme_length} = (8 * 60 * 60); # 8 hours 
    141145 
    142 # do store a preference title, even if we don't have any alternatives 
    143 $reclogic{title_xlate_table_min_alt_progs} = 0; 
     146# store a preferred title only if we have at least 1 alternative 
     147$reclogic{title_xlate_table_min_alt_progs} = 1; 
     148 
     149# keep up to 30 days of title history 
     150$reclogic{title_history} = 30; 
     151 
     152# matching of history titles requires programme start within 20 minutes and duration within 10% 
     153$reclogic{title_history_start_fuzzy_match_window} = (20*60);    # seconds 
     154$reclogic{title_history_duration_fuzzy_match_percent} = 10;     # percent 
     155 
     156 
    144157 
    145158# 
     
    189202my $out = { }; 
    190203my $w; 
     204my $gmt_offset; 
    191205 
    192206my $setting_override; 
    193207my %cli_override; 
    194208my $title_xlate_table;  # cached 
     209my $title_history;      # cached 
    195210 
    196211my %amp; 
     
    363378sub write_config_file 
    364379{ 
     380        # age out old titles (default is to keep for 30 days as per $reclogic{title_history}) 
     381        for my $cache_key (keys %{$title_history}) { 
     382                my ($starttime, $duration, $channel, $grabber) = split(/,/,$cache_key); 
     383                if ($starttime < (time-(86400*$reclogic{title_history}))) { 
     384                        delete $title_history->{$cache_key}; 
     385                        $stats{expired_old_titles}++; 
     386                } 
     387        } 
     388 
    365389        open(CONF, ">$opt->{config_file}") || die "cannot write to $opt->{config_file}: $!"; 
    366390        print CONF Data::Dumper->Dump( 
    367                 [$setting_override,  $title_xlate_table  ], 
    368                 ["setting_override", "title_xlate_table" ]); 
     391                [$setting_override,  $title_xlate_table,  $title_history], 
     392                ["setting_override", "title_xlate_table", "title_history" ]); 
    369393        close CONF; 
    370394        &log(1,(sprintf "updated configuration file %s.\n",$opt->{config_file})); 
     
    541565###################################################################################################### 
    542566 
     567# strptime type date parsing - BUT - if no timezone is present, treat time as being in localtime 
     568# rather than the various other perl implementation which treat it as being in UTC/GMT 
    543569sub parse_xmltv_date 
    544570{ 
    545571        my $datestring = shift; 
    546         my @timeattr; # = localtime(time); # 0=sec,1=min,2=hour,3=day,4=month,5=year,6=wday,7=yday,8=isdst 
    547  
    548         if ($datestring =~ /^(....)(..)(..)(..)(..)/) { 
    549                 ($timeattr[5],$timeattr[4],$timeattr[3],$timeattr[2],$timeattr[1],$timeattr[0]) = (int($1)-1900,int($2)-1,int($3),int($4),int($5),0); 
    550                 ($timeattr[6],$timeattr[7],$timeattr[8]) = (-1,-1,-1); 
    551                 # NOTE: FIXME: we are ignoring timezone for now... 
    552  
    553                 my $return_epoch = mktime(@timeattr); 
    554                 return $return_epoch if ($return_epoch > 1); 
    555         } 
    556  
     572        my @t; # 0=sec,1=min,2=hour,3=day,4=month,5=year,6=wday,7=yday,8=isdst 
     573        my $tz_offset = 0; 
     574 
     575        # work out GMT offset - we only do this once 
     576        if (!$gmt_offset) { 
     577                my $tzstring = strftime("%z", localtime(time)); 
     578 
     579                $gmt_offset = (60*60) * int(substr($tzstring,1,2));     # hr 
     580                $gmt_offset += (60 * int(substr($tzstring,3,2)));       # min 
     581                $gmt_offset *= -1 if (substr($tzstring,0,1) eq "-");    # +/- 
     582        } 
     583 
     584        if ($datestring =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})/) { 
     585                ($t[5],$t[4],$t[3],$t[2],$t[1],$t[0]) = (int($1)-1900,int($2)-1,int($3),int($4),int($5),0); 
     586                ($t[6],$t[7],$t[8]) = (-1,-1,-1); 
     587 
     588                # if input data has a timezone offset, then offset by that 
     589                if ($datestring =~ /\+(\d{2})(\d{2})/) { 
     590                        $tz_offset = $gmt_offset - (($1*(60*60)) + ($2*60)); 
     591                } elsif ($datestring =~ /\-(\d{2})(\d{2})/) { 
     592                        $tz_offset = $gmt_offset + (($1*(60*60)) + ($2*60)); 
     593                } 
     594 
     595                my $e = mktime(@t); 
     596                return ($e+$tz_offset) if ($e > 1); 
     597        } 
    557598        return undef; 
    558599} 
     
    651692sub canonicalizeTitles_match 
    652693{ 
    653         my $word1=canonicalizeTitle(shift); 
    654         my $word2 =canonicalizeTitle(shift); 
     694        my $word1 = canonicalizeTitle(shift); 
     695        my $word2 = canonicalizeTitle(shift); 
    655696        my @longer; 
    656697        my @shorter; 
     
    664705        } 
    665706 
    666 #printf "got shorter: '%s', longer '%s'\n",$shorter,$longer; 
    667707        WORD: for my $word (@shorter) { 
    668 #printf "got word: '%s'\n",$word; 
    669708                for(my $i=0; $i < @longer; ++$i) { 
    670709                        if (forgivingMatch($longer[$i], $word)) { 
     
    799838 
    800839        if ($c->{'generator-info-name'}) { 
    801                 if ($c->{'generator-info-name'} =~ /\s?(\S+)\sv([\d\.]+)/) { 
     840                if ($c->{'generator-info-name'} =~ /\s?(\S+)\s[v]?([\d\.]+)/) { 
    802841                        ($proggy, $version) = ($1,$2); 
    803842                } 
     
    10541093        my @titles, my @titles_from; 
    10551094        my $num_titles = 0; 
     1095        my $channel = $m->[0]->{channel}; 
     1096        my $title_start = $m->[0]->{start_epoch}; 
     1097        my $title_duration = $m->[0]->{stop_epoch} - $m->[0]->{start_epoch}; 
    10561098 
    10571099        # 0. first gather title from "preferred grabber" if we have it for 
     
    11901232        } 
    11911233 
     1234        my $seen_in_primary = time; 
     1235 
     1236        # 6. look back through our history of programmes to see if we can fuzzy match 
     1237        #    this title to a previously-seen title 
     1238        for my $title_cache_key (keys %{$title_history}) { 
     1239                my ($th_starttime, $th_duration, $th_channel, $th_grabber) = split(/,/,$title_cache_key); 
     1240 
     1241                # has to be on same channel 
     1242                next if ($channel ne $th_channel); 
     1243 
     1244                # has to be on the same day of the week 
     1245                next if ((int($th_starttime / 86400) % 7) != (int($title_start / 86400) % 7)); 
     1246 
     1247                # duration has to be within 10% ($reclogic{title_history_duration_fuzzy_match_percent}) 
     1248                next if ($title_duration > ($th_duration * (1 + ($reclogic{title_history_duration_fuzzy_match_percent} / 100)))); 
     1249                next if ($title_duration < ($th_duration * (1 - ($reclogic{title_history_duration_fuzzy_match_percent} / 100)))); 
     1250 
     1251                # start time has to be within 20 minutes ($reclogic{title_history_start_fuzzy_match_window}) 
     1252                my $th_starttime_day = $th_starttime % (7*86400); 
     1253                my $title_start_day = $title_start % (7*86400); 
     1254                next if ($title_start_day > ($th_starttime_day + $reclogic{title_history_start_fuzzy_match_window})); 
     1255                next if ($title_start_day < ($th_starttime_day - $reclogic{title_history_start_fuzzy_match_window})); 
     1256 
     1257                # within window - check title 
     1258                foreach my $i (0..($num_titles-1)) { 
     1259                        # grabber has to be different 
     1260                        next if ($th_grabber eq $titles_from[$i]); 
     1261 
     1262                        # check match 
     1263                        my $match = canonicalizeTitles_match($title_history->{$title_cache_key},$titles[$i]); 
     1264                        &log($reclogic{debug_choose_title},(sprintf "choose_title: title \"%s\" (from %s) %s title_history \"%s\" (from %s)", 
     1265                                $titles[$i], $titles_from[$i], ($match == 1 ? "IS THE SAME AS" : "did not match"), 
     1266                                $title_history->{$title_cache_key}, $th_grabber)); 
     1267 
     1268                        if ($match) { 
     1269                                # match - add it to the front of our titles list! 
     1270                                $num_titles++; 
     1271                                unshift(@titles, $title_history->{$title_cache_key}); 
     1272                                unshift(@titles_from, $th_grabber); 
     1273                                $seen_in_primary = $th_starttime; 
     1274                                last; 
     1275                        } 
     1276                } 
     1277        } 
     1278 
    11921279        my $preferred_title = $titles[0]; 
    11931280 
    1194         # 6. don't create a preferred title for this unless we have met our threshold for title_xlate_table_min_alt_progs 
     1281        # 7. don't create a preferred title for this unless we have met our threshold for title_xlate_table_min_alt_progs 
    11951282        if (($num_titles-1) < $reclogic{title_xlate_table_min_alt_progs}) { 
    11961283                &log($reclogic{debug_choose_title},(sprintf  
     
    12001287        } 
    12011288 
    1202         # 7. wasn't an alternate name 
     1289        # 8. wasn't an alternate name 
    12031290        #    add it as a preferred name 
    12041291 
    12051292        my $key = lc($preferred_title); 
    12061293        $title_xlate_table->{$key}->{seen_in_primary} = 1; 
    1207         $title_xlate_table->{$key}->{last_seen_in_primary} = time; 
     1294        $title_xlate_table->{$key}->{last_seen_in_primary} = $seen_in_primary; 
    12081295        $title_xlate_table->{$key}->{translated} = 0; 
    12091296        $title_xlate_table->{$key}->{translation} = $preferred_title; 
     
    12611348        $newprog->{title}->[0] =        [ $title, $title_lang ]; 
    12621349 
     1350        # add this programme to our title_history 
     1351        my $title_history_key = sprintf "%d,%d,%s,%s",  
     1352                $newprog->{start_epoch},  
     1353                ($newprog->{stop_epoch}-$newprog->{start_epoch}),  
     1354                $newprog->{channel}, $m->[0]->{grabber}; 
     1355        $title_history->{$title_history_key} = $title; 
     1356 
     1357        # fill in programme fields and attributes 
    12631358        $newprog->{datasources} = ""; 
    12641359        for my $i (0..($num_matching-1)) { 
  • status

    r291 r294  
    1111grabber         ninemsn             0.01-r1 
    1212grabber         yahoo7web           0.01-r1 
    13 reconciler      reconciler_mk2      0.14 
     13reconciler      reconciler_mk2      0.15 
    1414postprocessor   imdb_augment_data   0.05 
    1515postprocessor   flag_aus_hdtv       0.10.3-r1