Changeset 294
- Timestamp:
- 11/25/06 17:53:57 (6 years ago)
- Files:
-
- 2 modified
-
reconcilers/reconciler_mk2 (modified) (13 diffs)
-
status (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
reconcilers/reconciler_mk2
r284 r294 21 21 # 0.08 20oct06 improved preference-title rewriter, more debugging to logfile 22 22 # 0.09 24oct06 --preftitle support 23 # 0.15 26nov06 reworked the title translations a bit 24 # - now only stores preferred titles when it has seen more than once 25 # - keeps last 30 days titles (indexed by time-of-day/day-of-week) 26 # as back history for alternate titles when switching grabbers 23 27 24 28 # … … 91 95 92 96 my $progname = "reconciler_mk2"; 93 my $version = "0.1 4";97 my $version = "0.15"; 94 98 95 99 use LWP::UserAgent; … … 140 144 $reclogic{max_programme_length} = (8 * 60 * 60); # 8 hours 141 145 142 # do store a preference title, even if we don't have any alternatives 143 $reclogic{title_xlate_table_min_alt_progs} = 0; 146 # store a preferred title only if we have at least 1 alternative 147 $reclogic{title_xlate_table_min_alt_progs} = 1; 148 149 # keep up to 30 days of title history 150 $reclogic{title_history} = 30; 151 152 # matching of history titles requires programme start within 20 minutes and duration within 10% 153 $reclogic{title_history_start_fuzzy_match_window} = (20*60); # seconds 154 $reclogic{title_history_duration_fuzzy_match_percent} = 10; # percent 155 156 144 157 145 158 # … … 189 202 my $out = { }; 190 203 my $w; 204 my $gmt_offset; 191 205 192 206 my $setting_override; 193 207 my %cli_override; 194 208 my $title_xlate_table; # cached 209 my $title_history; # cached 195 210 196 211 my %amp; … … 363 378 sub write_config_file 364 379 { 380 # age out old titles (default is to keep for 30 days as per $reclogic{title_history}) 381 for my $cache_key (keys %{$title_history}) { 382 my ($starttime, $duration, $channel, $grabber) = split(/,/,$cache_key); 383 if ($starttime < (time-(86400*$reclogic{title_history}))) { 384 delete $title_history->{$cache_key}; 385 $stats{expired_old_titles}++; 386 } 387 } 388 365 389 open(CONF, ">$opt->{config_file}") || die "cannot write to $opt->{config_file}: $!"; 366 390 print CONF Data::Dumper->Dump( 367 [$setting_override, $title_xlate_table ],368 ["setting_override", "title_xlate_table" ]);391 [$setting_override, $title_xlate_table, $title_history], 392 ["setting_override", "title_xlate_table", "title_history" ]); 369 393 close CONF; 370 394 &log(1,(sprintf "updated configuration file %s.\n",$opt->{config_file})); … … 541 565 ###################################################################################################### 542 566 567 # strptime type date parsing - BUT - if no timezone is present, treat time as being in localtime 568 # rather than the various other perl implementation which treat it as being in UTC/GMT 543 569 sub parse_xmltv_date 544 570 { 545 571 my $datestring = shift; 546 my @timeattr; # = localtime(time); # 0=sec,1=min,2=hour,3=day,4=month,5=year,6=wday,7=yday,8=isdst 547 548 if ($datestring =~ /^(....)(..)(..)(..)(..)/) { 549 ($timeattr[5],$timeattr[4],$timeattr[3],$timeattr[2],$timeattr[1],$timeattr[0]) = (int($1)-1900,int($2)-1,int($3),int($4),int($5),0); 550 ($timeattr[6],$timeattr[7],$timeattr[8]) = (-1,-1,-1); 551 # NOTE: FIXME: we are ignoring timezone for now... 552 553 my $return_epoch = mktime(@timeattr); 554 return $return_epoch if ($return_epoch > 1); 555 } 556 572 my @t; # 0=sec,1=min,2=hour,3=day,4=month,5=year,6=wday,7=yday,8=isdst 573 my $tz_offset = 0; 574 575 # work out GMT offset - we only do this once 576 if (!$gmt_offset) { 577 my $tzstring = strftime("%z", localtime(time)); 578 579 $gmt_offset = (60*60) * int(substr($tzstring,1,2)); # hr 580 $gmt_offset += (60 * int(substr($tzstring,3,2))); # min 581 $gmt_offset *= -1 if (substr($tzstring,0,1) eq "-"); # +/- 582 } 583 584 if ($datestring =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})/) { 585 ($t[5],$t[4],$t[3],$t[2],$t[1],$t[0]) = (int($1)-1900,int($2)-1,int($3),int($4),int($5),0); 586 ($t[6],$t[7],$t[8]) = (-1,-1,-1); 587 588 # if input data has a timezone offset, then offset by that 589 if ($datestring =~ /\+(\d{2})(\d{2})/) { 590 $tz_offset = $gmt_offset - (($1*(60*60)) + ($2*60)); 591 } elsif ($datestring =~ /\-(\d{2})(\d{2})/) { 592 $tz_offset = $gmt_offset + (($1*(60*60)) + ($2*60)); 593 } 594 595 my $e = mktime(@t); 596 return ($e+$tz_offset) if ($e > 1); 597 } 557 598 return undef; 558 599 } … … 651 692 sub canonicalizeTitles_match 652 693 { 653 my $word1 =canonicalizeTitle(shift);654 my $word2 = canonicalizeTitle(shift);694 my $word1 = canonicalizeTitle(shift); 695 my $word2 = canonicalizeTitle(shift); 655 696 my @longer; 656 697 my @shorter; … … 664 705 } 665 706 666 #printf "got shorter: '%s', longer '%s'\n",$shorter,$longer;667 707 WORD: for my $word (@shorter) { 668 #printf "got word: '%s'\n",$word;669 708 for(my $i=0; $i < @longer; ++$i) { 670 709 if (forgivingMatch($longer[$i], $word)) { … … 799 838 800 839 if ($c->{'generator-info-name'}) { 801 if ($c->{'generator-info-name'} =~ /\s?(\S+)\s v([\d\.]+)/) {840 if ($c->{'generator-info-name'} =~ /\s?(\S+)\s[v]?([\d\.]+)/) { 802 841 ($proggy, $version) = ($1,$2); 803 842 } … … 1054 1093 my @titles, my @titles_from; 1055 1094 my $num_titles = 0; 1095 my $channel = $m->[0]->{channel}; 1096 my $title_start = $m->[0]->{start_epoch}; 1097 my $title_duration = $m->[0]->{stop_epoch} - $m->[0]->{start_epoch}; 1056 1098 1057 1099 # 0. first gather title from "preferred grabber" if we have it for … … 1190 1232 } 1191 1233 1234 my $seen_in_primary = time; 1235 1236 # 6. look back through our history of programmes to see if we can fuzzy match 1237 # this title to a previously-seen title 1238 for my $title_cache_key (keys %{$title_history}) { 1239 my ($th_starttime, $th_duration, $th_channel, $th_grabber) = split(/,/,$title_cache_key); 1240 1241 # has to be on same channel 1242 next if ($channel ne $th_channel); 1243 1244 # has to be on the same day of the week 1245 next if ((int($th_starttime / 86400) % 7) != (int($title_start / 86400) % 7)); 1246 1247 # duration has to be within 10% ($reclogic{title_history_duration_fuzzy_match_percent}) 1248 next if ($title_duration > ($th_duration * (1 + ($reclogic{title_history_duration_fuzzy_match_percent} / 100)))); 1249 next if ($title_duration < ($th_duration * (1 - ($reclogic{title_history_duration_fuzzy_match_percent} / 100)))); 1250 1251 # start time has to be within 20 minutes ($reclogic{title_history_start_fuzzy_match_window}) 1252 my $th_starttime_day = $th_starttime % (7*86400); 1253 my $title_start_day = $title_start % (7*86400); 1254 next if ($title_start_day > ($th_starttime_day + $reclogic{title_history_start_fuzzy_match_window})); 1255 next if ($title_start_day < ($th_starttime_day - $reclogic{title_history_start_fuzzy_match_window})); 1256 1257 # within window - check title 1258 foreach my $i (0..($num_titles-1)) { 1259 # grabber has to be different 1260 next if ($th_grabber eq $titles_from[$i]); 1261 1262 # check match 1263 my $match = canonicalizeTitles_match($title_history->{$title_cache_key},$titles[$i]); 1264 &log($reclogic{debug_choose_title},(sprintf "choose_title: title \"%s\" (from %s) %s title_history \"%s\" (from %s)", 1265 $titles[$i], $titles_from[$i], ($match == 1 ? "IS THE SAME AS" : "did not match"), 1266 $title_history->{$title_cache_key}, $th_grabber)); 1267 1268 if ($match) { 1269 # match - add it to the front of our titles list! 1270 $num_titles++; 1271 unshift(@titles, $title_history->{$title_cache_key}); 1272 unshift(@titles_from, $th_grabber); 1273 $seen_in_primary = $th_starttime; 1274 last; 1275 } 1276 } 1277 } 1278 1192 1279 my $preferred_title = $titles[0]; 1193 1280 1194 # 6. don't create a preferred title for this unless we have met our threshold for title_xlate_table_min_alt_progs1281 # 7. don't create a preferred title for this unless we have met our threshold for title_xlate_table_min_alt_progs 1195 1282 if (($num_titles-1) < $reclogic{title_xlate_table_min_alt_progs}) { 1196 1283 &log($reclogic{debug_choose_title},(sprintf … … 1200 1287 } 1201 1288 1202 # 7. wasn't an alternate name1289 # 8. wasn't an alternate name 1203 1290 # add it as a preferred name 1204 1291 1205 1292 my $key = lc($preferred_title); 1206 1293 $title_xlate_table->{$key}->{seen_in_primary} = 1; 1207 $title_xlate_table->{$key}->{last_seen_in_primary} = time;1294 $title_xlate_table->{$key}->{last_seen_in_primary} = $seen_in_primary; 1208 1295 $title_xlate_table->{$key}->{translated} = 0; 1209 1296 $title_xlate_table->{$key}->{translation} = $preferred_title; … … 1261 1348 $newprog->{title}->[0] = [ $title, $title_lang ]; 1262 1349 1350 # add this programme to our title_history 1351 my $title_history_key = sprintf "%d,%d,%s,%s", 1352 $newprog->{start_epoch}, 1353 ($newprog->{stop_epoch}-$newprog->{start_epoch}), 1354 $newprog->{channel}, $m->[0]->{grabber}; 1355 $title_history->{$title_history_key} = $title; 1356 1357 # fill in programme fields and attributes 1263 1358 $newprog->{datasources} = ""; 1264 1359 for my $i (0..($num_matching-1)) { -
status
r291 r294 11 11 grabber ninemsn 0.01-r1 12 12 grabber yahoo7web 0.01-r1 13 reconciler reconciler_mk2 0.1 413 reconciler reconciler_mk2 0.15 14 14 postprocessor imdb_augment_data 0.05 15 15 postprocessor flag_aus_hdtv 0.10.3-r1
