| 1 | #!/usr/bin/perl |
|---|
| 2 | |
|---|
| 3 | # flag_aus_hdtv |
|---|
| 4 | |
|---|
| 5 | # performs two functions: |
|---|
| 6 | # 1. takes existing programmes flagged as HDTV by grabbers, |
|---|
| 7 | # 2. checks the digital broadcasting australia (www.dba.org.au) |
|---|
| 8 | # .. and populates the '(chan)HD' channels (if they exist) |
|---|
| 9 | # with the HD versions of the programs. |
|---|
| 10 | |
|---|
| 11 | # originally written by Richard Dale <richard@interlink.com.au>, |
|---|
| 12 | # September 2006 |
|---|
| 13 | |
|---|
| 14 | my $progname = "flag_aus_hdtv"; |
|---|
| 15 | my $version = "0.12"; |
|---|
| 16 | |
|---|
| 17 | $| = 1; |
|---|
| 18 | print "$progname v$version\n"; |
|---|
| 19 | |
|---|
| 20 | use strict; |
|---|
| 21 | use Getopt::Long; |
|---|
| 22 | use LWP::UserAgent; |
|---|
| 23 | use HTML::TreeBuilder; |
|---|
| 24 | use IO::File; |
|---|
| 25 | use XMLTV; |
|---|
| 26 | use Data::Dumper; |
|---|
| 27 | |
|---|
| 28 | # |
|---|
| 29 | # customizations |
|---|
| 30 | # |
|---|
| 31 | |
|---|
| 32 | # Add overrides for HD - sometimes DBA doesn't always have the latest HD information about TV shows so set them here in the override |
|---|
| 33 | my $hdoverride; |
|---|
| 34 | $hdoverride->{TEN} = [ |
|---|
| 35 | "The Handler", |
|---|
| 36 | "N.Y.P.D. Blue", |
|---|
| 37 | "The Office", |
|---|
| 38 | "Battlestar Galactica" |
|---|
| 39 | ]; |
|---|
| 40 | |
|---|
| 41 | # Add translations for show names that differ between DBA and our normal tv_grab_au names |
|---|
| 42 | my $translation; |
|---|
| 43 | $translation->{'ACA'} = 'A Current Affair'; |
|---|
| 44 | $translation->{'CSI'} = 'CSI: Crime Scene Investigation'; |
|---|
| 45 | $translation->{'CSI-Miami'} = 'CSI: Miami'; |
|---|
| 46 | |
|---|
| 47 | # translations between DBA website and what we know channels as |
|---|
| 48 | my $chan_name_translations; |
|---|
| 49 | $chan_name_translations->{"Ten"} = "TEN"; |
|---|
| 50 | $chan_name_translations->{"ABC Main"} = "ABC"; |
|---|
| 51 | |
|---|
| 52 | # |
|---|
| 53 | # options |
|---|
| 54 | # |
|---|
| 55 | |
|---|
| 56 | my %stats; |
|---|
| 57 | my $channels, my $opt_channels; |
|---|
| 58 | my $d; |
|---|
| 59 | my %amp = ( nbsp => ' ', qw{ amp & lt < gt > apos ' quot " } ); |
|---|
| 60 | my $ua = LWP::UserAgent->new('timeout' => 30, 'keep_alive' => 1, 'agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-us)' ); |
|---|
| 61 | $ua->env_proxy; |
|---|
| 62 | |
|---|
| 63 | my $opt; |
|---|
| 64 | $opt->{dbaurl} = "http://www.dba.org.au/dtvprograms"; # This URL seems to work fine Sep 2006 |
|---|
| 65 | $opt->{output_file} = "output.xmltv"; |
|---|
| 66 | $opt->{region} = 94; |
|---|
| 67 | $opt->{debug} = 1; |
|---|
| 68 | $opt->{action} = "copy"; |
|---|
| 69 | |
|---|
| 70 | &parse_command_line; |
|---|
| 71 | |
|---|
| 72 | # check XMLTV version for HDTV compatability |
|---|
| 73 | my @xmltv_version = split(/\./,$XMLTV::VERSION); |
|---|
| 74 | if (($xmltv_version[0] <= 0) && ($xmltv_version[1] <= "5") && ($xmltv_version[2] <= "43")) { |
|---|
| 75 | print " - XMLTV version ".$XMLTV::VERSION." too old to support HDTV flags. Disabling HDTV flags."; |
|---|
| 76 | $opt->{notag} = 1; |
|---|
| 77 | $stats{disabled_hdtv_flag}++; |
|---|
| 78 | } |
|---|
| 79 | |
|---|
| 80 | &read_config_file($opt->{channels_file},1); |
|---|
| 81 | |
|---|
| 82 | &decode_region($opt->{region}); |
|---|
| 83 | &lookup_dba unless ((defined $opt->{nodbalookup}) || (!defined $opt->{locality})); |
|---|
| 84 | |
|---|
| 85 | # |
|---|
| 86 | # now eat up the XMLTV inputs we were given! |
|---|
| 87 | # |
|---|
| 88 | |
|---|
| 89 | my %writer_args = ( encoding => 'ISO-8859-1' ); |
|---|
| 90 | my $fh = new IO::File(">".$opt->{output_file}) || die "can't open $opt->{output_file} for writing: $!"; |
|---|
| 91 | $writer_args{OUTPUT} = $fh; |
|---|
| 92 | |
|---|
| 93 | my $writer = new XMLTV::Writer(%writer_args); |
|---|
| 94 | $writer->start( { |
|---|
| 95 | 'source-info-name' => "$progname $version", |
|---|
| 96 | 'generator-info-name' => "$progname $version"} ); |
|---|
| 97 | |
|---|
| 98 | foreach my $file (@ARGV) { |
|---|
| 99 | printf "Parsing: %s\n",($file eq "-" ? "(from-stdin, hit control-D to finiah)" : $file); |
|---|
| 100 | XMLTV::parsefiles_callback(undef, undef, \&channel_cb,\&programme_cb, $file); |
|---|
| 101 | } |
|---|
| 102 | |
|---|
| 103 | $writer->end(); |
|---|
| 104 | printf "Finished parsing, output in $opt->{output_file}\n"; |
|---|
| 105 | |
|---|
| 106 | &print_stats; |
|---|
| 107 | exit(0); |
|---|
| 108 | |
|---|
| 109 | ###################################################################################################### |
|---|
| 110 | # read settings |
|---|
| 111 | |
|---|
| 112 | sub read_config_file |
|---|
| 113 | { |
|---|
| 114 | my($file,$die_on_failure) = @_; |
|---|
| 115 | if (!(-r $file)) { |
|---|
| 116 | die "file $file could not be read. aborting.\n" if $die_on_failure; |
|---|
| 117 | return; |
|---|
| 118 | } |
|---|
| 119 | local (@ARGV, $/) = ($file); |
|---|
| 120 | no warnings 'all'; eval <>; die "$@" if $@; |
|---|
| 121 | } |
|---|
| 122 | |
|---|
| 123 | ###################################################################################################### |
|---|
| 124 | |
|---|
| 125 | sub parse_command_line |
|---|
| 126 | { |
|---|
| 127 | GetOptions( |
|---|
| 128 | 'channels_file=s' => \$opt->{channels_file}, |
|---|
| 129 | 'config=s' => \$opt->{config_file}, |
|---|
| 130 | 'action=s' => \$opt->{action}, |
|---|
| 131 | 'region=i' => \$opt->{region}, |
|---|
| 132 | 'nodbalookup' => \$opt->{nodbalookup}, |
|---|
| 133 | 'dbaurl=s' => \$opt->{dbaurl}, |
|---|
| 134 | 'notag' => \$opt->{notag}, |
|---|
| 135 | 'days=i' => \$opt->{days}, # ignored |
|---|
| 136 | 'offset=i' => \$opt->{offset}, # ignored |
|---|
| 137 | 'output=s' => \$opt->{output_file}, |
|---|
| 138 | 'debug+' => \$opt->{debug}, |
|---|
| 139 | 'help' => \$opt->{help}, |
|---|
| 140 | 'h' => \$opt->{help}, |
|---|
| 141 | 'v' => \$opt->{version}, |
|---|
| 142 | 'version' => \$opt->{version}); |
|---|
| 143 | |
|---|
| 144 | if ($opt->{help}) { |
|---|
| 145 | print<<EOF |
|---|
| 146 | |
|---|
| 147 | options: |
|---|
| 148 | --channels_file=(f) shepherd channels file (mandatory) |
|---|
| 149 | --region=(region) region where we are (mandatory) (default: $opt->{region}) |
|---|
| 150 | (VIC) 90,93,94,95,98 |
|---|
| 151 | (NSW/ACT) 63,66,67,69,71,73,106,184 |
|---|
| 152 | (QLD) 75,78,79,114 |
|---|
| 153 | (SA) 81,82,83,85,86,107 |
|---|
| 154 | (WA) 101,102 |
|---|
| 155 | (no HDTV in TAS or NT .. apparently) |
|---|
| 156 | |
|---|
| 157 | --nodbalookup don't look up programmes on DBA website (default: do) |
|---|
| 158 | --dbaurl=(url) URL where dba lists HDTV programming (default: $opt->{dbaurl}) |
|---|
| 159 | |
|---|
| 160 | --action=(policy) policy can be one of: |
|---|
| 161 | move - move prog to HD channel (remove it from SD channel) |
|---|
| 162 | copy - copy prog to HD channel (keep it in SD channel also) (default) |
|---|
| 163 | |
|---|
| 164 | --notag don't tag with HD flags (default: do) |
|---|
| 165 | |
|---|
| 166 | --output=(f) output XMLTV filename (default: $opt->{output}) |
|---|
| 167 | |
|---|
| 168 | --debug increase debug level |
|---|
| 169 | |
|---|
| 170 | EOF |
|---|
| 171 | ; |
|---|
| 172 | } |
|---|
| 173 | |
|---|
| 174 | exit 0 if (($opt->{version}) || ($opt->{help})); |
|---|
| 175 | |
|---|
| 176 | |
|---|
| 177 | die "'--channels_file {file}' must be specified. see --help for details.\n" if (!$opt->{channels_file}); |
|---|
| 178 | die "unknown --action policy, see --help for details.\n" if ($opt->{action} !~ /^(copy|move)$/); |
|---|
| 179 | } |
|---|
| 180 | |
|---|
| 181 | ###################################################################################################### |
|---|
| 182 | |
|---|
| 183 | sub decode_region |
|---|
| 184 | { |
|---|
| 185 | my $r = shift; |
|---|
| 186 | |
|---|
| 187 | if ($r =~ /^(90|93|94|95|98)$/) { |
|---|
| 188 | $opt->{locality} = "M"; |
|---|
| 189 | } elsif ($r =~ /^(101|102)$/) { |
|---|
| 190 | $opt->{locality} = "W"; |
|---|
| 191 | } elsif ($r =~ /^(63|66|67|69|71|73|76|106|184)$/) { |
|---|
| 192 | $opt->{locality} = "S"; |
|---|
| 193 | } elsif ($r =~ /^(75|78|79|114)$/) { |
|---|
| 194 | $opt->{locality} = "B"; |
|---|
| 195 | } elsif ($r =~ /^(81|82|83|85|86|107)$/) { |
|---|
| 196 | $opt->{locality} = "A"; |
|---|
| 197 | } |
|---|
| 198 | |
|---|
| 199 | if (!defined $opt->{locality}) { |
|---|
| 200 | print " - region $r doesn't receive HDTV, not moving channels.\n"; |
|---|
| 201 | $stats{no_hdtv_region}++; |
|---|
| 202 | } |
|---|
| 203 | } |
|---|
| 204 | |
|---|
| 205 | ###################################################################################################### |
|---|
| 206 | |
|---|
| 207 | sub lookup_dba |
|---|
| 208 | { |
|---|
| 209 | my $data = &get_url($opt->{dbaurl},"Obtaining DBA data",0); |
|---|
| 210 | return if (!$data); |
|---|
| 211 | |
|---|
| 212 | my $tree = HTML::TreeBuilder->new_from_content($data); |
|---|
| 213 | return if (!$tree); |
|---|
| 214 | |
|---|
| 215 | # each daily section starts with: |
|---|
| 216 | # <TABLE class=tv... |
|---|
| 217 | # so we'll search for that and populate an array |
|---|
| 218 | my @chan_map; |
|---|
| 219 | for ($tree->look_down('_tag' => 'table', 'class' => 'tv')) { |
|---|
| 220 | my $firstrowdone=0; |
|---|
| 221 | my $dayname = ""; |
|---|
| 222 | |
|---|
| 223 | foreach my $row ($_->look_down('_tag' => 'tr')) { |
|---|
| 224 | if ($firstrowdone == 0) { # Channel list is the first row |
|---|
| 225 | $firstrowdone = 1; |
|---|
| 226 | my $colnum = 0; |
|---|
| 227 | for my $channel ($row->look_down('_tag' => 'td')) { |
|---|
| 228 | my $this_chan = $channel->as_text; |
|---|
| 229 | $colnum++; |
|---|
| 230 | $this_chan =~ s/[^\x20-\x7f]/ /g; |
|---|
| 231 | $this_chan =~ s/(^\s+|\s+$)//g; |
|---|
| 232 | next if ($this_chan eq ""); |
|---|
| 233 | |
|---|
| 234 | $this_chan = $chan_name_translations->{$this_chan} |
|---|
| 235 | if (defined $chan_name_translations->{$this_chan}); |
|---|
| 236 | |
|---|
| 237 | if (!defined $channels->{$this_chan}) { |
|---|
| 238 | printf "DBA listed unkown channel '$this_chan'! Ignored.\n"; |
|---|
| 239 | $stats{unknown_channels}++; |
|---|
| 240 | next; |
|---|
| 241 | } |
|---|
| 242 | |
|---|
| 243 | $chan_map[$colnum] = $this_chan; |
|---|
| 244 | #printf "got channel '$this_chan' in column $colnum\n"; |
|---|
| 245 | } |
|---|
| 246 | next; |
|---|
| 247 | } |
|---|
| 248 | |
|---|
| 249 | # These must be the program details |
|---|
| 250 | my $colnum = 0; |
|---|
| 251 | foreach my $session ($row->look_down('_tag' => 'td')) { |
|---|
| 252 | $colnum++; |
|---|
| 253 | if ($colnum == 1) { |
|---|
| 254 | # This session has the day name in it |
|---|
| 255 | my $newdayname = $session->as_text(); |
|---|
| 256 | $newdayname =~ s/Night//; |
|---|
| 257 | $newdayname =~ s/Day//; |
|---|
| 258 | $newdayname =~ s/Sat/Saturday/; |
|---|
| 259 | $newdayname = "" if ($dayname =~ m/Subject to avail/i); |
|---|
| 260 | |
|---|
| 261 | $dayname = $newdayname if ($newdayname =~ m/day$/); |
|---|
| 262 | next; |
|---|
| 263 | } |
|---|
| 264 | |
|---|
| 265 | my $this_chan = $chan_map[$colnum]; |
|---|
| 266 | |
|---|
| 267 | my $sessionshows = $session->as_HTML(); |
|---|
| 268 | $sessionshows =~ s/<p>/<br \/>/g; # Convert new paragraphs to newlines because they're not always consistent |
|---|
| 269 | $sessionshows =~ s/<\/p>//g; |
|---|
| 270 | $sessionshows =~ s/<u>//g; |
|---|
| 271 | $sessionshows =~ s/<\/u>//g; |
|---|
| 272 | $sessionshows =~ s/<\/td>//g; |
|---|
| 273 | $sessionshows =~ s/<\/font>//g; |
|---|
| 274 | $sessionshows =~ s/<font.*?>//g; |
|---|
| 275 | $sessionshows =~ s/<td.*?>//g; |
|---|
| 276 | #print "\nFound " . $session->as_HTML() . "\n\n"; |
|---|
| 277 | my @shows = split(/<br[\s\/]*>/,$sessionshows); |
|---|
| 278 | foreach my $show (@shows) { |
|---|
| 279 | # Bodgy stuff to fix html markup |
|---|
| 280 | $show =~ s/\&/\&/g; $show =~ s/\'/\'/g; $show =~ s/\"/\"/g; |
|---|
| 281 | $show =~ s/\¸/\,/g; |
|---|
| 282 | $show =~ s/\'/\'/g; |
|---|
| 283 | $show =~ s/\ /\ /g; |
|---|
| 284 | |
|---|
| 285 | # Remove trailing asterisk - just means could be widescreen if content is in widescreen - duh! |
|---|
| 286 | $show =~ s/\*$//; |
|---|
| 287 | |
|---|
| 288 | # High Definition and locality |
|---|
| 289 | my $hd = 0; |
|---|
| 290 | if ($show =~ m/\(HD\)\s*? \(([SMAPB])\)/) { |
|---|
| 291 | $hd = 1 if ($1 eq $opt->{locality}); |
|---|
| 292 | $show =~ s/\(HD\)\s*? \([SMAPB]\)//; |
|---|
| 293 | } |
|---|
| 294 | |
|---|
| 295 | |
|---|
| 296 | # High Definition |
|---|
| 297 | if ($show =~ m/\(HD\)/ || $show =~ m/\(JHD\)/ ) { |
|---|
| 298 | $hd = 1; |
|---|
| 299 | $show =~ s/\(HD\)//; |
|---|
| 300 | $show =~ s/\(JHD\)//; # JHD was probably a typo but we've included it here because they'll probably have fat fingers again |
|---|
| 301 | } |
|---|
| 302 | |
|---|
| 303 | # Dolby Digital 5.1 |
|---|
| 304 | my $dd51 = 0; |
|---|
| 305 | if ($show =~ m/DD 5.1/ || $show =~ m/DD5.1/) { |
|---|
| 306 | $dd51 = 1; |
|---|
| 307 | $show =~ s/\[\+ DD 5.1 sound\]//; |
|---|
| 308 | $show =~ s/\[\+ DD 5.1 Audio\]//; |
|---|
| 309 | $show =~ s/\[\+ DD 5.1 audio\]//; |
|---|
| 310 | $show =~ s/\[\+DD 5.1 audio\]//; |
|---|
| 311 | $show =~ s/\[\+DD 5.1 Audio\]//; |
|---|
| 312 | $show =~ s/\[\+DD 5.1 sound\]//; |
|---|
| 313 | $show =~ s/\[\+DD 5.1 sound\]//; |
|---|
| 314 | $show =~ s/\[\+DD5.1 sound\]//; |
|---|
| 315 | $show =~ s/\[\+DD5.1\]//; |
|---|
| 316 | $show =~ s/\[\+ DD 5.1\]//; |
|---|
| 317 | $show =~ s/\[\DD5.1\]//; |
|---|
| 318 | $show =~ s/\[\DD 5.1\]//; |
|---|
| 319 | } |
|---|
| 320 | $show =~ s/\s+$//; |
|---|
| 321 | $show =~ s/^\s+//; |
|---|
| 322 | |
|---|
| 323 | if (defined($translation->{$show})) { |
|---|
| 324 | $show = $translation->{$show}; |
|---|
| 325 | $stats{manual_translations}++; |
|---|
| 326 | } |
|---|
| 327 | |
|---|
| 328 | next if (length($show) == 0); |
|---|
| 329 | |
|---|
| 330 | # Widescreen handling |
|---|
| 331 | # Now we're here, this show must be in widescreen - we don't (yet) do anything with widescreen in mythtv |
|---|
| 332 | |
|---|
| 333 | # DD5.1 |
|---|
| 334 | if ($dd51 == 1) { |
|---|
| 335 | # This show is is in DD5.1 - we don't (yet) do anything wiht DD5.1 in mythtv |
|---|
| 336 | ; |
|---|
| 337 | } |
|---|
| 338 | |
|---|
| 339 | # High Definition Handling |
|---|
| 340 | if ($hd == 1) { |
|---|
| 341 | #print "HD Channel $this_chan $dayname \"$show\"\n" if $opt->{debug}; |
|---|
| 342 | my $xmlid = $channels->{$this_chan}; |
|---|
| 343 | $d->{prog}->{$xmlid}->{$show}->{chan} = $this_chan; |
|---|
| 344 | $d->{prog}->{$xmlid}->{$show}->{days}->{$dayname} = 1; |
|---|
| 345 | $stats{parsed_hd_progs}++; |
|---|
| 346 | } else { |
|---|
| 347 | #print "SD Widescreen Channel $channels[$channelnumber]: $dayname $show \n" if $opt->{debug}; |
|---|
| 348 | $stats{parsed_sd_widescreen_progs}++; |
|---|
| 349 | } |
|---|
| 350 | } |
|---|
| 351 | } |
|---|
| 352 | } |
|---|
| 353 | } |
|---|
| 354 | } |
|---|
| 355 | |
|---|
| 356 | ###################################################################################################### |
|---|
| 357 | # logic to fetch a page via http |
|---|
| 358 | # retries up to 3 times to get a page with 5 second pauses inbetween |
|---|
| 359 | |
|---|
| 360 | sub get_url |
|---|
| 361 | { |
|---|
| 362 | my($url,$status,$dontretry) = (@_); |
|---|
| 363 | my $response; |
|---|
| 364 | my $attempts = 0; |
|---|
| 365 | my ($raw, $page, $base); |
|---|
| 366 | |
|---|
| 367 | my $request = HTTP::Request->new(GET => $url); |
|---|
| 368 | $request->header('Accept-Encoding' => 'gzip'); |
|---|
| 369 | |
|---|
| 370 | print "$status $url\n"; |
|---|
| 371 | |
|---|
| 372 | my $max_retries = 1 if ($dontretry); |
|---|
| 373 | |
|---|
| 374 | for (1..3) { |
|---|
| 375 | $response = $ua->request($request); |
|---|
| 376 | last if ($response->is_success || $dontretry); |
|---|
| 377 | |
|---|
| 378 | $stats{http_failed_requests}++; |
|---|
| 379 | $attempts++; |
|---|
| 380 | &sleepy("attempt $attempts failed (url $url), sleeping for 10 seconds",10); |
|---|
| 381 | } |
|---|
| 382 | if (!($response->is_success)) { |
|---|
| 383 | if ($dontretry == 0) { |
|---|
| 384 | printf "ERROR: could not open url %s in %d attempts\n",$url,$attempts; |
|---|
| 385 | } |
|---|
| 386 | return undef; |
|---|
| 387 | } |
|---|
| 388 | |
|---|
| 389 | $stats{bytes_fetched} += do {use bytes; length($response->content)}; |
|---|
| 390 | $stats{http_successful_requests}++; |
|---|
| 391 | |
|---|
| 392 | if ($response->header('Content-Encoding') && |
|---|
| 393 | $response->header('Content-Encoding') eq 'gzip') { |
|---|
| 394 | $stats{compressed_pages} += do {use bytes; length($response->content)}; |
|---|
| 395 | $response->content(Compress::Zlib::memGunzip($response->content)); |
|---|
| 396 | } |
|---|
| 397 | |
|---|
| 398 | if ($response->header('Content-type') eq 'xapplication/ywe-octet-stream') { |
|---|
| 399 | $stats{transformed_pages}++; |
|---|
| 400 | $base = &transform_output(length($response->content), $response->content); |
|---|
| 401 | } else { |
|---|
| 402 | $base = $response->content; |
|---|
| 403 | } |
|---|
| 404 | return $base; |
|---|
| 405 | } |
|---|
| 406 | |
|---|
| 407 | ###################################################################################################### |
|---|
| 408 | |
|---|
| 409 | sub print_stats |
|---|
| 410 | { |
|---|
| 411 | printf "%s v%s completed", $progname, $version; |
|---|
| 412 | foreach my $key (sort keys %stats) { |
|---|
| 413 | printf ", %d %s",$stats{$key},$key; |
|---|
| 414 | } |
|---|
| 415 | printf "\n"; |
|---|
| 416 | } |
|---|
| 417 | |
|---|
| 418 | ###################################################################################################### |
|---|
| 419 | |
|---|
| 420 | sub channel_cb( $ ) |
|---|
| 421 | { |
|---|
| 422 | my $c = shift; |
|---|
| 423 | # printf "got channel ".Dumper($c); |
|---|
| 424 | $writer->write_channel($c); |
|---|
| 425 | } |
|---|
| 426 | |
|---|
| 427 | ###################################################################################################### |
|---|
| 428 | |
|---|
| 429 | sub programme_cb( $ ) |
|---|
| 430 | { |
|---|
| 431 | my $prog=shift; |
|---|
| 432 | my $this_chan = $prog->{channel}; |
|---|
| 433 | my $this_title; |
|---|
| 434 | my $hd_chan_base; |
|---|
| 435 | |
|---|
| 436 | $this_title = $prog->{title}->[0]->[0] |
|---|
| 437 | if (($prog->{title}) && ($prog->{title}->[0]) && ($prog->{title}->[0]->[0])); |
|---|
| 438 | # print "got programme ".Dumper($prog); |
|---|
| 439 | |
|---|
| 440 | goto NOMATCH if (!defined $this_chan); |
|---|
| 441 | goto NOMATCH if (!defined $this_title); |
|---|
| 442 | goto NOMATCH if (!defined $d->{prog}->{$this_chan}); |
|---|
| 443 | |
|---|
| 444 | # if programme is already marked as HDTV, just skip all of this |
|---|
| 445 | if ((defined $prog->{video}) && (defined $prog->{video}->{quality}) && |
|---|
| 446 | ($prog->{video}->{quality} =~ /hdtv/i)) { |
|---|
| 447 | printf " matched '$this_title' (pre-marked)\n" if (!defined $d->{seenprog}->{$this_title}); |
|---|
| 448 | $stats{prog_already_marked}++; |
|---|
| 449 | goto MATCH; |
|---|
| 450 | } |
|---|
| 451 | |
|---|
| 452 | # see if title matches everything we have seen on this channel |
|---|
| 453 | foreach my $t (keys %{($d->{prog}->{$this_chan})}) { |
|---|
| 454 | if (canonicalizeTitles_match($t,$this_title)) { |
|---|
| 455 | # match! |
|---|
| 456 | printf " matched '$t' to '$this_title' (canonical match)\n" if (!defined $d->{seenprog}->{$this_title}); |
|---|
| 457 | $stats{matched_prog}++; |
|---|
| 458 | $hd_chan_base = $d->{prog}->{$this_chan}->{$t}->{chan}."HD"; |
|---|
| 459 | goto MATCH; |
|---|
| 460 | } |
|---|
| 461 | } |
|---|
| 462 | |
|---|
| 463 | # see if title matches something in our override table |
|---|
| 464 | foreach my $ch (keys %{($hdoverride)}) { |
|---|
| 465 | next if ($channels->{$ch} ne $this_chan); |
|---|
| 466 | foreach my $t (@{($hdoverride->{$ch})}) { |
|---|
| 467 | if (canonicalizeTitles_match($t,$this_title)) { |
|---|
| 468 | # match! |
|---|
| 469 | printf " matched '$t' to '$this_title' (hd override)\n" if (!defined $d->{seenprog}->{$this_title}); |
|---|
| 470 | $stats{override_matched_prog}++; |
|---|
| 471 | $hd_chan_base = $ch."HD"; |
|---|
| 472 | goto MATCH; |
|---|
| 473 | } |
|---|
| 474 | } |
|---|
| 475 | } |
|---|
| 476 | |
|---|
| 477 | NOMATCH: |
|---|
| 478 | $writer->write_programme($prog); |
|---|
| 479 | return; |
|---|
| 480 | |
|---|
| 481 | MATCH: |
|---|
| 482 | $d->{seenprog}->{$this_title}++; |
|---|
| 483 | $prog->{video}->{aspect} = "16:9"; # widescreen |
|---|
| 484 | |
|---|
| 485 | if (!defined $opt_channels->{$hd_chan_base}) { |
|---|
| 486 | # no HD variant |
|---|
| 487 | $prog->{video}->{quality} = "HDTV" unless (defined $opt->{notag}); |
|---|
| 488 | |
|---|
| 489 | $writer->write_programme($prog); |
|---|
| 490 | $stats{rewrote_prog_as_hdtv_inplace}++; |
|---|
| 491 | } else { |
|---|
| 492 | if ($opt->{action} =~ /copy/) { |
|---|
| 493 | $writer->write_programme($prog); |
|---|
| 494 | $stats{rewrote_prog_as_hdtv_inplace}++; |
|---|
| 495 | } |
|---|
| 496 | |
|---|
| 497 | # populate this programme in HD channel |
|---|
| 498 | $prog->{channel} = $opt_channels->{$hd_chan_base}; |
|---|
| 499 | $prog->{video}->{quality} = "HDTV" unless (defined $opt->{notag}); |
|---|
| 500 | $writer->write_programme($prog); |
|---|
| 501 | $stats{wrote_prog_into_hdtv_channel}++ |
|---|
| 502 | } |
|---|
| 503 | return; |
|---|
| 504 | } |
|---|
| 505 | |
|---|
| 506 | ###################################################################################################### |
|---|
| 507 | |
|---|
| 508 | sub canonicalizeTitle |
|---|
| 509 | { |
|---|
| 510 | my $title=shift; |
|---|
| 511 | $title =~ s/^\s+//; |
|---|
| 512 | $title =~ s/\s+$//; |
|---|
| 513 | $title =~ s/&(#(\d+)|(.*?));/ $2 ? chr($2) : $amp{$3}||' ' /eg; |
|---|
| 514 | $title =~ s/ *\& */ and /g; |
|---|
| 515 | $title =~ s/[^ a-zA-Z0-9]//g; |
|---|
| 516 | $title =~ s/\s+/ /; |
|---|
| 517 | return(lc($title)); |
|---|
| 518 | } |
|---|
| 519 | |
|---|
| 520 | my %alternatives = ( one => "1", |
|---|
| 521 | two => "2", |
|---|
| 522 | to => "2", |
|---|
| 523 | too => "2", |
|---|
| 524 | three => "3", |
|---|
| 525 | four => "4", |
|---|
| 526 | for => "4", |
|---|
| 527 | five => "5", |
|---|
| 528 | six => "6", |
|---|
| 529 | seven => "7", |
|---|
| 530 | eight => "8", |
|---|
| 531 | nine => "9" |
|---|
| 532 | ); |
|---|
| 533 | |
|---|
| 534 | sub forgivingMatch |
|---|
| 535 | { |
|---|
| 536 | my $word1 = shift; |
|---|
| 537 | my $word2 = shift; |
|---|
| 538 | |
|---|
| 539 | # exact match |
|---|
| 540 | return 1 if $word1 eq $word2; |
|---|
| 541 | # they match according to our alternatives lookup table |
|---|
| 542 | return 1 if $alternatives{$word1} && $alternatives{$word1} eq $word2 || |
|---|
| 543 | $alternatives{$word2} && $alternatives{$word2} eq $word1; |
|---|
| 544 | # irreconcilable differences |
|---|
| 545 | return 0 if abs(length($word1) - length($word2)) > 1 || |
|---|
| 546 | length($word1) < 3; |
|---|
| 547 | |
|---|
| 548 | my @list1 = split(//,$word1); |
|---|
| 549 | my @list2 = split(//,$word2); |
|---|
| 550 | my $i = 0; |
|---|
| 551 | my $j = 0; |
|---|
| 552 | |
|---|
| 553 | # find first difference |
|---|
| 554 | while ($i < @list1 && $j < @list2 && $list1[$i] eq $list2[$j]) { |
|---|
| 555 | ++$i; ++$j; |
|---|
| 556 | } |
|---|
| 557 | if ($i+1 < @list1 && $j+1 < @list2) { |
|---|
| 558 | # at least 2 chars to go in both words |
|---|
| 559 | if ($list1[$i+1] eq $list2[$j] && $list1[$i] eq $list2[$j+1]) { |
|---|
| 560 | # characters transposed |
|---|
| 561 | $i += 2; |
|---|
| 562 | $j += 2; |
|---|
| 563 | } elsif ($list1[$i+1] eq $list2[$j]) { |
|---|
| 564 | # extra character inserted into @list1 |
|---|
| 565 | $i += 2; |
|---|
| 566 | ++$j; |
|---|
| 567 | } elsif ($list1[$i] eq $list2[$j+1]) { |
|---|
| 568 | # extra character inserted into @list2 |
|---|
| 569 | ++$i; |
|---|
| 570 | $j += 2; |
|---|
| 571 | } else { |
|---|
| 572 | # single character difference |
|---|
| 573 | ++$i; |
|---|
| 574 | ++$j; |
|---|
| 575 | } |
|---|
| 576 | # we forgave one difference; now do rest of strings match exactly? |
|---|
| 577 | while ($i < @list1 && $j < @list2 && $list1[$i] eq $list2[$j]) { |
|---|
| 578 | ++$i; ++$j; |
|---|
| 579 | } |
|---|
| 580 | return($i == @list1 && $j == @list2); |
|---|
| 581 | } elsif ($i == @list1 || $j == @list2) { |
|---|
| 582 | # only difference is one word has one extra letter, or last char |
|---|
| 583 | # of each word differ. That's still only one one-char difference |
|---|
| 584 | return(1); |
|---|
| 585 | } |
|---|
| 586 | } |
|---|
| 587 | |
|---|
| 588 | sub canonicalizeTitles_match |
|---|
| 589 | { |
|---|
| 590 | my $word1=canonicalizeTitle(shift); |
|---|
| 591 | my $word2 =canonicalizeTitle(shift); |
|---|
| 592 | my @longer; |
|---|
| 593 | my @shorter; |
|---|
| 594 | |
|---|
| 595 | if (length($word1) > length($word2)) { |
|---|
| 596 | @longer = split(/\s+/, $word1); |
|---|
| 597 | @shorter = split(/\s+/, $word2); |
|---|
| 598 | } else { |
|---|
| 599 | @shorter = split(/\s+/, $word1); |
|---|
| 600 | @longer = split(/\s+/, $word2); |
|---|
| 601 | } |
|---|
| 602 | |
|---|
| 603 | WORD: for my $word (@shorter) { |
|---|
| 604 | for(my $i=0; $i < @longer; ++$i) { |
|---|
| 605 | if (forgivingMatch($longer[$i], $word)) { |
|---|
| 606 | splice(@longer,$i,1); |
|---|
| 607 | next WORD; |
|---|
| 608 | } elsif ($i+1 < @longer && |
|---|
| 609 | $word eq "$longer[$i]$longer[$i+1]") { |
|---|
| 610 | splice(@longer,$i,2); |
|---|
| 611 | next WORD; |
|---|
| 612 | } |
|---|
| 613 | } |
|---|
| 614 | return(0); |
|---|
| 615 | } |
|---|
| 616 | return(1); |
|---|
| 617 | } |
|---|
| 618 | |
|---|
| 619 | ############################################################################## |
|---|