Changeset 1294
- Timestamp:
- 08/31/10 17:55:30 (17 months ago)
- Location:
- trunk
- Files:
-
- 2 modified
-
grabbers/abc_website (modified) (17 diffs)
-
status (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
trunk/grabbers/abc_website
r1248 r1294 1 1 #!/usr/bin/perl -w 2 2 3 # ABC1/ABC2/ABC3 au_tv guide grabber - runs from "Shepherd" master grabber3 # ABC1/ABC2/ABC3/ABC News 24 au_tv guide grabber - runs from "Shepherd" master grabber 4 4 # * written by ltd 5 # * uses ABC website for ABC1 ABC2 ABC3 data5 # * uses ABC website for ABC1 ABC2 ABC3 ABC News 24 data 6 6 # * when used in conjunction with Shepherd, shepherd can collect other channels 7 7 # using other grabbers … … 20 20 # 3.04 12jun07 abc website format change 21 21 # 3.23 01dec09 added abc3 22 # 4.00 14aug10 new website format, much redesign, added abc4/abchd/"ABC News 24" 22 23 23 24 use strict; 24 25 25 26 my $progname = "abc_website"; 26 my $version = " 3.23";27 my $version = "4.00"; 27 28 28 29 use XMLTV; … … 36 37 # 37 38 my $urls; 38 $urls->{station_close}->{ABC1} = "http://www.abc.net.au/tv/guide/abctvweekguide.htm"; 39 $urls->{station_close}->{ABC2} = "http://www.abc.net.au/tv/guide/abc2weekguide.htm"; 40 $urls->{station_close}->{ABC3} = "http://www.abc.net.au/tv/guide/abc3weekguide.htm"; 41 $urls->{guide}->{ABC1} = "http://www.abc.net.au/tv/guide/"; 42 $urls->{guide}->{ABC2} = "http://www.abc.net.au/tv/guide/abc2"; 43 $urls->{guide}->{ABC3} = "http://www.abc.net.au/tv/guide/abc3"; 39 $urls->{station_close}->{"ABC1"} = "http://www.abc.net.au/tv/guide/abc1-7day-guide.htm"; 40 $urls->{station_close}->{"ABC2"} = "http://www.abc.net.au/tv/guide/abc2-7day-guide.htm"; 41 $urls->{station_close}->{"ABC3"} = "http://www.abc.net.au/tv/guide/abc3-7day-guide.htm"; 42 $urls->{station_close}->{"ABC News 24"} = "http://www.abc.net.au/tv/guide/abcnews24-7day-guide.htm"; 43 $urls->{guide}->{"ABC"} = "http://www.abc.net.au/tv/guide/all/"; 44 44 45 45 # … … 55 55 my $writer; 56 56 $| = 1; 57 58 my %webcache; 59 my %chanID_to_divID = ( 60 'ABC1' => 'epgAbc1', 61 'ABC2' => 'epgAbc2', 62 'ABC3' => 'epgAbc3', 63 'ABC News 24' => 'epgAbc4', 64 ); 57 65 58 66 # … … 114 122 if ($opt_version || $opt_desc) { 115 123 printf "%s %s\n",$progname,$version; 116 printf "%s is a details-aware grabber that collects decent quality data using the ABC website for ABC1/ABC2/ABC3 .",$progname if $opt_desc;124 printf "%s is a details-aware grabber that collects decent quality data using the ABC website for ABC1/ABC2/ABC3/ABC News 24.",$progname if $opt_desc; 117 125 exit(0); 118 126 } … … 134 142 # figure out region, "netw" is national 135 143 my $abc_region = lc(Shepherd::Common::which_state($region)); 136 $urls->{guide}->{ABC 1} .= $abc_region;144 $urls->{guide}->{ABC} .= $abc_region; 137 145 138 146 # read channels file … … 141 149 no warnings 'all'; eval <>; die "$@" if $@; 142 150 143 die "nothing to do; neither ABC1 nor ABC2 nor ABC3 in channels lineup!\n" 144 if ((!defined $channels->{ABC1}) && (!defined $channels->{ABC2}) && (!defined $channels->{ABC3})); 151 die "nothing to do; none of ABC1, ABC2, ABC3 or ABC News 24 in channels lineup!\n" 152 if ( (!defined $channels->{"ABC1"}) && 153 (!defined $channels->{"ABC2"}) && 154 (!defined $channels->{"ABC3"}) && 155 (!defined $channels->{"ABC News 24"}) && 156 1); 145 157 146 158 # check XMLTV version for HDTV compatability … … 152 164 } 153 165 154 &log(sprintf "Going to %s%s %s%d%s days%s of data for ABC1(%s), ABC2(%s), ABC3(%s) into %s (%s)",166 &log(sprintf "Going to %s%s %s%d%s days%s of data for ABC1(%s), ABC2(%s), ABC3(%s) or ABC News 24(%s) into %s (%s)", 155 167 ($opt_gaps_file ne "" ? "micro-gap " : ""), 156 168 ($opt_cheap ? "verify (cache-validate)" : "grab"), … … 159 171 ($opt_do_extra_days ? " to 28" : ""), 160 172 ($opt_offset ? " (skipping first $opt_offset days)" : ""), 161 (defined $channels->{ABC1} ? "yes:$abc_region" : "no"), 162 (defined $channels->{ABC2} ? "yes" : "no"), 163 (defined $channels->{ABC3} ? "yes" : "no"), 173 (defined $channels->{"ABC1"} ? "yes:$abc_region" : "no"), 174 (defined $channels->{"ABC2"} ? "yes" : "no"), 175 (defined $channels->{"ABC3"} ? "yes" : "no"), 176 (defined $channels->{"ABC News 24"} ? "yes" : "no"), 164 177 $opt_outputfile, 165 178 ($opt_no_cache ? "without caching" : "with caching")); … … 179 192 $writer->start( { 'source-info-name' => "$progname $version", 'generator-info-name' => "$progname $version"} ); 180 193 181 foreach my $ch ("ABC1","ABC2","ABC3" ) {194 foreach my $ch ("ABC1","ABC2","ABC3","ABC News 24") { 182 195 $writer->write_channel( { 'display-name' => [[ $ch, $lang ]], 'id' => $channels->{$ch} } ) if (defined $channels->{$ch}); 183 196 } 184 197 185 foreach my $ch ("ABC1","ABC2", "ABC3") {198 foreach my $ch ("ABC1","ABC2", "ABC3","ABC News 24") { 186 199 &get_station_close($ch, $urls->{station_close}->{$ch}); 187 &get_abc_data($channels->{$ch}, $urls->{guide}->{ $ch},$ch);200 &get_abc_data($channels->{$ch}, $urls->{guide}->{ABC},$ch); 188 201 } 189 202 … … 341 354 } 342 355 356 # fetch this URL once, and cache it for later 357 # TODO: maybe we should cache the tree, to save processing 358 # at the expense of memory 343 359 my $url = sprintf "%s/%s.htm",$urlbase, POSIX::strftime("%Y%m/%Y%m%d",localtime($currtime)); 344 360 … … 346 362 &log((sprintf "Fetching %s summary data: day %d of %d", 347 363 $xmlid, $daynum, $opt_days )); 348 my $data = Shepherd::Common::get_url(url => $url, retries => ($tries-1)); 364 my $data=$webcache{$url}; 365 if (defined $data) { 366 &log("reusing previously fetched data for $url"); 367 } else { 368 $data = Shepherd::Common::get_url(url => $url, retries => ($tries-1)); 369 $webcache{$url}=$data; 370 } 349 371 my $tree; 350 372 $tree = HTML::TreeBuilder->new_from_content($data) if ($data); … … 366 388 my $seen_pm = 0; 367 389 368 for ($tree->look_down('_tag' => 'div', 'id' => 'epgWrap')) { 369 foreach my $tree_tr ($_->look_down('_tag' => 'tr')) { 370 my $tree_tr_class = $tree_tr->attr('class'); 371 next if ((!defined $tree_tr_class) || ($tree_tr_class !~ /alt/i)); 372 373 if (my $tree_row = $tree_tr->look_down('_tag' => 'th')) { 374 if ($tree_row->as_text() =~ /^(\d+):(\d+)(.)m/) { 375 $timeattr[2] = $1; # hour 376 $timeattr[1] = $2; # min 377 378 if ($3 eq "p") { 379 # pm 380 $timeattr[2] += 12 if ($timeattr[2] != 12); 381 $seen_pm = 1; 390 # goes like this, for each channel: 391 # 392 # <div id="epgAbc1" class="epgRowWrap"> 393 # <li style="width: 955px; "> 394 # <span class="itemTitle"> 395 # <a href=...>Lateline</a> 396 # </span> 397 # <span class="itemTime">12:20pm</span> 398 # <span class="itemDesc">Current affairs program bla bla bla</span> 399 # </li> 400 # <li style="..."> 401 # ... 402 # </li> 403 # </div> 404 # 405 # each <div id="epgAbcN" class="epgRowWrap"> is a channel 406 # each <li> is a programme 407 # each <span> is an attribute for the programme 408 # 409 # we need to extract $programme, $found_time, $progurl 410 411 my $divID=$chanID_to_divID{$chan_id}; 412 if (!$divID) { 413 &log("failed to convert channel ID $chan_id to a <div> identifier"); 414 die("failed to convert channel ID $chan_id to a <div> identifier"); 415 } 416 417 for ($tree->look_down('_tag' => 'div', 'id' => $divID, 'class' => 'epgRowWrap')) { 418 # we have a channel 419 foreach my $tree_li ($_->look_down('_tag' => 'li')) { 420 # we have a programme 421 my %proghash; 422 my $programme; 423 my $progurl; 424 # scan through all the <span> entries looking for our data 425 foreach my $tree_span ($tree_li->look_down('_tag' => 'span')) { 426 # we have a programme attribute 427 my $tree_span_class = $tree_span->attr('class'); 428 next if !defined $tree_span_class; 429 my $tree_span_value=$tree_span->as_text; 430 if ($tree_span_class eq "itemTitle") { 431 # have to get the link within the tag 432 # if it doesn't exist, must be "... Programs start at" 433 my $prog_a=$tree_span->look_down('_tag' => 'a'); 434 next unless defined($prog_a); 435 $programme=$tree_span_value; 436 $progurl = $prog_a->attr('href'); 437 } 438 $proghash{$tree_span_class}=$tree_span_value; 439 } 440 next unless $programme; 441 442 # we should now have all values we need 443 if ($proghash{itemTime} =~ /^(\d+):(\d+)(.)m/) { 444 $timeattr[2] = $1; # hour 445 $timeattr[1] = $2; # min 446 447 if ($3 eq "p") { 448 # pm 449 $timeattr[2] += 12 if ($timeattr[2] != 12); 450 $seen_pm = 1; 451 } 452 my $found_time = mktime(@timeattr); 453 454 # handle programmes that are after midnight 455 if (($seen_pm) && ($3 eq "a")) { 456 if ($timeattr[2] == 12) { 457 $found_time += (12*60*60); # 12:xx am 458 } else { 459 $found_time += (24*60*60); 382 460 } 383 my $found_time = mktime(@timeattr); 384 385 # handle programmes that are after midnight 386 if (($seen_pm) && ($3 eq "a")) { 387 if ($timeattr[2] == 12) { 388 $found_time += (12*60*60); # 12:xx am 389 } else { 390 $found_time += (24*60*60); 391 } 392 } 461 } 393 462 394 if ($tree_tr->look_down('_tag' => 'td', 'class' => 'prg')) { 395 foreach my $prog ($tree_tr->look_down('_tag' => 'a')) { 396 my $programme = $prog->as_text(); 397 my $progurl = $prog->attr('href'); 398 399 if ($progurl =~ /^\/tv\/guide\//) { 400 printf "day %d time '%s' (%s) prog: %s url: %s\n", 401 $daynum,$tree_row->as_text(),POSIX::strftime("%Y%m%d%H%M", localtime($found_time)), 402 $programme,$progurl if ($debug && $debug > 1); 403 404 $unprocessed_progname[$unprocessed_programmes] = $programme; 405 $unprocessed_starttime[$unprocessed_programmes] = $found_time; 406 $unprocessed_day[$unprocessed_programmes] = $daynum; 407 $unprocessed_url[$unprocessed_programmes] = "http://www.abc.net.au".$progurl; 408 $unprocessed_programmes++; 409 $seen_programmes++; 410 } else { 411 printf "ignoring prog %s because url %s is not a detail page\n", 412 $programme,$progurl if $debug; 413 } 414 } 415 } 463 if ($progurl =~ /^\/tv\/guide\//) { 464 printf "day %d time '%s' (%s) prog: %s url: %s\n", 465 #$daynum,$tree_row->as_text(),POSIX::strftime("%Y%m%d%H%M", localtime($found_time)), 466 $programme,$progurl if ($debug && $debug > 1); 467 468 $unprocessed_progname[$unprocessed_programmes] = $programme; 469 $unprocessed_starttime[$unprocessed_programmes] = $found_time; 470 $unprocessed_day[$unprocessed_programmes] = $daynum; 471 $unprocessed_url[$unprocessed_programmes] = "http://www.abc.net.au".$progurl; 472 $unprocessed_programmes++; 473 $seen_programmes++; 474 } else { 475 printf "ignoring prog %s because url %s is not a detail page\n", 476 $programme,$progurl if $debug; 416 477 } 417 478 } … … 552 613 } else { 553 614 if ((!$opt_cheap) && ($unprocessed_url[$i] ne "")) { 554 &get_one_abc_event($cache_key, $unprocessed_url[$i] );615 &get_one_abc_event($cache_key, $unprocessed_url[$i], $unprocessed_progname[$i]); 555 616 556 617 if (($stats{portal_detail_pages} % 25) == 1) { … … 596 657 sub get_one_abc_event 597 658 { 598 my ($cache_key, $url ) = @_;659 my ($cache_key, $url, $orig_title) = @_; 599 660 600 661 if ($stats{failed_to_fetch_portal_detail_page} >= 3 or $stats{failed_to_parse_portal_detail_page} >= 9) … … 614 675 Shepherd::Common::log("get_one_abc_event ".$url) if ($debug); 615 676 616 if (my $inner_tree = $tree->look_down('_tag' => 'div', 'id' => 'prgTitle')) { 677 # Parse the page to get $full_title, $prog_genre_text, $prog_desc 678 # and: 679 # $data_cache->{$cache_key}->{repeat} = 1; 680 # $data_cache->{$cache_key}->{cc} = 1; 681 # $data_cache->{$cache_key}->{hdtv} = 1; 682 # $data_cache->{$cache_key}->{rating} = $1; 683 # Alas, nothing seems to be listing HDTV. 684 # 685 # page is of this form: 686 # 687 # <div id="maincontent"> 688 # <div id="prgTop"> 689 # <h1>Name of Programme - Subtitle</h1> 690 # <div id="prgSubDetails"> 691 # <em id="prgGenre">Children's</em> 692 # <em id="prgRepeat">Repeat</em> 693 # <em id="prgCc">CC</em> 694 # <em id="prgRating">PG</em> 695 # <em id="prgDuration">15 mins</em> 696 # </div> 697 # </div> 698 # <div id="prgContent"> 699 # <h3 id="aboutPrg>About the Program</h3> 700 # <p>This program is about bla bla, bla bla.</p> 701 # </div> 702 # <div id="sideInfo"> 703 # <h4>Last Broadcast</h4> 704 # <p>8:00am Thu, August 7 on ABC3</p> 705 # </div> 706 # </div> 707 708 if (my $inner_tree = $tree->look_down('_tag' => 'div', 'id' => 'prgTop')) { 617 709 my $event_title = undef, my $event_subtitle = undef, my $event_description = undef, my $event_genre = undef; 710 my $event_repeat = undef, my $event_cc = undef, my $event_rating = undef; 618 711 619 712 if (my $prog_h1 = $inner_tree->look_down('_tag' => 'h1')) { 620 713 my $full_title = $prog_h1->as_HTML(); 621 714 $full_title =~ s/(^<h1>|<\/h1>$)//g; 622 ($event_title,$event_subtitle) = split(/ - /,$full_title); 623 624 $event_title =~ s/(<[a-zA-Z0-9]+\>)//g; # remove html tags 625 $event_title =~ s/(^\n|\n$)//g; # strip trailing/leading blank lines 626 Shepherd::Common::log(" - decoded title '".$event_title."'") if ($debug); 627 628 if ($event_subtitle) { 629 $event_subtitle =~ s/(<[\/a-zA-Z0-9]+\>)//g; # remove html tags 630 $event_subtitle =~ s/(^\n+|\n+$)//g; # strip trailing/leading blank lines 631 $event_subtitle =~ s/(^\s+|\s+$)//g; # strip trailing/leading blanks 715 chomp($full_title); 716 $full_title =~ s/\n$//s; # chomp doesn't seem to work 717 # TODO: check title shown here against the one from 718 # the overview, so as to avoid splitting on prognames 719 # with hiphens in them 720 ($event_title,$event_subtitle) = split(/ - /,$full_title); 721 722 # If the title on this page matches the title 723 # on the parent page, we assume that the subtitle 724 # heuristics are correct (not a hiphenated title). 725 # Alas the event title sometimes contains url-escaped 726 # characters (such as ' for single quote) and 727 # so it will sometimes differ. We need to unescape 728 # before comparing. 729 if (1 or $event_title eq $orig_title) { 730 731 $event_title =~ s/(<[a-zA-Z0-9]+\>)//g; # remove html tags 732 $event_title =~ s/(^\n|\n$)//g; # strip trailing/leading blank lines 733 Shepherd::Common::log(" - decoded title '".$event_title."' (cf '".$orig_title."')") if ($debug); 734 632 735 if ($event_subtitle) { 633 $data_cache->{$cache_key}->{subtitle} = $event_subtitle; 634 Shepherd::Common::log(" - decoded subtitle '".$event_subtitle."'") if ($debug); 736 $event_subtitle =~ s/(<[\/a-zA-Z0-9]+\>)//g; # remove html tags 737 $event_subtitle =~ s/(^\n+|\n+$)//g; # strip trailing/leading blank lines 738 $event_subtitle =~ s/(^\s+|\s+$)//g; # strip trailing/leading blanks 739 if ($event_subtitle) { 740 $data_cache->{$cache_key}->{subtitle} = $event_subtitle; 741 Shepherd::Common::log(" - decoded subtitle '".$event_subtitle."'") if ($debug); 742 } 635 743 } 636 } 637 } 638 639 if (my $prog_g = $inner_tree->look_down('_tag' => 'span', 'class' => 'smlTxt')) { 640 if (my $prog_genre_tag = $prog_g->look_down('_tag' => 'a')) { 641 my $prog_genre_text = $prog_genre_tag->as_text(); 744 } elsif (length $event_subtitle) { 745 &log("confused by hiphen in title: '$full_title'; event title '$event_title' is not the same as original title '$orig_title'"); 746 } 747 } 748 749 if (my $prog_sd = $inner_tree->look_down('_tag' => 'div', 'id' => 'prgSubDetails')) { 750 my %subdetails; 751 foreach my $em ($prog_sd->look_down('_tag' => 'em')) { 752 $subdetails{$em->attr('id')}=$em->as_text(); 753 } 754 if (my $prog_genre_text = $subdetails{'prgGenre'}) { 642 755 $data_cache->{$cache_key}->{genre} = Shepherd::Common::translate_category($prog_genre_text); 643 756 Shepherd::Common::log(" - decoded genre '$prog_genre_text'") if ($debug); 644 757 } 645 758 646 my $other_text = $prog_g->as_text(); 647 $other_text =~ s/(^\n|\n$)//g; # strip trailing/leading blank lines 648 $other_text =~ s/(^\s+|\s+$)//g; # strip trailing/leading spaces 649 650 if ($other_text =~ /^(.*)CC(.*)$/) { 759 if ($subdetails{'prgCc'}) { 651 760 $data_cache->{$cache_key}->{cc} = 1; 652 Shepherd::Common::log(" - decoded CC from '$other_text'") if ($debug); 653 $other_text = $1.$2; # strip CC 654 } 655 656 if ($other_text =~ /^(.*)High(.*)$/) { 761 Shepherd::Common::log(" - decoded CC") if ($debug); 762 } 763 764 if ($subdetails{'prgHdtv'}) { 657 765 $data_cache->{$cache_key}->{hdtv} = 1; 658 Shepherd::Common::log(" - decoded HDTV from '$other_text'") if ($debug); 659 $other_text = $1.$2; 660 } 661 662 if ($other_text =~ /^(.*)Repeat(.*)$/) { 766 Shepherd::Common::log(" - decoded HDTV") if ($debug); 767 } 768 769 if ($subdetails{'prgRepeat'}) { 663 770 $data_cache->{$cache_key}->{repeat} = 1; 664 Shepherd::Common::log(" - decoded Repeat from '$other_text'") if ($debug); 665 $other_text = $1.$2; # strip Repeat 771 Shepherd::Common::log(" - decoded Repeat") if ($debug); 666 772 } 667 773 668 774 # any remaining text should be rating 669 if ($other_text =~ /\s+(.*)$/) { 670 my $rating_text = $1; 775 if (my $rating_text = $subdetails{'prgRating'}) { 671 776 $rating_text =~ s/[^\x20-\x7f]/ /g; 672 777 if ($rating_text =~ /^\s*(\w+)\s*$/) { 673 778 $data_cache->{$cache_key}->{rating} = $1; 674 Shepherd::Common::log(" - decoded Rating ' ".$1."' from '$other_text'") if ($debug);779 Shepherd::Common::log(" - decoded Rating '$1'") if ($debug); 675 780 } 676 781 } 677 782 } 678 783 679 if (my $prog_desc = $tree->look_down('_tag' => 'div', 'class' => 'p anelItemStory')) {784 if (my $prog_desc = $tree->look_down('_tag' => 'div', 'class' => 'prgContent')) { 680 785 # gather description 681 786 foreach my $para ($prog_desc->look_down('_tag' => 'p')) { … … 735 840 return 1 if (lc($_[0]->tag) eq 'h2' and $_[0]->as_text() !~ /Day Guide/i); 736 841 737 # <td>2:00am</td><td>... programs start at 6.00am</td>842 # <td>2:00am</td><td>... Programs start at 6.00am</td> 738 843 foreach my $item ($_[0]->content_list) { 739 844 next if ref $item; 740 return 1 if $item =~ /\.\.\. programs start at /i;845 return 1 if $item =~ /\.\.\.\s*programs start at /i; 741 846 } 742 847 -
trunk/status
r1293 r1294 8 8 grabber oztivo 2.40 9 9 grabber yahoo7widget 2.14 10 grabber abc_website 4.00 10 11 grabber foxtel_swf 2.03 11 12 grabber ten_website 2.10
