Changeset 340
- Timestamp:
- 12/04/06 18:53:28 (7 years ago)
- Files:
-
- 2 modified
-
grabbers/rex (modified) (17 diffs)
-
status (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
grabbers/rex
r272 r340 3 3 # "Rex" 4 4 5 my $version = '3.3. 7';5 my $version = '3.3.8'; 6 6 7 7 # An Australian TV Guide Grabber (a.k.a. tv_grab_au) … … 69 69 # 3.3.6 : Bugfix : neater options parsing 70 70 # 3.3.7 : Bugfix : now runs from current dir, not ~/.rex/ 71 # 3.3.8 : Bugfix : better handling of failed downloads & parses 71 72 72 73 use strict; … … 92 93 my $lang = "en"; 93 94 94 my ($count_dl, $count_detail, $count_bad, $count_cache, $count_changes, $count_kb ) = (0) x 6;95 my ($count_dl, $count_detail, $count_bad, $count_cache, $count_changes, $count_kb, $count_bad_parse) = (0) x 7; 95 96 96 97 my $DATASOURCE = "http://www.yourtv.com.au"; … … 213 214 214 215 $guidedata = get_page($DATASOURCE_GUIDE_TODAY); 215 parse_guide($guidedata, $date) ;216 parse_guide($guidedata, $date) if ($guidedata); 216 217 } 217 218 else … … 228 229 'submit' => 'submit' 229 230 ]); 230 parse_guide($guidedata, $date, $_) ;231 parse_guide($guidedata, $date, $_) if ($guidedata); 231 232 } 232 233 } … … 346 347 unless ($detailsdata and $result) 347 348 { 348 print "Download failed.\n" if ($debug);349 349 sleep(5); 350 $count_bad++;351 350 refresh_ua(); 352 351 return download_show($pid, $recurse_count+1); … … 428 427 429 428 # Set initial cookie 430 get_page($DATASOURCE , 1);429 get_page($DATASOURCE); 431 430 432 431 # Set region/service cookie 433 432 post_page($DATASOURCE_SETUP, 434 [ 'fta_region_id' => $opt->{'region'} ] ,435 1 ); 433 [ 'fta_region_id' => $opt->{'region'} ]); 434 436 435 $ua->cookie_jar()->scan(\&refresh_sid); 437 436 … … 521 520 " %d shows grabbed\n" . 522 521 " %d downloads, including %d detail pages (%d KB)\n" . 523 " %d cache hits, %d changes from cache, %d failed downloads\n", 522 " %d cache hits, %d changes from cache\n" . 523 " %d failed downloads, %d failed parses\n", 524 524 scalar(keys %shows), 525 525 $count_dl, $count_detail, $count_kb, 526 $count_cache, $count_changes, $count_bad); 526 $count_cache, $count_changes, 527 $count_bad, $count_bad_parse); 527 528 $ret .= " Time elapsed: " . timestats($t) . "\n"; 528 529 unless ($finished or !$count_detail) … … 553 554 sub get_page 554 555 { 555 my ($url , $ignore_failure) = @_;556 my ($url) = @_; 556 557 my $request = GET $url; 557 return fetch_page($request , $ignore_failure);558 return fetch_page($request); 558 559 } 559 560 560 561 sub post_page 561 562 { 562 my ($url, $headers , $ignore_failure) = @_;563 my ($url, $headers) = @_; 563 564 my $request = POST $url, $headers; 564 return fetch_page($request , $ignore_failure);565 return fetch_page($request); 565 566 } 566 567 567 568 sub fetch_page 568 569 { 569 my ($request , $ignore_failure) = @_;570 my ($request) = @_; 570 571 571 572 $request->uri() =~ s/^http:\/\//$WW/ if $opt->{warper}; … … 578 579 print "Attempt #$c.\n" if ($debug); 579 580 $response = $ua->request($request); 580 last if ($response->is_success() or $ignore_failure); 581 last unless ($response->is_error()); 582 $count_bad++; 583 print stats() if ($debug); 581 584 sleep 5; 582 585 } 583 unless ($response->is_success() or $ignore_failure)586 if ($response->is_error()) 584 587 { 585 588 print "ERROR! Failed to retrieve page: " . $request->uri() . ".\n"; 586 } 587 if ($debug and (my $r = $response)->previous) 588 { 589 print "GET_CONTENT_BASE redirection backtrace:\n"; 590 while ($r) { print " ", $r->base, "\n"; $r = $r->previous } 589 if ($debug and (my $r = $response)->previous) 590 { 591 print "GET_CONTENT_BASE redirection backtrace:\n"; 592 while ($r) { print " ", $r->base, "\n"; $r = $r->previous } 593 } 594 # Network down 595 if ($count_bad > 10 and $count_dl == 0) 596 { 597 print "ERROR! Unable to download anything useful. Smells like a " . 598 "network problem. Exiting.\n"; 599 print stats(1); 600 exit 1; 601 } 602 return undef; 591 603 } 592 604 $count_dl++; … … 616 628 my $curchan = ''; 617 629 my ($pid, $block, $line, $link, $title); 630 my $c = 0; 618 631 foreach my $tag ($tree->look_down('_tag' => 'td', 'class' => 'venue')) 619 632 { 633 $c++; 620 634 next if ($curchan eq $tag->as_text()); # Ignore repeated Station name 621 635 $curchan = $tag->as_text(); … … 671 685 else 672 686 { 673 print "Parsing error: No pid found in block.\n"; 674 } 675 } 676 } 687 bad_parse("No pid found in guide data block"); 688 } 689 } 690 } 691 bad_parse("Missing data") unless ($c); 677 692 } 678 693 … … 712 727 713 728 $block = $tree->find('h1'); 714 return undefunless ($block); # site is probably sending that block page729 return bad_parse("Mising title") unless ($block); # site is probably sending that block page 715 730 716 731 $show->{'title'} = [[ strip_whitespace($block->as_text()), $lang ]]; … … 721 736 } 722 737 723 $block = $tree->find('h3') or return undef;738 $block = $tree->find('h3') or return bad_parse("Missing section"); 724 739 @rows = $block->look_down('_tag' => 'div'); 725 740 $date = $rows[1]->as_text(); … … 732 747 else 733 748 { 734 return undef; # Failed to parse749 return bad_parse("Missing times"); # Failed to parse 735 750 } 736 751 if (Date_Cmp($show->{'start'}, $show->{'stop'}) == 1) … … 741 756 $show->{'channel'} = $chanid{lc($rows[0]->find('span')->as_text())}; 742 757 743 $block = $tree->find('_tag' => 'hr', 'noshade') or return undef;758 $block = $tree->find('_tag' => 'hr', 'noshade') or return bad_parse("Missing subsection"); 744 759 $block = $block->right(); 745 760 if ($block->as_text()) … … 858 873 } 859 874 875 sub bad_parse 876 { 877 my $msg = shift; 878 print "Parsing error: $msg.\n"; 879 $count_bad_parse++; 880 if ($count_bad_parse > 4 and !scalar(keys %shows)) 881 { 882 print "ERROR! Unable to parse any shows! Looks like a major problem. " . 883 "Exiting.\n"; 884 print stats(1); 885 exit 1; 886 } 887 return 0; 888 } 889 860 890 sub translate_category 861 891 { -
status
r332 r340 1 1 application shepherd 0.4.9 2 2 grabber yahoo7widget 1.60 3 grabber rex 3.3. 73 grabber rex 3.3.8 4 4 grabber abc_website 2.03 5 5 grabber abc2_website 2.03
