Changeset 340

Show
Ignore:
Timestamp:
12/04/06 18:53:28 (7 years ago)
Author:
max
Message:

Better handling of failed downloads and parses.

Files:
2 modified

Legend:

Unmodified
Added
Removed
  • grabbers/rex

    r272 r340  
    33# "Rex" 
    44 
    5 my $version  = '3.3.7'; 
     5my $version  = '3.3.8'; 
    66 
    77# An Australian TV Guide Grabber (a.k.a. tv_grab_au) 
     
    6969# 3.3.6   : Bugfix : neater options parsing 
    7070# 3.3.7   : Bugfix : now runs from current dir, not ~/.rex/ 
     71# 3.3.8   : Bugfix : better handling of failed downloads & parses 
    7172 
    7273use strict; 
     
    9293my $lang = "en"; 
    9394 
    94 my ($count_dl, $count_detail, $count_bad, $count_cache, $count_changes, $count_kb) = (0) x 6; 
     95my ($count_dl, $count_detail, $count_bad, $count_cache, $count_changes, $count_kb, $count_bad_parse) = (0) x 7; 
    9596 
    9697my $DATASOURCE             = "http://www.yourtv.com.au"; 
     
    213214 
    214215      $guidedata = get_page($DATASOURCE_GUIDE_TODAY); 
    215       parse_guide($guidedata, $date); 
     216      parse_guide($guidedata, $date) if ($guidedata); 
    216217    } 
    217218    else 
     
    228229                          'submit' => 'submit' 
    229230                        ]); 
    230         parse_guide($guidedata, $date, $_); 
     231        parse_guide($guidedata, $date, $_) if ($guidedata); 
    231232      } 
    232233    } 
     
    346347  unless ($detailsdata and $result) 
    347348  { 
    348     print "Download failed.\n" if ($debug); 
    349349    sleep(5); 
    350     $count_bad++; 
    351350    refresh_ua(); 
    352351    return download_show($pid, $recurse_count+1); 
     
    428427 
    429428  # Set initial cookie 
    430   get_page($DATASOURCE, 1); 
     429  get_page($DATASOURCE); 
    431430 
    432431  # Set region/service cookie 
    433432  post_page($DATASOURCE_SETUP, 
    434         [ 'fta_region_id' => $opt->{'region'} ], 
    435         1 ); 
     433        [ 'fta_region_id' => $opt->{'region'} ]); 
     434 
    436435  $ua->cookie_jar()->scan(\&refresh_sid); 
    437436 
     
    521520        " %d shows grabbed\n" . 
    522521        " %d downloads, including %d detail pages (%d KB)\n" . 
    523         " %d cache hits, %d changes from cache, %d failed downloads\n", 
     522        " %d cache hits, %d changes from cache\n" . 
     523        " %d failed downloads, %d failed parses\n", 
    524524        scalar(keys %shows), 
    525525        $count_dl, $count_detail, $count_kb,  
    526         $count_cache, $count_changes, $count_bad); 
     526        $count_cache, $count_changes,  
     527        $count_bad, $count_bad_parse); 
    527528  $ret .= " Time elapsed: " . timestats($t) . "\n"; 
    528529  unless ($finished or !$count_detail) 
     
    553554sub get_page 
    554555{ 
    555   my ($url, $ignore_failure) = @_; 
     556  my ($url) = @_; 
    556557  my $request = GET $url; 
    557   return fetch_page($request, $ignore_failure); 
     558  return fetch_page($request); 
    558559} 
    559560 
    560561sub post_page 
    561562{ 
    562   my ($url, $headers, $ignore_failure) = @_; 
     563  my ($url, $headers) = @_; 
    563564  my $request = POST $url, $headers; 
    564   return fetch_page($request, $ignore_failure); 
     565  return fetch_page($request); 
    565566} 
    566567 
    567568sub fetch_page 
    568569{ 
    569   my ($request, $ignore_failure) = @_; 
     570  my ($request) = @_; 
    570571   
    571572  $request->uri() =~ s/^http:\/\//$WW/ if $opt->{warper}; 
     
    578579    print "Attempt #$c.\n" if ($debug); 
    579580    $response = $ua->request($request); 
    580     last if ($response->is_success() or $ignore_failure); 
     581    last unless ($response->is_error()); 
     582    $count_bad++; 
     583    print stats() if ($debug); 
    581584    sleep 5; 
    582585  } 
    583   unless ($response->is_success() or $ignore_failure) 
     586  if ($response->is_error()) 
    584587  { 
    585588    print "ERROR! Failed to retrieve page: " . $request->uri() . ".\n"; 
    586   } 
    587   if ($debug and (my $r = $response)->previous)  
    588   { 
    589     print "GET_CONTENT_BASE redirection backtrace:\n"; 
    590     while ($r) { print "    ", $r->base, "\n"; $r = $r->previous } 
     589    if ($debug and (my $r = $response)->previous)  
     590    { 
     591        print "GET_CONTENT_BASE redirection backtrace:\n"; 
     592        while ($r) { print "    ", $r->base, "\n"; $r = $r->previous } 
     593    } 
     594    # Network down 
     595    if ($count_bad > 10 and $count_dl == 0) 
     596    { 
     597        print "ERROR! Unable to download anything useful. Smells like a " . 
     598              "network problem. Exiting.\n"; 
     599        print stats(1); 
     600        exit 1; 
     601    } 
     602    return undef; 
    591603  } 
    592604  $count_dl++; 
     
    616628  my $curchan = ''; 
    617629  my ($pid, $block, $line, $link, $title); 
     630  my $c = 0; 
    618631  foreach my $tag ($tree->look_down('_tag' => 'td', 'class' => 'venue')) 
    619632  { 
     633    $c++; 
    620634    next if ($curchan eq $tag->as_text()); # Ignore repeated Station name 
    621635    $curchan = $tag->as_text(); 
     
    671685      else 
    672686      { 
    673         print "Parsing error: No pid found in block.\n"; 
    674       } 
    675     } 
    676   } 
     687        bad_parse("No pid found in guide data block"); 
     688      } 
     689    } 
     690  } 
     691  bad_parse("Missing data") unless ($c); 
    677692} 
    678693 
     
    712727 
    713728  $block = $tree->find('h1'); 
    714   return undef unless ($block);  # site is probably sending that block page 
     729  return bad_parse("Mising title") unless ($block);  # site is probably sending that block page 
    715730 
    716731  $show->{'title'} = [[ strip_whitespace($block->as_text()), $lang ]]; 
     
    721736  } 
    722737 
    723   $block = $tree->find('h3') or return undef; 
     738  $block = $tree->find('h3') or return bad_parse("Missing section"); 
    724739  @rows = $block->look_down('_tag' => 'div'); 
    725740  $date = $rows[1]->as_text(); 
     
    732747  else 
    733748  { 
    734     return undef;  # Failed to parse 
     749    return bad_parse("Missing times");  # Failed to parse 
    735750  } 
    736751  if (Date_Cmp($show->{'start'}, $show->{'stop'}) == 1) 
     
    741756  $show->{'channel'} = $chanid{lc($rows[0]->find('span')->as_text())}; 
    742757 
    743   $block = $tree->find('_tag' => 'hr', 'noshade') or return undef; 
     758  $block = $tree->find('_tag' => 'hr', 'noshade') or return bad_parse("Missing subsection"); 
    744759  $block = $block->right(); 
    745760  if ($block->as_text()) 
     
    858873} 
    859874 
     875sub bad_parse 
     876{ 
     877  my $msg = shift; 
     878  print "Parsing error: $msg.\n"; 
     879  $count_bad_parse++; 
     880  if ($count_bad_parse > 4 and !scalar(keys %shows)) 
     881  { 
     882      print "ERROR! Unable to parse any shows! Looks like a major problem. " . 
     883            "Exiting.\n"; 
     884      print stats(1); 
     885      exit 1; 
     886  } 
     887  return 0; 
     888} 
     889 
    860890sub translate_category 
    861891{ 
  • status

    r332 r340  
    11application     shepherd            0.4.9 
    22grabber         yahoo7widget        1.60 
    3 grabber         rex                 3.3.7 
     3grabber         rex                 3.3.8 
    44grabber         abc_website         2.03 
    55grabber         abc2_website        2.03