Changeset 50

Show
Ignore:
Timestamp:
10/08/06 22:29:27 (7 years ago)
Author:
max
Message:

Improvements to Rex, incl. gzip compression support.

Files:
2 modified

Legend:

Unmodified
Added
Removed
  • grabbers/rex

    r1 r50  
    33# "Rex" 
    44 
    5 my $version  = '3.0.1'; 
     5my $version  = '3.2.1'; 
    66 
    77# An Australian TV Guide Grabber (a.k.a. tv_grab_au) 
     
    5555#                    --rebuild-cache options; exit on unknown option 
    5656# 3.0.0   : Shepherd compatibility 
    57 # 3.0.1   : Added --ready option 
     57# 3.1.0   : Feature: --ready option 
     58# 3.2.0   : Feature: gzip compression, report KB downloaded 
     59# 3.2.1   : Bugfix : handle failed downloads better 
    5860 
    5961use strict; 
     
    8385my $cache_file = "$output_dir/cache.dat"; 
    8486 
    85 my ($count_dl, $count_detail, $count_bad, $count_cache, $count_changes) = (0) x 5; 
     87my ($count_dl, $count_detail, $count_bad, $count_cache, $count_changes, $count_kb) = (0) x 6; 
    8688 
    8789my $DATASOURCE             = "http://www.yourtv.com.au"; 
     
    106108my %shows; 
    107109my $numshows; 
     110my $dcount; 
    108111 
    109112# --------------------------------------------------------------------------- 
     
    231234  $firstfetch = time(); 
    232235  my $show; 
    233   my $dcount = 0; 
    234236  foreach my $pid (keys %$precache) 
    235237  { 
     
    251253    unless ($shows{$pid}) 
    252254    { 
    253       $dcount++; 
    254       refresh_ua() if ($dcount % 20 == 0); # don't wait for error page 
    255255      $show = download_show($pid); 
    256256      if ($show) 
     
    261261      else 
    262262      { 
    263         $count_bad++; 
    264263        print "Failed to parse show $pid.\n"; 
    265264      } 
    266       sleep int(rand(10)); 
     265      sleep int(2 + rand(5)); 
    267266    } 
    268267    if ($opt->{stats} and time() - $laststats >= $opt->{stats}) 
     
    323322  $recurse_count ||= 0; 
    324323  return undef if ($recurse_count > 2); 
    325    
     324  
     325  $dcount++; 
     326  refresh_ua() if ($dcount % 20 == 0); # don't wait for error page 
     327 
     328  my $result; 
     329 
    326330  print "Downloading # $pid.\n" if ($debug); 
    327331  my $detailsdata = get_page($DATASOURCE_DETAIL . 
    328332                      '?action=session_info&event_id=' . $pid . 
    329333                      '&sid=' . $sid . '&loc=grid'); 
    330   unless ($detailsdata) 
     334  $result = parse_details($detailsdata) if ($detailsdata); 
     335  unless ($detailsdata and $result) 
    331336  { 
    332337    print "Download failed.\n" if ($debug); 
     338    sleep(5); 
    333339    $count_bad++; 
    334340    refresh_ua(); 
     
    336342  } 
    337343 
    338   return parse_details($detailsdata); 
     344  return $result; 
    339345} 
    340346 
     
    418424        1 ); 
    419425  $ua->cookie_jar()->scan(\&refresh_sid); 
     426 
     427  $dcount = 0; 
    420428} 
    421429 
     
    483491  $ret .= sprintf( 
    484492        " %d shows grabbed\n" . 
    485         " %d downloads, including %d detail pages\n" . 
     493        " %d downloads, including %d detail pages (%d KB)\n" . 
    486494        " %d cache hits, %d changes from cache, %d failed downloads\n", 
    487495        scalar(keys %shows), 
    488         $count_dl, $count_detail, $count_cache, $count_changes, $count_bad); 
     496        $count_dl, $count_detail, $count_kb,  
     497        $count_cache, $count_changes, $count_bad); 
    489498  $ret .= " Time elapsed: " . timestats($t) . "\n"; 
    490499  unless ($finished or !$count_detail) 
     
    532541   
    533542  $request->uri() =~ s/^http:\/\//$WW/ if $opt->{warper}; 
     543 
     544  $request->header('Accept-Encoding' => 'gzip'); 
    534545 
    535546  print "Fetching: " . $request->as_string() . "\n" if ($debug); 
     
    552563  $count_dl++; 
    553564  my $page = $response->content(); 
     565  $count_kb += (do {use bytes; length($page)}) / 1024; 
     566 
     567  if ($response->header('Content-Encoding') 
     568      and 
     569      $response->header('Content-Encoding') eq 'gzip')  
     570  { 
     571      $page = Compress::Zlib::memGunzip($response->content()); 
     572  } 
     573 
    554574  $page =~ s/ / /g; 
    555575  return $page; 
  • status

    r48 r50  
    11shepherd:0.2.8:shepherd 
    2 rex:3.0.1:grabber 
     2rex:3.2.1:grabber 
    33oztivo:0.4:grabber 
    44abc_website:1.55:grabber