Changeset 713
- Timestamp:
- 05/29/07 17:38:54 (6 years ago)
- Files:
-
- 3 modified
-
postprocessors/imdb_augment_data (modified) (28 diffs)
-
references/Shepherd/Common.pm (modified) (6 diffs)
-
status (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
-
postprocessors/imdb_augment_data
r691 r713 27 27 28 28 my $progname = "imdb_augment_data"; 29 my $version = "0.15"; 30 31 use LWP::UserAgent; 29 my $version = "0.16"; 30 32 31 use XMLTV; 33 32 use POSIX qw(strftime mktime); … … 35 34 use HTML::TokeParser; 36 35 use Data::Dumper; 37 use Compress::Zlib;38 use DateTime::Format::Strptime;39 36 use Storable; 37 use Shepherd::Common; 40 38 41 39 # … … 47 45 my $data_cache; 48 46 my $settings_override = { }; 49 my $xmltv_strptime = new DateTime::Format::Strptime(pattern => "%Y%m%d%H%M"); 50 my %amp = ( nbsp => ' ', qw{ amp & lt < gt > apos ' quot " } ); 51 52 my $ua; 53 $ua = LWP::UserAgent->new( 54 'timeout' => 30, 55 'keep_alive' => 1, 56 'agent' => "Shepherd / $progname $version" 57 ); 58 $ua->env_proxy; 59 $ua->cookie_jar({}); 47 60 48 $| = 1; 61 49 … … 156 144 &set_settings if (defined $opt->{set}); 157 145 146 # set defaults 147 Shepherd::Common::set_default("debug", ((defined $opt->{debug} && $opt->{debug} > 0) ? 2 : 0)); 148 Shepherd::Common::set_default("retry_delay", 10); 149 Shepherd::Common::set_default("delay", int(rand(4) + 3)) unless (defined $opt->{fast}); 150 Shepherd::Common::setup_ua('cookie_jar' => 1, 'fake' => 1); 151 158 152 # go go go! 159 153 160 &log(sprintf "started: cache %s, %s%soutput %s",154 Shepherd::Common::log(sprintf "started: cache %s, %s%soutput %s", 161 155 ($opt->{no_cache} ? "disabled" : "enabled"), 162 156 ($opt->{fast} ? "fast-override, " : ""), … … 178 172 179 173 foreach my $file (@ARGV) { 180 &log((sprintf "Parsing: %s",174 Shepherd::Common::log((sprintf "Parsing: %s", 181 175 ($file eq "-" ? "(from-stdin, hit control-D to finiah)" : $file))); 182 176 XMLTV::parsefiles_callback(\&encoding_cb, \&credits_cb, … … 185 179 186 180 $writer->end(); 187 &log("Finished parsing, output in $opt->{output_file}");181 Shepherd::Common::log("Finished parsing, output in $opt->{output_file}"); 188 182 189 183 &write_cache unless ($opt->{no_cache}); 190 184 191 &print_stats; 185 Shepherd::Common::print_stats($progname, $version, $script_start_time, %stats); 186 192 187 exit(0); 193 188 … … 272 267 273 268 ############################################################################## 274 # logic to fetch a page via http275 # retries up to 3 times to get a page with 5 second pauses inbetween276 277 sub get_url278 {279 my ($url,$urltype,$status,$dontretry,$postvars) = @_;280 my $response;281 my $attempts = 0;282 my ($raw, $page, $base);283 284 my $request;285 286 if ($urltype eq "GET") {287 $request = HTTP::Request->new(GET => $url);288 } elsif ($urltype eq "POST") {289 $request = HTTP::Request->new(POST => $url);290 $request->add_content($postvars);291 }292 293 $request->header('Accept-Encoding' => 'gzip');294 295 &log($status);296 for (1..3) {297 $response = $ua->request($request);298 last if ($response->is_success || $dontretry);299 300 $stats{http_failed_requests}++;301 $stats{slept_for} += 10;302 $attempts++;303 sleep 10;304 }305 if (!($response->is_success)) {306 if ($dontretry == 0) {307 &log("aborting after $attempts attempts to fetch url $url");308 }309 return undef;310 }311 312 $stats{bytes_fetched} += do {use bytes; length($response->content)};313 $stats{http_successful_requests}++;314 315 unless ((!$opt->{fast}) || (($stats{http_successful_requests} % 15) != 0)) {316 my $sleeptimer = int(rand(12)) + 3; # sleep 4 to 15 seconds317 $stats{slept_for} += $sleeptimer;318 sleep $sleeptimer;319 }320 321 if ($response->header('Content-Encoding') &&322 $response->header('Content-Encoding') eq 'gzip') {323 $stats{compressed_pages} += do {use bytes; length($response->content)};324 $response->content(Compress::Zlib::memGunzip($response->content));325 }326 return $response->content;327 }328 329 ##############################################################################330 331 sub log332 {333 my ($entry) = @_;334 printf "%s\n",$entry;335 }336 337 ##############################################################################338 339 sub print_stats340 {341 my $now = time;342 printf "STATS: %s v%s completed in %d seconds",343 $progname, $version, ($now-$script_start_time);344 foreach my $key (sort keys %stats) {345 printf ", %d %s",$stats{$key},$key;346 }347 printf "\n";348 }349 350 ##############################################################################351 # descend a structure and clean up various things, including stripping352 # leading/trailing spaces in strings, translations of html stuff etc353 # -- taken & modified from Michael 'Immir' Smith's excellent tv_grab_au354 355 sub cleanup {356 my $x = shift;357 if (ref $x eq "REF") { cleanup($_) }358 elsif (ref $x eq "HASH") { cleanup(\$_) for values %$x }359 elsif (ref $x eq "ARRAY") { cleanup(\$_) for @$x }360 elsif (defined $$x) {361 $$x =~ s/&(#(\d+)|(.*?));/ $2 ? chr($2) : $amp{$3}||' ' /eg;362 # $$x =~ s/[^\x20-\x7f]/ /g;363 $$x =~ s/(^\s+|\s+$)//g;364 }365 }366 367 ##############################################################################368 # turn a string into something that can be used on a URL line369 370 sub urlify371 {372 my $str = shift;373 $str =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;374 return $str;375 }376 377 ##############################################################################378 269 # use the online IMDb "power search" at http://www.imdb/List to try to find _1_ match 379 270 … … 383 274 $data_cache->{movie_id_lookup}->{$post_fields}->{last_fetched} = time; 384 275 385 my $html_data = get_url("http://www.imdb.com/List","POST", 386 " online IMDb search for '$title' with $post_fields",0,$post_fields); 387 if (!$html_data) { 276 Shepherd::Common::log(" online IMDb search for '$title' with $post_fields"); 277 my ($html_data, $success, $status_msg, $bytes_fetched, $seconds_slept, $failed_attempts) = 278 Shepherd::Common::get_url(url => "http://www.imdb.com/List", method => "POST", postvars => $post_fields); 279 280 $stats{failed_requests} += $failed_attempts; 281 $stats{slept_for} += $seconds_slept; 282 $stats{bytes_fetched} += $bytes_fetched; 283 284 if ((!$html_data) || (!$success)) { 388 285 $stats{failed_online_imdb_lookup}++; 389 &log("failed to search imdb movie data from http://www.imdb.com/List");286 Shepherd::Common::log("failed to search imdb movie data from http://www.imdb.com/List"); 390 287 return; 391 288 } … … 413 310 $stats{imdb_lookup_added_positive_cache_entry}++; 414 311 } else { 415 &log(" online search failed: wanted 1 match, got $urls_found matches.");312 Shepherd::Common::log(" online search failed: wanted 1 match, got $urls_found matches."); 416 313 # negatively cache our failed lookup 417 314 $data_cache->{movie_id_lookup}->{$post_fields}->{url} = "-"; … … 438 335 } 439 336 if (!$found) { 440 &log(" - no ".$target." found") if ($opt->{debug});337 Shepherd::Common::log(" - no ".$target." found") if ($opt->{debug}); 441 338 return; 442 339 } … … 449 346 } 450 347 451 &log(" - parsed ".$target." ".$$v) if ($opt->{debug});348 Shepherd::Common::log(" - parsed ".$target." ".$$v) if ($opt->{debug}); 452 349 } 453 350 … … 479 376 foreach my $item (@list) { 480 377 $$v->[$found_items] = $item; 481 &log(" - parsed ".$target." ".$item) if ($opt->{debug});378 Shepherd::Common::log(" - parsed ".$target." ".$item) if ($opt->{debug}); 482 379 $found_items++; 483 380 } … … 491 388 { 492 389 my ($movie_title, $movie_url) = @_; 493 my $html_data = get_url($movie_url,"GET", 494 " downloading online IMDb movie data for '$movie_title'",0); 495 496 if (!$html_data) { 390 391 Shepherd::Common::log(" downloading online IMDb movie data for '$movie_title'"); 392 my ($html_data, $success, $status_msg, $bytes_fetched, $seconds_slept, $failed_attempts) = 393 Shepherd::Common::get_url(url => $movie_url); 394 395 $stats{failed_requests} += $failed_attempts; 396 $stats{slept_for} += $seconds_slept; 397 $stats{bytes_fetched} += $bytes_fetched; 398 399 if ((!$html_data) || (!$success)) { 497 400 $stats{failed_online_imdb_lookup}++; 498 &log("failed to fetch imdb movie data from $movie_url");401 Shepherd::Common::log("failed to fetch imdb movie data from $movie_url"); 499 402 return; 500 403 } … … 517 420 if (!defined $data_cache->{movie_lookup}->{$movie_url}->{title}) { 518 421 $stats{failed_online_imdb_title_parsing}++; 519 &log("failed to parse title within imdb movie data from $movie_url");422 Shepherd::Common::log("failed to parse title within imdb movie data from $movie_url"); 520 423 return; 521 424 } … … 531 434 if ($img_tag->[1]{alt} =~ /^$title$/i) { 532 435 $data_cache->{movie_lookup}->{$movie_url}->{cover} = $img_tag->[1]{src}; 533 &log(" - parsed cover_url ".$img_tag->[1]{src}) if ($opt->{debug});436 Shepherd::Common::log(" - parsed cover_url ".$img_tag->[1]{src}) if ($opt->{debug}); 534 437 last; 535 438 } … … 550 453 next if ($id !~ /^\/name\/nm/); 551 454 $data_cache->{movie_lookup}->{$movie_url}->{directors}->{$text} = $id; 552 &log(" - parsed director ".$text." (".$id.")") if ($opt->{debug});455 Shepherd::Common::log(" - parsed director ".$text." (".$id.")") if ($opt->{debug}); 553 456 } 554 457 } … … 568 471 next if ($id !~ /^\/name\/nm/); 569 472 $data_cache->{movie_lookup}->{$movie_url}->{writers}->{$text} = $id; 570 &log(" - parsed writer ".$text." (".$id.")") if ($opt->{debug});473 Shepherd::Common::log(" - parsed writer ".$text." (".$id.")") if ($opt->{debug}); 571 474 } 572 475 } … … 592 495 if ((defined $person) && (defined $role)) { 593 496 $data_cache->{movie_lookup}->{$movie_url}->{cast}->{$person} = $role; 594 &log(" - parsed cast (".$role.") ".$person) if ($opt->{debug});497 Shepherd::Common::log(" - parsed cast (".$role.") ".$person) if ($opt->{debug}); 595 498 } 596 499 … … 644 547 my($country, $range) = split(/:/, $tp->get_text); 645 548 $data_cache->{movie_lookup}->{$movie_url}->{certifications}->{$country} = $range; 646 &log(" - parsed certification ".$range." (".$country.")") if ($opt->{debug});549 Shepherd::Common::log(" - parsed certification ".$range." (".$country.")") if ($opt->{debug}); 647 550 } 648 551 last if ($tag->[0] =~ /\/td/i); … … 652 555 # official_sites 653 556 # full plot 557 558 559 # write out the cache every 15 progs or so 560 &write_cache if ((($stats{imdb_movie_added_cache_entry} % 15) == 0) && (!defined $opt->{no_cache})); 654 561 } 655 562 … … 684 591 { 685 592 my $prog=shift; 593 $stats{programmes}++; 686 594 687 595 my $movie_title = $prog->{title}->[0]->[0] … … 689 597 ($prog->{title}->[0]->[0])); 690 598 my $movie_year = $prog->{date} if ($prog->{date}); 599 600 if (($movie_title =~ /^close$/i) || ($movie_title =~ /^station close$/i)) { 601 $stats{skipped_due_to_category}++; 602 goto END; 603 } 691 604 692 605 if (defined $prog->{category}) { … … 708 621 # only lookup if min_duration < prog_duration > min_duration 709 622 # 710 my $t1 = $xmltv_strptime->parse_datetime($prog->{start});711 my $t2 = $xmltv_strptime->parse_datetime($prog->{stop});623 my $t1 = Shepherd::Common::parse_xmltv_date($prog->{start}); 624 my $t2 = Shepherd::Common::parse_xmltv_date($prog->{stop}); 712 625 if ((!$t1) || (!$t2)) { 713 626 $stats{excluded_couldnt_parse_time}++; 714 627 goto END; 715 628 } 716 my $prog_duration = (($t2 ->epoch - $t1->epoch) / 60);629 my $prog_duration = (($t2 - $t1) / 60); 717 630 if ($prog_duration < $opt->{min_duration}) { 718 631 $stats{excluded_prog_too_short}++; … … 731 644 # 732 645 my @search_fields; 733 push(@search_fields, "words=". urlify($movie_title));734 push(@search_fields, "&countries=". urlify($prog->{country}->[0][0]))646 push(@search_fields, "words=".Shepherd::Common::urlify($movie_title)); 647 push(@search_fields, "&countries=".Shepherd::Common::urlify($prog->{country}->[0][0])) 735 648 if ((defined $prog->{country}) && (defined $prog->{country}->[0][0])); 736 push(@search_fields, "&year=". urlify($prog->{date}))649 push(@search_fields, "&year=".Shepherd::Common::urlify($prog->{date})) 737 650 if ((defined $prog->{date}) && ($prog->{date} > 0)); 738 push(@search_fields, "&language=". urlify($prog->{language}->[0] =~ /(^[^,]*)/))651 push(@search_fields, "&language=".Shepherd::Common::urlify($prog->{language}->[0] =~ /(^[^,]*)/)) 739 652 if ((defined $prog->{language}) && (defined $prog->{language}->[0])); 740 653 # &exact=y 741 654 # cast/crew 742 655 743 &log("programme: \"$movie_title\" ($prog_duration minutes)");656 Shepherd::Common::log("programme ".$stats{programmes}.": \"$movie_title\" ($prog_duration minutes)"); 744 657 745 658 # first search using everything we can... … … 763 676 $found = 1; 764 677 } 765 &log(" used (".($found ? "positive" : "negative")." cache) search: $post_fields");678 Shepherd::Common::log(" used (".($found ? "positive" : "negative")." cache) search: $post_fields"); 766 679 } else { 767 680 # … … 809 722 if (defined $data_cache->{movie_lookup}->{$movie_url}) { 810 723 $stats{imdb_movie_used_cache_entry}++; 811 &log(" used existing (cached) movie details: $movie_url");724 Shepherd::Common::log(" used existing (cached) movie details: $movie_url"); 812 725 } else { 813 726 $stats{imdb_movie_added_cache_entry}++; … … 1014 927 } 1015 928 1016 &cleanup($prog);929 Shepherd::Common::cleanup($prog); 1017 930 #print "prog now ".Dumper($prog); 1018 931 END: … … 1026 939 $opt->{debug} = 1; 1027 940 $opt->{fast} = 1; 1028 &log("running test for: ".$opt->{test});1029 1030 my $post_fields = "words=". urlify($opt->{test});941 Shepherd::Common::log("running test for: ".$opt->{test}); 942 943 my $post_fields = "words=".Shepherd::Common::urlify($opt->{test}); 1031 944 &search_imdb_online($opt->{test},$post_fields); 1032 945 -
references/Shepherd/Common.pm
r703 r713 18 18 use POSIX qw(strftime mktime); 19 19 20 my $version = '0.1 2';20 my $version = '0.13'; 21 21 22 22 my $gmt_offset; … … 54 54 # headers : ref to array of any additional headers to send (default: <none>) 55 55 # debug : set debug level; 0 = silent, 5 = noisy (default: 1) 56 # delay : before returning, wait 'n' seconds (default: 0) 56 57 # 57 58 # If called in list context, returns an array: … … 91 92 $cnf{retry_delay} = 10 unless (defined $cnf{retry_delay}); 92 93 $cnf{debug} = 1 unless (defined $cnf{debug}); 94 $cnf{delay} = 0 unless (defined $cnf{delay}); 93 95 94 96 $this_url = $cnf{url}; … … 215 217 $prev_referer = $this_url; 216 218 219 # delay if successful 220 if ($success and $cnf{delay}) 221 { 222 sleep $cnf{delay}; 223 } 224 217 225 # If called in list context, return all our goodies 218 226 if (wantarray) … … 222 230 $response->status_line, 223 231 $bytes, 224 $failures * $cnf{retry_delay},232 ($failures * $cnf{retry_delay}) + $cnf{delay}, 225 233 $failures, 226 234 ($response->header('Content-type') ? $response->header('Content-type') : undef)); … … 454 462 ########################################################################## 455 463 464 sub log 465 { 466 my ($entry) = @_; 467 printf "%s\n",$entry; 468 } 469 470 ########################################################################## 471 472 sub print_stats 473 { 474 my ($progname, $version, $script_start_time, %stats) = @_; 475 my $now = time; 476 printf "STATS: %s v%s completed in %d seconds", 477 $progname, $version, ($now-$script_start_time); 478 foreach my $key (sort keys %stats) { 479 printf ", %d %s",$stats{$key},$key; 480 } 481 printf "\n"; 482 } 483 484 ########################################################################## 485 456 486 1; -
status
r712 r713 1 1 application shepherd 0.4.100 2 2 reference channel_list 2 3 reference Shepherd/Common.pm 0.1 23 reference Shepherd/Common.pm 0.13 4 4 reference Shepherd/MythTV.pm 0.3 5 5 grabber yahoo7widget 1.85 … … 15 15 grabber ten_website 1.00 16 16 reconciler reconciler_mk2 0.26 17 postprocessor imdb_augment_data 0.1 517 postprocessor imdb_augment_data 0.16 18 18 postprocessor flag_aus_hdtv 0.16 19 19 postprocessor augment_timezone 0.13
