| | 327 | # simple search |
| | 328 | # |
| | 329 | # The 'power search' seems to have a couple of annoying flaws, like |
| | 330 | # an inability to find AKA titles. So if our power search fails, we |
| | 331 | # fall back to the simple search. This seems to work very well, finding |
| | 332 | # many shows that the power search misses. |
| | 333 | |
| | 334 | sub simple_search_imdb |
| | 335 | { |
| | 336 | my ($title, $year) = @_; |
| | 337 | |
| | 338 | &Shepherd::Common::log(" trying simple search for '$title' ($year)"); |
| | 339 | my $url = sprintf "http://us.imdb.com/find?q=%s;s=tt", &Shepherd::Common::urlify($title); |
| | 340 | my $html = &Shepherd::Common::get_url($url); |
| | 341 | |
| | 342 | unless ($html) |
| | 343 | { |
| | 344 | $stats{failed_online_imdb_lookup}++; |
| | 345 | &Shepherd::Common::log("simple search failed"); |
| | 346 | return 0; |
| | 347 | } |
| | 348 | |
| | 349 | my $link; |
| | 350 | my $tree = HTML::TreeBuilder->new_from_content($html); |
| | 351 | HTMLPARSE: foreach my $block ($tree->look_down('_tag' => 'p')) |
| | 352 | { |
| | 353 | my $tag = $block->look_down('_tag' => 'b'); |
| | 354 | next unless ($tag); |
| | 355 | if ($tag->as_text eq 'Titles (Exact Matches)' or $tag->as_text eq 'Popular Titles') |
| | 356 | { |
| | 357 | foreach my $line (split(/<br>/, $block->as_HTML)) |
| | 358 | { |
| | 359 | if ($line =~ /^<a href="(\/title\/tt[0-9]+\/?)">(.*?)<\/a> \((\d{4})\)(.*)/ |
| | 360 | and |
| | 361 | (!$year or $year == $3) |
| | 362 | and |
| | 363 | (!$4 or $4 !~ /VG/)) |
| | 364 | { |
| | 365 | if ($link) |
| | 366 | { |
| | 367 | # found multiple hits. That's bad. |
| | 368 | $link = undef; |
| | 369 | &Shepherd::Common::log(" found multiple hits, wanted 0.") if ($opt->{debug}); |
| | 370 | last HTMLPARSE; |
| | 371 | } |
| | 372 | # Bingo! Found a link |
| | 373 | &Shepherd::Common::log(" found link: $1") if ($opt->{debug}); |
| | 374 | $link = "http://www.imdb.com" . $1; |
| | 375 | } |
| | 376 | } |
| | 377 | } |
| | 378 | } |
| | 379 | my $cache_name = 'simple-'.&Shepherd::Common::urlify($title).'-'.$year; |
| | 380 | |
| | 381 | $data_cache->{movie_id_lookup}->{$cache_name}->{last_fetched} = time; |
| | 382 | |
| | 383 | unless ($link) |
| | 384 | { |
| | 385 | &Shepherd::Common::log(" simple search failed."); |
| | 386 | # negatively cache our failed lookup |
| | 387 | $data_cache->{movie_id_lookup}->{$cache_name}->{url} = "-"; |
| | 388 | return 0; |
| | 389 | } |
| | 390 | $data_cache->{movie_id_lookup}->{$cache_name}->{url} = $link; |
| | 391 | $stats{imdb_lookup_added_positive_cache_entry}++; |
| | 392 | return 1; |
| | 393 | } |
| | 394 | |
| | 395 | ############################################################################## |
| | 797 | } |
| | 798 | |
| | 799 | # Try the simple search if we have date info |
| | 800 | if (!$found and $prog->{date}) { |
| | 801 | # cached? |
| | 802 | $post_fields = 'simple-'. &Shepherd::Common::urlify($movie_title) . '-'.$prog->{date}; |
| | 803 | my $simple_search = $data_cache->{movie_id_lookup}->{$post_fields}; |
| | 804 | if ($simple_search and $simple_search->{url}) { |
| | 805 | if ($simple_search->{url} eq '-') { |
| | 806 | # negatively cached |
| | 807 | $stats{imdb_lookup_used_negative_cache_entry}++; |
| | 808 | } else { |
| | 809 | # positively cached |
| | 810 | $stats{imdb_lookup_used_cache_entry}++; |
| | 811 | $found = 1; |
| | 812 | } |
| | 813 | &Shepherd::Common::log(sprintf " used (%s cache) search: $post_fields", ($found ? 'positive' : 'negative')); |
| | 814 | } else { |
| | 815 | # not cached; look it up |
| | 816 | $found = &simple_search_imdb($movie_title, $prog->{date}); |
| | 817 | } |
| | 818 | } else { |
| | 819 | &Shepherd::Common::log(" no date info, not trying simple search") if ($opt->{debug}); |