| | 578 | # use the online IMDb "power search" at http://www.imdb/List to try to find _1_ match |
| | 579 | |
| | 580 | sub search_imdb_online |
| | 581 | { |
| | 582 | my ($title, $post_fields) = @_; |
| | 583 | $stats{imdb_lookup_added_cache_entry}++; |
| | 584 | $data_cache->{movie_id_lookup}->{$post_fields}->{last_fetched} = time; |
| | 585 | |
| | 586 | my $html_data = get_url("http://www.imdb.com/List","POST"," online IMDb search for '$title' with $post_fields",0,$post_fields); |
| | 587 | if (!$html_data) { |
| | 588 | $stats{failed_online_imdb_lookup}++; |
| | 589 | &log("failed to search imdb movie data from http://www.imdb.com/List"); |
| | 590 | return; |
| | 591 | } |
| | 592 | my $tp = HTML::TokeParser->new(\$html_data); |
| | 593 | |
| | 594 | my $urls_found = 0; |
| | 595 | my @urls; |
| | 596 | |
| | 597 | # see if we can find any <a href="/title/tt[0-9]+/">{name}</a> tags |
| | 598 | while (my $token = $tp->get_tag("a")) { |
| | 599 | my $url = $token->[1]{href}; |
| | 600 | if ($url =~ /\/title\/tt[0-9]+\//) { |
| | 601 | $urls_found++; |
| | 602 | push(@urls,$url); |
| | 603 | } |
| | 604 | } |
| | 605 | |
| | 606 | # only insert into cache if we match exactly _1_ movie |
| | 607 | if ($urls_found == 1) { |
| | 608 | if ($urls[0] =~ /^http:/) { |
| | 609 | $data_cache->{movie_id_lookup}->{$post_fields}->{url} = $urls[0]; |
| | 610 | } else { |
| | 611 | $data_cache->{movie_id_lookup}->{$post_fields}->{url} = "http://www.imdb.com".$urls[0]; |
| | 612 | } |
| | 613 | $stats{imdb_lookup_added_positive_cache_entry}++; |
| | 614 | } else { |
| | 615 | # negatively cache our failed lookup |
| | 616 | $data_cache->{movie_id_lookup}->{$post_fields}->{url} = "-"; |
| | 617 | $data_cache->{movie_id_lookup}->{$post_fields}->{num_choices} = $urls_found; |
| | 618 | |
| | 619 | my $num = 0; |
| | 620 | foreach my $url (@urls) { |
| | 621 | $data_cache->{movie_id_lookup}->{$post_fields}->{choices}->[$num] = $url; |
| | 622 | $num++; |
| | 623 | } |
| | 624 | } |
| | 625 | } |
| | 626 | |
| | 627 | ###################################################################################################### |
| | 628 | # simple parser for imdb returned data: covers most data |
| | 629 | |
| | 630 | sub imdb_scalar_parser |
| | 631 | { |
| | 632 | my ($html_data, $target, $texttype, $targetb, $targetc, $targeta) = @_; |
| | 633 | |
| | 634 | my $found = 0; |
| | 635 | my $tp = HTML::TokeParser->new(\$html_data); |
| | 636 | |
| | 637 | while (my $tag = $tp->get_tag('b')) { |
| | 638 | if ($tp->get_text =~ /^$target/i) { |
| | 639 | $found = 1; |
| | 640 | last; |
| | 641 | } |
| | 642 | } |
| | 643 | return undef if (!$found); |
| | 644 | |
| | 645 | my $tag = $tp->get_tag($targeta) if (defined $targeta); |
| | 646 | |
| | 647 | return ($tp->get_trimmed_text($targetb,$targetc)) if ($texttype eq "trimmed"); |
| | 648 | return ($tp->get_text($targetb,$targetc)); |
| | 649 | } |
| | 650 | |
| | 651 | ###################################################################################################### |
| | 652 | |
| | 653 | sub imdb_list_parser |
| | 654 | { |
| | 655 | my ($html_data, $target, $target2, $v) = @_; |
| | 656 | |
| | 657 | my $tp = HTML::TokeParser->new(\$html_data); |
| | 658 | my @list; |
| | 659 | my $found = 0; |
| | 660 | |
| | 661 | while (my $tag = $tp->get_tag('b')) { |
| | 662 | if ($tp->get_text =~ /^$target/i) { |
| | 663 | $found = 1; |
| | 664 | last; |
| | 665 | } |
| | 666 | } |
| | 667 | return undef if (!$found); |
| | 668 | |
| | 669 | while (my $tag = $tp->get_tag()) { |
| | 670 | push (@list, $tp->get_text()) if (($tag->[0] eq 'a') && ($tag->[1]{href} =~ /$target2/i)); |
| | 671 | last if ($tag->[0] eq 'br'); |
| | 672 | } |
| | 673 | |
| | 674 | my $found_items = 0; |
| | 675 | foreach my $item (@list) { |
| | 676 | $$v->[$found_items] = $item; |
| | 677 | $found_items++; |
| | 678 | } |
| | 679 | } |
| | 680 | |
| | 681 | ###################################################################################################### |
| | 682 | # perform a detailed movie lookup given a movie url |
| | 683 | # store what we find in our data cache |
| | 684 | |
| | 685 | sub get_imdb_movie_online |
| | 686 | { |
| | 687 | my ($movie_title, $movie_url) = @_; |
| | 688 | my $html_data = get_url($movie_url,"GET"," downloading online IMDb movie data for '$movie_title'",0); |
| | 689 | |
| | 690 | if (!$html_data) { |
| | 691 | $stats{failed_online_imdb_lookup}++; |
| | 692 | &log("failed to fetch imdb movie data from $movie_url"); |
| | 693 | return; |
| | 694 | } |
| | 695 | |
| | 696 | $stats{imdb_movie_added_cache_entry}++; |
| | 697 | $data_cache->{movie_lookup}->{$movie_url}->{last_fetched} = time; |
| | 698 | my $tp; |
| | 699 | |
| | 700 | # |
| | 701 | # parse title and year |
| | 702 | # |
| | 703 | $tp = HTML::TokeParser->new(\$html_data); |
| | 704 | my $title_token = $tp->get_tag('title'); |
| | 705 | my $title_text = $tp->get_text(); |
| | 706 | if ($title_text =~ /(.*?)\s+\((\d{4}).*?\)/) { |
| | 707 | $data_cache->{movie_lookup}->{$movie_url}->{title} = $1; |
| | 708 | $data_cache->{movie_lookup}->{$movie_url}->{year} = $2; |
| | 709 | } |
| | 710 | |
| | 711 | if (!defined $data_cache->{movie_lookup}->{$movie_url}->{title}) { |
| | 712 | $stats{failed_online_imdb_title_parsing}++; |
| | 713 | &log("failed to parse title within imdb movie data from $movie_url"); |
| | 714 | return; |
| | 715 | } |
| | 716 | |
| | 717 | # |
| | 718 | # parse cover url |
| | 719 | # |
| | 720 | my $title = $data_cache->{movie_lookup}->{$movie_url}->{title}; |
| | 721 | $tp = HTML::TokeParser->new(\$html_data); |
| | 722 | while (my $img_tag = $tp->get_tag('img')) { |
| | 723 | next if (!$img_tag->[1]{alt}); |
| | 724 | last if ($img_tag->[1]{alt} =~ /^poster not submitted/i); |
| | 725 | if ($img_tag->[1]{alt} =~ /^$title$/i) { |
| | 726 | $data_cache->{movie_lookup}->{$movie_url}->{cover} = $img_tag->[1]{src}; |
| | 727 | last; |
| | 728 | } |
| | 729 | } |
| | 730 | |
| | 731 | # |
| | 732 | # parse directors |
| | 733 | # |
| | 734 | $tp = HTML::TokeParser->new(\$html_data); |
| | 735 | while (my $tag = $tp->get_tag('b')) { |
| | 736 | last if ($tp->get_text =~ /^directed/i); |
| | 737 | } |
| | 738 | while (my $tag = $tp->get_tag) { |
| | 739 | my $text = $tp->get_text(); |
| | 740 | last if (($text =~ /writing/i) || ($tag->[0] =~ /\/td/i)); |
| | 741 | if ($tag->[0] eq 'a') { |
| | 742 | my $id = $tag->[1]{href}; |
| | 743 | next if ($id !~ /^\/name\/nm/); |
| | 744 | $data_cache->{movie_lookup}->{$movie_url}->{directors}->{$text} = $id; |
| | 745 | } |
| | 746 | } |
| | 747 | |
| | 748 | # |
| | 749 | # parse writers |
| | 750 | # |
| | 751 | $tp = HTML::TokeParser->new(\$html_data); |
| | 752 | while (my $tag = $tp->get_tag('b')) { |
| | 753 | last if ($tp->get_text =~ /^writing/i); |
| | 754 | } |
| | 755 | while (my $tag = $tp->get_tag) { |
| | 756 | my $text = $tp->get_text(); |
| | 757 | last if ($tag->[0] =~ /\/table/i); |
| | 758 | if (($tag->[0] eq 'a') && ($text !~ /more/i)) { |
| | 759 | my $id = $tag->[1]{href}; |
| | 760 | next if ($id !~ /^\/name\/nm/); |
| | 761 | $data_cache->{movie_lookup}->{$movie_url}->{writers}->{$text} = $id; |
| | 762 | } |
| | 763 | } |
| | 764 | |
| | 765 | # |
| | 766 | # parse cast |
| | 767 | # |
| | 768 | $tp = HTML::TokeParser->new(\$html_data); |
| | 769 | while (my $tag = $tp->get_tag('b')) { |
| | 770 | next unless ((exists $tag->[1]{class}) && ($tag->[1]{class} eq 'blackcatheader')); |
| | 771 | last if ($tp->get_text =~ /^(cast overview|credited cast|(?:series )?complete credited cast)/i); |
| | 772 | } |
| | 773 | while (my $tag = $tp->get_tag('a')) { |
| | 774 | last if ($tag->[1]{href} =~ /fullcredits/i); |
| | 775 | if (($tag->[1]{href}) && ($tag->[1]{href} =~ /(?<!tinyhead)\/name\/nm(\d+?)\//)) { |
| | 776 | my $person = $tp->get_text; |
| | 777 | # ignore id: my $id = $1; |
| | 778 | my $text = $tp->get_trimmed_text('a', '/tr'); |
| | 779 | my $role = ""; |
| | 780 | $role = $1 if ($text =~ /.*?\s+(.*)$/); |
| | 781 | $data_cache->{movie_lookup}->{$movie_url}->{cast}->{$person} = $role; |
| | 782 | } |
| | 783 | } |
| | 784 | |
| | 785 | # |
| | 786 | # parse countries, languages, genres using generic list parser |
| | 787 | # |
| | 788 | &imdb_list_parser($html_data,"country","countries",\$data_cache->{movie_lookup}->{$movie_url}->{countries}); |
| | 789 | &imdb_list_parser($html_data,"language","language",\$data_cache->{movie_lookup}->{$movie_url}->{languages}); |
| | 790 | &imdb_list_parser($html_data,"genre","genre",\$data_cache->{movie_lookup}->{$movie_url}->{genres}); |
| | 791 | |
| | 792 | # |
| | 793 | # parse tagline, plot, rating, runtime, aka, trivia, goofs, awards, summary using generic scalar handler |
| | 794 | # |
| | 795 | $data_cache->{movie_lookup}->{$movie_url}->{tagline} = &imdb_scalar_parser($html_data,"tagline","trimmed","b","a"); |
| | 796 | $data_cache->{movie_lookup}->{$movie_url}->{plot} = &imdb_scalar_parser($html_data,"plot","trimmed","b","a"); |
| | 797 | $data_cache->{movie_lookup}->{$movie_url}->{rating} = &imdb_scalar_parser($html_data,"user rating","trimmed","b","a","b"); |
| | 798 | $data_cache->{movie_lookup}->{$movie_url}->{runtime} = &imdb_scalar_parser($html_data,"runtime","trimmed","b","br"); |
| | 799 | $data_cache->{movie_lookup}->{$movie_url}->{aka} = &imdb_scalar_parser($html_data,"(aka|also known as)","trimmed","b","b"); |
| | 800 | $data_cache->{movie_lookup}->{$movie_url}->{trivia} = &imdb_scalar_parser($html_data,"trivia","trimmed","b","a"); |
| | 801 | $data_cache->{movie_lookup}->{$movie_url}->{goofs} = &imdb_scalar_parser($html_data,"goofs","trimmed","b","a"); |
| | 802 | $data_cache->{movie_lookup}->{$movie_url}->{awards} = &imdb_scalar_parser($html_data,"awards","trimmed","b","a"); |
| | 803 | $data_cache->{movie_lookup}->{$movie_url}->{summary} = &imdb_scalar_parser($html_data,"plot summary","","b","a"); |
| | 804 | |
| | 805 | # |
| | 806 | # certifications |
| | 807 | # |
| | 808 | $tp = HTML::TokeParser->new(\$html_data); |
| | 809 | while (my $tag = $tp->get_tag('b')) { |
| | 810 | last if ($tp->get_text =~ /^certification/i); |
| | 811 | } |
| | 812 | while (my $tag = $tp->get_tag()) { |
| | 813 | if (($tag->[0] eq "a") && ($tag->[1]{href} =~ /certificates/i)) { |
| | 814 | my($country, $range) = split(/:/, $tp->get_text); |
| | 815 | $data_cache->{movie_lookup}->{$movie_url}->{certifications}->{$country} = $range; |
| | 816 | } |
| | 817 | last if ($tag->[0] =~ /\/td/i); |
| | 818 | } |
| | 819 | |
| | 820 | # don't yet pick the following up: do we need to? |
| | 821 | # official_sites |
| | 822 | # full plot |
| | 823 | } |
| | 824 | |
| | 825 | |
| | 826 | |
| | 827 | ###################################################################################################### |