| 614 | | $s = $data_cache->{$id}->{ocr_text}; |
| 615 | | } |
| 616 | | |
| 617 | | &log("parse_one_summary_prog: d".$day_num."p".$progs_in_day." ".$prog_details.": ".$s) if (defined $opt->{debug}); |
| 618 | | |
| | 660 | $s = $data_cache->{id_cache}->{$id}->{ocr_text}; |
| | 661 | $stats{used_cached_items}++; |
| | 662 | } |
| | 663 | $data_cache->{id_cache}->{$id}->{last_used} = time; |
| | 664 | |
| | 665 | if ($s =~ /\s*(\d+):(\d+)\s*(A|P)M\s+(.*)$/) { |
| | 666 | my $prog_hr = $1; |
| | 667 | $prog_hr = 0 if ($prog_hr == 12); |
| | 668 | |
| | 669 | $prog->{starttime} = (($prog_hr * 60) + $2) * 60; # seconds |
| | 670 | $prog->{starttime} += (60*60*12) if (lc($3) eq "p"); |
| | 671 | |
| | 672 | $prog->{title} = [[ $4, $opt->{lang} ]]; |
| | 673 | } elsif ($s =~ /\s*(\d+)\.(\d+)\.(\d+)\s+(\d+):(\d+)\s*(A|P)M\s+(.*)$/) { |
| | 674 | my $prog_hr = $4; |
| | 675 | $prog_hr = 0 if ($prog_hr == 12); |
| | 676 | |
| | 677 | $prog->{starttime} = (($prog_hr * 60) + $5) * 60; # seconds |
| | 678 | $prog->{starttime} += (60*60*12) if (lc($6) eq "p"); |
| | 679 | |
| | 680 | $prog->{title} = [[ $7, $opt->{lang} ]]; |
| | 681 | } |
| | 682 | |
| | 683 | if (!defined $prog->{title}) { |
| | 684 | &log("could not parse progname from OCR string '$s'. Format changed?"); |
| | 685 | $stats{unparsable_name}++; |
| | 686 | return; |
| | 687 | } |
| | 688 | if (!defined $prog->{starttime}) { |
| | 689 | &log("could not parse start time from OCR string '$s'. Format changed?"); |
| | 690 | $stats{unparsable_time}++; |
| | 691 | return; |
| | 692 | } |
| | 693 | |
| | 694 | if ($prog->{starttime} < (12*60*60)) { |
| | 695 | $prog->{starttime} += (24*60*60) if (defined $d->{seen_pm}->[$day_num]); |
| | 696 | } else { |
| | 697 | $d->{seen_pm}->[$day_num] = 1 if (!defined $d->{seen_pm}->[$day_num]); |
| | 698 | } |
| | 699 | $prog->{starttime} += $day_start; |
| | 700 | |
| | 701 | $prog->{id} = $id; |
| | 702 | $prog->{details} = $prog_details; |
| | 703 | $prog->{channel} = $channels->{TEN}; |
| | 704 | |
| | 705 | if ((defined $prog_rating) && ($prog_rating ne "")) { |
| | 706 | my @ratings; |
| | 707 | push(@ratings, [$prog_rating, 'ABA', undef]); |
| | 708 | $prog->{rating} = [ @ratings ]; |
| | 709 | } |
| | 710 | |
| | 711 | $prog->{subtitles} = [ { 'type' => 'teletext' } ] if ($prog_cc); |
| | 712 | if ($prog_hd) { |
| | 713 | $prog->{video}->{aspect} = "16:9"; # widescreen |
| | 714 | $prog->{video}->{quality} = "HDTV" unless (defined $opt->{no_hdtv_flags}); |
| | 715 | } |
| | 716 | |
| | 717 | push(@{($d->{progs})},$prog); |
| | 718 | |
| | 719 | if ($stats{programmes} > 0) { |
| | 720 | # set previous stoptime based on this starttime |
| | 721 | $d->{progs}->[($stats{programmes}-1)]->{stoptime} = $prog->{starttime}; |
| | 722 | } |
| | 723 | $stats{programmes}++; |
| | 724 | |
| | 725 | &log("parse_one_summary_prog: d".$day_num."p".$progs_in_day." ".$prog_details.": start:".$prog->{starttime}." name:".$prog->{title}->[0]->[0]) if (defined $opt->{debug}); |
| | 726 | |
| | 727 | } |
| | 728 | |
| | 729 | ############################################################################## |
| | 730 | |
| | 731 | sub get_detail_pages |
| | 732 | { |
| | 733 | my $prog_count; |
| | 734 | $stats{used_detailed_cache} = 0; |
| | 735 | &log("fetching up to ".$stats{programmes}." detail pages..."); |
| | 736 | |
| | 737 | foreach my $prog (@{($d->{progs})}) { |
| | 738 | my $was_in_cache = 0; |
| | 739 | |
| | 740 | $prog_count++; |
| | 741 | my $details = $prog->{details}; |
| | 742 | |
| | 743 | if (($prog_count % 10) == 1) { |
| | 744 | &log(" .. at programme ".$prog_count." of ".$stats{programmes}." (".$stats{used_detailed_cache}." from cache)"); |
| | 745 | } |
| | 746 | |
| | 747 | if (!defined $data_cache->{detail_cache}->{$details}) { |
| | 748 | my $url = "http://www.ten.com.au/".$details; |
| | 749 | my $data = &get_url($url,5); |
| | 750 | my $tree = HTML::TreeBuilder->new_from_content($data) if (defined $data); |
| | 751 | if ((!$data) || (!$tree)) { |
| | 752 | &log("url '$url' doesn't seem to contain any valid details. Has the format changed?"); |
| | 753 | $stats{bad_detail_response}++; |
| | 754 | return; |
| | 755 | } |
| | 756 | |
| | 757 | # parse description from first 'class=info' table cell |
| | 758 | my $prog_desc = $tree->look_down('_tag' => 'td', 'class' => 'info', 'style' => 'text-align:justify;'); |
| | 759 | if (defined $prog_desc) { |
| | 760 | $data_cache->{detail_cache}->{$details}->{desc} = [[ $prog_desc->as_text(), $opt->{lang} ]]; |
| | 761 | &log("got desc '".$prog_desc->as_text()."'") if (defined $opt->{debug} && $opt->{debug} > 1); |
| | 762 | } |
| | 763 | |
| | 764 | my $genre_group = &get_url("http://www.ten.com.au/pgutil/epfront.ashx?cd=2&id=".$prog->{id},3,undef,"HEAD"); |
| | 765 | if ((defined $genre_group) && ($genre_group ne "")) { |
| | 766 | if (defined $d->{gset}->{$genre_group}) { |
| | 767 | $data_cache->{detail_cache}->{$details}->{category} = [[ translate_category($d->{gset}->{$genre_group}), $opt->{lang} ]]; |
| | 768 | } else { |
| | 769 | $data_cache->{detail_cache}->{$details}->{category} = [[ $genre_group, $opt->{lang} ]]; |
| | 770 | &log("unknown genre group '$genre_group' for prog '".$prog->{title}->[0]->[0]."'"); |
| | 771 | $stats{unknown_genre}++; |
| | 772 | } |
| | 773 | } |
| | 774 | |
| | 775 | if ($data =~ /EPISODE:/) { |
| | 776 | my $s = &ocr_image($prog->{id}, "http://www.ten.com.au/pgutil/epfront.ashx?cd=1&id=".$prog->{id}, "http://www.ten.com.au/pgutil/epback.ashx?cd=1&id=".$prog->{id}, 3, 0, 1); |
| | 777 | if ((defined $s) && ($s ne "")) { |
| | 778 | $data_cache->{detail_cache}->{$details}->{'sub-title'} = [[ $s, $opt->{lang} ]]; |
| | 779 | } |
| | 780 | } |
| | 781 | |
| | 782 | $stats{used_detailed_cache}++; |
| | 783 | $was_in_cache = 1; |
| | 784 | } |
| | 785 | |
| | 786 | $data_cache->{detail_cache}->{$details}->{last_used} = time; |
| | 787 | |
| | 788 | $prog->{desc} = $data_cache->{detail_cache}->{$details}->{desc} |
| | 789 | if (defined $data_cache->{detail_cache}->{$details}->{desc}); |
| | 790 | |
| | 791 | $prog->{category} = $data_cache->{detail_cache}->{$details}->{category} |
| | 792 | if (defined $data_cache->{detail_cache}->{$details}->{category}); |
| | 793 | |
| | 794 | $prog->{'sub-title'} = $data_cache->{detail_cache}->{$details}->{'sub-title'} |
| | 795 | if (defined $data_cache->{detail_cache}->{$details}->{'sub-title'}); |
| | 796 | |
| | 797 | unless ((defined $opt->{fast}) || ($was_in_cache)) { |
| | 798 | my $sleep_for = 3 + int(rand(2)); |
| | 799 | sleep $sleep_for; |
| | 800 | $stats{slept_for} += $sleep_for; |
| | 801 | } |
| | 802 | } |
| 710 | | $charset{"87b7de3dbbeda572e883253803f73a78"}="e"; |
| 711 | | $charset{"caa8c600dd0aecf49f445753963e97b5"}=":"; |
| 712 | | $charset{"46854a6efed48426f1018828cca41ac2"}="o"; |
| 713 | | $charset{"f64db65ec25ba73bdced42fcf01be00a"}="r"; |
| 714 | | $charset{"d0ec43eddec59827259b46c460386ae0"}="l"; |
| 715 | | $charset{"d4fb05e2702c4199a73794b5def96ec2"}="G"; |
| 716 | | $charset{"ca1cbc6861523c4608f19365cac6dde0"}="0"; |
| 717 | | $charset{"a49667c09e9d4be0b595578d51eeb60d"}="M"; |
| 718 | | $charset{"1a9de7fb6f1c93f3ffaa15816549e43a"}="6"; |
| 719 | | $charset{"fbec6375cab7ff5b9d4b4783c7aab13b"}="s"; |
| 720 | | $charset{"b1465cc2781264fff5a55a9e9b3d8064"}="A"; |
| 721 | | $charset{"b61b8c026407890a23276d41125d7e98"}="h"; |
| 722 | | $charset{"8e2a682942360201f924e694dc70fa43"}="T"; |
| 723 | | $charset{"c1e402fec1d35694b1898b1f1dbb16bb"}="w"; |
| 724 | | $charset{"bed2eaba5e16b7246bb1f5b94d44b61c"}="h"; |
| 725 | | $charset{"c0107f886a27e42ce8fd2eca63a5ebcc"}="D"; |
| 726 | | $charset{"8300a291d7dae2e876126878c98af6d1"}="S"; |
| 727 | | $charset{"7edd9f81d7da6577d57da07f93f95b87"}="E"; |
| 728 | | $charset{"ec5d1b2140213fdbbf3c837400b2d3c5"}="e"; |
| 729 | | $charset{"14e3c224bd590504ffa95a1987ac3fbc"}="n"; |
| 730 | | $charset{"55ced8bf6a2a2482f578bc988b60b5ed"}="a"; |
| 731 | | $charset{"51bc70bfed877b2bf7300a5023a88634"}="o"; |
| 732 | | $charset{"5ae0c7cc64eb457ed198ee008fcd52f9"}="d"; |
| 733 | | $charset{"8769704a7c47684c74d841673664f942"}="V"; |
| 734 | | $charset{"763602fa61fe36273a3492f3fbae0ff8"}="t"; |
| 735 | | $charset{"0efb2ccf6c4e8b3084e56da89ad6629b"}="7"; |
| 736 | | $charset{"70dddd8427594526c8fd308b6151d673"}="i"; |
| 737 | | $charset{"75419c36d52e0f29143d4ecf3c5fd2fd"}="W"; |
| 738 | | $charset{"592559bccc3f515e5d2a93622320a1a2"}="m"; |
| 739 | | $charset{"4e4073891b344c07deceee07cd6ba348"}="g"; |
| 740 | | $charset{"9e2f28787475e105da5221e20eb7a137"}="r"; |
| 741 | | $charset{"f7e8dade3df2070be62dd206ef0cc8f1"}="5"; |
| 742 | | $charset{"62982338ad7a6b499056bac67f840d83"}="N"; |
| 743 | | $charset{"97fd0fdcedad187e8bf877adc5d580c5"}="2"; |
| 744 | | $charset{"35f9e067a546f3ae0057065223fe4c33"}="3"; |
| 745 | | $charset{"97f15e1d2ad1cb232147d0b6f01c8022"}="O"; |
| 746 | | $charset{"4c52033ce6a724d184d9c8d23a960d6e"}="P"; |
| 747 | | $charset{"9fbd3153eb8e55a0a1f453ee33e6bafd"}="i"; |
| 748 | | $charset{"e7a5cbb21f17f35f2f141e63f37a45fb"}="c"; |
| 749 | | $charset{"f7e8dade3df2070be62dd206ef0cc8f1"}="5"; |
| 750 | | $charset{"646db1d6726727e809ed1eb7ea11f545"}="8"; |
| 751 | | $charset{"c5ed676c18b62bc6885e34bf527e66af"}="Y"; |
| 752 | | $charset{"99bea8c75f15219ca16a7229b3938665"}="u"; |
| 753 | | $charset{"a9ea989899145834e84daf0abc5964f0"}="!"; |
| 754 | | $charset{"8113592ffa186852672d458f5bd86135"}="k"; |
| 755 | | $charset{"fec8880342772dd7e83ca9ffeed0e216"}="l"; |
| 756 | | $charset{"185a57d42d98c6cbd85135d9e8295501"}="D"; |
| 757 | | $charset{"606bf5428471cfd5de3434374c281334"}="y"; |
| 758 | | $charset{"50692dc12cde0fae151d9a0c2563c81d"}="J"; |
| 759 | | $charset{"f3045893d14fbb5f20e215a38617aee4"}="0"; |
| 760 | | $charset{"cf78a362c08ef3b9284ade8113e670d7"}="R"; |
| 761 | | $charset{"0de173cf09ded97fff935aa24f7f8bfe"}="z"; |
| 762 | | $charset{"cf63706b1f8eaa1c9120e1f9794918c3"}="T"; |
| 763 | | $charset{"f9f2e0d23af08cb6fbeacb686992f633"}="v"; |
| 764 | | $charset{"c1777c45a7d53a5d557c5da145bea080"}="'"; |
| 765 | | $charset{"88f4902f74cf89846318c96003466835"}="p"; |
| 766 | | $charset{"2b821839a93b75e470d04a5e2c1971b3"}="J"; |
| 767 | | $charset{"120cfb2dcf74d7900dc22d44bea9db09"}="H"; |
| 768 | | $charset{"298b488eb21a879c4cf9007c05283a15"}="s"; |
| 769 | | $charset{"08021ebe5ef72c0ed41b438fd794e71e"}="tt"; |
| 770 | | $charset{"b24415f6bccb3a9ad482156a524dbf1e"}="y"; |
| 771 | | $charset{"6c27fb8ed1d2d451785d957138ca0902"}="u"; |
| 772 | | $charset{"5a6e6307a1b18b409618616556a327e5"}="E"; |
| 773 | | $charset{"8d4430c7857a01d4805b4666c54fe114"}="b"; |
| 774 | | $charset{"cf9c23550ff1fde3b19b593966fdd391"}="S"; |
| 775 | | $charset{"97986e54d74ef7047eebc1169134564f"}="B"; |
| 776 | | $charset{"561fda757040c25038687752394d39a8"}="M"; |
| 777 | | $charset{"8015f8d4c3d6574c9ec73b412ece2013"}="L"; |
| 778 | | $charset{"7fdc4d50db244ad00f11d7c362f10b8f"}="9"; |
| 779 | | $charset{"73f0455d71b4156ab2bbebb7fac004ca"}="4"; |
| 780 | | $charset{"401ccf9844fe6399f13597cb458abedb"}="a"; |
| 781 | | $charset{"28de7104f0f94e161104c407071a5e91"}="m"; |
| 782 | | $charset{"cc282e429660787afc4a292a6e35cb2a"}="F"; |
| 783 | | $charset{"449bb458f502dbb10cf71673d1bd7ac4"}="5"; |
| 784 | | $charset{"eb6c72d1cb3b32bfcf646e2c5dafc4d2"}="N"; |
| 785 | | $charset{"aa364cab095bc5f46f855c9772619f5e"}="1"; |
| 786 | | $charset{"6aadacaa0e0b622fe755be8615f67f87"}="2"; |
| 787 | | $charset{"0f87f473885da54c2a7c886ae92f0ddd"}="R"; |
| 788 | | $charset{"17d10978ffc796cc024c68afa3fb463c"}="I"; |
| 789 | | $charset{"70b21817f2611845e464f8b551c73b71"}="1"; |
| 790 | | $charset{"f5a215139fdc4921b4fad687e0899fdc"}="H"; |
| 791 | | $charset{"8b9e1cc11d23773ca68afaea3064902a"}="A"; |
| 792 | | $charset{"6adf28b9140e9b236394bd6956638630"}="9"; |
| 793 | | $charset{"0d2eeea7b20edb640d5556ea8528ba67"}="K"; |
| 794 | | $charset{"3503cdc59df22be3b6242db35cfe3482"}="f"; |
| 795 | | $charset{"b5856240a388696d55ea99fad53166ce"}="W"; |
| 796 | | $charset{"4b8e08032dde00ced51e8435820be5e1"}="n"; |
| 797 | | $charset{"38b1c7da79cbbac219c590129f40cca1"}="k"; |
| 798 | | $charset{"1697c04376dac187f028f240cb0ccc9a"}="C"; |
| 799 | | $charset{"2e03a06a91a1993a5c6e15b43784e5c3"}="3"; |
| 800 | | $charset{"840e43645d65217fd0d57914321db2bf"}=":"; |
| 801 | | $charset{"5e871ec322ade9e74d44285c3ddad972"}="L"; |
| 802 | | $charset{"e3bcb0065109e004bc6b18b1403fb810"}="rt"; |
| 803 | | $charset{"9f32b9cd5083733eead4380bb6551ac5"}="B"; |
| 804 | | $charset{"148cafcb02f1a203866f583dbdb253af"}="&"; |
| 805 | | $charset{"519cc9d317d1a6db113c0da6e5560e71"}="d"; |
| 806 | | $charset{"40ec9716cfe72fe54201dae866e70ec5"}="V"; |
| 807 | | $charset{"ace17452c10518e97caba9493898c910"}="U"; |
| 808 | | $charset{"d244b3a33602a55c1ee8cf9c570dced9"}="-"; |
| 809 | | $charset{"25ee9123a9fdb7c164b29dfaa50d10b7"}="6"; |
| 810 | | $charset{"a87bc5bc8b3e5df44df2e2405561dd83"}="."; |
| 811 | | $charset{"f6e64e873007d53c7bf7873d639f4678"}="."; |
| 812 | | $charset{"de8b17aa3cf358a1e8b9496dd99e20f1"}="7"; |
| 813 | | $charset{"21e73997781a1af8c506eded30c6143f"}="4"; |
| 814 | | $charset{"4518bf9cb085588761164be21442aa5d"}="F"; |
| 815 | | $charset{"c38b4e845130be00f1a27a023241a500"}="!"; |
| 816 | | $charset{"f5a9cba4badf510bbde66e1012647c8c"}="O"; |
| 817 | | $charset{"c67d0abf9dd1bf2352613c243de4649b"}="P"; |
| 818 | | $charset{"453b59cf0cb2813958d5518fc668639c"}="Z"; |
| 819 | | $charset{"3dad6dcdedabfbb99ef2067f38d6bd67"}="B"; |
| 820 | | $charset{"b2da7f7ca8c9be23ca445a7df954a4f2"}="8"; |
| 821 | | $charset{"8a3bf2c9eb10c811e50c91759e6e57cc"}="G"; |
| 822 | | $charset{"f9e0333c0725c22b198bc0c3a7aa4a51"}="x"; |
| 823 | | $charset{"61ea6df7256f910d1cb031979d7d1eda"}="C"; |
| 824 | | $charset{"588b076556aa1b58810fe1f97fa77371"}="Y"; |
| 825 | | $charset{"8a3bf2c9eb10c811e50c91759e6e57cc"}="G"; |
| 826 | | $charset{"61ea6df7256f910d1cb031979d7d1eda"}="C"; |
| 827 | | } |
| 828 | | |
| 829 | | ############################################################################## |
| 830 | | |
| 831 | | sub setup_socks |
| 832 | | { |
| 833 | | use LWP::Protocol::http; |
| 834 | | my $orig_new_socket = \&LWP::Protocol::http::_new_socket; |
| 835 | | |
| 836 | | # override LWP::Protocol::http's _new_socket method with our own |
| 837 | | local($^W) = 0; |
| 838 | | *LWP::Protocol::http::_new_socket = \&socks_new_socket; |
| 839 | | |
| 840 | | # test that it works |
| 841 | | &log("configured to use Tor, testing that it works by connecting to www.google.com ..."); |
| 842 | | my $data = &get_url("http://www.google.com/",10); |
| 843 | | if (($data) && ($data =~ /Google/i)) { |
| 844 | | &log("success. Tor appears to be working!"); |
| 845 | | return; |
| 846 | | } |
| 847 | | |
| 848 | | &log("ERROR: Could not connect to www.google.com via Tor, disabling Tor."); |
| 849 | | &log(" DATA FETCHING WILL BE VERY SLOW."); |
| 850 | | &log(" DISABLING DETAILS-FETCHING BECAUSE OF THIS - SIGNIFICANTLY LOWER DATA QUALITY!!"); |
| 851 | | |
| 852 | | $opt->{no_details} = 1; |
| 853 | | delete $opt->{anon_socks}; |
| 854 | | $stats{fallback_to_non_tor}++; |
| 855 | | |
| 856 | | *LWP::Protocol::http::_new_socket = $orig_new_socket; |
| 857 | | } |
| 858 | | |
| 859 | | ############################################################################## |
| 860 | | # our own SOCKS4Aified version of LWP::Protocol::http::_new_socket |
| 861 | | |
| 862 | | sub socks_new_socket |
| 863 | | { |
| 864 | | my($self, $host, $port, $timeout) = @_; |
| 865 | | |
| 866 | | my ($socks_ip,$socks_port) = split(/:/,$opt->{anon_socks}); |
| 867 | | $socks_ip = "127.0.0.1" if (!defined $socks_ip); |
| 868 | | $socks_port = "9050" if (!defined $socks_port); |
| 869 | | |
| 870 | | local($^W) = 0; # IO::Socket::INET can be noisy |
| 871 | | my $sock = $self->socket_class->new( |
| 872 | | PeerAddr => $socks_ip, |
| 873 | | PeerPort => $socks_port, |
| 874 | | Proto => 'tcp'); |
| 875 | | |
| 876 | | unless ($sock) { |
| 877 | | # IO::Socket::INET leaves additional error messages in $@ |
| 878 | | $@ =~ s/^.*?: //; |
| 879 | | &log("Can't connect to $host:$port ($@)"); |
| 880 | | return undef; |
| 881 | | } |
| 882 | | |
| 883 | | # perl 5.005's IO::Socket does not have the blocking method. |
| 884 | | eval { $sock->blocking(0); }; |
| 885 | | |
| 886 | | # establish connectivity with socks server - SOCKS4A protocol |
| 887 | | print { $sock } pack("CCnN", 0x04, 0x01, $port, 1) . |
| 888 | | (pack 'x') . |
| 889 | | $host . (pack 'x'); |
| 890 | | |
| 891 | | my $received = ""; |
| 892 | | my $timeout_time = time + $timeout; |
| 893 | | while ($sock->sysread($received, 8) && (length($received) < 8) ) { |
| 894 | | select(undef, undef, undef, 0.25); |
| 895 | | last if ($timeout_time < time); |
| 896 | | } |
| 897 | | |
| 898 | | if ($timeout_time < time) { |
| 899 | | &log("Timeout ($timeout) while connecting via SOCKS server"); |
| 900 | | return $sock; |
| 901 | | } |
| 902 | | |
| 903 | | my ($null_byte, $req_status, $port_num, $ip_addr) = unpack('CCnN',$received); |
| 904 | | &log("Connection via SOCKS4A server rejected or failed") if ($req_status == 0x5b); |
| 905 | | &log("Connection via SOCKS4A server because client is not running identd") if ($req_status == 0x5c); |
| 906 | | &log("Connection via SOCKS4A server because client's identd could not confirm the user") if ($req_status == 0x5d); |
| 907 | | |
| 908 | | $sock; |
| 909 | | } |
| 910 | | |
| 911 | | ############################################################################## |
| | 893 | my %charset = qw{ |
| | 894 | 87b7de3dbbeda572e883253803f73a78 e caa8c600dd0aecf49f445753963e97b5 : 46854a6efed48426f1018828cca41ac2 o |
| | 895 | f64db65ec25ba73bdced42fcf01be00a r d0ec43eddec59827259b46c460386ae0 l d4fb05e2702c4199a73794b5def96ec2 G |
| | 896 | ca1cbc6861523c4608f19365cac6dde0 0 a49667c09e9d4be0b595578d51eeb60d M 1a9de7fb6f1c93f3ffaa15816549e43a 6 |
| | 897 | fbec6375cab7ff5b9d4b4783c7aab13b s b1465cc2781264fff5a55a9e9b3d8064 A b61b8c026407890a23276d41125d7e98 h |
| | 898 | 8e2a682942360201f924e694dc70fa43 T c1e402fec1d35694b1898b1f1dbb16bb w bed2eaba5e16b7246bb1f5b94d44b61c h |
| | 899 | c0107f886a27e42ce8fd2eca63a5ebcc D 8300a291d7dae2e876126878c98af6d1 S 7edd9f81d7da6577d57da07f93f95b87 E |
| | 900 | ec5d1b2140213fdbbf3c837400b2d3c5 e 14e3c224bd590504ffa95a1987ac3fbc n 55ced8bf6a2a2482f578bc988b60b5ed a |
| | 901 | 51bc70bfed877b2bf7300a5023a88634 o 5ae0c7cc64eb457ed198ee008fcd52f9 d 8769704a7c47684c74d841673664f942 V |
| | 902 | 763602fa61fe36273a3492f3fbae0ff8 t 0efb2ccf6c4e8b3084e56da89ad6629b 7 70dddd8427594526c8fd308b6151d673 i |
| | 903 | 75419c36d52e0f29143d4ecf3c5fd2fd W 592559bccc3f515e5d2a93622320a1a2 m 4e4073891b344c07deceee07cd6ba348 g |
| | 904 | 9e2f28787475e105da5221e20eb7a137 r f7e8dade3df2070be62dd206ef0cc8f1 5 62982338ad7a6b499056bac67f840d83 N |
| | 905 | 97fd0fdcedad187e8bf877adc5d580c5 2 35f9e067a546f3ae0057065223fe4c33 3 97f15e1d2ad1cb232147d0b6f01c8022 O |
| | 906 | 4c52033ce6a724d184d9c8d23a960d6e P 9fbd3153eb8e55a0a1f453ee33e6bafd i e7a5cbb21f17f35f2f141e63f37a45fb c |
| | 907 | f7e8dade3df2070be62dd206ef0cc8f1 5 646db1d6726727e809ed1eb7ea11f545 8 c5ed676c18b62bc6885e34bf527e66af Y |
| | 908 | 99bea8c75f15219ca16a7229b3938665 u a9ea989899145834e84daf0abc5964f0 ! 8113592ffa186852672d458f5bd86135 k |
| | 909 | fec8880342772dd7e83ca9ffeed0e216 l 185a57d42d98c6cbd85135d9e8295501 D 606bf5428471cfd5de3434374c281334 y |
| | 910 | 50692dc12cde0fae151d9a0c2563c81d J f3045893d14fbb5f20e215a38617aee4 0 cf78a362c08ef3b9284ade8113e670d7 R |
| | 911 | 0de173cf09ded97fff935aa24f7f8bfe z cf63706b1f8eaa1c9120e1f9794918c3 T f9f2e0d23af08cb6fbeacb686992f633 v |
| | 912 | c1777c45a7d53a5d557c5da145bea080 ' 88f4902f74cf89846318c96003466835 p 2b821839a93b75e470d04a5e2c1971b3 J |
| | 913 | 120cfb2dcf74d7900dc22d44bea9db09 H 298b488eb21a879c4cf9007c05283a15 s 08021ebe5ef72c0ed41b438fd794e71e tt |
| | 914 | b24415f6bccb3a9ad482156a524dbf1e y 6c27fb8ed1d2d451785d957138ca0902 u 5a6e6307a1b18b409618616556a327e5 E |
| | 915 | 8d4430c7857a01d4805b4666c54fe114 b cf9c23550ff1fde3b19b593966fdd391 S 97986e54d74ef7047eebc1169134564f B |
| | 916 | 561fda757040c25038687752394d39a8 M 8015f8d4c3d6574c9ec73b412ece2013 L 7fdc4d50db244ad00f11d7c362f10b8f 9 |
| | 917 | 73f0455d71b4156ab2bbebb7fac004ca 4 401ccf9844fe6399f13597cb458abedb a 28de7104f0f94e161104c407071a5e91 m |
| | 918 | cc282e429660787afc4a292a6e35cb2a F 449bb458f502dbb10cf71673d1bd7ac4 5 eb6c72d1cb3b32bfcf646e2c5dafc4d2 N |
| | 919 | aa364cab095bc5f46f855c9772619f5e 1 6aadacaa0e0b622fe755be8615f67f87 2 0f87f473885da54c2a7c886ae92f0ddd R |
| | 920 | 17d10978ffc796cc024c68afa3fb463c I 70b21817f2611845e464f8b551c73b71 1 f5a215139fdc4921b4fad687e0899fdc H |
| | 921 | 8b9e1cc11d23773ca68afaea3064902a A 6adf28b9140e9b236394bd6956638630 9 0d2eeea7b20edb640d5556ea8528ba67 K |
| | 922 | 3503cdc59df22be3b6242db35cfe3482 f b5856240a388696d55ea99fad53166ce W 4b8e08032dde00ced51e8435820be5e1 n |
| | 923 | 38b1c7da79cbbac219c590129f40cca1 k 1697c04376dac187f028f240cb0ccc9a C 2e03a06a91a1993a5c6e15b43784e5c3 3 |
| | 924 | 840e43645d65217fd0d57914321db2bf : 5e871ec322ade9e74d44285c3ddad972 L e3bcb0065109e004bc6b18b1403fb810 rt |
| | 925 | 9f32b9cd5083733eead4380bb6551ac5 B 148cafcb02f1a203866f583dbdb253af & 519cc9d317d1a6db113c0da6e5560e71 d |
| | 926 | 40ec9716cfe72fe54201dae866e70ec5 V ace17452c10518e97caba9493898c910 U d244b3a33602a55c1ee8cf9c570dced9 - |
| | 927 | 25ee9123a9fdb7c164b29dfaa50d10b7 6 a87bc5bc8b3e5df44df2e2405561dd83 . f6e64e873007d53c7bf7873d639f4678 . |
| | 928 | de8b17aa3cf358a1e8b9496dd99e20f1 7 21e73997781a1af8c506eded30c6143f 4 4518bf9cb085588761164be21442aa5d F |
| | 929 | c38b4e845130be00f1a27a023241a500 ! f5a9cba4badf510bbde66e1012647c8c O c67d0abf9dd1bf2352613c243de4649b P |
| | 930 | 453b59cf0cb2813958d5518fc668639c Z 3dad6dcdedabfbb99ef2067f38d6bd67 B b2da7f7ca8c9be23ca445a7df954a4f2 8 |
| | 931 | 8a3bf2c9eb10c811e50c91759e6e57cc G f9e0333c0725c22b198bc0c3a7aa4a51 x 61ea6df7256f910d1cb031979d7d1eda C |
| | 932 | 588b076556aa1b58810fe1f97fa77371 Y 8a3bf2c9eb10c811e50c91759e6e57cc G 61ea6df7256f910d1cb031979d7d1eda C |
| | 933 | |
| | 934 | 5892305501d6d7b3c944edcdfac487b0 W cb28d04e3bbe3bfd0bf0086b5b50b50e a d9f38cfa215b61b0baf8d3232ab71e5a c |
| | 935 | bcfbf5865682d0d691b0ba7ad34b4e5f k 0ff718ec0df83d26df8ef58f27af3e1d y df27299772b1c1fa25bc74e3e0b28519 M |
| | 936 | e74795b60c312f1fa48d956433cffd67 d e1bcd7c44b8fd705281926db43eae7f2 n c677cf0e0d2124629e224628a01a96fe e |
| | 937 | df74545eacbaf90dc1206ef81be97bbb s 118dfa4e0e53dbde0a74554c16f4b6e8 A 6ac1a637edb8d167b9b0263b72d30d50 B |
| | 938 | ab0135e45bdc858357c40d35e2a6d662 l 202c9276948bf52699ef2521988c2ed0 z f65810bb9e22c25d31a442b3ff1ec3e8 i |
| | 939 | f079edc2a2167e5c3b5a0250130ad3cc g 6ac1a637edb8d167b9b0263b72d30d50 B 238d4f228563b5efcd46fdb0ee0fa367 ttl |
| | 940 | df74545eacbaf90dc1206ef81be97bbb s acf02f7463a907c98ccfdaf1364e506a ( 761086404df3dd6a879c15722e6b5c72 P |
| | 941 | f932ec8ddd3f2edd739a715090614687 1 ff8c0771c4dc7c6a1867ada5d47c1446 ) 26e1d6a4efa3a6e7d107a7003924ad9f rt |
| | 942 | 27e56f6930a29f7ccb1f2ed98c2c99be G 168527e83abcaee41f74514b627b651a ra 9ab22051e33a6755e407cc69ea9d02b4 a |
| | 943 | 9ab22051e33a6755e407cc69ea9d02b4 p 0e6e0a842f847b0997de866dcb69fd7d th 98dee45f3aa315b8d6d1c2a83208e158 u |
| | 944 | 90b5c188102f105c0cab2556d27b0788 rd 154690fb8d4578148e1513ab0f921076 P 34065c67fbb12cce0561001cd462d573 a |
| | 945 | 1d58b69f2b50b50daacfb7645a0fdd18 rt 13f9bf707f893bc39e10ce0475e151a7 I 3af52f596fd1c33743a59d7fa816aaa3 ts |
| | 946 | 2e1e8bd83e52ee09bb58297aeb1da158 Th 1e128beba3aff04a49fba2b291603579 Re aa296120499cf1ee8868ec6759895f9b m |
| | 947 | f74eae3e6c5426b5da01fb1ad236e1a2 Tw faa3822c5ea6489c829cafc96ba86271 o a507f381a52898da1b4c63a3252559ef N |
| | 948 | 0e07a84d610ae1d5f823c02573825438 h 49b35e005120197a73685301f17ddd92 b f66ffe4a80deebe8ffca678d33e33f7b rs |
| | 949 | 2e1e8bd83e52ee09bb58297aeb1da158 Th 28a61bb021be4f7b4d43c3a995207169 re 33f3092e1d836e03bbbe45cf77f46183 S |
| | 950 | 517c4ccbb8292617db5d758e868023a0 M ac8de377a8f7d07007d10ad37eeaa88b r e42460162dfa7d0d9ad67efe32f9505c . |
| | 951 | 7f84b8c690c3b0412a0514e117a04c69 S fd84447f45a91a443e1863fa7a2c830e p 1e92bddfb0b4813630d147a38863543d ri |
| | 952 | 84db1131cd6f3ed6f630e58b879f781f tz 0fd741130b71b082f1eeebda6e2e2811 G a26fbebcec2437f07bad0ad6f6dc2313 o |
| | 953 | 29f53067840a08d6ca5c34834ad14e77 e 673324edd255d182fad9267db821f230 s d3f8a87a788b91db4886c6a4c0e5a82d To |
| | 954 | 11a9bc26a268f7cd5787ccae1a3a7fd6 to a68667571be8a5b2aaf5fd4f4f429d41 D 8d4c375b6b8db04ccee5077e5ba33863 Re |
| | 955 | ed445642499ca8148938c51518771540 e a1d72e973b08017846fcd70a732b3143 i 70a7a183ec29e18634005ddde569f65d a |
| | 956 | 29dc936fcdb2723b69c638a022135ff2 tch 3961534a0448ed072632dce5dba32d2a e e27d84de85414214f105583f45d406d7 d |
| | 957 | 0dc0ef29925f3ddffb70ce1107ca1b4d ri b46e207278c9048939ff4eb56d1aa847 t 53f78e0dc0417e0f6a455299e15dca0c V |
| | 958 | ef32aff5c88702eb5ed51c3a6836a583 7 732b43290b91d76547d1e4dd5e85ab8f - ade03db1bcb287d34d4ca9c9bd82c227 r |
| | 959 | 2f49cdc45bf918107fd3001a57d334cc U 96ab55702d9094de2f158ec3a5f1dd00 n a1a6c673257c30fe6b02ed3a5de7acec to |
| | 960 | 222c34badb06b16ff61a3bfdbd2087c5 l b6e528d8cb510fceabfcb1d280e539d9 W bd4a858bb84721b3c83498f9e4e33b20 a |
| | 961 | 222c34badb06b16ff61a3bfdbd2087c5 l a49e3b56b645aa6dc1de7a81898c92ba th b2c89ec08fe126b2e147bc3fceb5b72e S |
| | 962 | 05dd472da0bb30cf7eb463c5eea42aca u ce6488a8ce8ae8a8e81bdc631880780d c 000312319671d8f7f93eb9461828c238 s |
| | 963 | 49ba6d6bfe0d856eb6808ab901bf0ec3 F 207d6b243ade809ae1cad6507711d528 ro 37138974a7027ed973547cce5fba5db7 m |
| | 964 | c458ef3d193bfddaecd9970d9a57f844 P bd4a858bb84721b3c83498f9e4e33b20 a af722d233b9e8ae897b72d15fd8b5bc4 ti |
| | 965 | 7f76b9fb361c686de8ec1c828c71da4b v 4dcdd7bc37f7b3dae2943ddb8618bbc1 9 8fc445dd8da1ee8f8542ca18a4816109 V |
| | 966 | 5eee84d45d3263e5db81dfcc62d101fa 2 |
| | 967 | }; |
| | 968 | $d->{charset} = \%charset; |
| | 969 | |
| | 970 | |
| | 971 | my %gset = qw{491 News 508 Children 531 Entertainment 496 Drama 533 Infotainment 507 Religion}; |
| | 972 | $d->{gset} = \%gset; |
| | 973 | } |
| | 974 | |
| | 975 | ############################################################################## |
| | 976 | |
| | 977 | sub ocr_image |
| | 978 | { |
| | 979 | my ($id, $fg_url, $bg_url, $tries, $space_width, $multiline) = @_; |
| | 980 | $multiline = 0 if (!defined $multiline); |
| | 981 | |
| | 982 | my $fg_gif_image = &get_url($fg_url, $tries); |
| | 983 | my $bg_png_image = &get_url($bg_url, $tries); |
| | 984 | |
| | 985 | my $fg_image = GD::Image->newFromGifData($fg_gif_image); |
| | 986 | my $bg_image = GD::Image->newFromPngData($bg_png_image); |
| | 987 | |
| | 988 | $bg_image->copyMerge($fg_image, 0, 0, 0, 0, $fg_image->width, $fg_image->height, 100); |
| | 989 | |
| | 990 | if (!$multiline) { |
| | 991 | # remove underline |
| | 992 | my $white = $bg_image->colorExact(255,255,255); |
| | 993 | $bg_image->filledRectangle(0, 14, $fg_image->width, $fg_image->height, $white); |
| | 994 | |
| | 995 | return parse_characters($id, $bg_image, $space_width); |
| | 996 | } |
| | 997 | |
| | 998 | return parse_multiline_characters($id, $bg_image); |
| | 999 | } |
| | 1000 | |
| | 1001 | ############################################################################## |
| | 1002 | |
| | 1003 | sub parse_multiline_characters |
| | 1004 | { |
| | 1005 | my ($imgname,$i) = @_; |
| | 1006 | |
| | 1007 | my ($width, $height) = $i->getBounds; |
| | 1008 | my $bg = $i->getPixel(0,0); |
| | 1009 | &log("image bounds: x=$width, y=$height") if ((defined $opt->{debug}) && ($opt->{debug} > 3)); |
| | 1010 | |
| | 1011 | # flatten image colours |
| | 1012 | my @bgcolour; |
| | 1013 | foreach my $index (0..$i->colorsTotal) { |
| | 1014 | my @r = $i->rgb($index); |
| | 1015 | my $total = $r[0]+$r[1]+$r[2]; |
| | 1016 | |
| | 1017 | if ($total > (240*3)) { |
| | 1018 | $bgcolour[$index] = 1; |
| | 1019 | } else { |
| | 1020 | $bgcolour[$index] = 0; |
| | 1021 | } |
| | 1022 | } |
| | 1023 | |
| | 1024 | my $last_char_x; |
| | 1025 | my $charnum = 0; |
| | 1026 | my $s = ""; |
| | 1027 | |
| | 1028 | my $done = 0; |
| | 1029 | my $char_y1 = 0; |
| | 1030 | my $char_y2 = 0; |
| | 1031 | |
| | 1032 | while (!$done) { |
| | 1033 | # 1. find first non-blank horizontal |
| | 1034 | $char_y1 = $char_y2; |
| | 1035 | my $blank_y_line = 1; # until proven otherwise |
| | 1036 | while (($blank_y_line) && ($char_y1 < ($height-1))) { |
| | 1037 | my $char_x = 0; |
| | 1038 | while ($char_x < ($width-1)) { |
| | 1039 | my $index = $i->getPixel($char_x, $char_y1); |
| | 1040 | $blank_y_line = 0 if ($bgcolour[($i->getPixel($char_x,$char_y1))] == 0); |
| | 1041 | $char_x++; |
| | 1042 | } |
| | 1043 | if ($blank_y_line) { |
| | 1044 | &log("[1] whole-of-line y $char_y1 was blank!") if ((defined $opt->{debug}) && ($opt->{debug} > 3)); |
| | 1045 | $char_y1++; |
| | 1046 | } |
| | 1047 | } |
| | 1048 | if ($blank_y_line) { |
| | 1049 | &log("[1] reached end of image without finding anymore non-blank y lines. end of image!") if ((defined $opt->{debug}) && ($opt->{debug} > 3)); |
| | 1050 | $done = 1; |
| | 1051 | next; |
| | 1052 | } |
| | 1053 | &log("[1] non-blank horizontal line found: y1=$char_y1") if ((defined $opt->{debug}) && ($opt->{debug} > 3)); |
| | 1054 | |
| | 1055 | # 2. find first blank horizontal |
| | 1056 | $char_y2 = $char_y1; |
| | 1057 | my $nonblank_y_line = 0; |
| | 1058 | while (($nonblank_y_line == 0) && ($char_y2 < ($height-1))) { |
| | 1059 | my $char_x = 0; |
| | 1060 | $nonblank_y_line = 1; |
| | 1061 | while ($char_x < ($width-1)) { |
| | 1062 | my $index = $i->getPixel($char_x, $char_y2); |
| | 1063 | $nonblank_y_line = 0 if ($bgcolour[($i->getPixel($char_x,$char_y2))] == 0); |
| | 1064 | $char_x++; |
| | 1065 | } |
| | 1066 | if ($nonblank_y_line == 0) { |
| | 1067 | &log("[2] whole-of-line y $char_y2 was nonblank!") if ((defined $opt->{debug}) && ($opt->{debug} > 3)); |
| | 1068 | $char_y2++; |
| | 1069 | } |
| | 1070 | } |
| | 1071 | &log("[2] blank horizontal line found: y2=$char_y2") if ((defined $opt->{debug}) && ($opt->{debug} > 3)); |
| | 1072 | |
| | 1073 | my $done_line = 0; |
| | 1074 | my $char_x1 = 0; |
| | 1075 | my $char_x2 = 0; |
| | 1076 | while (!$done_line) { |
| | 1077 | # 3. find first non-blank vertical between char_y1 and char_y2 |
| | 1078 | $char_x1 = $char_x2; |
| | 1079 | my $blank_x_line = 1; # until proven otherwise |
| | 1080 | while (($blank_x_line) && ($char_x1 < ($width-1))) { |
| | 1081 | my $char_y = $char_y1; |
| | 1082 | while ($char_y < $char_y2) { |
| | 1083 | my $index = $i->getPixel($char_x1,$char_y); |
| | 1084 | $blank_x_line = 0 if ($bgcolour[($i->getPixel($char_x1,$char_y))] == 0); |
| | 1085 | $char_y++; |
| | 1086 | } |
| | 1087 | if ($blank_x_line) { |
| | 1088 | &log("[3] whole-of-line x $char_x1 was blank!") if ((defined $opt->{debug}) && ($opt->{debug} > 3)); |
| | 1089 | $char_x1++; |
| | 1090 | } |
| | 1091 | } |
| | 1092 | if ($blank_x_line) { |
| | 1093 | &log("[3] end of this line (x1 is $char_x1), looking for next line...") if ((defined $opt->{debug}) && ($opt->{debug} > 3)); |
| | 1094 | $done_line = 1; |
| | 1095 | $s .= " " if ($s ne ""); |
| | 1096 | next; |
| | 1097 | } |
| | 1098 | &log("[3] non-blank vertical line found: x1=$char_x1") if ((defined $opt->{debug}) && ($opt->{debug} > 3)); |
| | 1099 | |
| | 1100 | # 4. find first blank vertical between char_y1 and char_y2 |
| | 1101 | $char_x2 = $char_x1; |
| | 1102 | my $nonblank_x_line = 0; |
| | 1103 | while (($nonblank_x_line == 0) && ($char_x2 < ($width-1))) { |
| | 1104 | my $char_y = $char_y1; |
| | 1105 | $nonblank_x_line = 1; |
| | 1106 | while ($char_y < $char_y2) { |
| | 1107 | my $index = $i->getPixel($char_x2,$char_y); |
| | 1108 | $nonblank_x_line = 0 if ($bgcolour[($i->getPixel($char_x2,$char_y))] == 0); |
| | 1109 | $char_y++; |
| | 1110 | } |
| | 1111 | if ($nonblank_x_line == 0) { |
| | 1112 | &log("[4] whole-of-line x $char_x2 wasn't blank!") if ((defined $opt->{debug}) && ($opt->{debug} > 3)); |
| | 1113 | $char_x2++; |
| | 1114 | } |
| | 1115 | } |
| | 1116 | &log("blank vertical line found: x2=$char_x2") if ((defined $opt->{debug}) && ($opt->{debug} > 3)); |
| | 1117 | |
| | 1118 | &log("looking at character between: x1,y1 $char_x1,$char_y1 and x2,y2 $char_x2,$char_y2 ........") if ((defined $opt->{debug}) && ($opt->{debug} > 3)); |
| | 1119 | # 5. insert spaces |
| | 1120 | if ((defined $last_char_x) && (($char_x1 - $last_char_x) >= 3)) { |
| | 1121 | $s .= " "; |
| | 1122 | $charnum++; |
| | 1123 | } |
| | 1124 | $last_char_x = $char_x2; |
| | 1125 | |
| | 1126 | # 6. md5 char |
| | 1127 | my $str = pack('NN',($char_x2-$char_x1),($char_y2-$char_y1)); |
| | 1128 | foreach my $x ($char_x1..($char_x2-1)) { |
| | 1129 | foreach my $y ($char_y1..($char_y2-1)) { |
| | 1130 | $str .= pack('N', $bgcolour[($i->getPixel($x, $y))]); |
| | 1131 | } |
| | 1132 | } |
| | 1133 | my $md5 = Digest::MD5::md5_hex($str); |
| | 1134 | $charnum++; |
| | 1135 | |
| | 1136 | # 7. insert char |
| | 1137 | if ((!defined $d->{charset}->{$md5}) || ($d->{charset}->{$md5} eq "?")) { |
| | 1138 | if (defined $opt->{ocr_learn_mode}) { |
| | 1139 | $d->{charset}->{$md5} = "[".$md5."]"; |
| | 1140 | $s .= "[".$md5."]"; |
| | 1141 | } else { |
| | 1142 | $s .= "?"; |
| | 1143 | } |
| | 1144 | } else { |
| | 1145 | $s .= $d->{charset}->{$md5}; |
| | 1146 | } |
| | 1147 | } |
| | 1148 | } |
| | 1149 | |
| | 1150 | &log("multiline ocr got '$s'") if (defined $opt->{debug} && $opt->{debug} > 1); |
| | 1151 | return $s; |
| | 1152 | } |