| 1 | #!/usr/bin/perl -w |
|---|
| 2 | |
|---|
| 3 | # IMDb XMLTV data augmenter <ltd@interlink.com.au> |
|---|
| 4 | # |
|---|
| 5 | # * to be used as a postprocessor for XMLTV data |
|---|
| 6 | # * uses The Internet Movie Database (www.imdb.com) to augment TV guide data; |
|---|
| 7 | # contacts www.imdb.com to collect actual movie details |
|---|
| 8 | # * this should only be used for non-commercial use. |
|---|
| 9 | # please follow the IMDb terms and conditions. |
|---|
| 10 | # * can be used in conjunction with 'shepherd' XMLTV reconciler or standalone |
|---|
| 11 | # (pipe-through) |
|---|
| 12 | # * no configuration necessary |
|---|
| 13 | # |
|---|
| 14 | # based roughly on a few existing IMDB XMLTV modules and IMDB CPAN modules |
|---|
| 15 | # but doesn't actually use them due to the large number of interdependencies |
|---|
| 16 | # they drag in. |
|---|
| 17 | # much credit goes to Michael Stepanov for his excellent IMDB::Film module |
|---|
| 18 | # and the regex's used to match data from IMDb pages |
|---|
| 19 | # |
|---|
| 20 | # changelog: |
|---|
| 21 | # 0.01 09aug06 ltd initial version |
|---|
| 22 | # 0.03 19aug06 ltd actually do imdb lookups & augment data |
|---|
| 23 | # 0.09 09feb07 paulx variety of fixes from paulx@andreassen.com.au |
|---|
| 24 | # as per trac ticket #31 |
|---|
| 25 | |
|---|
| 26 | use strict; |
|---|
| 27 | |
|---|
| 28 | my $progname = "imdb_augment_data"; |
|---|
| 29 | my $version = "0.32"; |
|---|
| 30 | |
|---|
| 31 | use XMLTV; |
|---|
| 32 | use POSIX qw(strftime mktime); |
|---|
| 33 | use Getopt::Long; |
|---|
| 34 | use HTML::TokeParser; |
|---|
| 35 | use Data::Dumper; |
|---|
| 36 | use Storable; |
|---|
| 37 | use HTML::TreeBuilder; |
|---|
| 38 | use Shepherd::Common; |
|---|
| 39 | |
|---|
| 40 | # |
|---|
| 41 | # some initial cruft |
|---|
| 42 | # |
|---|
| 43 | |
|---|
| 44 | my $script_start_time = time; |
|---|
| 45 | my %stats; |
|---|
| 46 | my $data_cache; |
|---|
| 47 | my $settings_override = { }; |
|---|
| 48 | |
|---|
| 49 | $| = 1; |
|---|
| 50 | |
|---|
| 51 | # |
|---|
| 52 | # parse command line |
|---|
| 53 | # |
|---|
| 54 | |
|---|
| 55 | my $opt = { }; |
|---|
| 56 | $opt->{output_file} = "output.xmltv"; |
|---|
| 57 | $opt->{cache_file} = "imdb_augment_data.storable.cache"; |
|---|
| 58 | $opt->{lang} = "en"; |
|---|
| 59 | $opt->{debug} = 0; |
|---|
| 60 | $opt->{min_duration} = 65; # 65 mins |
|---|
| 61 | $opt->{max_duration} = 240; # 4 hours |
|---|
| 62 | $opt->{skip_categories} = "Infotainment,Shopping,Business and Finance,Game Show,News,Parliament,Current Affairs,sports,Sport,Weather,Reality,live"; |
|---|
| 63 | $opt->{cache_details_for} = 120; # cache movie details for 4 months |
|---|
| 64 | $opt->{cache_title_for} = 120; # cache title lookups for 4 months |
|---|
| 65 | |
|---|
| 66 | GetOptions( |
|---|
| 67 | 'region=i' => \$opt->{region}, # ignored |
|---|
| 68 | 'days=i' => \$opt->{days}, # ignored |
|---|
| 69 | 'offset=i' => \$opt->{offset}, # ignored |
|---|
| 70 | 'timezone=s' => \$opt->{timezone}, # ignored |
|---|
| 71 | 'channels_file=s' => \$opt->{channels_file}, # ignored |
|---|
| 72 | 'config-file=s' => \$opt->{configfile}, # ignored |
|---|
| 73 | |
|---|
| 74 | 'min_duration=i' => \$opt->{min_duration}, |
|---|
| 75 | 'max_duration=i' => \$opt->{max_duration}, |
|---|
| 76 | 'skip_categories=s' => \$opt->{skip_categories}, |
|---|
| 77 | 'cache_details_for=i' => \$opt->{cache_details_for}, |
|---|
| 78 | 'cache_title_for=i' => \$opt->{cache_title_for}, |
|---|
| 79 | 'long-info' => \$opt->{long_info}, |
|---|
| 80 | 'dont-augment-desc' => \$opt->{dont_augment_desc}, |
|---|
| 81 | |
|---|
| 82 | 'output=s' => \$opt->{output_file}, |
|---|
| 83 | 'cache-file=s' => \$opt->{cache_file}, |
|---|
| 84 | 'fast' => \$opt->{fast}, |
|---|
| 85 | 'no-cache' => \$opt->{no_cache}, |
|---|
| 86 | 'debug+' => \$opt->{debug}, |
|---|
| 87 | 'lang=s' => \$opt->{lang}, |
|---|
| 88 | 'no-retry' => \$opt->{dont_retry}, |
|---|
| 89 | 'help' => \$opt->{help}, |
|---|
| 90 | 'test=s' => \$opt->{test}, |
|---|
| 91 | 'simpletest=s' => \$opt->{simpletest}, |
|---|
| 92 | 'set=s' => \$opt->{set}, |
|---|
| 93 | 'verbose' => \$opt->{help}, |
|---|
| 94 | 'version' => \$opt->{version}, |
|---|
| 95 | 'ready' => \$opt->{ready}, |
|---|
| 96 | 'desc' => \$opt->{desc}, |
|---|
| 97 | 'v' => \$opt->{version}); |
|---|
| 98 | |
|---|
| 99 | printf "%s v%s\n",$progname,$version; |
|---|
| 100 | |
|---|
| 101 | if ($opt->{version} || $opt->{desc} || $opt->{help} || $opt->{ready} || |
|---|
| 102 | $opt->{output_file} eq "") { |
|---|
| 103 | printf "Augments XMLTV data with programme information from ". |
|---|
| 104 | "The Internet Movie Database (www.imdb.com)\n" if $opt->{desc}; |
|---|
| 105 | |
|---|
| 106 | printf "$progname is ready for operation.\n" if ($opt->{ready}); |
|---|
| 107 | |
|---|
| 108 | printf "No --output file specified.\n" if ($opt->{output_file} eq ""); |
|---|
| 109 | |
|---|
| 110 | if ($opt->{help} || $opt->{output_file} eq "") { |
|---|
| 111 | print<<EOF |
|---|
| 112 | |
|---|
| 113 | usage: $0 [options] {FILE(s)} |
|---|
| 114 | |
|---|
| 115 | Supported options include: |
|---|
| 116 | --min_duration={min} ignore programs under {min} duration (default: $opt->{min_duration} min) |
|---|
| 117 | --max_duration={min} ignore programs over {min} duration (default: $opt->{max_duration} min) |
|---|
| 118 | --skip_categories={list} don't try to look up programmes in these categories (default: $opt->{skip_categories}) |
|---|
| 119 | |
|---|
| 120 | --dont-augment-desc don't add any IMDb data to programme description, |
|---|
| 121 | only update the data fields (default: do) |
|---|
| 122 | --long-info add lots of IMDb data to description |
|---|
| 123 | |
|---|
| 124 | --cache_details_for={days} cache programme details for {days} (def: $opt->{cache_details_for} days) |
|---|
| 125 | --cache_title_for={days} cache IMDb URLs for {days} (def: $opt->{cache_title_for} days) |
|---|
| 126 | |
|---|
| 127 | --lang={lang} set language to {lang} (default: $opt->{lang}) |
|---|
| 128 | --output={file} send final XMLTV output to {file} (default: $opt->{output_file}) |
|---|
| 129 | --debug enable debugging |
|---|
| 130 | --fast don't pause between requests to www.imdb.com |
|---|
| 131 | |
|---|
| 132 | --cache-file={file} local file to use as our data cache (default: $opt->{cache_file}) |
|---|
| 133 | --no-cache don't use local cache to reduce network load on www.imdb.com |
|---|
| 134 | --no-retry don't retry failed HTTP requests |
|---|
| 135 | |
|---|
| 136 | --test=(string) operate in 'test mode', look up prog named (string) |
|---|
| 137 | |
|---|
| 138 | --set=(setting):(value) save setting override: (value) 1=enable, 0=disable |
|---|
| 139 | dont-augment-desc:1/0 (don't / do) |
|---|
| 140 | long-info:1/0 (yes / no) |
|---|
| 141 | |
|---|
| 142 | EOF |
|---|
| 143 | ; |
|---|
| 144 | } |
|---|
| 145 | exit(0); |
|---|
| 146 | } |
|---|
| 147 | |
|---|
| 148 | &run_test if (defined $opt->{test}); |
|---|
| 149 | &simple_search_imdb(split(/,/, $opt->{simpletest})) if ($opt->{simpletest}); |
|---|
| 150 | &set_settings if (defined $opt->{set}); |
|---|
| 151 | |
|---|
| 152 | # set defaults |
|---|
| 153 | Shepherd::Common::set_default("debug", ((defined $opt->{debug} && $opt->{debug} > 0) ? 2 : 0)); |
|---|
| 154 | Shepherd::Common::set_default("stats", \%stats); |
|---|
| 155 | Shepherd::Common::set_default("retry_delay", 10); |
|---|
| 156 | Shepherd::Common::set_default("delay", int(rand(4) + 3)) unless (defined $opt->{fast}); |
|---|
| 157 | Shepherd::Common::set_default('fake' => 0); |
|---|
| 158 | |
|---|
| 159 | # go go go! |
|---|
| 160 | |
|---|
| 161 | Shepherd::Common::log(sprintf "started: cache %s, %s%soutput %s", |
|---|
| 162 | ($opt->{no_cache} ? "disabled" : "enabled"), |
|---|
| 163 | ($opt->{fast} ? "fast-override, " : ""), |
|---|
| 164 | ($opt->{debug} ? "debug enabled, " : ""), |
|---|
| 165 | ($opt->{output_file})); |
|---|
| 166 | |
|---|
| 167 | &read_cache unless ($opt->{no_cache}); |
|---|
| 168 | |
|---|
| 169 | my %writer_args = ( encoding => 'ISO-8859-1' ); |
|---|
| 170 | my $fh = new IO::File(">".$opt->{output_file}) || |
|---|
| 171 | die "can't open $opt->{output_file} for writing: $!"; |
|---|
| 172 | $writer_args{OUTPUT} = $fh; |
|---|
| 173 | |
|---|
| 174 | my $writer = new XMLTV::Writer(%writer_args); |
|---|
| 175 | $writer->start( { |
|---|
| 176 | 'source-info-url' => "http://www.imdb.com", |
|---|
| 177 | 'source-info-name' => "$progname $version", |
|---|
| 178 | 'generator-info-name' => "$progname $version"} ); |
|---|
| 179 | |
|---|
| 180 | foreach my $file (@ARGV) { |
|---|
| 181 | Shepherd::Common::log((sprintf "Parsing: %s", |
|---|
| 182 | ($file eq "-" ? "(from-stdin, hit control-D to finiah)" : $file))); |
|---|
| 183 | XMLTV::parsefiles_callback(\&encoding_cb, \&credits_cb, |
|---|
| 184 | \&channel_cb,\&programme_cb, $file); |
|---|
| 185 | } |
|---|
| 186 | |
|---|
| 187 | $writer->end(); |
|---|
| 188 | Shepherd::Common::log("Finished parsing, output in $opt->{output_file}"); |
|---|
| 189 | |
|---|
| 190 | &write_cache unless ($opt->{no_cache}); |
|---|
| 191 | |
|---|
| 192 | Shepherd::Common::print_stats($progname, $version, $script_start_time, %stats); |
|---|
| 193 | |
|---|
| 194 | exit(0); |
|---|
| 195 | |
|---|
| 196 | ############################################################################## |
|---|
| 197 | |
|---|
| 198 | sub set_settings |
|---|
| 199 | { |
|---|
| 200 | &read_cache; |
|---|
| 201 | my ($setting, $val) = split(/:/,$opt->{set}); |
|---|
| 202 | |
|---|
| 203 | die "--set format is (setting):(value) where value is 0 for disable, 1 for enable.\n" |
|---|
| 204 | if ((!defined $val) || (($val ne "0") && ($val ne "1"))); |
|---|
| 205 | |
|---|
| 206 | die "unknown '--set' parameter '$setting', see --help for details.\n" |
|---|
| 207 | if ($setting ne "dont-augment-desc" and $setting ne "long-info"); |
|---|
| 208 | |
|---|
| 209 | printf "%s: override parameter %s: %s\n", $progname, $setting, ($val eq "0" ? "disabled" : "enabled"); |
|---|
| 210 | |
|---|
| 211 | $setting =~ s/-/_/g; |
|---|
| 212 | $settings_override->{$setting} = $val; |
|---|
| 213 | &write_cache; |
|---|
| 214 | exit(0); |
|---|
| 215 | } |
|---|
| 216 | |
|---|
| 217 | ############################################################################## |
|---|
| 218 | # populate cache |
|---|
| 219 | |
|---|
| 220 | sub read_cache |
|---|
| 221 | { |
|---|
| 222 | # |
|---|
| 223 | # new-style Storable cache |
|---|
| 224 | # |
|---|
| 225 | if (-r $opt->{cache_file}) { |
|---|
| 226 | my $store = Storable::retrieve($opt->{cache_file}); |
|---|
| 227 | $data_cache = $store->{data_cache}; |
|---|
| 228 | $settings_override = $store->{settings_override}; |
|---|
| 229 | |
|---|
| 230 | foreach my $setting (keys %$settings_override) { |
|---|
| 231 | $opt->{$setting} = 1 if ($settings_override->{$setting} != 0); |
|---|
| 232 | } |
|---|
| 233 | } else { |
|---|
| 234 | printf "WARNING: no cache $opt->{cache_file} - ". |
|---|
| 235 | "have to fetch all details.\n"; |
|---|
| 236 | &write_cache; # try to write to it - failure will cause an error & barf |
|---|
| 237 | } |
|---|
| 238 | |
|---|
| 239 | # |
|---|
| 240 | # age our caches on startup |
|---|
| 241 | # |
|---|
| 242 | my $max_age; |
|---|
| 243 | |
|---|
| 244 | # age our programme_id cache on startup |
|---|
| 245 | my $prog_id = $data_cache->{movie_id_lookup}; |
|---|
| 246 | $max_age = time - ($opt->{cache_title_for} * 86400); |
|---|
| 247 | foreach my $key (keys %{$prog_id}) { |
|---|
| 248 | if ($data_cache->{movie_id_lookup}->{$key}->{last_fetched} < $max_age) { |
|---|
| 249 | delete $data_cache->{movie_id_lookup}->{$key}; |
|---|
| 250 | $stats{removed_programme_id_from_cache}++ |
|---|
| 251 | } |
|---|
| 252 | } |
|---|
| 253 | |
|---|
| 254 | # age our programme cache on startup |
|---|
| 255 | my $prog = $data_cache->{movie_lookup}; |
|---|
| 256 | $max_age = time - ($opt->{cache_details_for} * 86400); |
|---|
| 257 | foreach my $key (keys %{$prog}) { |
|---|
| 258 | if ($data_cache->{movie_lookup}->{$key}->{last_fetched} < $max_age) { |
|---|
| 259 | delete $data_cache->{movie_lookup}->{$key}; |
|---|
| 260 | $stats{removed_programme_from_cache}++ |
|---|
| 261 | } |
|---|
| 262 | } |
|---|
| 263 | } |
|---|
| 264 | |
|---|
| 265 | ############################################################################## |
|---|
| 266 | # write out updated cache |
|---|
| 267 | |
|---|
| 268 | sub write_cache |
|---|
| 269 | { |
|---|
| 270 | my $store; |
|---|
| 271 | $store->{data_cache} = $data_cache; |
|---|
| 272 | $store->{settings_override} = $settings_override; |
|---|
| 273 | Storable::store($store, $opt->{cache_file}); |
|---|
| 274 | } |
|---|
| 275 | |
|---|
| 276 | ############################################################################## |
|---|
| 277 | # use the online IMDb "power search" at http://www.imdb/List to try to find _1_ match |
|---|
| 278 | |
|---|
| 279 | sub search_imdb_online |
|---|
| 280 | { |
|---|
| 281 | my ($title, $post_fields) = @_; |
|---|
| 282 | |
|---|
| 283 | if ($stats{failed_online_imdb_lookup} and $stats{failed_online_imdb_lookup} >= 3) { |
|---|
| 284 | Shepherd::Common::log("too many failed lookups to online IMDb search for '$title' with $post_fields"); |
|---|
| 285 | return; |
|---|
| 286 | } |
|---|
| 287 | |
|---|
| 288 | Shepherd::Common::log(" online IMDb search for '$title' with $post_fields"); |
|---|
| 289 | my $html_data = Shepherd::Common::get_url(url => "http://www.imdb.com/List", |
|---|
| 290 | method => "POST", postvars => $post_fields); |
|---|
| 291 | |
|---|
| 292 | if (!$html_data) { |
|---|
| 293 | $stats{failed_online_imdb_lookup}++; |
|---|
| 294 | Shepherd::Common::log("failed to search imdb movie data from http://www.imdb.com/List"); |
|---|
| 295 | return; |
|---|
| 296 | } |
|---|
| 297 | $data_cache->{movie_id_lookup}->{$post_fields}->{last_fetched} = time; |
|---|
| 298 | |
|---|
| 299 | my $tp = HTML::TokeParser->new(\$html_data); |
|---|
| 300 | |
|---|
| 301 | # see if we can find any <a href="/title/tt[0-9]+/">{name}</a> tags |
|---|
| 302 | my %urls; |
|---|
| 303 | my $urls_found = 0; |
|---|
| 304 | while (my $token = $tp->get_tag("a")) { |
|---|
| 305 | my $url = $token->[1]{href}; # undefined when 200 results returned |
|---|
| 306 | if ($url && $url =~ /\/title\/tt[0-9]+\//) { |
|---|
| 307 | $urls{$url}++; # store as a hash since imdb sometimes gives dups |
|---|
| 308 | $urls_found++ if ($urls{$url} == 1); |
|---|
| 309 | } |
|---|
| 310 | } |
|---|
| 311 | |
|---|
| 312 | # only insert into cache if we match exactly _1_ movie |
|---|
| 313 | if ($urls_found == 1) { |
|---|
| 314 | my $this_url = (keys %urls)[0]; |
|---|
| 315 | if ($this_url =~ /^http:/) { |
|---|
| 316 | $data_cache->{movie_id_lookup}->{$post_fields}->{url} = $this_url; |
|---|
| 317 | } else { |
|---|
| 318 | $data_cache->{movie_id_lookup}->{$post_fields}->{url} = "http://www.imdb.com".$this_url; |
|---|
| 319 | } |
|---|
| 320 | $stats{imdb_lookup_added_positive_cache_entry}++; |
|---|
| 321 | } else { |
|---|
| 322 | Shepherd::Common::log(" online search failed: wanted 1 match, got $urls_found matches."); |
|---|
| 323 | # negatively cache our failed lookup |
|---|
| 324 | $data_cache->{movie_id_lookup}->{$post_fields}->{url} = "-"; |
|---|
| 325 | $data_cache->{movie_id_lookup}->{$post_fields}->{num_choices} = |
|---|
| 326 | $urls_found; |
|---|
| 327 | } |
|---|
| 328 | } |
|---|
| 329 | |
|---|
| 330 | ############################################################################## |
|---|
| 331 | # simple search |
|---|
| 332 | # |
|---|
| 333 | # The 'power search' seems to have a couple of annoying flaws, like |
|---|
| 334 | # an inability to find AKA titles. So if our power search fails, we |
|---|
| 335 | # fall back to the simple search. This seems to work very well, finding |
|---|
| 336 | # many shows that the power search misses. |
|---|
| 337 | |
|---|
| 338 | sub simple_search_imdb |
|---|
| 339 | { |
|---|
| 340 | my ($title, $year) = @_; |
|---|
| 341 | |
|---|
| 342 | if ($stats{failed_online_imdb_lookup} and $stats{failed_online_imdb_lookup} >= 3) { |
|---|
| 343 | Shepherd::Common::log("too many failed lookups to online IMDb search for '$title' with $year"); |
|---|
| 344 | return; |
|---|
| 345 | } |
|---|
| 346 | |
|---|
| 347 | &Shepherd::Common::log(" trying simple search for '$title' ($year)"); |
|---|
| 348 | my $url = sprintf "http://us.imdb.com/find?q=%s;s=tt", &Shepherd::Common::urlify($title); |
|---|
| 349 | my @response = &Shepherd::Common::get_url($url); |
|---|
| 350 | my $html = $response[0]; |
|---|
| 351 | |
|---|
| 352 | unless ($html) |
|---|
| 353 | { |
|---|
| 354 | $stats{failed_online_imdb_lookup}++; |
|---|
| 355 | &Shepherd::Common::log("simple search failed"); |
|---|
| 356 | return 0; |
|---|
| 357 | } |
|---|
| 358 | |
|---|
| 359 | my $link; |
|---|
| 360 | |
|---|
| 361 | # Check if we were sent directly to the show's main page. |
|---|
| 362 | my $r = $response[6]; |
|---|
| 363 | if ($r and $r->base and $r->base =~ m"/title/tt[0-9]+/") |
|---|
| 364 | { |
|---|
| 365 | $link = $r->base; |
|---|
| 366 | } |
|---|
| 367 | else |
|---|
| 368 | { |
|---|
| 369 | my $tree = HTML::TreeBuilder->new_from_content($html); |
|---|
| 370 | |
|---|
| 371 | HTMLPARSE: foreach my $block ($tree->look_down('_tag' => 'p')) |
|---|
| 372 | { |
|---|
| 373 | my $tag = $block->look_down('_tag' => 'b'); |
|---|
| 374 | next unless ($tag); |
|---|
| 375 | if ($tag->as_text eq 'Titles (Exact Matches)' or $tag->as_text eq 'Popular Titles') |
|---|
| 376 | { |
|---|
| 377 | foreach my $line (split(/<br>/, $block->as_HTML)) |
|---|
| 378 | { |
|---|
| 379 | if ($line =~ /^<a href="(\/title\/tt[0-9]+\/?)">(.*?)<\/a> \((\d{4})\)(.*)/ |
|---|
| 380 | and |
|---|
| 381 | (!$year or $year == $3) |
|---|
| 382 | and |
|---|
| 383 | (!$4 or $4 !~ /VG/)) |
|---|
| 384 | { |
|---|
| 385 | if ($link) |
|---|
| 386 | { |
|---|
| 387 | # found multiple hits. That's bad. |
|---|
| 388 | $link = undef; |
|---|
| 389 | &Shepherd::Common::log(" found multiple hits, wanted 0.") if ($opt->{debug}); |
|---|
| 390 | last HTMLPARSE; |
|---|
| 391 | } |
|---|
| 392 | # Bingo! Found a link |
|---|
| 393 | &Shepherd::Common::log(" found link: $1") if ($opt->{debug}); |
|---|
| 394 | $link = "http://www.imdb.com" . $1; |
|---|
| 395 | } |
|---|
| 396 | } |
|---|
| 397 | } |
|---|
| 398 | } |
|---|
| 399 | } |
|---|
| 400 | my $cache_name = 'simple-'.&Shepherd::Common::urlify($title).'-'.$year; |
|---|
| 401 | |
|---|
| 402 | $data_cache->{movie_id_lookup}->{$cache_name}->{last_fetched} = time; |
|---|
| 403 | |
|---|
| 404 | unless ($link) |
|---|
| 405 | { |
|---|
| 406 | &Shepherd::Common::log(" simple search failed."); |
|---|
| 407 | # negatively cache our failed lookup |
|---|
| 408 | $data_cache->{movie_id_lookup}->{$cache_name}->{url} = "-"; |
|---|
| 409 | return 0; |
|---|
| 410 | } |
|---|
| 411 | $data_cache->{movie_id_lookup}->{$cache_name}->{url} = $link; |
|---|
| 412 | $stats{imdb_lookup_added_positive_cache_entry}++; |
|---|
| 413 | return 1; |
|---|
| 414 | } |
|---|
| 415 | |
|---|
| 416 | ############################################################################## |
|---|
| 417 | # simple parser for imdb returned data: covers most data |
|---|
| 418 | |
|---|
| 419 | sub imdb_scalar_parser |
|---|
| 420 | { |
|---|
| 421 | my ($html_data, $start_tag, $target, $texttype, $targetb, $targetc, $targeta, $v) = @_; |
|---|
| 422 | |
|---|
| 423 | my $found = 0; |
|---|
| 424 | my $tp = HTML::TokeParser->new(\$html_data); |
|---|
| 425 | |
|---|
| 426 | while (my $tag = $tp->get_tag($start_tag)) { |
|---|
| 427 | if ($tp->get_text =~ /^$target/i) { |
|---|
| 428 | $found = 1; |
|---|
| 429 | last; |
|---|
| 430 | } |
|---|
| 431 | } |
|---|
| 432 | if (!$found) { |
|---|
| 433 | Shepherd::Common::log(" - no ".$target." found") if ($opt->{debug}); |
|---|
| 434 | return; |
|---|
| 435 | } |
|---|
| 436 | |
|---|
| 437 | my $tag = $tp->get_tag($targeta) if (defined $targeta); |
|---|
| 438 | if ($texttype eq "trimmed") { |
|---|
| 439 | $$v = $tp->get_trimmed_text($targetb,$targetc); |
|---|
| 440 | } else { |
|---|
| 441 | $$v = $tp->get_text($targetb,$targetc); |
|---|
| 442 | $$v =~ s/^\s*(.*?)\s*$/$1/; |
|---|
| 443 | } |
|---|
| 444 | |
|---|
| 445 | Shepherd::Common::log(" - parsed ".$target." ".$$v) if ($opt->{debug}); |
|---|
| 446 | } |
|---|
| 447 | |
|---|
| 448 | ############################################################################## |
|---|
| 449 | |
|---|
| 450 | sub imdb_list_parser |
|---|
| 451 | { |
|---|
| 452 | my ($html_data, $target, $target2, $v) = @_; |
|---|
| 453 | |
|---|
| 454 | my $tp = HTML::TokeParser->new(\$html_data); |
|---|
| 455 | my @list; |
|---|
| 456 | my $found = 0; |
|---|
| 457 | |
|---|
| 458 | while (my $tag = $tp->get_tag('h5')) { |
|---|
| 459 | if ($tp->get_text =~ /^$target/i) { |
|---|
| 460 | $found = 1; |
|---|
| 461 | last; |
|---|
| 462 | } |
|---|
| 463 | } |
|---|
| 464 | return undef if (!$found); |
|---|
| 465 | |
|---|
| 466 | while (my $tag = $tp->get_tag()) { |
|---|
| 467 | push (@list, $tp->get_text()) |
|---|
| 468 | if (($tag->[0] eq 'a') && (defined $tag->[1]{href}) && ($tag->[1]{href} =~ /$target2/i)); |
|---|
| 469 | last if ($tag->[0] eq 'br'); |
|---|
| 470 | } |
|---|
| 471 | |
|---|
| 472 | my $found_items = 0; |
|---|
| 473 | foreach my $item (@list) { |
|---|
| 474 | $$v->[$found_items] = $item; |
|---|
| 475 | Shepherd::Common::log(" - parsed ".$target." ".$item) if ($opt->{debug}); |
|---|
| 476 | $found_items++; |
|---|
| 477 | } |
|---|
| 478 | } |
|---|
| 479 | |
|---|
| 480 | ############################################################################## |
|---|
| 481 | # perform a detailed movie lookup given a movie url |
|---|
| 482 | # store what we find in our data cache |
|---|
| 483 | |
|---|
| 484 | sub get_imdb_movie_online |
|---|
| 485 | { |
|---|
| 486 | my ($movie_title, $movie_url) = @_; |
|---|
| 487 | |
|---|
| 488 | if ($stats{failed_online_imdb_lookup} and $stats{failed_online_imdb_lookup} >= 3) { |
|---|
| 489 | Shepherd::Common::log("too many failed lookups to downloading online IMDb movie data for '$movie_title'"); |
|---|
| 490 | return; |
|---|
| 491 | } |
|---|
| 492 | |
|---|
| 493 | Shepherd::Common::log(" downloading online IMDb movie data for '$movie_title'"); |
|---|
| 494 | my $html_data = Shepherd::Common::get_url($movie_url); |
|---|
| 495 | |
|---|
| 496 | if (!$html_data) { |
|---|
| 497 | $stats{failed_online_imdb_lookup}++; |
|---|
| 498 | Shepherd::Common::log("failed to fetch imdb movie data from $movie_url"); |
|---|
| 499 | return; |
|---|
| 500 | } |
|---|
| 501 | |
|---|
| 502 | $stats{imdb_movie_added_cache_entry}++; |
|---|
| 503 | $data_cache->{movie_lookup}->{$movie_url}->{last_fetched} = time; |
|---|
| 504 | my $tp; |
|---|
| 505 | |
|---|
| 506 | # |
|---|
| 507 | # parse title and year |
|---|
| 508 | # |
|---|
| 509 | $tp = HTML::TokeParser->new(\$html_data); |
|---|
| 510 | my $title_token = $tp->get_tag('title'); |
|---|
| 511 | my $title_text = $tp->get_text(); |
|---|
| 512 | if ($title_text =~ /(.*?)\s+\((\d{4})?.*?\)/) { |
|---|
| 513 | $data_cache->{movie_lookup}->{$movie_url}->{title} = $1; |
|---|
| 514 | $data_cache->{movie_lookup}->{$movie_url}->{year} = $2; |
|---|
| 515 | } |
|---|
| 516 | |
|---|
| 517 | if (!defined $data_cache->{movie_lookup}->{$movie_url}->{title}) { |
|---|
| 518 | $stats{failed_online_imdb_title_parsing}++; |
|---|
| 519 | Shepherd::Common::log("failed to parse title within imdb movie data from $movie_url"); |
|---|
| 520 | return; |
|---|
| 521 | } |
|---|
| 522 | |
|---|
| 523 | # |
|---|
| 524 | # parse cover url |
|---|
| 525 | # |
|---|
| 526 | my $title = $data_cache->{movie_lookup}->{$movie_url}->{title}; |
|---|
| 527 | $title =~ s/([\{\}\[\]\(\)\^\$\.\|\*\+\?\\])/\\$1/g; |
|---|
| 528 | $tp = HTML::TokeParser->new(\$html_data); |
|---|
| 529 | while (my $img_tag = $tp->get_tag('img')) { |
|---|
| 530 | next if (!$img_tag->[1]{alt}); |
|---|
| 531 | last if ($img_tag->[1]{alt} =~ /^poster not submitted/i); |
|---|
| 532 | if ($img_tag->[1]{alt} =~ /^$title$/i) { |
|---|
| 533 | $data_cache->{movie_lookup}->{$movie_url}->{cover} = $img_tag->[1]{src}; |
|---|
| 534 | Shepherd::Common::log(" - parsed cover_url ".$img_tag->[1]{src}) if ($opt->{debug}); |
|---|
| 535 | last; |
|---|
| 536 | } |
|---|
| 537 | } |
|---|
| 538 | |
|---|
| 539 | # |
|---|
| 540 | # parse directors |
|---|
| 541 | # |
|---|
| 542 | $tp = HTML::TokeParser->new(\$html_data); |
|---|
| 543 | while (my $tag = $tp->get_tag('h5')) { |
|---|
| 544 | last if ($tp->get_text =~ /^(directed|director)/i); |
|---|
| 545 | } |
|---|
| 546 | while (my $tag = $tp->get_tag) { |
|---|
| 547 | my $text = $tp->get_text(); |
|---|
| 548 | last if (($text =~ /^(writing|writer)/i) || ($tag->[0] =~ /\/td/i)); |
|---|
| 549 | if ($tag->[0] eq 'a') { |
|---|
| 550 | my $id = $tag->[1]{href}; |
|---|
| 551 | next if (!$id or $id !~ /^\/name\/nm/); |
|---|
| 552 | push(@{$data_cache->{movie_lookup}->{$movie_url}->{directors}}, [$text, $id]); |
|---|
| 553 | Shepherd::Common::log(" - parsed director ".$text." (".$id.")") if ($opt->{debug}); |
|---|
| 554 | } |
|---|
| 555 | } |
|---|
| 556 | |
|---|
| 557 | # |
|---|
| 558 | # parse writers |
|---|
| 559 | # |
|---|
| 560 | $tp = HTML::TokeParser->new(\$html_data); |
|---|
| 561 | while (my $tag = $tp->get_tag('h5')) { |
|---|
| 562 | last if ($tp->get_text =~ /^(writing|writer)/i); |
|---|
| 563 | } |
|---|
| 564 | while (my $tag = $tp->get_tag) { |
|---|
| 565 | my $text = $tp->get_text(); |
|---|
| 566 | last if ($tag->[0] =~ /div/i); |
|---|
| 567 | if (($tag->[0] eq 'a') && ($text !~ /\bmore\b/i)) { |
|---|
| 568 | my $id = $tag->[1]{href}; |
|---|
| 569 | next if (!$id or $id !~ /^\/name\/nm/); |
|---|
| 570 | push(@{$data_cache->{movie_lookup}->{$movie_url}->{writers}}, [$text, $id]); |
|---|
| 571 | Shepherd::Common::log(" - parsed writer ".$text." (".$id.")") if ($opt->{debug}); |
|---|
| 572 | } |
|---|
| 573 | } |
|---|
| 574 | |
|---|
| 575 | # |
|---|
| 576 | # parse cast |
|---|
| 577 | # |
|---|
| 578 | $tp = HTML::TokeParser->new(\$html_data); |
|---|
| 579 | while (my $tag = $tp->get_tag('table')) { |
|---|
| 580 | next unless ((exists $tag->[1]{class}) && ($tag->[1]{class} =~ /cast/i)); |
|---|
| 581 | |
|---|
| 582 | my $person, my $role; |
|---|
| 583 | while ($tag = $tp->get_tag) { |
|---|
| 584 | last if ($tag->[0] =~ /^\/table/i); |
|---|
| 585 | if (($tag->[0] =~ /^td/i) && (defined $tag->[1]{class}) && ($tag->[1]{class} =~ /nm/)) { |
|---|
| 586 | $tag = $tp->get_tag; |
|---|
| 587 | $person = $tp->get_text(); |
|---|
| 588 | next; |
|---|
| 589 | } |
|---|
| 590 | if (($tag->[0] =~ /^td/i) && (defined $tag->[1]{class}) && ($tag->[1]{class} =~ /char/)) { |
|---|
| 591 | $role = $tp->get_text(); |
|---|
| 592 | |
|---|
| 593 | if ((defined $person) && (defined $role)) { |
|---|
| 594 | push(@{$data_cache->{movie_lookup}->{$movie_url}->{cast}}, [$person, $role]); |
|---|
| 595 | Shepherd::Common::log(" - parsed cast (".$role.") ".$person) if ($opt->{debug}); |
|---|
| 596 | } |
|---|
| 597 | |
|---|
| 598 | $person = undef, $role = undef; |
|---|
| 599 | } |
|---|
| 600 | } |
|---|
| 601 | |
|---|
| 602 | last; |
|---|
| 603 | } |
|---|
| 604 | |
|---|
| 605 | # |
|---|
| 606 | # parse countries, languages, genres using generic list parser |
|---|
| 607 | # |
|---|
| 608 | &imdb_list_parser($html_data,"country","countries", |
|---|
| 609 | \$data_cache->{movie_lookup}->{$movie_url}->{countries}); |
|---|
| 610 | &imdb_list_parser($html_data,"language","language", |
|---|
| 611 | \$data_cache->{movie_lookup}->{$movie_url}->{languages}); |
|---|
| 612 | &imdb_list_parser($html_data,"genre","genre", |
|---|
| 613 | \$data_cache->{movie_lookup}->{$movie_url}->{genres}); |
|---|
| 614 | |
|---|
| 615 | # |
|---|
| 616 | # parse tagline, plot, rating, runtime, aka, trivia, goofs, |
|---|
| 617 | # awards and summary using generic scalar handler |
|---|
| 618 | # |
|---|
| 619 | &imdb_scalar_parser($html_data, "h5", "tagline","trimmed","h5", "a", undef, \$data_cache->{movie_lookup}->{$movie_url}->{tagline}); |
|---|
| 620 | &imdb_scalar_parser($html_data, "h5", "plot","trimmed","h5","a", undef, \$data_cache->{movie_lookup}->{$movie_url}->{plot}); |
|---|
| 621 | &imdb_scalar_parser($html_data, "b", "user rating","trimmed","b","/b","b", \$data_cache->{movie_lookup}->{$movie_url}->{rating}); |
|---|
| 622 | &imdb_scalar_parser($html_data, "h5", "runtime","trimmed","h5","br", undef, \$data_cache->{movie_lookup}->{$movie_url}->{runtime}); |
|---|
| 623 | &imdb_scalar_parser($html_data, "h5", "(aka|also known as)","trimmed","b","/div", undef, \$data_cache->{movie_lookup}->{$movie_url}->{aka}); |
|---|
| 624 | &imdb_scalar_parser($html_data, "h5", "trivia","trimmed","h5","a", undef, \$data_cache->{movie_lookup}->{$movie_url}->{trivia}); |
|---|
| 625 | &imdb_scalar_parser($html_data, "h5", "goofs","trimmed","h5","a", undef, \$data_cache->{movie_lookup}->{$movie_url}->{goofs}); |
|---|
| 626 | &imdb_scalar_parser($html_data, "h5", "awards","trimmed","h5","a", undef, \$data_cache->{movie_lookup}->{$movie_url}->{awards}); |
|---|
| 627 | &imdb_scalar_parser($html_data, "h5", "original air date","trimmed","h5","a", undef, \$data_cache->{movie_lookup}->{$movie_url}->{airdate}); |
|---|
| 628 | &imdb_scalar_parser($html_data, "h5", "plot summary","","h5","a", undef, \$data_cache->{movie_lookup}->{$movie_url}->{summary}); |
|---|
| 629 | |
|---|
| 630 | if (defined $data_cache->{movie_lookup}->{$movie_url}->{rating}) { |
|---|
| 631 | # don't use a rating unless its numeric |
|---|
| 632 | if ($data_cache->{movie_lookup}->{$movie_url}->{rating} !~ /^\d+/) { |
|---|
| 633 | delete $data_cache->{movie_lookup}->{$movie_url}->{rating}; |
|---|
| 634 | } |
|---|
| 635 | } |
|---|
| 636 | |
|---|
| 637 | # |
|---|
| 638 | # certifications |
|---|
| 639 | # |
|---|
| 640 | $tp = HTML::TokeParser->new(\$html_data); |
|---|
| 641 | while (my $tag = $tp->get_tag('h5')) { |
|---|
| 642 | last if ($tp->get_text =~ /^certification/i); |
|---|
| 643 | } |
|---|
| 644 | while (my $tag = $tp->get_tag()) { |
|---|
| 645 | if (($tag->[0] eq "a") && (defined $tag->[1]{href}) && ($tag->[1]{href} =~ /certificates/i)) { |
|---|
| 646 | my($country, $range) = split(/:/, $tp->get_text); |
|---|
| 647 | $data_cache->{movie_lookup}->{$movie_url}->{certifications}->{$country} = $range; |
|---|
| 648 | Shepherd::Common::log(" - parsed certification ".$range." (".$country.")") if ($opt->{debug}); |
|---|
| 649 | } |
|---|
| 650 | last if ($tag->[0] =~ /\/td/i); |
|---|
| 651 | } |
|---|
| 652 | |
|---|
| 653 | # don't yet pick the following up: do we need to? |
|---|
| 654 | # official_sites |
|---|
| 655 | # full plot |
|---|
| 656 | |
|---|
| 657 | |
|---|
| 658 | # write out the cache every 15 progs or so |
|---|
| 659 | &write_cache if ((($stats{imdb_movie_added_cache_entry} % 15) == 0) && (!defined $opt->{no_cache})); |
|---|
| 660 | } |
|---|
| 661 | |
|---|
| 662 | ############################################################################## |
|---|
| 663 | |
|---|
| 664 | sub encoding_cb( $ ) |
|---|
| 665 | { |
|---|
| 666 | my $e = shift; |
|---|
| 667 | # printf "got encoding ".Dumper($e); |
|---|
| 668 | } |
|---|
| 669 | |
|---|
| 670 | ############################################################################## |
|---|
| 671 | |
|---|
| 672 | sub credits_cb( $ ) |
|---|
| 673 | { |
|---|
| 674 | my $c = shift; |
|---|
| 675 | # printf "got credits ".Dumper($c); |
|---|
| 676 | } |
|---|
| 677 | |
|---|
| 678 | ############################################################################## |
|---|
| 679 | |
|---|
| 680 | sub channel_cb( $ ) |
|---|
| 681 | { |
|---|
| 682 | my $c = shift; |
|---|
| 683 | # printf "got channel ".Dumper($c); |
|---|
| 684 | $writer->write_channel($c); |
|---|
| 685 | } |
|---|
| 686 | |
|---|
| 687 | ############################################################################## |
|---|
| 688 | |
|---|
| 689 | sub programme_cb( $ ) |
|---|
| 690 | { |
|---|
| 691 | my $prog=shift; |
|---|
| 692 | $stats{programmes}++; |
|---|
| 693 | |
|---|
| 694 | my $movie_title = $prog->{title}->[0]->[0] |
|---|
| 695 | if (($prog->{title}) && ($prog->{title}->[0]) && |
|---|
| 696 | ($prog->{title}->[0]->[0])); |
|---|
| 697 | my $movie_subtitle = $prog->{'sub-title'}->[0]->[0] |
|---|
| 698 | if (($prog->{'sub-title'}) && ($prog->{'sub-title'}->[0]) && |
|---|
| 699 | ($prog->{'sub-title'}->[0]->[0])); |
|---|
| 700 | my $movie_year = $prog->{date} if ($prog->{date}); |
|---|
| 701 | |
|---|
| 702 | if (($movie_title =~ /^close$/i) || ($movie_title =~ /^station close$/i)) { |
|---|
| 703 | $stats{skipped_due_to_category}++; |
|---|
| 704 | goto END; |
|---|
| 705 | } |
|---|
| 706 | |
|---|
| 707 | if (defined $prog->{category}) { |
|---|
| 708 | foreach my $prog_category (@{($prog->{category})}) { |
|---|
| 709 | foreach my $prog_cat2 (@$prog_category) { |
|---|
| 710 | foreach my $skip_category (split(/,/,$opt->{skip_categories})) { |
|---|
| 711 | if (lc($prog_cat2) eq lc($skip_category)) { |
|---|
| 712 | $stats{skipped_due_to_category}++; |
|---|
| 713 | goto END; |
|---|
| 714 | } |
|---|
| 715 | } |
|---|
| 716 | } |
|---|
| 717 | } |
|---|
| 718 | } |
|---|
| 719 | |
|---|
| 720 | # print "got programme ".Dumper($prog); |
|---|
| 721 | |
|---|
| 722 | # |
|---|
| 723 | # only lookup if min_duration < prog_duration > min_duration |
|---|
| 724 | # |
|---|
| 725 | my $t1 = Shepherd::Common::parse_xmltv_date($prog->{start}); |
|---|
| 726 | my $t2 = Shepherd::Common::parse_xmltv_date($prog->{stop}); |
|---|
| 727 | if ((!$t1) || (!$t2)) { |
|---|
| 728 | $stats{excluded_couldnt_parse_time}++; |
|---|
| 729 | goto END; |
|---|
| 730 | } |
|---|
| 731 | my $prog_duration = (($t2 - $t1) / 60); |
|---|
| 732 | if ($prog_duration < $opt->{min_duration}) { |
|---|
| 733 | $stats{excluded_prog_too_short}++; |
|---|
| 734 | goto END; |
|---|
| 735 | } |
|---|
| 736 | if ($prog_duration > $opt->{max_duration}) { |
|---|
| 737 | $stats{excluded_prog_too_long}++; |
|---|
| 738 | goto END; |
|---|
| 739 | } |
|---|
| 740 | |
|---|
| 741 | $stats{included_for_imdb_lookup}++; |
|---|
| 742 | |
|---|
| 743 | # |
|---|
| 744 | # find movie url |
|---|
| 745 | # (either via a cached previous search or via IMDb "power search") |
|---|
| 746 | # |
|---|
| 747 | my @search_fields; |
|---|
| 748 | push(@search_fields, "words=".Shepherd::Common::urlify($movie_title)); |
|---|
| 749 | push(@search_fields, "&countries=".Shepherd::Common::urlify($prog->{country}->[0][0])) |
|---|
| 750 | if ((defined $prog->{country}) && (defined $prog->{country}->[0][0])); |
|---|
| 751 | push(@search_fields, "&year=".Shepherd::Common::urlify($prog->{date})) |
|---|
| 752 | if ((defined $prog->{date}) && ($prog->{date} > 0)); |
|---|
| 753 | push(@search_fields, "&language=".Shepherd::Common::urlify($prog->{language}->[0] =~ /(^[^,]*)/)) |
|---|
| 754 | if ((defined $prog->{language}) && (defined $prog->{language}->[0])); |
|---|
| 755 | # &exact=y |
|---|
| 756 | # cast/crew |
|---|
| 757 | |
|---|
| 758 | Shepherd::Common::log("programme ".$stats{programmes}.": \"$movie_title\" ($prog_duration minutes)"); |
|---|
| 759 | |
|---|
| 760 | # first search using everything we can... |
|---|
| 761 | my $orig_post_fields = join("",@search_fields); |
|---|
| 762 | my $post_fields; |
|---|
| 763 | my $found = 0; |
|---|
| 764 | while ((scalar(@search_fields) > 0) && (!$found)) { |
|---|
| 765 | if ($movie_subtitle) { |
|---|
| 766 | $post_fields = "words=".Shepherd::Common::urlify($movie_title." ".$movie_subtitle); |
|---|
| 767 | $orig_post_fields = $post_fields; |
|---|
| 768 | push(@search_fields, $post_fields); |
|---|
| 769 | } else { |
|---|
| 770 | $post_fields = join("",@search_fields); |
|---|
| 771 | } |
|---|
| 772 | |
|---|
| 773 | # |
|---|
| 774 | # first check if we have a previos cache entry for this |
|---|
| 775 | # |
|---|
| 776 | if (defined $data_cache->{movie_id_lookup}->{$post_fields} and |
|---|
| 777 | defined $data_cache->{movie_id_lookup}->{$post_fields}->{url}) { |
|---|
| 778 | if ($data_cache->{movie_id_lookup}->{$post_fields}->{url} eq "-") { |
|---|
| 779 | # negatively cached |
|---|
| 780 | $stats{imdb_lookup_used_negative_cache_entry}++; |
|---|
| 781 | undef $movie_subtitle; |
|---|
| 782 | pop(@search_fields); |
|---|
| 783 | } else { |
|---|
| 784 | # positive cache |
|---|
| 785 | $stats{imdb_lookup_used_cache_entry}++; |
|---|
| 786 | $found = 1; |
|---|
| 787 | } |
|---|
| 788 | Shepherd::Common::log(" used (".($found ? "positive" : "negative")." cache) search: $post_fields"); |
|---|
| 789 | } else { |
|---|
| 790 | # |
|---|
| 791 | # no cache, go look it up |
|---|
| 792 | # |
|---|
| 793 | &search_imdb_online($movie_title,$post_fields); |
|---|
| 794 | |
|---|
| 795 | # IMDb web site failed/returned unexpected result, just skip rest |
|---|
| 796 | goto END |
|---|
| 797 | if (!defined $data_cache->{movie_id_lookup}->{$post_fields} or |
|---|
| 798 | !defined $data_cache->{movie_id_lookup}->{$post_fields}->{url}); |
|---|
| 799 | |
|---|
| 800 | if ($data_cache->{movie_id_lookup}->{$post_fields}->{url} eq "-") { |
|---|
| 801 | # lookup failed. |
|---|
| 802 | |
|---|
| 803 | # if it returned no hits, perhaps our search was too specific. |
|---|
| 804 | # make it less specific if we can |
|---|
| 805 | if ($data_cache->{movie_id_lookup}->{$post_fields}->{num_choices} == 0) { |
|---|
| 806 | undef $movie_subtitle; |
|---|
| 807 | pop(@search_fields); |
|---|
| 808 | } else { |
|---|
| 809 | # search wasn't specific enough. (more than 1 hit) |
|---|
| 810 | |
|---|
| 811 | if ($movie_subtitle) { |
|---|
| 812 | # try again if subtitle in search |
|---|
| 813 | undef $movie_subtitle; |
|---|
| 814 | pop(@search_fields); |
|---|
| 815 | } else { |
|---|
| 816 | # skip ahead to simple search |
|---|
| 817 | # fill in less detailed searches or we fetch them next time |
|---|
| 818 | pop(@search_fields); |
|---|
| 819 | while (scalar(@search_fields) > 0) { |
|---|
| 820 | $post_fields = join("",@search_fields); |
|---|
| 821 | $data_cache->{movie_id_lookup}->{$post_fields}->{last_fetched} = time; |
|---|
| 822 | $data_cache->{movie_id_lookup}->{$post_fields}->{url} = '-'; |
|---|
| 823 | $data_cache->{movie_id_lookup}->{$post_fields}->{num_choices} = 1000; |
|---|
| 824 | pop(@search_fields); |
|---|
| 825 | } |
|---|
| 826 | } |
|---|
| 827 | } |
|---|
| 828 | } else { |
|---|
| 829 | # lookup succeeded |
|---|
| 830 | $found = 1; |
|---|
| 831 | } |
|---|
| 832 | } |
|---|
| 833 | } |
|---|
| 834 | |
|---|
| 835 | # Try the simple search if we have date info |
|---|
| 836 | if (!$found and $prog->{date}) { |
|---|
| 837 | # cached? |
|---|
| 838 | $post_fields = 'simple-'. &Shepherd::Common::urlify($movie_title) . '-'.$prog->{date}; |
|---|
| 839 | my $simple_search = $data_cache->{movie_id_lookup}->{$post_fields}; |
|---|
| 840 | if ($simple_search and $simple_search->{url}) { |
|---|
| 841 | if ($simple_search->{url} eq '-') { |
|---|
| 842 | # negatively cached |
|---|
| 843 | $stats{imdb_lookup_used_negative_cache_entry}++; |
|---|
| 844 | } else { |
|---|
| 845 | # positively cached |
|---|
| 846 | $stats{imdb_lookup_used_cache_entry}++; |
|---|
| 847 | $found = 1; |
|---|
| 848 | } |
|---|
| 849 | &Shepherd::Common::log(sprintf(" used (%s cache) search: ", ($found ? 'positive' : 'negative')) . $post_fields); |
|---|
| 850 | } else { |
|---|
| 851 | # not cached; look it up |
|---|
| 852 | $found = &simple_search_imdb($movie_title, $prog->{date}); |
|---|
| 853 | } |
|---|
| 854 | } else { |
|---|
| 855 | &Shepherd::Common::log(" no date info, not trying simple search") if ($opt->{debug}); |
|---|
| 856 | } |
|---|
| 857 | |
|---|
| 858 | goto END if (!$found); |
|---|
| 859 | |
|---|
| 860 | $data_cache->{movie_id_lookup}->{$orig_post_fields}->{url} = $data_cache->{movie_id_lookup}->{$post_fields}->{url} |
|---|
| 861 | if ($post_fields ne $orig_post_fields); |
|---|
| 862 | |
|---|
| 863 | my $movie_url = $data_cache->{movie_id_lookup}->{$post_fields}->{url}; |
|---|
| 864 | |
|---|
| 865 | # no match or negative cache match - bail out |
|---|
| 866 | goto END if ((!defined $movie_url) || ($movie_url eq "-")); |
|---|
| 867 | |
|---|
| 868 | # |
|---|
| 869 | # lookup movie details |
|---|
| 870 | # (either via previously cached entry or via an online IMDb lookup) |
|---|
| 871 | # |
|---|
| 872 | |
|---|
| 873 | if (defined $data_cache->{movie_lookup}->{$movie_url}) { |
|---|
| 874 | $stats{imdb_movie_used_cache_entry}++; |
|---|
| 875 | Shepherd::Common::log(" used existing (cached) movie details: $movie_url"); |
|---|
| 876 | } else { |
|---|
| 877 | &get_imdb_movie_online($movie_title,$movie_url); |
|---|
| 878 | goto END if (!defined $data_cache->{movie_lookup}->{$movie_url}); |
|---|
| 879 | $stats{imdb_movie_added_cache_entry}++; |
|---|
| 880 | } |
|---|
| 881 | |
|---|
| 882 | |
|---|
| 883 | # |
|---|
| 884 | # augment data |
|---|
| 885 | # |
|---|
| 886 | |
|---|
| 887 | $data_cache->{movie_lookup}->{$movie_url}->{last_lookup} = time; |
|---|
| 888 | $data_cache->{movie_lookup}->{$movie_url}->{num_lookups}++; |
|---|
| 889 | goto END if (!defined $data_cache->{movie_lookup}->{$movie_url}->{title}); |
|---|
| 890 | my $imdb = $data_cache->{movie_lookup}->{$movie_url}; |
|---|
| 891 | $stats{added_imdb_data}++; |
|---|
| 892 | |
|---|
| 893 | if ($opt->{dont_augment_desc}) { |
|---|
| 894 | |
|---|
| 895 | # if not a reasonable description, add anything we've got |
|---|
| 896 | if (!defined $prog->{desc} || !defined $prog->{desc}->[0] || !defined $prog->{desc}->[0]->[0] || |
|---|
| 897 | (length ($prog->{desc}->[0]->[0]) <= 20)) { |
|---|
| 898 | my $imdb_desc = ''; |
|---|
| 899 | |
|---|
| 900 | $imdb_desc .= $imdb->{tagline}."\n" if ($imdb->{tagline}); |
|---|
| 901 | $imdb_desc .= $imdb->{summary}."\n" if ($imdb->{summary}); |
|---|
| 902 | $imdb_desc .= $imdb->{plot} if ($imdb->{plot}); |
|---|
| 903 | |
|---|
| 904 | if ($imdb_desc ne '') { |
|---|
| 905 | chomp $imdb_desc; |
|---|
| 906 | $prog->{desc}->[0]->[0] = "" if (!defined $prog->{desc}->[0]->[0]); |
|---|
| 907 | $prog->{desc}->[0]->[0] .= "\n" if ($prog->{desc}->[0]->[0] ne ""); |
|---|
| 908 | $prog->{desc}->[0]->[0] .= $imdb_desc; |
|---|
| 909 | } |
|---|
| 910 | } |
|---|
| 911 | } else { |
|---|
| 912 | my $imdb_desc = ''; |
|---|
| 913 | |
|---|
| 914 | # Unless we request --long-info, only add |
|---|
| 915 | # Tagline, Awards, and Trivia to the description. |
|---|
| 916 | # (Other data goes in appropriate fields.) |
|---|
| 917 | |
|---|
| 918 | if ($opt->{long_info}) { |
|---|
| 919 | $imdb_desc .= sprintf " Title: %s",$imdb->{title}; |
|---|
| 920 | $imdb_desc .= sprintf " (%s)",$imdb->{year} |
|---|
| 921 | if (($imdb->{year}) && ($imdb->{year} > 0)); |
|---|
| 922 | $imdb_desc .= "\n"; |
|---|
| 923 | |
|---|
| 924 | $imdb_desc .= sprintf " aka: %s\n",$imdb->{aka} |
|---|
| 925 | if (defined $imdb->{aka} && $imdb->{aka} ne ""); |
|---|
| 926 | } |
|---|
| 927 | |
|---|
| 928 | $imdb_desc .= sprintf " Tagline: %s\n",$imdb->{tagline} |
|---|
| 929 | if (defined $imdb->{tagline} && $imdb->{tagline} ne ""); |
|---|
| 930 | |
|---|
| 931 | if ($opt->{long_info} || !defined $prog->{desc} || !defined $prog->{desc}->[0] || |
|---|
| 932 | !defined $prog->{desc}->[0]->[0] || (length ($prog->{desc}->[0]->[0]) <= 20)) { |
|---|
| 933 | $imdb_desc .= sprintf " Summary: %s\n",$imdb->{summary} |
|---|
| 934 | if (defined $imdb->{summary} && $imdb->{summary} ne ""); |
|---|
| 935 | |
|---|
| 936 | $imdb_desc .= sprintf " Plot: %s\n",$imdb->{plot} |
|---|
| 937 | if (defined $imdb->{plot} && $imdb->{plot} ne ""); |
|---|
| 938 | } |
|---|
| 939 | |
|---|
| 940 | if ($opt->{long_info}) { |
|---|
| 941 | $imdb_desc .= sprintf " Rating: %s\n",$imdb->{rating} |
|---|
| 942 | if (defined $imdb->{rating} && $imdb->{rating} =~ /^\d+/); |
|---|
| 943 | } |
|---|
| 944 | |
|---|
| 945 | $imdb_desc .= sprintf " Original Air Date: %s\n",$imdb->{airdate} |
|---|
| 946 | if (defined $imdb->{airdate} && $imdb->{airdate} ne ""); |
|---|
| 947 | |
|---|
| 948 | $imdb_desc .= sprintf " Awards: %s\n",$imdb->{awards} |
|---|
| 949 | if (defined $imdb->{awards} && $imdb->{awards} ne ""); |
|---|
| 950 | |
|---|
| 951 | $imdb_desc .= sprintf " Trivia: %s\n",$imdb->{trivia} |
|---|
| 952 | if (defined $imdb->{trivia} && $imdb->{trivia} ne ""); |
|---|
| 953 | |
|---|
| 954 | if ($opt->{long_info}) { |
|---|
| 955 | $imdb_desc .= sprintf " Certifications: %s\n", join(', ', |
|---|
| 956 | map { sprintf "%s (%s)", $imdb->{certifications}->{$_}, $_ } sort keys %{$imdb->{certifications}}) |
|---|
| 957 | if (defined $imdb->{certifications}); |
|---|
| 958 | |
|---|
| 959 | $imdb_desc .= sprintf " Cast: %s\n", join(', ', |
|---|
| 960 | (ref $imdb->{cast} eq "ARRAY" ? |
|---|
| 961 | map { sprintf "%s%s", @$_[0], (@$_[1] ? " as @$_[1]" : '') } @{$imdb->{cast}} : |
|---|
| 962 | map { sprintf "%s%s", $_, ($imdb->{cast}->{$_} ? " as " . $imdb->{cast}->{$_} : '') } keys %{$imdb->{cast}})) |
|---|
| 963 | if (defined $imdb->{cast}); |
|---|
| 964 | |
|---|
| 965 | $imdb_desc .= sprintf " Directors: %s\n", |
|---|
| 966 | join(", ", ref($imdb->{directors}) eq "ARRAY" ? |
|---|
| 967 | map(@$_[0], @{$imdb->{directors}}) : sort keys %{$imdb->{directors}}) |
|---|
| 968 | if ($imdb->{directors}); |
|---|
| 969 | $imdb_desc .= sprintf " Writers: %s\n", |
|---|
| 970 | join(", ", ref($imdb->{writers}) eq "ARRAY" ? |
|---|
| 971 | map(@$_[0], @{$imdb->{writers}}) : sort keys %{$imdb->{writers}}) |
|---|
| 972 | if ($imdb->{writers}); |
|---|
| 973 | |
|---|
| 974 | $imdb_desc .= sprintf " Runtime: %s\n",$imdb->{runtime} |
|---|
| 975 | if (defined $imdb->{runtime} && $imdb->{runtime} ne ""); |
|---|
| 976 | $imdb_desc .= sprintf " Countries: %s\n", |
|---|
| 977 | join(", ",@{$imdb->{countries}}) |
|---|
| 978 | if (defined $imdb->{countries}); |
|---|
| 979 | $imdb_desc .= sprintf " Languages: %s\n", |
|---|
| 980 | join(", ",@{$imdb->{languages}}) |
|---|
| 981 | if (defined $imdb->{languages}); |
|---|
| 982 | $imdb_desc .= sprintf " Genres: %s\n", |
|---|
| 983 | join(", ",@{$imdb->{genres}}) |
|---|
| 984 | if (defined $imdb->{genres}); |
|---|
| 985 | |
|---|
| 986 | $imdb_desc .= sprintf " Goofs: %s\n",$imdb->{goofs} |
|---|
| 987 | if (defined $imdb->{goofs} && $imdb->{goofs} ne ""); |
|---|
| 988 | $imdb_desc .= sprintf " Cover: %s\n",$imdb->{cover} |
|---|
| 989 | if (defined $imdb->{cover} && $imdb->{cover} ne ""); |
|---|
| 990 | } |
|---|
| 991 | |
|---|
| 992 | if ($imdb_desc ne '') { |
|---|
| 993 | chomp $imdb_desc; |
|---|
| 994 | if ($opt->{long_info}) { |
|---|
| 995 | $imdb_desc = "IMDb augmented data:\n$imdb_desc"; |
|---|
| 996 | } else { |
|---|
| 997 | $imdb_desc = sprintf "(%s)", substr($imdb_desc, 1); |
|---|
| 998 | } |
|---|
| 999 | $prog->{desc}->[0]->[0] = "" if (!defined $prog->{desc}->[0]->[0]); |
|---|
| 1000 | $prog->{desc}->[0]->[0] .= "\n\n" if ($prog->{desc}->[0]->[0] ne ""); |
|---|
| 1001 | $prog->{desc}->[0]->[0] .= $imdb_desc; |
|---|
| 1002 | } |
|---|
| 1003 | } |
|---|
| 1004 | |
|---|
| 1005 | $prog->{date} = $imdb->{year} if ($imdb->{year} && !$prog->{date}); |
|---|
| 1006 | # $prog->{length} = $imdb->{runtime} if ($imdb->{runtime}); |
|---|
| 1007 | |
|---|
| 1008 | my $found_url = 0, my $found_cover = 0; |
|---|
| 1009 | if (defined $prog->{url}) { |
|---|
| 1010 | foreach my $url (@{($prog->{url})}) { |
|---|
| 1011 | $found_url++ if (lc($url) eq lc($movie_url)); |
|---|
| 1012 | $found_cover++ if (($imdb->{cover}) && (lc($url) eq lc($imdb->{cover}))); |
|---|
| 1013 | } |
|---|
| 1014 | } |
|---|
| 1015 | push (@{($prog->{url})},$movie_url) if (!$found_url); |
|---|
| 1016 | push (@{($prog->{url})},$imdb->{cover}) if (($imdb->{cover}) && (!$found_cover)); |
|---|
| 1017 | |
|---|
| 1018 | if ($imdb->{rating}) { |
|---|
| 1019 | my ($rating,$votes) = split(/ /,$imdb->{rating}); |
|---|
| 1020 | |
|---|
| 1021 | # If there's already a star-rating, average the scores |
|---|
| 1022 | if (ref $prog->{'star-rating'}) { |
|---|
| 1023 | $rating = sprintf "%.1f/10", (eval($rating) + eval($prog->{'star-rating'}[0])) * 5; |
|---|
| 1024 | } |
|---|
| 1025 | $prog->{'star-rating'} = [ $rating, [] ]; |
|---|
| 1026 | } |
|---|
| 1027 | |
|---|
| 1028 | if ($imdb->{languages}) { |
|---|
| 1029 | foreach my $lang (@{($imdb->{languages})}) { |
|---|
| 1030 | if (defined $prog->{language}) { |
|---|
| 1031 | $prog->{language}->[0] .= ", " . $lang |
|---|
| 1032 | if ($prog->{language}->[0] !~ /$lang/i); |
|---|
| 1033 | } else { |
|---|
| 1034 | $prog->{language}->[0] = $lang; |
|---|
| 1035 | } |
|---|
| 1036 | } |
|---|
| 1037 | } |
|---|
| 1038 | # don't fill in XMLTV orig-language - mythtv ignores it |
|---|
| 1039 | |
|---|
| 1040 | if (defined $imdb->{genres}) { |
|---|
| 1041 | foreach my $genre (@{($imdb->{genres})}) { |
|---|
| 1042 | my $found_genre = 0; |
|---|
| 1043 | foreach my $category (@{($prog->{category})}) { |
|---|
| 1044 | $found_genre++ if (lc($genre) eq lc($category->[0])); |
|---|
| 1045 | } |
|---|
| 1046 | push(@{($prog->{category})},[$genre]) if (!$found_genre); |
|---|
| 1047 | } |
|---|
| 1048 | } |
|---|
| 1049 | |
|---|
| 1050 | if (defined $imdb->{countries}) { |
|---|
| 1051 | foreach my $country (@{($imdb->{countries})}) { |
|---|
| 1052 | my $found_country = 0; |
|---|
| 1053 | foreach my $c (@{($prog->{country})}) { |
|---|
| 1054 | $found_country++ if (lc($country) eq lc($c->[0])); |
|---|
| 1055 | } |
|---|
| 1056 | push(@{($prog->{country})},[$country]) if (!$found_country); |
|---|
| 1057 | } |
|---|
| 1058 | } |
|---|
| 1059 | |
|---|
| 1060 | if (defined $imdb->{airdate}) |
|---|
| 1061 | { |
|---|
| 1062 | my ($season) = ($imdb->{airdate} =~ /Season\s+(\d+)/i); |
|---|
| 1063 | my ($episode) = ($imdb->{airdate} =~ /Episode\s+(\d+)/i); |
|---|
| 1064 | if ($season || $episode) { |
|---|
| 1065 | my $xmltv_ns = sprintf "%d . %d . %d", |
|---|
| 1066 | ($season ? ($season - 1) : 0), ($episode ? ($episode - 1) : 0), 0; |
|---|
| 1067 | $prog->{'episode-num'} = [ [ $xmltv_ns, 'xmltv_ns' ] ]; |
|---|
| 1068 | } |
|---|
| 1069 | } |
|---|
| 1070 | |
|---|
| 1071 | if (defined $imdb->{certifications}) { |
|---|
| 1072 | # put a useful rating first |
|---|
| 1073 | my $useful_rating; |
|---|
| 1074 | foreach my $country ('Australia', 'New Zealand', 'USA', 'UK', 'Canada', 'Ireland', |
|---|
| 1075 | 'Singapore', 'Philippines', 'Taiwan') { |
|---|
| 1076 | if ($imdb->{certifications}->{$country} && |
|---|
| 1077 | $imdb->{certifications}->{$country} ne 'Unrated' && |
|---|
| 1078 | $imdb->{certifications}->{$country} ne 'Not Rated' && |
|---|
| 1079 | $imdb->{certifications}->{$country} ne 'Approved') { |
|---|
| 1080 | push(@{($prog->{rating})},[$imdb->{certifications}->{$country},$country,undef]); |
|---|
| 1081 | $useful_rating = 1; |
|---|
| 1082 | last; |
|---|
| 1083 | } |
|---|
| 1084 | } |
|---|
| 1085 | |
|---|
| 1086 | # only return everything if long_info or no useful rating found |
|---|
| 1087 | if ($opt->{long_info} || !$useful_rating) { |
|---|
| 1088 | foreach my $cert (sort keys %{($imdb->{certifications})}) { |
|---|
| 1089 | my $found_cert = 0; |
|---|
| 1090 | foreach my $c (@{($prog->{rating})}) { |
|---|
| 1091 | $found_cert++ if (lc($cert) eq lc($c->[1])); |
|---|
| 1092 | } |
|---|
| 1093 | push(@{($prog->{rating})},[$imdb->{certifications}->{$cert},$cert,undef]) |
|---|
| 1094 | if (!$found_cert); |
|---|
| 1095 | } |
|---|
| 1096 | } |
|---|
| 1097 | } |
|---|
| 1098 | |
|---|
| 1099 | if (defined $imdb->{cast}) { |
|---|
| 1100 | foreach my $cast (ref($imdb->{cast}) eq "ARRAY" ? |
|---|
| 1101 | map(@$_[0], @{$imdb->{cast}}) : sort keys %{$imdb->{cast}}) { |
|---|
| 1102 | my $found_cast = 0; |
|---|
| 1103 | if ((defined $prog->{credits}) && (defined $prog->{credits}->{actor})) { |
|---|
| 1104 | foreach my $a (@{($prog->{credits}->{actor})}) { |
|---|
| 1105 | $found_cast++ if (lc($cast) eq lc($a)); |
|---|
| 1106 | } |
|---|
| 1107 | } |
|---|
| 1108 | push(@{($prog->{credits}->{actor})},$cast) if (!$found_cast); |
|---|
| 1109 | } |
|---|
| 1110 | } |
|---|
| 1111 | |
|---|
| 1112 | if (defined $imdb->{writers}) { |
|---|
| 1113 | foreach my $cast (ref($imdb->{writers}) eq "ARRAY" ? |
|---|
| 1114 | map(@$_[0], @{$imdb->{writers}}) : sort keys %{$imdb->{writers}}) { |
|---|
| 1115 | my $found_cast = 0; |
|---|
| 1116 | if ((defined $prog->{credits}) && (defined $prog->{credits}->{writer})) { |
|---|
| 1117 | foreach my $w (@{($prog->{credits}->{writer})}) { |
|---|
| 1118 | $found_cast++ if (lc($cast) eq lc($w)); |
|---|
| 1119 | } |
|---|
| 1120 | } |
|---|
| 1121 | push(@{($prog->{credits}->{writer})},$cast) if (!$found_cast); |
|---|
| 1122 | } |
|---|
| 1123 | } |
|---|
| 1124 | |
|---|
| 1125 | if (defined $imdb->{directors}) { |
|---|
| 1126 | foreach my $cast (ref($imdb->{directors}) eq "ARRAY" ? |
|---|
| 1127 | map(@$_[0], @{$imdb->{directors}}) : sort keys %{$imdb->{directors}}) { |
|---|
| 1128 | my $found_cast = 0; |
|---|
| 1129 | if ((defined $prog->{credits}) && (defined $prog->{credits}->{director})) { |
|---|
| 1130 | foreach my $d (@{($prog->{credits}->{director})}) { |
|---|
| 1131 | $found_cast++ if (lc($cast) eq lc($d)); |
|---|
| 1132 | } |
|---|
| 1133 | } |
|---|
| 1134 | push(@{($prog->{credits}->{director})},$cast) if (!$found_cast); |
|---|
| 1135 | } |
|---|
| 1136 | } |
|---|
| 1137 | |
|---|
| 1138 | if (defined $imdb->{cover}) { |
|---|
| 1139 | $found_cover = 0; |
|---|
| 1140 | if (defined $prog->{icon}) { |
|---|
| 1141 | foreach my $cover (@{($prog->{icon})}) { |
|---|
| 1142 | $found_cover++ if (lc($cover->{src}) eq lc($imdb->{cover})); |
|---|
| 1143 | } |
|---|
| 1144 | } |
|---|
| 1145 | $prog->{icon}->[0]->{src} = $imdb->{cover} if (!$found_cover); |
|---|
| 1146 | } |
|---|
| 1147 | |
|---|
| 1148 | Shepherd::Common::cleanup($prog); |
|---|
| 1149 | #print "prog now ".Dumper($prog); |
|---|
| 1150 | END: |
|---|
| 1151 | $writer->write_programme($prog); |
|---|
| 1152 | } |
|---|
| 1153 | |
|---|
| 1154 | ############################################################################## |
|---|
| 1155 | |
|---|
| 1156 | sub run_test |
|---|
| 1157 | { |
|---|
| 1158 | $opt->{debug} = 1; |
|---|
| 1159 | $opt->{fast} = 1; |
|---|
| 1160 | Shepherd::Common::log("running test for: ".$opt->{test}); |
|---|
| 1161 | |
|---|
| 1162 | my $post_fields = "words=".Shepherd::Common::urlify($opt->{test}); |
|---|
| 1163 | &search_imdb_online($opt->{test},$post_fields); |
|---|
| 1164 | |
|---|
| 1165 | die "lookup unsuccessful\n" |
|---|
| 1166 | if ((!defined $data_cache->{movie_id_lookup}->{$post_fields}) || |
|---|
| 1167 | ($data_cache->{movie_id_lookup}->{$post_fields}->{url} eq "-")); |
|---|
| 1168 | |
|---|
| 1169 | my $movie_url = $data_cache->{movie_id_lookup}->{$post_fields}->{url}; |
|---|
| 1170 | &get_imdb_movie_online($opt->{test},$movie_url); |
|---|
| 1171 | print "Movie details returned: ".Dumper($data_cache->{movie_lookup}->{$movie_url}); |
|---|
| 1172 | |
|---|
| 1173 | exit(0); |
|---|
| 1174 | } |
|---|
| 1175 | |
|---|
| 1176 | ############################################################################## |
|---|