root/trunk/grabbers/southerncross_website

Revision 1283, 23.9 kB (checked in by max, 23 months ago)

southerncross_website: Format change. Committed half-working version 0.99, disabled centrally.

  • Property svn:executable set to *
Line 
1#!/usr/bin/perl -w
2
3# Southern Cross Broadcasting grabber
4# grabs data from http://www.southerncrossbroadcasting.com.au/TVGuide
5# http://www.southerncrossbroadcasting.com.au/TVGuide/eguide.asp?ch=MAC:Southern%20Cross%20TEN%20-%20Mackay&dd=20070522
6# has more than 7 days (up to 18 days) but missing all quote marks (')
7
8# alternative site http://www.mytalk.com.au/NewDesign/Pages/TVGuide.aspx
9# has more than 7 days (upto 12 days) + part day, page a bit more complex,
10# alot more regions but MISSING SUBTITLES
11
12# weekly guide on website doesn't work
13# http://www.southerncrossbroadcasting.com.au/TVGuide/Weekly.asp?ch=MAC:Southern%20Cross%20TEN%20-%20Mackay&gd=20070520
14# <script language="javascript">
15# function OpenGuide(GuideName, GuideDate) {
16#   sFeatures="location=no,menubar=no,resizable=no,status=no,toolbar=no,width=640,height=500,scrollbars=yes";
17#   window.open("Weekly.asp?ch="+GuideName+"&gd="+GuideDate,"weekly",sFeatures);
18# }
19# </script>
20# <!--
21#   <tr><td align="center" colspan=7 style="font-size:10px;font-weight:normal; color:#FF0000">View the full weeks guide [<a href="javascript:OpenGuide('MAC:Southern Cross TEN - Mackay','20070520');">This Week</a>] [<a href="javascript:OpenGuide('MAC:Southern Cross TEN - Mackay','20070527');">Next Week</a>]</td></tr>
22# -->
23
24# test with nice ./southerncross_website -channels_file=channels.conf -do-extra-days -debug 2>&1 | tee log.txt
25
26# todo
27#  micro grab
28#  add setable do-extra-days
29#      ~/.shepherd/tv_grab_au --component-set southerncross_website:do_extra_days:1
30#  add other supported regions (done except for "Southern Cross" in two regions without url and
31#      "Central GTS/BKN" in two regions with url, these also require parser changes)
32
33use strict;
34
35my $progname = "southerncross_website";
36my $version = "0.99";
37
38#use LWP::UserAgent;
39#use LWP::ConnCache;
40use XMLTV;
41use POSIX qw(strftime mktime);
42use Getopt::Long;
43use HTML::TreeBuilder;
44#use Data::Dumper;
45#use Compress::Zlib;
46#use Digest::MD5;
47#use Storable;
48use Shepherd::Common;
49
50#
51# global variables and settings
52#
53
54$| = 1;
55my $script_start_time = time;
56my %stats;
57my $channels, my $opt_channels, my $gaps;
58# my $data_cache;
59my $ua;
60my $conn_cache;
61my $prev_url;
62my $d;
63my $opt;
64
65
66#
67# parse command line
68#
69
70$opt->{days} =          7;      # default
71$opt->{offset} =        0;      # default
72$opt->{outputfile} =    "output.xmltv";    # default
73$opt->{lang} =          "en";
74$opt->{region} =        253;
75
76GetOptions(
77#  'log-http'  => \$opt->{log_http},
78  'region=i'  => \$opt->{region},
79  'days=i'  => \$opt->{days},
80  'offset=i'  => \$opt->{offset},
81  'do-extra-days' => \$opt->{do_extra_days},
82#  'timezone=s'  => \$opt->{timezone},
83  'channels_file=s' => \$opt->{channels_file},
84  'gaps_file=s'  => \$opt->{gaps_file},
85  'output=s'  => \$opt->{outputfile},
86#  'cache-file=s'  => \$opt->{cache_file},
87  'fast'    => \$opt->{fast},
88#  'no-cache'  => \$opt->{no_cache},
89#  'no-details'  => \$opt->{no_details},
90  'debug+'  => \$opt->{debug},
91  'warper'  => \$opt->{warper},
92  'lang=s'  => \$opt->{lang},
93#  'no-hdtv-flags'  => \$opt->{no_hdtv_flags},
94  'obfuscate'  => \$opt->{obfuscate},
95
96  'help'    => \$opt->{help},
97  'verbose'  => \$opt->{help},
98  'version'  => \$opt->{version},
99  'ready'    => \$opt->{version},
100  'v'    => \$opt->{help});
101
102&help if ($opt->{help});
103
104if ($opt->{version}) {
105  printf "%s %s\n",$progname,$version;
106  exit(0);
107}
108
109die "No channel file specified, see --help for instructions\n", if (!$opt->{channels_file});
110
111# set defaults
112&Shepherd::Common::set_default("debug", (defined $opt->{debug} ? ($opt->{debug} * 2) : 0));
113&Shepherd::Common::set_default("webwarper", 1) if (defined $opt->{warper});
114&Shepherd::Common::set_default("squid", 1) if (defined $opt->{obfuscate});
115&Shepherd::Common::set_defaults(stats => \%stats, referer => "last");
116
117#
118# go go go!
119#
120
121my $start_message = sprintf "Going to %sgrab %s%d%s days%s into %s (%s%s) for region %d",
122  (defined $opt->{gaps_file} ? "micro-gap " : ""),
123  ($opt->{do_extra_days} ? "somewhere between " : ""),
124  $opt->{days},
125  ($opt->{do_extra_days} ? " to 28" : ""),
126  ($opt->{offset} ? " (skipping first $opt->{offset} days)" : ""),
127  $opt->{outputfile},
128  (defined $opt->{fast} ? "with haste" : "slowly"),
129  (defined $opt->{warper} ? ", anonymously" : ""),
130  $opt->{region};
131&log($start_message);
132
133$opt->{days} = 28 if ($opt->{do_extra_days} and $opt->{days} < 28);
134
135my ($channel, $url) = &set_region($opt->{region});
136
137# read channels file
138if (-r $opt->{channels_file}) {
139  local (@ARGV, $/) = ($opt->{channels_file});
140  no warnings 'all'; eval <>; die "$@" if $@;
141} else {
142  die "WARNING: channels file $opt->{channels_file} could not be read\n";
143}
144
145die "$channel not listed as a channel in ".$opt->{channels_file}.", nothing to do!\n"
146    if (!defined $channels->{$channel});
147
148if (defined $opt->{gaps_file}) {
149  if (-r $opt->{gaps_file}) {
150    local (@ARGV, $/) = ($opt->{gaps_file});
151    no warnings 'all'; eval <>; die "$@" if $@;
152  } else {
153    die "WARNING: gaps file $opt->{gaps_file} could not be read: $!\n";
154  }
155
156  die "no $channel channel in gaps lineup, nothing to do!\n"
157    unless ((defined $gaps) && (defined $gaps->{$channel}));
158}
159
160my $progs = &get_pages($url);
161
162&write_xmltv($channel, $progs);
163
164&Shepherd::Common::print_stats($progname, $version, $script_start_time, %stats);
165exit(0);
166
167##############################################################################
168# help
169
170sub help
171{
172  print<<EOF
173$progname $version
174
175options are as follows:
176  --help      show these help options
177  --days=N    fetch 'n' days of data (default: $opt->{days})
178  --output=file    send xml output to file (default: "$opt->{outputfile}")
179  --fast      don't run slow - get data as quick as you can - not recommended
180  --anonsocks=(ip:port)  use SOCKS4A server at (ip):(port) (for Tor: recommended)
181
182  --debug      increase debug level
183  --warper    fetch data using WebWarper web anonymizer service
184  --obfuscate    pretend to be a proxy servicing multiple clients
185  --lang=[s]    set language of xmltv output data (default $opt->{lang})
186
187  --region=N    set region for where to collect data from (default: $opt->{region})
188  --channels_file=file  where to get channel data from
189  --gaps_file=file        micro-fetch gaps only
190
191EOF
192;
193
194  exit(0);
195}
196
197##############################################################################
198
199sub log
200{
201  &Shepherd::Common::log(@_);
202}
203
204##############################################################################
205
206sub write_xmltv
207{
208  my ($channel, $progs) = @_;
209
210  my %writer_args = ( encoding => 'ISO-8859-1' );
211  if ($opt->{outputfile}) {
212    my $fh = new IO::File(">$opt->{outputfile}") || die "can't open $opt->{outputfile}: $!";
213    $writer_args{OUTPUT} = $fh;
214  }
215
216  my $writer = new XMLTV::Writer(%writer_args);
217
218  $writer->start
219    ( { 'source-info-name'   => "$progname $version",
220        'generator-info-name' => "$progname $version"} );
221
222  $writer->write_channel( {
223    'display-name' => [[ "$channel", $opt->{lang} ]], 'id' => $channels->{$channel} } );
224
225  # only return programmes for requested days
226  my @timeattr = localtime($script_start_time); # 0=sec,1=min,2=hour,3=day,4=month,5=year,6=wday,7=yday,8=isdst
227  $timeattr[0] = 0; # zero seconds
228  $timeattr[1] = 0; # zero mintues
229  $timeattr[2] = 0; # zero hours
230  $timeattr[3] += $opt->{offset}; # day
231  my $start_day = mktime(@timeattr);
232  $timeattr[3] += $opt->{days} - $opt->{offset}; # day
233  my $stop_day = mktime(@timeattr);
234
235  foreach my $key (sort keys %{$progs}) {
236    my $prog = $progs->{$key};
237
238    # if we are micro-gap fetching, only include programmes which match our micro gaps
239    if ($opt->{gaps_file} and defined $gaps and defined $gaps->{$channel}) {
240      my $found_gap_matches = 0;
241      foreach my $gap (@{($gaps->{$channel})}) {
242        my ($s, $e) = split(/-/,$gap);
243        if ((($s >= $prog->{start}) && ($s <= $prog->{stop})) ||
244            (($e >= $prog->{start}) && ($e <= $prog->{stop})) ||
245            (($s <= $prog->{start}) && ($e >= $prog->{stop}))) {
246          printf "gap-fetching: including prog '%s' at %d to %d for gap %d to %d\n",
247              $prog->{title}->[0]->[0], $prog->{start}, $prog->{stop}, $s, $e if $opt->{debug};
248          $found_gap_matches++
249        }
250      }
251      next if (not $found_gap_matches);
252      $stats{gaps_matched} += $found_gap_matches;
253    } elsif ($prog->{stop} <= $start_day or $prog->{start} >= $stop_day) {
254      printf "skipping program because before or after requested days, '%s' at %d to %d\n",
255          $prog->{title}->[0]->[0], $prog->{start}, $prog->{stop} if $opt->{debug};
256      next;
257    }
258
259    $prog->{channel} = $channels->{$channel};
260    $prog->{start} = POSIX::strftime("%Y%m%d%H%M",localtime($prog->{start}));
261    $prog->{stop} = POSIX::strftime("%Y%m%d%H%M",localtime($prog->{stop}));
262    &Shepherd::Common::cleanup($prog);
263#    printf "DEBUG: programme xmltv: ".Dumper($prog) if (defined $opt->{debug});
264    $writer->write_programme($prog);
265    $stats{programs_output}++;
266  }
267
268  $writer->end();
269
270  die "No programs output. ($start_message)" if (not $stats{programs_output} and not $opt->{gaps_file});
271}
272
273##############################################################################
274
275sub set_region
276{
277  my $region = shift;
278
279  # From http://www.southerncrossbroadcasting.com.au/TVGuide
280  my $channel;
281#  my $url = "http://www.southerncrossbroadcasting.com.au/TVGuide/eguide.asp?ch=";
282  my $url = "http://www.scmedia.com.au/content/eguide.aspx?ch=";
283 
284
285  # non "Sthn Cross TEN" has different page layout (all Capitals, (year))
286#  if ($region == 74) # NT - Darwin
287#  {
288#    $channel = "Southern Cross";
289#    $url .= "";
290#  }
291#  elsif ($region == 88) # TAS - Tasmania
292#  {
293#    $channel = "Southern Cross";
294#    $url .= "";
295#    # maybe TDT maybe 10
296#  }
297  if ($region == 90) # VIC - Ballarat
298  {
299    $url .= "BAL:Southern%20Cross%20TEN%20-%20Ballarat";
300  }
301  elsif ($region == 98) # VIC - Gippsland
302  {
303    $url .= "GIP:Southern%20Cross%20TEN%20-%20Gippsland";
304  }
305  elsif ($region == 266) # VIC - Bendigo
306  {
307    $url .= "BEN:Southern%20Cross%20TEN%20-%20Bendigo";
308  }
309  elsif ($region == 267) # VIC - Shepparton
310  {
311    $url .= "SHE:Southern%20Cross%20TEN%20-%20Shepparton";
312  }
313  elsif ($region == 268) # VIC - Albury/Wodonga
314  {
315    $url .= "ALB:Southern%20Cross%20TEN%20-%20Albury";
316  }
317  elsif ($region == 66) # NSW - Central Coast
318  { # http://www.atn.com.au/nsw/syd/cen-map.htm
319    $url .= "WOL:Southern%20Cross%20TEN%20-%20Wollongong";
320  }
321  elsif ($region == 69) # NSW - Tamworth
322  {
323    $url .= "TAM:Southern%20Cross%20TEN%20-%20Tamworth";
324  }
325  elsif ($region == 71 || $region == 259) # NSW - Wollongong & NSW - Far South Coast
326  {
327    $url .= "WOL:Southern%20Cross%20TEN%20-%20Wollongong";
328  }
329  elsif ($region == 261) # NSW - Lismore/Coffs Harbour
330  {
331    $url .= "LIS:Southern%20Cross%20TEN%20-%20Lismore";
332  }
333  elsif ($region == 262) # NSW - Orange/Dubbo
334  {
335    $url .= "ORA:Southern%20Cross%20TEN%20-%20Orange/Dubbo";
336  }
337  elsif ($region == 263) # NSW - Taree/Port Macquarie
338  {
339    $url .= "TAR:Southern%20Cross%20TEN%20-%20Taree";
340  }
341  elsif ($region == 264) # NSW - Wagga Wagga
342  {
343    $url .= "WAG:Southern%20Cross%20TEN%20-%20Wagga";
344  }
345  elsif ($region == 184) # NSW - Newcastle
346  {
347    $url .= "NEW:Southern%20Cross%20TEN%20-%20Newcastle";
348  }
349  elsif ($region == 63) # NSW - Broken Hill
350  {
351    $url .= "BRO%2010:Southern%20Cross%20TEN%20-%20Broken%20Hill";
352  }
353  elsif ($region == 86) # SA - Spencer Gulf (Port Pirie or Port Lincoln)
354  { # http://www.centralonline.com.au/contact.asp
355    $url .= "PIR%2010:Southern%20Cross%20TEN%20-%20Port%20Pirie";
356  }
357  elsif ($region == 78) # QLD - Gold Coast
358  {
359    $url .= "GOL:Southern%20Cross%20TEN%20-%20Gold%20Coast";
360  }
361  elsif ($region == 255) # QLD - Sunshine Coast
362  {
363    $url .= "SUN:Southern%20Cross%20TEN%20-%20Sunshine%20Coast";
364  }
365  elsif ($region == 256) # QLD - Toowoomba
366  {
367    $url .= "TOO:Southern%20Cross%20TEN%20-%20Toowoomba";
368  }
369  elsif ($region == 258) # QLD - Wide Bay (Bundaberg)
370  {
371    $url .= "BUN:Southern%20Cross%20TEN%20-%20Bundaberg";
372  }
373  elsif ($region == 254) # QLD - Rockhampton
374  {
375    $url .= "ROC:Southern%20Cross%20TEN%20-%20Rockhampton";
376  }
377  elsif ($region == 253) # QLD - Mackay
378  {
379    $url .= "MAC:Southern%20Cross%20TEN%20-%20Mackay";
380  }
381  elsif ($region == 257) # QLD - Townsville
382  {
383    $url .= "TOW:Southern%20Cross%20TEN%20-%20Townsville";
384  }
385  elsif ($region == 79) # QLD - Cairns
386  {
387    $url .= "CAI:Southern%20Cross%20TEN%20-%20Cairns";
388  }
389  elsif ($region == 126) # ACT - ACT
390  {
391    $url .= "CAN:Southern%20Cross%20TEN%20-%20Canberra";
392  }
393  else
394  {
395    &log("ERROR: No suitable region guide found using default");
396    $url .= "MAC:Southern%20Cross%20TEN%20-%20Mackay";
397  }
398
399  $url =~ /-%20(.*)$/;
400  &log("Setting region $region to $1");
401
402  return ('SC10', $url);
403
404#                       <p class="Heading1">Southern Cross Television</p>
405#                       <select size="1" name="SCTV" networkname="Southern Cross Television" onchange="OpenNewGuide(this);" style="font-family:Tahoma;font-size:11px;margin:10px">
406#                       <option>Select...</option>
407#                       <option value="BRO 7">Broken Hill</option>   NSW - Broken Hill
408#                       <option value="CEN">Central/Satellite</option>
409#                       <option value="PIR 7">Port Pirie</option>    SA - Spencer Gulf
410#                       <option value="LIN 7">Port Lincoln</option>  SA - Spencer Gulf
411#                       </select><br>
412#                       <p class="Heading1">Southern Cross TEN</p>
413#                       <select size="1" name="SCTEN" networkname="Southern Cross TEN" onchange="OpenNewGuide(this);" style="font-family:Tahoma;font-size:11px;margin:10px">
414#                       <option>Queensland...</option>
415#                       <option value="BUN">Bundaberg</option>     Wide Bay
416#                       <option value="CAI">Cairns</option>        Cairns
417#                       <option value="GOL">Gold Coast</option>    Gold Coast
418#                       <option value="MAC">Mackay</option>        Mackay
419#                       <option value="ROC">Rockhampton</option>   Rockhampton
420#                       <option value="SUN">Sunshine Coast</option>Sunshine Coast
421#                       <option value="TOO">Toowoomba</option>     Toowoomba
422#                       <option value="TOW">Townsville</option>    Townsville
423#                       </select><select size="1" name="SCTEN" networkname="Southern Cross TEN" onchange="OpenNewGuide(this);" style="font-family:Tahoma;font-size:11px;margin:10px">
424#                       <option>New South Wales/ACT...</option>
425#                       <option value="CAN">Canberra</option>      ACT
426#                       <option value="COF">Coffs Harbour</option> 261 NSW: Lismore/Coffs Harbour
427#                       <option value="LIS">Lismore</option>       261 NSW: Lismore/Coffs Harbour
428#                       <option value="NEW">Newcastle</option>     Newcastle
429#                       <option value="ORA">Orange/Dubbo</option>  262 NSW: Orange/Dubbo
430#                       <option value="TAR">Taree</option>         263 NSW: Taree/Port Macquarie
431#                       <option value="TAM">Tamworth</option>      69 NSW: Tamworth
432#                       <option value="WAG">Wagga</option>         264 NSW: Wagga Wagga
433#                       <option value="WOL">Wollongong</option>    71 NSW: Wollongong & 259 NSW: Far South Coast
434#                       </select><select size="1" name="SCTEN" networkname="Southern Cross TEN" onchange="OpenNewGuide(this);" style="font-family:Tahoma;font-size:11px;margin:10px">
435#                       <option>Victoria...</option>
436#                       <option value="ALB">Albury</option>        Eastern Victoria
437#                       <option value="BAL">Ballarat</option>      Western Victoria
438#                       <option value="BEN">Bendigo</option>       Western Victoria
439#                       <option value="GIP">Gippsland</option>     Eastern Victoria
440#                       <option value="SHE">Shepparton</option>    Western Victoria
441#                       </select><select size="1" name="SCTEN" networkname="Southern Cross TEN" onchange="OpenNewGuide(this);" style="font-family:Tahoma;font-size:11px;margin:10px">
442#                       <option>Spencer Gulf...</option>
443#                       <option value="BRO 10">Broken Hill</option>  NSW - Broken Hill
444#                       <option value="LIN 10">Port Lincoln</option> SA - Spencer Gulf
445#                       <option value="PIR 10">Port Pirie</option>   SA - Spencer Gulf
446#                       </select>
447#
448}
449
450##############################################################################
451
452sub get_pages
453{
454  my $url = shift;
455
456  my @timeattr = localtime($script_start_time);
457  # 0=sec,1=min,2=hour,3=day,4=month,5=year,6=wday,7=yday,8=isdst
458  $timeattr[0] = 0; # zero sec
459  $timeattr[1] = 0; # zero min
460  $timeattr[2] = 6; # 6am
461  # guide pages start at 6am so grab day before
462  $timeattr[3] += $opt->{offset} - 1; # day
463
464  my $progs;
465  foreach my $day (($opt->{offset}-1) .. ($opt->{days}-1)) {
466    my $date = mktime(@timeattr);
467    $timeattr[3]++; # day
468
469    # if we are micro-gap fetching, only grab days which match our micro gaps
470    if ($opt->{gaps_file} and defined $gaps and defined $gaps->{$channel}) {
471      my $found_gap_matches = 0;
472      foreach my $gap (@{($gaps->{$channel})}) {
473        my ($s, $e) = split(/-/,$gap);
474        if ((($s >= $date) && ($s <= ($date + 24*60*60))) ||
475            (($e >= $date) && ($e <= ($date + 24*60*60))) ||
476            (($s <= $date) && ($e >= ($date + 24*60*60)))) {
477          $found_gap_matches++;
478          printf "gap-fetching: including day %d at %d for gap %d to %d\n",
479              $day, $date, $s, $e if $opt->{debug};
480        }
481      }
482      next if (not $found_gap_matches);
483    }
484
485    &log("Fetching day $day");
486    ($progs, my $progs_in_day) = &parse_page($progs, $url, $date);
487    if ($progs_in_day == 0) {
488      $timeattr[3]--; # day
489      last;
490    }
491
492    if ($opt->{fast} or $opt->{debug} or $day == ($opt->{days}-1)) {
493      &log("  found $progs_in_day programmes.");
494    } else {
495      my $sleep_for = 10 + int(rand(5));
496      &log("  found $progs_in_day programmes, " .
497          "sleeping for $sleep_for seconds.");
498      sleep $sleep_for;
499      $stats{slept_for} += $sleep_for;
500    }
501  }
502
503  # reverse-sort thru list to set prog stop times
504  $timeattr[2] = 6; # 6am
505  my $last_start = mktime(@timeattr);  # 6am on last day + 1
506  foreach my $key (sort {$b <=> $a} keys %$progs) {
507    $progs->{$key}->{stop} = $last_start;
508    $last_start = $progs->{$key}->{start};
509  }
510
511  return $progs;
512}
513
514##############################################################################
515
516sub parse_page
517{
518  my ($progs, $url, $date) = @_;
519
520  my $datestr = POSIX::strftime("%Y%m%d",localtime($date));
521  $url .= "&dd=" . $datestr;
522
523  &log("parse_page debug: url $url") if $opt->{debug};
524
525  my $data;
526  if (not $opt->{debug}) { 
527    $data = &Shepherd::Common::get_url(url => $url);
528  } else {
529    if (open(INFILE, "eguide.asp.$datestr")) {
530      my @lines = <INFILE>;
531      close INFILE;
532      $data = join ' ',@lines;
533    } else {
534      $data = &Shepherd::Common::get_url(url => $url);
535      if ($data) {
536        sleep(12);
537        $stats{slept_for} += 12;
538        open(OUTFILE, ">eguide.asp.$datestr") or 
539            die "Can't open >eguide.asp.$datestr: $!";
540        print OUTFILE $data;
541        close OUTFILE;
542      }
543    }
544  }
545
546  if (!$data) {
547    &log("Didn't return a valid response for url '$url'");
548    return ($progs, 0);
549  }
550
551  my $tree = HTML::TreeBuilder->new_from_content($data);
552  if (!$tree) {
553    &log("Can't parse url '$url'");
554    $stats{bad_html}++;
555    return ($progs, 0);
556  }
557
558  &log("parse_page debug: page title: " . 
559      $tree->find_by_tag_name('title')->as_trimmed_text())
560      if $opt->{debug};
561
562  my @table_rows = $tree->find_by_tag_name('table');
563  if (@table_rows < 1) {
564    &log("Format has changed for url '$url'");
565    $stats{bad_html}++;
566    return ($progs, 0);
567  }
568
569  my $progs_in_day = 0;
570  my @tds = $table_rows[0]->find_by_tag_name('td');
571  foreach (@tds)
572  {
573      print " - " . $_->as_trimmed_text() . "\n";
574  }
575  my $td = shift @tds;
576  my $text = $td->as_trimmed_text() if $td;
577  while (@tds > 0) {
578    my $prog;
579
580    if ($text !~ /(\d\d):(\d\d)/) { # time
581      &log("parse_page debug: skipped   : " . $td->as_HTML()) if $opt->{debug};
582      $td = shift @tds || last;
583      $text = $td->as_trimmed_text();
584      next;
585    }
586    $prog->{start} = $date + ((($1 < 6 ? 24 : 0) - 6 + $1)*60 + $2)*60;
587    &log("parse_page debug: time      : $1:$2  " .
588        POSIX::strftime("%Y%m%d%H%M", localtime($prog->{start}))) 
589        if $opt->{debug};
590
591    $td = shift @tds || last;
592    my $title = $td->as_trimmed_text();
593
594    $td = shift @tds || last;
595    my $rating = &Shepherd::Common::subrating($td->as_trimmed_text());
596    &log("parse_page debug: rating : $rating") if $opt->{debug};
597
598    $td = shift @tds || last;
599    $td = shift @tds || last;
600    my $text = $td->as_trimmed_text();
601
602    # <b>Saturday Late Night Movie</b> - Run Lola Run
603    # <b>Sunday Afternoon Premiere Movie</b> - Rat
604    # <b>Sunday Afternoon Movie</b> - Evolution
605    my (%type);
606    my $post_desc="";
607    if ($title =~ /^(Friday|Saturday|Sunday) .* Movie$/i) {
608      $type{movie} = 1;
609      $type{premiere} = 1 if ($title =~ /Premiere/i);
610    }
611    # (Network Premiere)
612    ## (Special Encore Presentation) # repeat of a (Network Premiere)
613    # (Series Premiere)
614    # (Series Finale)
615    # (Series Final)
616    # (Series Return)
617    # (Replay)
618    ## (New Time)
619    ## (Qualifying)
620    ## (Special Presentation)
621    # $subtitle =~ , live from Aurora
622    if ($title =~ s/(\((.*)\))//) {
623        $post_desc .= $1;
624        my $tag = $2;
625        $type{series} = 1 if ($tag =~ /(^|\W)Series(\W|$)/i);
626        $type{premiere} = 1 if ($tag =~ /(^|\W)Premiere(\W|$)/i);
627        $type{final} = 1 if ($tag =~ /(^|\W)Finale?(\W|$)/i);
628        $type{return} = 1 if ($tag =~ /(^|\W)Return(\W|$)/i);
629        $type{repeat} = 1 if ($tag =~ /(^|\W)Repeat(\W|$)/i); # unseen
630        $type{repeat} = 1 if ($tag =~ /(^|\W)Replay(\W|$)/i);
631    }
632    $prog->{title} = [[$title, $opt->{lang}]];
633    &log("parse_page debug: title     : $title $post_desc") if ($opt->{debug});
634
635    if ($text) {
636        # - Live from Aurora
637        # Hosted by Mike Goldman & Bree Amer, this ... # how to parse all
638        # Hosted by Mike Goldman, Ryan Fitzgerald and Bree Amer.
639        # ^Guest Stars: Mel Gibson Studio ... # how to parse all
640        # Guests include: Damien Leith, Young Divas and Darren Hayes.
641        # Starring: $
642        ## Insert Cast: Damon Wayans, Adam Sandler, Kristen Wilson, James Caan,
643        ## James Farentino (Cons Adv: Some Coarse Language, Some Violence, Sexual References)
644        ## Featuring cartoons: Yu Gi Oh! Gx (Final), All Grown Up (Repeat),
645        ## Teenage Mutant Ninja Turtules (repeat)
646        $type{live} = 1 if ($text =~ /Live from/i);
647        my $cast = "";
648        $cast .= "$1," if ($text =~ /Hosted by (\w+ \w+)( |\.|,|$)/i);
649        $cast .= "$1," if ($text =~ /Guest Stars: (\w+ \w+)( |\.|,|$)/i);
650        $cast .= "$1," if ($text =~ /Guests Include: (.+?)\./i);
651        $cast .= $1 if ($text =~ s/Starring: (.+)$//i);
652        $prog->{'credits'}{'actor'} = [ split(/,|&amp;|&|\Wand\W/i, $cast) ] if $cast;
653        $prog->{desc} = [[$text, $opt->{lang}]] if $text; # desc
654        &log("parse_page debug: desc      : $text") if $opt->{debug};
655    }
656
657  PART:
658    my $category = category_from_title($title);
659    $category = "Children" if !$category && $rating && $rating =~ /^(C|P)$/i;
660    $prog->{'category'} = [ &Shepherd::Common::generate_category(
661        $title, $category, %type) ];
662    $prog->{'premiere'} = ['premiere', $opt->{lang}] if $type{premiere};
663    $prog->{'last-chance'} = ['final', $opt->{lang}] if $type{final};
664    $prog->{'previously-shown'} = { } if $type{repeat};
665    $prog->{rating}  = [[$rating, "ABA", undef]] if $rating;
666#     if ($post_desc) {
667#       if (not $prog->{desc}) {
668#         $prog->{desc} = [[$post_desc, $opt->{lang}]];
669#       } else {
670#         $prog->{desc}->[0]->[0] .= "\n" . $post_desc;
671#       }
672#     }
673
674    $progs->{$prog->{start}} = $prog;
675    $progs_in_day++;
676  }
677
678  $tree->delete();
679
680  $stats{programs_found} += $progs_in_day;
681
682  &log("WARNING: Only $progs_in_day programmes seen for url $url") 
683      if ($progs_in_day < 10);
684
685  return ($progs, $progs_in_day);
686}
687
688sub category_from_title
689{
690  my $title = shift;
691  return "News" if $title=~/(^|\W)News(\W|$)/i;
692  return "sports" if $title=~/(^|\W)Sports Tonight(\W|$)/i;
693  return "sports" if $title=~/(^|\W)AFL(\W|$)/i;
694  return "Infotainment" if $title=~/(^|\W)Infomercials?(\W|$)/i;
695  return "Reality" if $title=~/(^|\W)Big Brother(\W|$)/i;
696  return "Animation" if $title=~/(^|\W)The( All New)? Simpsons(\W|$)/i;
697  return "Music" if $title=~/(^|\W)Video Hits(\W|$)/i;
698  return "Soap" if $title=~/(^|\W)The Bold (&amp;|&|and) The Beautiful(\W|$)/i;
699  return "Lifestyle/Cooking, Food & Wine" if $title=~/(^|\W)Hueys Cooking Adventures(\W|$)/i;
700  return "Variety" if $title=~/(^|\W)The Late Show With David Letterman(\W|$)/i;
701}
Note: See TracBrowser for help on using the browser.