| 1 | #!/usr/bin/perl -w |
|---|
| 2 | |
|---|
| 3 | # Southern Cross Broadcasting grabber |
|---|
| 4 | # grabs data from http://www.southerncrossbroadcasting.com.au/TVGuide |
|---|
| 5 | # http://www.southerncrossbroadcasting.com.au/TVGuide/eguide.asp?ch=MAC:Southern%20Cross%20TEN%20-%20Mackay&dd=20070522 |
|---|
| 6 | # has more than 7 days (up to 18 days) but missing all quote marks (') |
|---|
| 7 | |
|---|
| 8 | # alternative site http://www.mytalk.com.au/NewDesign/Pages/TVGuide.aspx |
|---|
| 9 | # has more than 7 days (upto 12 days) + part day, page a bit more complex, |
|---|
| 10 | # alot more regions but MISSING SUBTITLES |
|---|
| 11 | |
|---|
| 12 | # weekly guide on website doesn't work |
|---|
| 13 | # http://www.southerncrossbroadcasting.com.au/TVGuide/Weekly.asp?ch=MAC:Southern%20Cross%20TEN%20-%20Mackay&gd=20070520 |
|---|
| 14 | # <script language="javascript"> |
|---|
| 15 | # function OpenGuide(GuideName, GuideDate) { |
|---|
| 16 | # sFeatures="location=no,menubar=no,resizable=no,status=no,toolbar=no,width=640,height=500,scrollbars=yes"; |
|---|
| 17 | # window.open("Weekly.asp?ch="+GuideName+"&gd="+GuideDate,"weekly",sFeatures); |
|---|
| 18 | # } |
|---|
| 19 | # </script> |
|---|
| 20 | # <!-- |
|---|
| 21 | # <tr><td align="center" colspan=7 style="font-size:10px;font-weight:normal; color:#FF0000">View the full weeks guide [<a href="javascript:OpenGuide('MAC:Southern Cross TEN - Mackay','20070520');">This Week</a>] [<a href="javascript:OpenGuide('MAC:Southern Cross TEN - Mackay','20070527');">Next Week</a>]</td></tr> |
|---|
| 22 | # --> |
|---|
| 23 | |
|---|
| 24 | # test with nice ./southerncross_website -channels_file=channels.conf -do-extra-days -debug 2>&1 | tee log.txt |
|---|
| 25 | |
|---|
| 26 | # todo |
|---|
| 27 | # micro grab |
|---|
| 28 | # add setable do-extra-days |
|---|
| 29 | # ~/.shepherd/tv_grab_au --component-set southerncross_website:do_extra_days:1 |
|---|
| 30 | # add other supported regions (done except for "Southern Cross" in two regions without url and |
|---|
| 31 | # "Central GTS/BKN" in two regions with url, these also require parser changes) |
|---|
| 32 | |
|---|
| 33 | use strict; |
|---|
| 34 | |
|---|
| 35 | my $progname = "southerncross_website"; |
|---|
| 36 | my $version = "0.99"; |
|---|
| 37 | |
|---|
| 38 | #use LWP::UserAgent; |
|---|
| 39 | #use LWP::ConnCache; |
|---|
| 40 | use XMLTV; |
|---|
| 41 | use POSIX qw(strftime mktime); |
|---|
| 42 | use Getopt::Long; |
|---|
| 43 | use HTML::TreeBuilder; |
|---|
| 44 | #use Data::Dumper; |
|---|
| 45 | #use Compress::Zlib; |
|---|
| 46 | #use Digest::MD5; |
|---|
| 47 | #use Storable; |
|---|
| 48 | use Shepherd::Common; |
|---|
| 49 | |
|---|
| 50 | # |
|---|
| 51 | # global variables and settings |
|---|
| 52 | # |
|---|
| 53 | |
|---|
| 54 | $| = 1; |
|---|
| 55 | my $script_start_time = time; |
|---|
| 56 | my %stats; |
|---|
| 57 | my $channels, my $opt_channels, my $gaps; |
|---|
| 58 | # my $data_cache; |
|---|
| 59 | my $ua; |
|---|
| 60 | my $conn_cache; |
|---|
| 61 | my $prev_url; |
|---|
| 62 | my $d; |
|---|
| 63 | my $opt; |
|---|
| 64 | |
|---|
| 65 | |
|---|
| 66 | # |
|---|
| 67 | # parse command line |
|---|
| 68 | # |
|---|
| 69 | |
|---|
| 70 | $opt->{days} = 7; # default |
|---|
| 71 | $opt->{offset} = 0; # default |
|---|
| 72 | $opt->{outputfile} = "output.xmltv"; # default |
|---|
| 73 | $opt->{lang} = "en"; |
|---|
| 74 | $opt->{region} = 253; |
|---|
| 75 | |
|---|
| 76 | GetOptions( |
|---|
| 77 | # 'log-http' => \$opt->{log_http}, |
|---|
| 78 | 'region=i' => \$opt->{region}, |
|---|
| 79 | 'days=i' => \$opt->{days}, |
|---|
| 80 | 'offset=i' => \$opt->{offset}, |
|---|
| 81 | 'do-extra-days' => \$opt->{do_extra_days}, |
|---|
| 82 | # 'timezone=s' => \$opt->{timezone}, |
|---|
| 83 | 'channels_file=s' => \$opt->{channels_file}, |
|---|
| 84 | 'gaps_file=s' => \$opt->{gaps_file}, |
|---|
| 85 | 'output=s' => \$opt->{outputfile}, |
|---|
| 86 | # 'cache-file=s' => \$opt->{cache_file}, |
|---|
| 87 | 'fast' => \$opt->{fast}, |
|---|
| 88 | # 'no-cache' => \$opt->{no_cache}, |
|---|
| 89 | # 'no-details' => \$opt->{no_details}, |
|---|
| 90 | 'debug+' => \$opt->{debug}, |
|---|
| 91 | 'warper' => \$opt->{warper}, |
|---|
| 92 | 'lang=s' => \$opt->{lang}, |
|---|
| 93 | # 'no-hdtv-flags' => \$opt->{no_hdtv_flags}, |
|---|
| 94 | 'obfuscate' => \$opt->{obfuscate}, |
|---|
| 95 | |
|---|
| 96 | 'help' => \$opt->{help}, |
|---|
| 97 | 'verbose' => \$opt->{help}, |
|---|
| 98 | 'version' => \$opt->{version}, |
|---|
| 99 | 'ready' => \$opt->{version}, |
|---|
| 100 | 'v' => \$opt->{help}); |
|---|
| 101 | |
|---|
| 102 | &help if ($opt->{help}); |
|---|
| 103 | |
|---|
| 104 | if ($opt->{version}) { |
|---|
| 105 | printf "%s %s\n",$progname,$version; |
|---|
| 106 | exit(0); |
|---|
| 107 | } |
|---|
| 108 | |
|---|
| 109 | die "No channel file specified, see --help for instructions\n", if (!$opt->{channels_file}); |
|---|
| 110 | |
|---|
| 111 | # set defaults |
|---|
| 112 | &Shepherd::Common::set_default("debug", (defined $opt->{debug} ? ($opt->{debug} * 2) : 0)); |
|---|
| 113 | &Shepherd::Common::set_default("webwarper", 1) if (defined $opt->{warper}); |
|---|
| 114 | &Shepherd::Common::set_default("squid", 1) if (defined $opt->{obfuscate}); |
|---|
| 115 | &Shepherd::Common::set_defaults(stats => \%stats, referer => "last"); |
|---|
| 116 | |
|---|
| 117 | # |
|---|
| 118 | # go go go! |
|---|
| 119 | # |
|---|
| 120 | |
|---|
| 121 | my $start_message = sprintf "Going to %sgrab %s%d%s days%s into %s (%s%s) for region %d", |
|---|
| 122 | (defined $opt->{gaps_file} ? "micro-gap " : ""), |
|---|
| 123 | ($opt->{do_extra_days} ? "somewhere between " : ""), |
|---|
| 124 | $opt->{days}, |
|---|
| 125 | ($opt->{do_extra_days} ? " to 28" : ""), |
|---|
| 126 | ($opt->{offset} ? " (skipping first $opt->{offset} days)" : ""), |
|---|
| 127 | $opt->{outputfile}, |
|---|
| 128 | (defined $opt->{fast} ? "with haste" : "slowly"), |
|---|
| 129 | (defined $opt->{warper} ? ", anonymously" : ""), |
|---|
| 130 | $opt->{region}; |
|---|
| 131 | &log($start_message); |
|---|
| 132 | |
|---|
| 133 | $opt->{days} = 28 if ($opt->{do_extra_days} and $opt->{days} < 28); |
|---|
| 134 | |
|---|
| 135 | my ($channel, $url) = &set_region($opt->{region}); |
|---|
| 136 | |
|---|
| 137 | # read channels file |
|---|
| 138 | if (-r $opt->{channels_file}) { |
|---|
| 139 | local (@ARGV, $/) = ($opt->{channels_file}); |
|---|
| 140 | no warnings 'all'; eval <>; die "$@" if $@; |
|---|
| 141 | } else { |
|---|
| 142 | die "WARNING: channels file $opt->{channels_file} could not be read\n"; |
|---|
| 143 | } |
|---|
| 144 | |
|---|
| 145 | die "$channel not listed as a channel in ".$opt->{channels_file}.", nothing to do!\n" |
|---|
| 146 | if (!defined $channels->{$channel}); |
|---|
| 147 | |
|---|
| 148 | if (defined $opt->{gaps_file}) { |
|---|
| 149 | if (-r $opt->{gaps_file}) { |
|---|
| 150 | local (@ARGV, $/) = ($opt->{gaps_file}); |
|---|
| 151 | no warnings 'all'; eval <>; die "$@" if $@; |
|---|
| 152 | } else { |
|---|
| 153 | die "WARNING: gaps file $opt->{gaps_file} could not be read: $!\n"; |
|---|
| 154 | } |
|---|
| 155 | |
|---|
| 156 | die "no $channel channel in gaps lineup, nothing to do!\n" |
|---|
| 157 | unless ((defined $gaps) && (defined $gaps->{$channel})); |
|---|
| 158 | } |
|---|
| 159 | |
|---|
| 160 | my $progs = &get_pages($url); |
|---|
| 161 | |
|---|
| 162 | &write_xmltv($channel, $progs); |
|---|
| 163 | |
|---|
| 164 | &Shepherd::Common::print_stats($progname, $version, $script_start_time, %stats); |
|---|
| 165 | exit(0); |
|---|
| 166 | |
|---|
| 167 | ############################################################################## |
|---|
| 168 | # help |
|---|
| 169 | |
|---|
| 170 | sub help |
|---|
| 171 | { |
|---|
| 172 | print<<EOF |
|---|
| 173 | $progname $version |
|---|
| 174 | |
|---|
| 175 | options are as follows: |
|---|
| 176 | --help show these help options |
|---|
| 177 | --days=N fetch 'n' days of data (default: $opt->{days}) |
|---|
| 178 | --output=file send xml output to file (default: "$opt->{outputfile}") |
|---|
| 179 | --fast don't run slow - get data as quick as you can - not recommended |
|---|
| 180 | --anonsocks=(ip:port) use SOCKS4A server at (ip):(port) (for Tor: recommended) |
|---|
| 181 | |
|---|
| 182 | --debug increase debug level |
|---|
| 183 | --warper fetch data using WebWarper web anonymizer service |
|---|
| 184 | --obfuscate pretend to be a proxy servicing multiple clients |
|---|
| 185 | --lang=[s] set language of xmltv output data (default $opt->{lang}) |
|---|
| 186 | |
|---|
| 187 | --region=N set region for where to collect data from (default: $opt->{region}) |
|---|
| 188 | --channels_file=file where to get channel data from |
|---|
| 189 | --gaps_file=file micro-fetch gaps only |
|---|
| 190 | |
|---|
| 191 | EOF |
|---|
| 192 | ; |
|---|
| 193 | |
|---|
| 194 | exit(0); |
|---|
| 195 | } |
|---|
| 196 | |
|---|
| 197 | ############################################################################## |
|---|
| 198 | |
|---|
| 199 | sub log |
|---|
| 200 | { |
|---|
| 201 | &Shepherd::Common::log(@_); |
|---|
| 202 | } |
|---|
| 203 | |
|---|
| 204 | ############################################################################## |
|---|
| 205 | |
|---|
| 206 | sub write_xmltv |
|---|
| 207 | { |
|---|
| 208 | my ($channel, $progs) = @_; |
|---|
| 209 | |
|---|
| 210 | my %writer_args = ( encoding => 'ISO-8859-1' ); |
|---|
| 211 | if ($opt->{outputfile}) { |
|---|
| 212 | my $fh = new IO::File(">$opt->{outputfile}") || die "can't open $opt->{outputfile}: $!"; |
|---|
| 213 | $writer_args{OUTPUT} = $fh; |
|---|
| 214 | } |
|---|
| 215 | |
|---|
| 216 | my $writer = new XMLTV::Writer(%writer_args); |
|---|
| 217 | |
|---|
| 218 | $writer->start |
|---|
| 219 | ( { 'source-info-name' => "$progname $version", |
|---|
| 220 | 'generator-info-name' => "$progname $version"} ); |
|---|
| 221 | |
|---|
| 222 | $writer->write_channel( { |
|---|
| 223 | 'display-name' => [[ "$channel", $opt->{lang} ]], 'id' => $channels->{$channel} } ); |
|---|
| 224 | |
|---|
| 225 | # only return programmes for requested days |
|---|
| 226 | my @timeattr = localtime($script_start_time); # 0=sec,1=min,2=hour,3=day,4=month,5=year,6=wday,7=yday,8=isdst |
|---|
| 227 | $timeattr[0] = 0; # zero seconds |
|---|
| 228 | $timeattr[1] = 0; # zero mintues |
|---|
| 229 | $timeattr[2] = 0; # zero hours |
|---|
| 230 | $timeattr[3] += $opt->{offset}; # day |
|---|
| 231 | my $start_day = mktime(@timeattr); |
|---|
| 232 | $timeattr[3] += $opt->{days} - $opt->{offset}; # day |
|---|
| 233 | my $stop_day = mktime(@timeattr); |
|---|
| 234 | |
|---|
| 235 | foreach my $key (sort keys %{$progs}) { |
|---|
| 236 | my $prog = $progs->{$key}; |
|---|
| 237 | |
|---|
| 238 | # if we are micro-gap fetching, only include programmes which match our micro gaps |
|---|
| 239 | if ($opt->{gaps_file} and defined $gaps and defined $gaps->{$channel}) { |
|---|
| 240 | my $found_gap_matches = 0; |
|---|
| 241 | foreach my $gap (@{($gaps->{$channel})}) { |
|---|
| 242 | my ($s, $e) = split(/-/,$gap); |
|---|
| 243 | if ((($s >= $prog->{start}) && ($s <= $prog->{stop})) || |
|---|
| 244 | (($e >= $prog->{start}) && ($e <= $prog->{stop})) || |
|---|
| 245 | (($s <= $prog->{start}) && ($e >= $prog->{stop}))) { |
|---|
| 246 | printf "gap-fetching: including prog '%s' at %d to %d for gap %d to %d\n", |
|---|
| 247 | $prog->{title}->[0]->[0], $prog->{start}, $prog->{stop}, $s, $e if $opt->{debug}; |
|---|
| 248 | $found_gap_matches++ |
|---|
| 249 | } |
|---|
| 250 | } |
|---|
| 251 | next if (not $found_gap_matches); |
|---|
| 252 | $stats{gaps_matched} += $found_gap_matches; |
|---|
| 253 | } elsif ($prog->{stop} <= $start_day or $prog->{start} >= $stop_day) { |
|---|
| 254 | printf "skipping program because before or after requested days, '%s' at %d to %d\n", |
|---|
| 255 | $prog->{title}->[0]->[0], $prog->{start}, $prog->{stop} if $opt->{debug}; |
|---|
| 256 | next; |
|---|
| 257 | } |
|---|
| 258 | |
|---|
| 259 | $prog->{channel} = $channels->{$channel}; |
|---|
| 260 | $prog->{start} = POSIX::strftime("%Y%m%d%H%M",localtime($prog->{start})); |
|---|
| 261 | $prog->{stop} = POSIX::strftime("%Y%m%d%H%M",localtime($prog->{stop})); |
|---|
| 262 | &Shepherd::Common::cleanup($prog); |
|---|
| 263 | # printf "DEBUG: programme xmltv: ".Dumper($prog) if (defined $opt->{debug}); |
|---|
| 264 | $writer->write_programme($prog); |
|---|
| 265 | $stats{programs_output}++; |
|---|
| 266 | } |
|---|
| 267 | |
|---|
| 268 | $writer->end(); |
|---|
| 269 | |
|---|
| 270 | die "No programs output. ($start_message)" if (not $stats{programs_output} and not $opt->{gaps_file}); |
|---|
| 271 | } |
|---|
| 272 | |
|---|
| 273 | ############################################################################## |
|---|
| 274 | |
|---|
| 275 | sub set_region |
|---|
| 276 | { |
|---|
| 277 | my $region = shift; |
|---|
| 278 | |
|---|
| 279 | # From http://www.southerncrossbroadcasting.com.au/TVGuide |
|---|
| 280 | my $channel; |
|---|
| 281 | # my $url = "http://www.southerncrossbroadcasting.com.au/TVGuide/eguide.asp?ch="; |
|---|
| 282 | my $url = "http://www.scmedia.com.au/content/eguide.aspx?ch="; |
|---|
| 283 | |
|---|
| 284 | |
|---|
| 285 | # non "Sthn Cross TEN" has different page layout (all Capitals, (year)) |
|---|
| 286 | # if ($region == 74) # NT - Darwin |
|---|
| 287 | # { |
|---|
| 288 | # $channel = "Southern Cross"; |
|---|
| 289 | # $url .= ""; |
|---|
| 290 | # } |
|---|
| 291 | # elsif ($region == 88) # TAS - Tasmania |
|---|
| 292 | # { |
|---|
| 293 | # $channel = "Southern Cross"; |
|---|
| 294 | # $url .= ""; |
|---|
| 295 | # # maybe TDT maybe 10 |
|---|
| 296 | # } |
|---|
| 297 | if ($region == 90) # VIC - Ballarat |
|---|
| 298 | { |
|---|
| 299 | $url .= "BAL:Southern%20Cross%20TEN%20-%20Ballarat"; |
|---|
| 300 | } |
|---|
| 301 | elsif ($region == 98) # VIC - Gippsland |
|---|
| 302 | { |
|---|
| 303 | $url .= "GIP:Southern%20Cross%20TEN%20-%20Gippsland"; |
|---|
| 304 | } |
|---|
| 305 | elsif ($region == 266) # VIC - Bendigo |
|---|
| 306 | { |
|---|
| 307 | $url .= "BEN:Southern%20Cross%20TEN%20-%20Bendigo"; |
|---|
| 308 | } |
|---|
| 309 | elsif ($region == 267) # VIC - Shepparton |
|---|
| 310 | { |
|---|
| 311 | $url .= "SHE:Southern%20Cross%20TEN%20-%20Shepparton"; |
|---|
| 312 | } |
|---|
| 313 | elsif ($region == 268) # VIC - Albury/Wodonga |
|---|
| 314 | { |
|---|
| 315 | $url .= "ALB:Southern%20Cross%20TEN%20-%20Albury"; |
|---|
| 316 | } |
|---|
| 317 | elsif ($region == 66) # NSW - Central Coast |
|---|
| 318 | { # http://www.atn.com.au/nsw/syd/cen-map.htm |
|---|
| 319 | $url .= "WOL:Southern%20Cross%20TEN%20-%20Wollongong"; |
|---|
| 320 | } |
|---|
| 321 | elsif ($region == 69) # NSW - Tamworth |
|---|
| 322 | { |
|---|
| 323 | $url .= "TAM:Southern%20Cross%20TEN%20-%20Tamworth"; |
|---|
| 324 | } |
|---|
| 325 | elsif ($region == 71 || $region == 259) # NSW - Wollongong & NSW - Far South Coast |
|---|
| 326 | { |
|---|
| 327 | $url .= "WOL:Southern%20Cross%20TEN%20-%20Wollongong"; |
|---|
| 328 | } |
|---|
| 329 | elsif ($region == 261) # NSW - Lismore/Coffs Harbour |
|---|
| 330 | { |
|---|
| 331 | $url .= "LIS:Southern%20Cross%20TEN%20-%20Lismore"; |
|---|
| 332 | } |
|---|
| 333 | elsif ($region == 262) # NSW - Orange/Dubbo |
|---|
| 334 | { |
|---|
| 335 | $url .= "ORA:Southern%20Cross%20TEN%20-%20Orange/Dubbo"; |
|---|
| 336 | } |
|---|
| 337 | elsif ($region == 263) # NSW - Taree/Port Macquarie |
|---|
| 338 | { |
|---|
| 339 | $url .= "TAR:Southern%20Cross%20TEN%20-%20Taree"; |
|---|
| 340 | } |
|---|
| 341 | elsif ($region == 264) # NSW - Wagga Wagga |
|---|
| 342 | { |
|---|
| 343 | $url .= "WAG:Southern%20Cross%20TEN%20-%20Wagga"; |
|---|
| 344 | } |
|---|
| 345 | elsif ($region == 184) # NSW - Newcastle |
|---|
| 346 | { |
|---|
| 347 | $url .= "NEW:Southern%20Cross%20TEN%20-%20Newcastle"; |
|---|
| 348 | } |
|---|
| 349 | elsif ($region == 63) # NSW - Broken Hill |
|---|
| 350 | { |
|---|
| 351 | $url .= "BRO%2010:Southern%20Cross%20TEN%20-%20Broken%20Hill"; |
|---|
| 352 | } |
|---|
| 353 | elsif ($region == 86) # SA - Spencer Gulf (Port Pirie or Port Lincoln) |
|---|
| 354 | { # http://www.centralonline.com.au/contact.asp |
|---|
| 355 | $url .= "PIR%2010:Southern%20Cross%20TEN%20-%20Port%20Pirie"; |
|---|
| 356 | } |
|---|
| 357 | elsif ($region == 78) # QLD - Gold Coast |
|---|
| 358 | { |
|---|
| 359 | $url .= "GOL:Southern%20Cross%20TEN%20-%20Gold%20Coast"; |
|---|
| 360 | } |
|---|
| 361 | elsif ($region == 255) # QLD - Sunshine Coast |
|---|
| 362 | { |
|---|
| 363 | $url .= "SUN:Southern%20Cross%20TEN%20-%20Sunshine%20Coast"; |
|---|
| 364 | } |
|---|
| 365 | elsif ($region == 256) # QLD - Toowoomba |
|---|
| 366 | { |
|---|
| 367 | $url .= "TOO:Southern%20Cross%20TEN%20-%20Toowoomba"; |
|---|
| 368 | } |
|---|
| 369 | elsif ($region == 258) # QLD - Wide Bay (Bundaberg) |
|---|
| 370 | { |
|---|
| 371 | $url .= "BUN:Southern%20Cross%20TEN%20-%20Bundaberg"; |
|---|
| 372 | } |
|---|
| 373 | elsif ($region == 254) # QLD - Rockhampton |
|---|
| 374 | { |
|---|
| 375 | $url .= "ROC:Southern%20Cross%20TEN%20-%20Rockhampton"; |
|---|
| 376 | } |
|---|
| 377 | elsif ($region == 253) # QLD - Mackay |
|---|
| 378 | { |
|---|
| 379 | $url .= "MAC:Southern%20Cross%20TEN%20-%20Mackay"; |
|---|
| 380 | } |
|---|
| 381 | elsif ($region == 257) # QLD - Townsville |
|---|
| 382 | { |
|---|
| 383 | $url .= "TOW:Southern%20Cross%20TEN%20-%20Townsville"; |
|---|
| 384 | } |
|---|
| 385 | elsif ($region == 79) # QLD - Cairns |
|---|
| 386 | { |
|---|
| 387 | $url .= "CAI:Southern%20Cross%20TEN%20-%20Cairns"; |
|---|
| 388 | } |
|---|
| 389 | elsif ($region == 126) # ACT - ACT |
|---|
| 390 | { |
|---|
| 391 | $url .= "CAN:Southern%20Cross%20TEN%20-%20Canberra"; |
|---|
| 392 | } |
|---|
| 393 | else |
|---|
| 394 | { |
|---|
| 395 | &log("ERROR: No suitable region guide found using default"); |
|---|
| 396 | $url .= "MAC:Southern%20Cross%20TEN%20-%20Mackay"; |
|---|
| 397 | } |
|---|
| 398 | |
|---|
| 399 | $url =~ /-%20(.*)$/; |
|---|
| 400 | &log("Setting region $region to $1"); |
|---|
| 401 | |
|---|
| 402 | return ('SC10', $url); |
|---|
| 403 | |
|---|
| 404 | # <p class="Heading1">Southern Cross Television</p> |
|---|
| 405 | # <select size="1" name="SCTV" networkname="Southern Cross Television" onchange="OpenNewGuide(this);" style="font-family:Tahoma;font-size:11px;margin:10px"> |
|---|
| 406 | # <option>Select...</option> |
|---|
| 407 | # <option value="BRO 7">Broken Hill</option> NSW - Broken Hill |
|---|
| 408 | # <option value="CEN">Central/Satellite</option> |
|---|
| 409 | # <option value="PIR 7">Port Pirie</option> SA - Spencer Gulf |
|---|
| 410 | # <option value="LIN 7">Port Lincoln</option> SA - Spencer Gulf |
|---|
| 411 | # </select><br> |
|---|
| 412 | # <p class="Heading1">Southern Cross TEN</p> |
|---|
| 413 | # <select size="1" name="SCTEN" networkname="Southern Cross TEN" onchange="OpenNewGuide(this);" style="font-family:Tahoma;font-size:11px;margin:10px"> |
|---|
| 414 | # <option>Queensland...</option> |
|---|
| 415 | # <option value="BUN">Bundaberg</option> Wide Bay |
|---|
| 416 | # <option value="CAI">Cairns</option> Cairns |
|---|
| 417 | # <option value="GOL">Gold Coast</option> Gold Coast |
|---|
| 418 | # <option value="MAC">Mackay</option> Mackay |
|---|
| 419 | # <option value="ROC">Rockhampton</option> Rockhampton |
|---|
| 420 | # <option value="SUN">Sunshine Coast</option>Sunshine Coast |
|---|
| 421 | # <option value="TOO">Toowoomba</option> Toowoomba |
|---|
| 422 | # <option value="TOW">Townsville</option> Townsville |
|---|
| 423 | # </select><select size="1" name="SCTEN" networkname="Southern Cross TEN" onchange="OpenNewGuide(this);" style="font-family:Tahoma;font-size:11px;margin:10px"> |
|---|
| 424 | # <option>New South Wales/ACT...</option> |
|---|
| 425 | # <option value="CAN">Canberra</option> ACT |
|---|
| 426 | # <option value="COF">Coffs Harbour</option> 261 NSW: Lismore/Coffs Harbour |
|---|
| 427 | # <option value="LIS">Lismore</option> 261 NSW: Lismore/Coffs Harbour |
|---|
| 428 | # <option value="NEW">Newcastle</option> Newcastle |
|---|
| 429 | # <option value="ORA">Orange/Dubbo</option> 262 NSW: Orange/Dubbo |
|---|
| 430 | # <option value="TAR">Taree</option> 263 NSW: Taree/Port Macquarie |
|---|
| 431 | # <option value="TAM">Tamworth</option> 69 NSW: Tamworth |
|---|
| 432 | # <option value="WAG">Wagga</option> 264 NSW: Wagga Wagga |
|---|
| 433 | # <option value="WOL">Wollongong</option> 71 NSW: Wollongong & 259 NSW: Far South Coast |
|---|
| 434 | # </select><select size="1" name="SCTEN" networkname="Southern Cross TEN" onchange="OpenNewGuide(this);" style="font-family:Tahoma;font-size:11px;margin:10px"> |
|---|
| 435 | # <option>Victoria...</option> |
|---|
| 436 | # <option value="ALB">Albury</option> Eastern Victoria |
|---|
| 437 | # <option value="BAL">Ballarat</option> Western Victoria |
|---|
| 438 | # <option value="BEN">Bendigo</option> Western Victoria |
|---|
| 439 | # <option value="GIP">Gippsland</option> Eastern Victoria |
|---|
| 440 | # <option value="SHE">Shepparton</option> Western Victoria |
|---|
| 441 | # </select><select size="1" name="SCTEN" networkname="Southern Cross TEN" onchange="OpenNewGuide(this);" style="font-family:Tahoma;font-size:11px;margin:10px"> |
|---|
| 442 | # <option>Spencer Gulf...</option> |
|---|
| 443 | # <option value="BRO 10">Broken Hill</option> NSW - Broken Hill |
|---|
| 444 | # <option value="LIN 10">Port Lincoln</option> SA - Spencer Gulf |
|---|
| 445 | # <option value="PIR 10">Port Pirie</option> SA - Spencer Gulf |
|---|
| 446 | # </select> |
|---|
| 447 | # |
|---|
| 448 | } |
|---|
| 449 | |
|---|
| 450 | ############################################################################## |
|---|
| 451 | |
|---|
| 452 | sub get_pages |
|---|
| 453 | { |
|---|
| 454 | my $url = shift; |
|---|
| 455 | |
|---|
| 456 | my @timeattr = localtime($script_start_time); |
|---|
| 457 | # 0=sec,1=min,2=hour,3=day,4=month,5=year,6=wday,7=yday,8=isdst |
|---|
| 458 | $timeattr[0] = 0; # zero sec |
|---|
| 459 | $timeattr[1] = 0; # zero min |
|---|
| 460 | $timeattr[2] = 6; # 6am |
|---|
| 461 | # guide pages start at 6am so grab day before |
|---|
| 462 | $timeattr[3] += $opt->{offset} - 1; # day |
|---|
| 463 | |
|---|
| 464 | my $progs; |
|---|
| 465 | foreach my $day (($opt->{offset}-1) .. ($opt->{days}-1)) { |
|---|
| 466 | my $date = mktime(@timeattr); |
|---|
| 467 | $timeattr[3]++; # day |
|---|
| 468 | |
|---|
| 469 | # if we are micro-gap fetching, only grab days which match our micro gaps |
|---|
| 470 | if ($opt->{gaps_file} and defined $gaps and defined $gaps->{$channel}) { |
|---|
| 471 | my $found_gap_matches = 0; |
|---|
| 472 | foreach my $gap (@{($gaps->{$channel})}) { |
|---|
| 473 | my ($s, $e) = split(/-/,$gap); |
|---|
| 474 | if ((($s >= $date) && ($s <= ($date + 24*60*60))) || |
|---|
| 475 | (($e >= $date) && ($e <= ($date + 24*60*60))) || |
|---|
| 476 | (($s <= $date) && ($e >= ($date + 24*60*60)))) { |
|---|
| 477 | $found_gap_matches++; |
|---|
| 478 | printf "gap-fetching: including day %d at %d for gap %d to %d\n", |
|---|
| 479 | $day, $date, $s, $e if $opt->{debug}; |
|---|
| 480 | } |
|---|
| 481 | } |
|---|
| 482 | next if (not $found_gap_matches); |
|---|
| 483 | } |
|---|
| 484 | |
|---|
| 485 | &log("Fetching day $day"); |
|---|
| 486 | ($progs, my $progs_in_day) = &parse_page($progs, $url, $date); |
|---|
| 487 | if ($progs_in_day == 0) { |
|---|
| 488 | $timeattr[3]--; # day |
|---|
| 489 | last; |
|---|
| 490 | } |
|---|
| 491 | |
|---|
| 492 | if ($opt->{fast} or $opt->{debug} or $day == ($opt->{days}-1)) { |
|---|
| 493 | &log(" found $progs_in_day programmes."); |
|---|
| 494 | } else { |
|---|
| 495 | my $sleep_for = 10 + int(rand(5)); |
|---|
| 496 | &log(" found $progs_in_day programmes, " . |
|---|
| 497 | "sleeping for $sleep_for seconds."); |
|---|
| 498 | sleep $sleep_for; |
|---|
| 499 | $stats{slept_for} += $sleep_for; |
|---|
| 500 | } |
|---|
| 501 | } |
|---|
| 502 | |
|---|
| 503 | # reverse-sort thru list to set prog stop times |
|---|
| 504 | $timeattr[2] = 6; # 6am |
|---|
| 505 | my $last_start = mktime(@timeattr); # 6am on last day + 1 |
|---|
| 506 | foreach my $key (sort {$b <=> $a} keys %$progs) { |
|---|
| 507 | $progs->{$key}->{stop} = $last_start; |
|---|
| 508 | $last_start = $progs->{$key}->{start}; |
|---|
| 509 | } |
|---|
| 510 | |
|---|
| 511 | return $progs; |
|---|
| 512 | } |
|---|
| 513 | |
|---|
| 514 | ############################################################################## |
|---|
| 515 | |
|---|
| 516 | sub parse_page |
|---|
| 517 | { |
|---|
| 518 | my ($progs, $url, $date) = @_; |
|---|
| 519 | |
|---|
| 520 | my $datestr = POSIX::strftime("%Y%m%d",localtime($date)); |
|---|
| 521 | $url .= "&dd=" . $datestr; |
|---|
| 522 | |
|---|
| 523 | &log("parse_page debug: url $url") if $opt->{debug}; |
|---|
| 524 | |
|---|
| 525 | my $data; |
|---|
| 526 | if (not $opt->{debug}) { |
|---|
| 527 | $data = &Shepherd::Common::get_url(url => $url); |
|---|
| 528 | } else { |
|---|
| 529 | if (open(INFILE, "eguide.asp.$datestr")) { |
|---|
| 530 | my @lines = <INFILE>; |
|---|
| 531 | close INFILE; |
|---|
| 532 | $data = join ' ',@lines; |
|---|
| 533 | } else { |
|---|
| 534 | $data = &Shepherd::Common::get_url(url => $url); |
|---|
| 535 | if ($data) { |
|---|
| 536 | sleep(12); |
|---|
| 537 | $stats{slept_for} += 12; |
|---|
| 538 | open(OUTFILE, ">eguide.asp.$datestr") or |
|---|
| 539 | die "Can't open >eguide.asp.$datestr: $!"; |
|---|
| 540 | print OUTFILE $data; |
|---|
| 541 | close OUTFILE; |
|---|
| 542 | } |
|---|
| 543 | } |
|---|
| 544 | } |
|---|
| 545 | |
|---|
| 546 | if (!$data) { |
|---|
| 547 | &log("Didn't return a valid response for url '$url'"); |
|---|
| 548 | return ($progs, 0); |
|---|
| 549 | } |
|---|
| 550 | |
|---|
| 551 | my $tree = HTML::TreeBuilder->new_from_content($data); |
|---|
| 552 | if (!$tree) { |
|---|
| 553 | &log("Can't parse url '$url'"); |
|---|
| 554 | $stats{bad_html}++; |
|---|
| 555 | return ($progs, 0); |
|---|
| 556 | } |
|---|
| 557 | |
|---|
| 558 | &log("parse_page debug: page title: " . |
|---|
| 559 | $tree->find_by_tag_name('title')->as_trimmed_text()) |
|---|
| 560 | if $opt->{debug}; |
|---|
| 561 | |
|---|
| 562 | my @table_rows = $tree->find_by_tag_name('table'); |
|---|
| 563 | if (@table_rows < 1) { |
|---|
| 564 | &log("Format has changed for url '$url'"); |
|---|
| 565 | $stats{bad_html}++; |
|---|
| 566 | return ($progs, 0); |
|---|
| 567 | } |
|---|
| 568 | |
|---|
| 569 | my $progs_in_day = 0; |
|---|
| 570 | my @tds = $table_rows[0]->find_by_tag_name('td'); |
|---|
| 571 | foreach (@tds) |
|---|
| 572 | { |
|---|
| 573 | print " - " . $_->as_trimmed_text() . "\n"; |
|---|
| 574 | } |
|---|
| 575 | my $td = shift @tds; |
|---|
| 576 | my $text = $td->as_trimmed_text() if $td; |
|---|
| 577 | while (@tds > 0) { |
|---|
| 578 | my $prog; |
|---|
| 579 | |
|---|
| 580 | if ($text !~ /(\d\d):(\d\d)/) { # time |
|---|
| 581 | &log("parse_page debug: skipped : " . $td->as_HTML()) if $opt->{debug}; |
|---|
| 582 | $td = shift @tds || last; |
|---|
| 583 | $text = $td->as_trimmed_text(); |
|---|
| 584 | next; |
|---|
| 585 | } |
|---|
| 586 | $prog->{start} = $date + ((($1 < 6 ? 24 : 0) - 6 + $1)*60 + $2)*60; |
|---|
| 587 | &log("parse_page debug: time : $1:$2 " . |
|---|
| 588 | POSIX::strftime("%Y%m%d%H%M", localtime($prog->{start}))) |
|---|
| 589 | if $opt->{debug}; |
|---|
| 590 | |
|---|
| 591 | $td = shift @tds || last; |
|---|
| 592 | my $title = $td->as_trimmed_text(); |
|---|
| 593 | |
|---|
| 594 | $td = shift @tds || last; |
|---|
| 595 | my $rating = &Shepherd::Common::subrating($td->as_trimmed_text()); |
|---|
| 596 | &log("parse_page debug: rating : $rating") if $opt->{debug}; |
|---|
| 597 | |
|---|
| 598 | $td = shift @tds || last; |
|---|
| 599 | $td = shift @tds || last; |
|---|
| 600 | my $text = $td->as_trimmed_text(); |
|---|
| 601 | |
|---|
| 602 | # <b>Saturday Late Night Movie</b> - Run Lola Run |
|---|
| 603 | # <b>Sunday Afternoon Premiere Movie</b> - Rat |
|---|
| 604 | # <b>Sunday Afternoon Movie</b> - Evolution |
|---|
| 605 | my (%type); |
|---|
| 606 | my $post_desc=""; |
|---|
| 607 | if ($title =~ /^(Friday|Saturday|Sunday) .* Movie$/i) { |
|---|
| 608 | $type{movie} = 1; |
|---|
| 609 | $type{premiere} = 1 if ($title =~ /Premiere/i); |
|---|
| 610 | } |
|---|
| 611 | # (Network Premiere) |
|---|
| 612 | ## (Special Encore Presentation) # repeat of a (Network Premiere) |
|---|
| 613 | # (Series Premiere) |
|---|
| 614 | # (Series Finale) |
|---|
| 615 | # (Series Final) |
|---|
| 616 | # (Series Return) |
|---|
| 617 | # (Replay) |
|---|
| 618 | ## (New Time) |
|---|
| 619 | ## (Qualifying) |
|---|
| 620 | ## (Special Presentation) |
|---|
| 621 | # $subtitle =~ , live from Aurora |
|---|
| 622 | if ($title =~ s/(\((.*)\))//) { |
|---|
| 623 | $post_desc .= $1; |
|---|
| 624 | my $tag = $2; |
|---|
| 625 | $type{series} = 1 if ($tag =~ /(^|\W)Series(\W|$)/i); |
|---|
| 626 | $type{premiere} = 1 if ($tag =~ /(^|\W)Premiere(\W|$)/i); |
|---|
| 627 | $type{final} = 1 if ($tag =~ /(^|\W)Finale?(\W|$)/i); |
|---|
| 628 | $type{return} = 1 if ($tag =~ /(^|\W)Return(\W|$)/i); |
|---|
| 629 | $type{repeat} = 1 if ($tag =~ /(^|\W)Repeat(\W|$)/i); # unseen |
|---|
| 630 | $type{repeat} = 1 if ($tag =~ /(^|\W)Replay(\W|$)/i); |
|---|
| 631 | } |
|---|
| 632 | $prog->{title} = [[$title, $opt->{lang}]]; |
|---|
| 633 | &log("parse_page debug: title : $title $post_desc") if ($opt->{debug}); |
|---|
| 634 | |
|---|
| 635 | if ($text) { |
|---|
| 636 | # - Live from Aurora |
|---|
| 637 | # Hosted by Mike Goldman & Bree Amer, this ... # how to parse all |
|---|
| 638 | # Hosted by Mike Goldman, Ryan Fitzgerald and Bree Amer. |
|---|
| 639 | # ^Guest Stars: Mel Gibson Studio ... # how to parse all |
|---|
| 640 | # Guests include: Damien Leith, Young Divas and Darren Hayes. |
|---|
| 641 | # Starring: $ |
|---|
| 642 | ## Insert Cast: Damon Wayans, Adam Sandler, Kristen Wilson, James Caan, |
|---|
| 643 | ## James Farentino (Cons Adv: Some Coarse Language, Some Violence, Sexual References) |
|---|
| 644 | ## Featuring cartoons: Yu Gi Oh! Gx (Final), All Grown Up (Repeat), |
|---|
| 645 | ## Teenage Mutant Ninja Turtules (repeat) |
|---|
| 646 | $type{live} = 1 if ($text =~ /Live from/i); |
|---|
| 647 | my $cast = ""; |
|---|
| 648 | $cast .= "$1," if ($text =~ /Hosted by (\w+ \w+)( |\.|,|$)/i); |
|---|
| 649 | $cast .= "$1," if ($text =~ /Guest Stars: (\w+ \w+)( |\.|,|$)/i); |
|---|
| 650 | $cast .= "$1," if ($text =~ /Guests Include: (.+?)\./i); |
|---|
| 651 | $cast .= $1 if ($text =~ s/Starring: (.+)$//i); |
|---|
| 652 | $prog->{'credits'}{'actor'} = [ split(/,|&|&|\Wand\W/i, $cast) ] if $cast; |
|---|
| 653 | $prog->{desc} = [[$text, $opt->{lang}]] if $text; # desc |
|---|
| 654 | &log("parse_page debug: desc : $text") if $opt->{debug}; |
|---|
| 655 | } |
|---|
| 656 | |
|---|
| 657 | PART: |
|---|
| 658 | my $category = category_from_title($title); |
|---|
| 659 | $category = "Children" if !$category && $rating && $rating =~ /^(C|P)$/i; |
|---|
| 660 | $prog->{'category'} = [ &Shepherd::Common::generate_category( |
|---|
| 661 | $title, $category, %type) ]; |
|---|
| 662 | $prog->{'premiere'} = ['premiere', $opt->{lang}] if $type{premiere}; |
|---|
| 663 | $prog->{'last-chance'} = ['final', $opt->{lang}] if $type{final}; |
|---|
| 664 | $prog->{'previously-shown'} = { } if $type{repeat}; |
|---|
| 665 | $prog->{rating} = [[$rating, "ABA", undef]] if $rating; |
|---|
| 666 | # if ($post_desc) { |
|---|
| 667 | # if (not $prog->{desc}) { |
|---|
| 668 | # $prog->{desc} = [[$post_desc, $opt->{lang}]]; |
|---|
| 669 | # } else { |
|---|
| 670 | # $prog->{desc}->[0]->[0] .= "\n" . $post_desc; |
|---|
| 671 | # } |
|---|
| 672 | # } |
|---|
| 673 | |
|---|
| 674 | $progs->{$prog->{start}} = $prog; |
|---|
| 675 | $progs_in_day++; |
|---|
| 676 | } |
|---|
| 677 | |
|---|
| 678 | $tree->delete(); |
|---|
| 679 | |
|---|
| 680 | $stats{programs_found} += $progs_in_day; |
|---|
| 681 | |
|---|
| 682 | &log("WARNING: Only $progs_in_day programmes seen for url $url") |
|---|
| 683 | if ($progs_in_day < 10); |
|---|
| 684 | |
|---|
| 685 | return ($progs, $progs_in_day); |
|---|
| 686 | } |
|---|
| 687 | |
|---|
| 688 | sub category_from_title |
|---|
| 689 | { |
|---|
| 690 | my $title = shift; |
|---|
| 691 | return "News" if $title=~/(^|\W)News(\W|$)/i; |
|---|
| 692 | return "sports" if $title=~/(^|\W)Sports Tonight(\W|$)/i; |
|---|
| 693 | return "sports" if $title=~/(^|\W)AFL(\W|$)/i; |
|---|
| 694 | return "Infotainment" if $title=~/(^|\W)Infomercials?(\W|$)/i; |
|---|
| 695 | return "Reality" if $title=~/(^|\W)Big Brother(\W|$)/i; |
|---|
| 696 | return "Animation" if $title=~/(^|\W)The( All New)? Simpsons(\W|$)/i; |
|---|
| 697 | return "Music" if $title=~/(^|\W)Video Hits(\W|$)/i; |
|---|
| 698 | return "Soap" if $title=~/(^|\W)The Bold (&|&|and) The Beautiful(\W|$)/i; |
|---|
| 699 | return "Lifestyle/Cooking, Food & Wine" if $title=~/(^|\W)Hueys Cooking Adventures(\W|$)/i; |
|---|
| 700 | return "Variety" if $title=~/(^|\W)The Late Show With David Letterman(\W|$)/i; |
|---|
| 701 | } |
|---|