| 1 | #!/usr/bin/env perl |
|---|
| 2 | |
|---|
| 3 | use strict; |
|---|
| 4 | use warnings; |
|---|
| 5 | use XMLTV; |
|---|
| 6 | use Shepherd::Common; |
|---|
| 7 | use POSIX qw(strftime mktime); |
|---|
| 8 | use Data::Dumper; |
|---|
| 9 | use HTML::TreeBuilder; |
|---|
| 10 | |
|---|
| 11 | # |
|---|
| 12 | # global variables and settings |
|---|
| 13 | # |
|---|
| 14 | |
|---|
| 15 | $| = 1; |
|---|
| 16 | my $details_cache; |
|---|
| 17 | my $writer; |
|---|
| 18 | my $baseurl = "http://www.acctv.com.au/"; |
|---|
| 19 | my $prev_stop = undef; |
|---|
| 20 | |
|---|
| 21 | # |
|---|
| 22 | # go go go! |
|---|
| 23 | # |
|---|
| 24 | |
|---|
| 25 | my %stats; |
|---|
| 26 | my $o; |
|---|
| 27 | Shepherd::Common::program_begin(\$o, "acctv_website", "0.09", \%stats); |
|---|
| 28 | |
|---|
| 29 | my ($channels, $opt_channels, $gaps) = Shepherd::Common::read_channels($o, ("ACC")); |
|---|
| 30 | my $channel_xmlid = $channels->{ACC} || $opt_channels->{ACC}; |
|---|
| 31 | |
|---|
| 32 | &read_cache unless (defined $o->{no_cache}); |
|---|
| 33 | &start_writing_xmltv; |
|---|
| 34 | &get_days(); |
|---|
| 35 | $writer->end(); |
|---|
| 36 | &write_cache unless (defined $o->{no_cache}); |
|---|
| 37 | |
|---|
| 38 | Shepherd::Common::program_end($o, %stats); |
|---|
| 39 | exit(0); |
|---|
| 40 | |
|---|
| 41 | ############################################################################## |
|---|
| 42 | # populate cache |
|---|
| 43 | |
|---|
| 44 | sub read_cache |
|---|
| 45 | { |
|---|
| 46 | my $store = Shepherd::Common::read_cache(\$o->{cache_file}); |
|---|
| 47 | |
|---|
| 48 | if ($store) { |
|---|
| 49 | $details_cache = $store->{details_cache} if (defined $store->{details_cache}); |
|---|
| 50 | } |
|---|
| 51 | } |
|---|
| 52 | |
|---|
| 53 | ############################################################################## |
|---|
| 54 | # write out updated cache |
|---|
| 55 | |
|---|
| 56 | sub write_cache |
|---|
| 57 | { |
|---|
| 58 | # cleanup old prog entries from cache |
|---|
| 59 | for my $cache_key (keys %{$details_cache}) { |
|---|
| 60 | # no real way to detect changes so check every four weeks |
|---|
| 61 | if ($details_cache->{$cache_key}->{added} < ($o->{script_start_time} - (28*24*60*60))) { |
|---|
| 62 | delete $details_cache->{$cache_key}; |
|---|
| 63 | $stats{cache_expire}++; |
|---|
| 64 | } |
|---|
| 65 | } |
|---|
| 66 | |
|---|
| 67 | my $store = { }; |
|---|
| 68 | $store->{details_cache} = $details_cache; |
|---|
| 69 | Shepherd::Common::write_cache($o->{cache_file}, $store); |
|---|
| 70 | } |
|---|
| 71 | |
|---|
| 72 | ############################################################################## |
|---|
| 73 | |
|---|
| 74 | sub start_writing_xmltv |
|---|
| 75 | { |
|---|
| 76 | my %writer_args = ( encoding => 'ISO-8859-1' ); |
|---|
| 77 | if ($o->{outputfile}) { |
|---|
| 78 | my $fh = new IO::File(">$o->{outputfile}") || die "can't open $o->{outputfile}: $!"; |
|---|
| 79 | $writer_args{OUTPUT} = $fh; |
|---|
| 80 | } |
|---|
| 81 | |
|---|
| 82 | $writer = new XMLTV::Writer(%writer_args); |
|---|
| 83 | |
|---|
| 84 | $writer->start |
|---|
| 85 | ( { 'source-info-name' => "$o->{program_name} v$o->{version_number}", |
|---|
| 86 | 'generator-info-name' => "$o->{program_name} v$o->{version_number}"} ); |
|---|
| 87 | |
|---|
| 88 | $writer->write_channel( {'display-name' => [[ "ACC", $o->{lang} ]], 'id' => $channel_xmlid } ); |
|---|
| 89 | } |
|---|
| 90 | |
|---|
| 91 | ############################################################################## |
|---|
| 92 | |
|---|
| 93 | sub get_days |
|---|
| 94 | { |
|---|
| 95 | Shepherd::Common::log("Setting region"); |
|---|
| 96 | |
|---|
| 97 | my $state = Shepherd::Common::which_state($o->{region}); |
|---|
| 98 | |
|---|
| 99 | my $intnewtzstate = 0; |
|---|
| 100 | $intnewtzstate = 0 if ($state eq "NSW"); |
|---|
| 101 | $intnewtzstate = 1 if ($state eq "VIC"); |
|---|
| 102 | $intnewtzstate = 2 if ($state eq "QLD"); |
|---|
| 103 | $intnewtzstate = 3 if ($state eq "SA"); |
|---|
| 104 | $intnewtzstate = 4 if ($state eq "WA"); |
|---|
| 105 | $intnewtzstate = 5 if ($state eq "TAS"); |
|---|
| 106 | $intnewtzstate = 6 if ($state eq "NT"); |
|---|
| 107 | $intnewtzstate = 7 if ($state eq "ACT"); |
|---|
| 108 | |
|---|
| 109 | my $data = &Shepherd::Common::get_url(url => $baseurl."schedule.asp", |
|---|
| 110 | postvars => "intnewtzstate=" . $intnewtzstate); |
|---|
| 111 | |
|---|
| 112 | my @timeattr = localtime($o->{script_start_time}); |
|---|
| 113 | # 0=sec,1=min,2=hour,3=day,4=month,5=year,6=wday,7=yday,8=isdst |
|---|
| 114 | $timeattr[0] = 0; # zero sec |
|---|
| 115 | $timeattr[1] = 0; # zero min |
|---|
| 116 | $timeattr[2] = 0; # zero hour |
|---|
| 117 | $timeattr[3] += $o->{offset}; # day |
|---|
| 118 | my $first_day = mktime(@timeattr); # don't return anything before first day |
|---|
| 119 | # guide pages have part days so grab day before |
|---|
| 120 | $timeattr[3]--; # day |
|---|
| 121 | |
|---|
| 122 | my $progs; |
|---|
| 123 | foreach my $day (($o->{offset}-1) .. ($o->{days}-1)) { |
|---|
| 124 | my $date = mktime(@timeattr); |
|---|
| 125 | $timeattr[3]++; # day |
|---|
| 126 | |
|---|
| 127 | &Shepherd::Common::log("Fetching day $day"); |
|---|
| 128 | |
|---|
| 129 | my $progs_in_day = &get_day($first_day, $date); |
|---|
| 130 | last if ($progs_in_day == 0 && $day >= 0); |
|---|
| 131 | |
|---|
| 132 | &Shepherd::Common::log(" found $progs_in_day programmes."); |
|---|
| 133 | } |
|---|
| 134 | } |
|---|
| 135 | |
|---|
| 136 | ############################################################################## |
|---|
| 137 | |
|---|
| 138 | sub get_day |
|---|
| 139 | { |
|---|
| 140 | my ($first_day, $date) = @_; |
|---|
| 141 | |
|---|
| 142 | my $url = $baseurl."schedule.asp?day=" . POSIX::strftime("%m%%2F%d%%2F%Y", localtime($date)); |
|---|
| 143 | |
|---|
| 144 | my $data = &Shepherd::Common::get_url($url); |
|---|
| 145 | if (!$data) { |
|---|
| 146 | Shepherd::Common::log("Failed to get html for '$url'"); |
|---|
| 147 | $stats{failed_html_get}++; |
|---|
| 148 | return 0; |
|---|
| 149 | } |
|---|
| 150 | Shepherd::Common::log("DEBUG: html: $data") if ((defined $o->{debug}) && ($o->{debug} > 2)); |
|---|
| 151 | |
|---|
| 152 | my $tree = HTML::TreeBuilder->new_from_content($data); |
|---|
| 153 | if (!$tree) { |
|---|
| 154 | Shepherd::Common::log("Can't parse html for '$url'"); |
|---|
| 155 | $stats{failed_html_parse}++; |
|---|
| 156 | return 0; |
|---|
| 157 | } |
|---|
| 158 | |
|---|
| 159 | my @tables = $tree->find_by_tag_name('table'); |
|---|
| 160 | if (@tables < 2) { |
|---|
| 161 | Shepherd::Common::log("Format has changed for '$url'"); |
|---|
| 162 | $stats{failed_html_change}++; |
|---|
| 163 | return 0; |
|---|
| 164 | } |
|---|
| 165 | |
|---|
| 166 | my $progs_in_day = 0; |
|---|
| 167 | my @tds = $tables[1]->find_by_tag_name('td'); # from second table |
|---|
| 168 | my $td = shift @tds; |
|---|
| 169 | my $text = $td->as_trimmed_text() if $td; |
|---|
| 170 | while (@tds > 0) { |
|---|
| 171 | my $prog; |
|---|
| 172 | |
|---|
| 173 | if ($text !~ /(\d+):(\d+) (am|pm)/) { # start |
|---|
| 174 | Shepherd::Common::log("skipped : " . $td->as_HTML()) if $o->{debug}; |
|---|
| 175 | $td = shift @tds || last; |
|---|
| 176 | $text = $td->as_trimmed_text(); |
|---|
| 177 | next; |
|---|
| 178 | } |
|---|
| 179 | $prog->{start} = $date + (($1)*60 + $2)*60; |
|---|
| 180 | $prog->{start} += 12*60*60 if ($3 eq 'pm' && $1 != 12); |
|---|
| 181 | $prog->{start} += 24*60*60 if ($3 eq 'am' && $1 < 6); |
|---|
| 182 | $prog->{start} += 12*60*60 if ($3 eq 'am' && $1 == 12); |
|---|
| 183 | Shepherd::Common::log("\ntime : $1:$2 $3 " . |
|---|
| 184 | POSIX::strftime("%Y%m%d%H%M", localtime($prog->{start}))) |
|---|
| 185 | if $o->{debug}; |
|---|
| 186 | $td = shift @tds || last; |
|---|
| 187 | $text = $td->as_trimmed_text(); |
|---|
| 188 | |
|---|
| 189 | my $a = $td->find_by_tag_name('a') || next; # title |
|---|
| 190 | $prog->{title} = [[$a->as_trimmed_text(), $o->{lang}]]; |
|---|
| 191 | my $link = $a->attr('href'); # link |
|---|
| 192 | Shepherd::Common::log("title link: $prog->{title}->[0]->[0] \t$link") if ($o->{debug}); |
|---|
| 193 | $td = shift @tds || goto PART; |
|---|
| 194 | $text = $td->as_trimmed_text(); |
|---|
| 195 | |
|---|
| 196 | PART: |
|---|
| 197 | # don't return anything before first day |
|---|
| 198 | next if ($prog->{start} <= ($first_day - 2*60*60)); |
|---|
| 199 | |
|---|
| 200 | $prog = get_details($prog, $link); |
|---|
| 201 | |
|---|
| 202 | # don't return anything before first day |
|---|
| 203 | next if (defined $prog->{stop} && $prog->{stop} <= $first_day); |
|---|
| 204 | |
|---|
| 205 | gapfill($prog->{start},(defined($prog->{stop}) ? $prog->{stop} : undef)); |
|---|
| 206 | |
|---|
| 207 | $prog->{start} = POSIX::strftime("%Y%m%d%H%M",localtime($prog->{start})); |
|---|
| 208 | $prog->{stop} = POSIX::strftime("%Y%m%d%H%M",localtime($prog->{stop})) if defined $prog->{stop}; |
|---|
| 209 | $prog->{channel} = $channel_xmlid; |
|---|
| 210 | push(@{$prog->{'category'}}, ["Religion", $o->{lang}]) |
|---|
| 211 | if !grep($_->[0] eq "Religion", @{$prog->{'category'}}); |
|---|
| 212 | |
|---|
| 213 | Shepherd::Common::cleanup($prog); |
|---|
| 214 | |
|---|
| 215 | Shepherd::Common::log("DEBUG: xmltv: ".Dumper($prog)) |
|---|
| 216 | if ((defined $o->{debug}) && ($o->{debug} > 1)); |
|---|
| 217 | |
|---|
| 218 | $writer->write_programme($prog); |
|---|
| 219 | |
|---|
| 220 | $progs_in_day++; |
|---|
| 221 | $stats{programmes}++; |
|---|
| 222 | } |
|---|
| 223 | |
|---|
| 224 | $tree->delete(); |
|---|
| 225 | |
|---|
| 226 | Shepherd::Common::log("WARNING: Only $progs_in_day programmes seen for url $url") |
|---|
| 227 | if ($progs_in_day < 5); |
|---|
| 228 | |
|---|
| 229 | return $progs_in_day; |
|---|
| 230 | } |
|---|
| 231 | |
|---|
| 232 | sub gapfill |
|---|
| 233 | { |
|---|
| 234 | my ($next_start, $next_stop) = @_; |
|---|
| 235 | |
|---|
| 236 | if (defined($prev_stop) && ($prev_stop != $next_start)) { |
|---|
| 237 | my $prog; |
|---|
| 238 | $prog->{title} = [["Gap", $o->{lang}]]; |
|---|
| 239 | $prog->{start} = POSIX::strftime("%Y%m%d%H%M",localtime($prev_stop)); |
|---|
| 240 | $prog->{stop} = POSIX::strftime("%Y%m%d%H%M",localtime($next_start)); |
|---|
| 241 | $prog->{channel} = $channel_xmlid; |
|---|
| 242 | |
|---|
| 243 | Shepherd::Common::cleanup($prog); |
|---|
| 244 | |
|---|
| 245 | Shepherd::Common::log("DEBUG: xmltv: ".Dumper($prog)) |
|---|
| 246 | if ((defined $o->{debug}) && ($o->{debug} > 1)); |
|---|
| 247 | |
|---|
| 248 | $writer->write_programme($prog); |
|---|
| 249 | |
|---|
| 250 | } |
|---|
| 251 | $prev_stop = $next_stop; |
|---|
| 252 | } |
|---|
| 253 | |
|---|
| 254 | sub get_details |
|---|
| 255 | { |
|---|
| 256 | my ($prog, $link) = @_; |
|---|
| 257 | |
|---|
| 258 | my $cache_key = sprintf("%s:%s", $prog->{title}->[0]->[0], $link); |
|---|
| 259 | if (defined $details_cache->{$cache_key}) { |
|---|
| 260 | foreach ('desc', 'length', 'category', 'credits') { |
|---|
| 261 | next if !defined $details_cache->{$cache_key}->{$_}; |
|---|
| 262 | Shepherd::Common::log(" cache use $_ \t: $details_cache->{$cache_key}->{$_}") if ($o->{debug}); |
|---|
| 263 | $prog->{$_} = $details_cache->{$cache_key}->{$_}; |
|---|
| 264 | } |
|---|
| 265 | $prog->{stop} = $prog->{start} + $prog->{length} if (defined $prog->{length}); |
|---|
| 266 | $stats{cached_used}++; |
|---|
| 267 | return $prog; |
|---|
| 268 | } |
|---|
| 269 | |
|---|
| 270 | my $url = $baseurl . $link; |
|---|
| 271 | if (($stats{failed_details_html_get} && $stats{failed_details_html_get} > 5) || |
|---|
| 272 | ($stats{failed_details_html_parse} && $stats{failed_details_html_parse} > 5) || |
|---|
| 273 | ($stats{failed_details_html_change} && $stats{failed_details_html_change} > 5)) { |
|---|
| 274 | Shepherd::Common::log("Too many errors retrieving details, skipping '$url'"); |
|---|
| 275 | return $prog; |
|---|
| 276 | } |
|---|
| 277 | |
|---|
| 278 | my $data = &Shepherd::Common::get_url($url); |
|---|
| 279 | if (!$data) { |
|---|
| 280 | Shepherd::Common::log("Failed to get html for details '$url'"); |
|---|
| 281 | $stats{failed_details_html_get}++; |
|---|
| 282 | return $prog; |
|---|
| 283 | } |
|---|
| 284 | Shepherd::Common::log("DEBUG: html: $data") if ((defined $o->{debug}) && ($o->{debug} > 2)); |
|---|
| 285 | |
|---|
| 286 | my $tree = HTML::TreeBuilder->new_from_content($data); |
|---|
| 287 | if (!$tree) { |
|---|
| 288 | Shepherd::Common::log("Can't parse html for details '$url'"); |
|---|
| 289 | $stats{failed_details_html_parse}++; |
|---|
| 290 | return $prog; |
|---|
| 291 | } |
|---|
| 292 | |
|---|
| 293 | my @tables = $tree->find_by_tag_name('table'); |
|---|
| 294 | if (@tables < 3) { |
|---|
| 295 | Shepherd::Common::log("Format has changed for details '$url'"); |
|---|
| 296 | $stats{failed_details_html_change}++; |
|---|
| 297 | return $prog; |
|---|
| 298 | } |
|---|
| 299 | |
|---|
| 300 | my $h4 = $tables[1]->find_by_tag_name('h4') || return; # from second table |
|---|
| 301 | my $desc = $h4->right(); |
|---|
| 302 | $prog->{desc} = [[$desc, $o->{lang}]] if !ref($desc); # desc |
|---|
| 303 | Shepherd::Common::log("desc : $desc") if ($o->{debug}); |
|---|
| 304 | |
|---|
| 305 | my @categories; |
|---|
| 306 | |
|---|
| 307 | my @tds = $tables[2]->find_by_tag_name('td'); # from third table |
|---|
| 308 | my $td = shift @tds; |
|---|
| 309 | my $header = $td->as_trimmed_text() if $td; |
|---|
| 310 | $header =~ s/[^\x20-\x7f]//g; # remove |
|---|
| 311 | my $content; |
|---|
| 312 | while (@tds > 0) { |
|---|
| 313 | $td = shift @tds || last; |
|---|
| 314 | $content = $td->as_trimmed_text(); |
|---|
| 315 | Shepherd::Common::log("header content : $header\t$content") if ($o->{debug}); |
|---|
| 316 | |
|---|
| 317 | if ($header eq 'Length') { |
|---|
| 318 | if ($content !~ /^(\d+) /) { # length |
|---|
| 319 | Shepherd::Common::log("skipped detail length : " . $td->as_HTML()); |
|---|
| 320 | $td = shift @tds || last; |
|---|
| 321 | $header = $td->as_trimmed_text(); |
|---|
| 322 | $header =~ s/[^\x20-\x7f]//g; # remove |
|---|
| 323 | next; |
|---|
| 324 | } |
|---|
| 325 | my $length = ($1 < 3 ? ($1*60*60) : ($1*60)); |
|---|
| 326 | $prog->{length} = $length; |
|---|
| 327 | $prog->{stop} = $prog->{start} + $length; |
|---|
| 328 | Shepherd::Common::log("set length : $length") if ($o->{debug}); |
|---|
| 329 | } elsif ($header eq 'Genre') { |
|---|
| 330 | push(@categories, [$content, $o->{lang}]); # category |
|---|
| 331 | Shepherd::Common::log("set category : $content") if ($o->{debug}); |
|---|
| 332 | } elsif ($header eq 'Speaker/Host') { |
|---|
| 333 | #Amanda Wells and Chris Byers |
|---|
| 334 | my @actors = split(/,|and/, $content); |
|---|
| 335 | $prog->{credits}->{actor} = [@actors]; # actor |
|---|
| 336 | Shepherd::Common::log("set actors : @actors") if ($o->{debug}); |
|---|
| 337 | } |
|---|
| 338 | |
|---|
| 339 | $td = shift @tds || last; |
|---|
| 340 | $header = $td->as_trimmed_text(); |
|---|
| 341 | $header =~ s/[^\x20-\x7f]//g; # remove |
|---|
| 342 | } |
|---|
| 343 | |
|---|
| 344 | $prog->{category} = [@categories]; |
|---|
| 345 | |
|---|
| 346 | foreach ('desc', 'length', 'category', 'credits') { |
|---|
| 347 | next if !defined $prog->{$_}; |
|---|
| 348 | Shepherd::Common::log(" cache add $_ \t: $prog->{$_}") if ($o->{debug}); |
|---|
| 349 | $details_cache->{$cache_key}->{$_} = $prog->{$_}; |
|---|
| 350 | } |
|---|
| 351 | $details_cache->{$cache_key}->{added} = $o->{script_start_time}; |
|---|
| 352 | |
|---|
| 353 | $stats{cached_added}++; |
|---|
| 354 | if (($stats{cached_added} % 32) == 0) |
|---|
| 355 | { |
|---|
| 356 | &write_cache unless (defined $o->{no_cache}); |
|---|
| 357 | Shepherd::Common::log(" cache write"); |
|---|
| 358 | } |
|---|
| 359 | |
|---|
| 360 | return $prog; |
|---|
| 361 | } |
|---|