#!/usr/bin/perl -w

# yahoo7_widget au_tv guide grabber - runs from "Shepherd" master grabber
#  * written by ltd
#  * uses yahoo7 widget for ABC/7/9/10/SBS (all they have)
#  * when used in conjunction with Shepherd, shepherd can collect other channels
#    using other grabbers
#  * this does NOT use any config file - all settings are passed in from shepherd

#  changelog:
#    1.50  22sep06	added support for "shepherd" master grabber script
#    1.51  02oct06      --ready option
#    1.52  03oct06      split out yahoo7 grabber into its own grabber
#    1.54  16oct06      put date/cast/credits/year into correct xmltv fields

use strict;

my $progname = "yahoo7widget";
my $version = "1.54_16oct06";

use LWP::UserAgent;
use Time::HiRes qw(gettimeofday tv_interval);
use XMLTV;
use XML::DOM;
use XML::DOM::NodeList;
use POSIX qw(strftime mktime);
use Getopt::Long;
use Data::Dumper;
use Cwd;

#
# some initial cruft
#

my $script_start_time = [gettimeofday];
my %stats;
my $channels;
my $tv_guide;

# lets make sure we look exactly like the yahoo widget engine...
my $ua;
BEGIN {
	$ua = LWP::UserAgent->new(
		'timeout' => 30,
		'keep_alive' => 1,
		'agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-us)'
		);
	$ua->env_proxy;
	# $ua->cookie_jar({});
	$| = 1;
}

#
# parse command line
#

my $opt_days = 		7;					# default
my $opt_offset = 	0;					# default
my $opt_timezone = 	"1000";					# default
my $opt_outputfile = 	cwd() . "/yahoo7widget.xmltv";		# default
my $opt_configfile = 	cwd() . "/yahoo7widget.conf"; 		# ignored
my $opt_channels_file= 	"";
my $opt_fast =		0;
my $opt_warper =	0;
my $opt_obfuscate = 	0;
my $opt_help =		0;
my $opt_version =	0;
my $opt_desc =		0;
my $opt_dont_retry = 	0;
my $debug = 		1;
my $lang = 		"en";
my $region =		94;
my $time_offset =	0;
my $opt_raw;

GetOptions(
	'region=i'	=> \$region,
	'days=i'	=> \$opt_days,
	'offset=i'	=> \$opt_offset,
	'timezone=s'	=> \$opt_timezone,
	'channels_file=s' => \$opt_channels_file,
	'output=s'	=> \$opt_outputfile,
	'config-file=s'	=> \$opt_configfile,
	'fast'		=> \$opt_fast,
	'debug+'	=> \$debug,
	'warper'	=> \$opt_warper,
	'lang=s'	=> \$lang,
	'obfuscate'	=> \$opt_obfuscate,
	'no-retry'	=> \$opt_dont_retry,
	'help'		=> \$opt_help,
	'verbose'	=> \$opt_help,
	'version'	=> \$opt_version,
	'ready'		=> \$opt_version,
	'desc'		=> \$opt_desc,
	'raw=s'		=> \$opt_raw,
	'v'		=> \$opt_help);

&help if ($opt_help);

if ($opt_version || $opt_desc) {
	printf "%s %s\n",$progname,$version;
	printf "%s is a details-aware grabber that collects very high quality data (full title/subtitle/description/genre and year/cast/credits data) using the Yahoo7 widget for ABC/7/9/10/SBS only.",$progname if $opt_desc;
	exit(0);
}

die "no channel file specified, see --help for instructions\n", if ($opt_channels_file eq "");

#
# go go go!
#

# normalize starttime to an hour..
my $starttime = time;
my ($sec,$min,@rest) = localtime($starttime);
$starttime -= ((60 * $min) + $sec);
my $endtime = $starttime + ($opt_days * 86400);
$starttime += (86400 * $opt_offset);

&log(sprintf "going to grab %d days%s of data into %s (%s%s) timezone %s region %s",
	$opt_days,
	($opt_offset ? " (skipping first %d days)" : ""),
	$opt_outputfile,
	($opt_fast ? "with haste" : "slowly"),
	($opt_warper ? ", anonymously" : ""),
	$opt_timezone, $region);

$time_offset = 3600*((($opt_timezone / 100)-10) + (0.1 * (($opt_timezone % 100) / 60))) if ($opt_timezone ne "1000");

if (-r $opt_channels_file) {
	local (@ARGV, $/) = ($opt_channels_file);
	no warnings 'all'; eval <>; die "$@" if $@;
} else {
	die "WARNING: channels file $opt_channels_file could not be read\n";
}

unlink $opt_raw if ($opt_raw);

for (my $currtime = $starttime; $currtime < $endtime; $currtime += 86400) {
	&parse_xml_data( get_url(
		(sprintf "http://au.tv.yahoo.com/widget.html?rg=%d&st=%d&et=%d", $region, $currtime,($currtime+86400)),
		(sprintf "yahoo7widget detailed data: day %d of %d",((($currtime-$starttime)/86400)+1),(($endtime-$starttime)/86400))));
}

&write_data;
&print_stats;
exit(0);

######################################################################################################
# help

sub help
{
	print<<EOF
$progname $version

options are as follows:
	--help			show these help options
	--days=N		fetch 'n' days of data (default: $opt_days)
	--output=file		send xml output to file (default: "$opt_outputfile")
	--config-file=file	(ignored - historically used by grabbers not not this one)
	--fast			don't run slow - get data as quick as you can - not recommended
	--debug			increase debug level
	--warper		fetch data using WebWarper web anonymizer service
	--obfuscate		pretend to be a proxy servicing multiple clients
	--no-retry		if webserver is rejecting our request, don't retry (default: do retry)
	--lang=[s]		set language of xmltv output data (default $lang)

	--region=N		set region for where to collect data from (default: $region)
	--channels_file=file	where to get channel data from (MANDATORY)
	--timezone=HHMM		timezone for channel data (default: $opt_timezone)
EOF
;

	exit(0);
}

######################################################################################################
# transcode ywe-octet-stream back into text

sub transform_output
{
	my ($datasize, $data) = @_;

	my @xform_map = (
	  0x39, 0x9E, 0x05, 0x72, 0x6C, 0x06, 0x38, 0x15, 0x42, 0x1E, 0xB9, 0xFD, 0x4D, 0x08, 0x0C, 0x2E,
	  0x57, 0xC7, 0x62, 0x6E, 0xC5, 0x3A, 0x3C, 0xA4, 0x1D, 0xC6, 0x3D, 0x18, 0x2D, 0x1B, 0x83, 0x20,
	  0x78, 0xFC, 0xA5, 0xDE, 0x28, 0xE8, 0x3E, 0x9B, 0x7C, 0x22, 0x1C, 0x89, 0xFF, 0x52, 0x54, 0x43,
	  0x51, 0x7F, 0x71, 0x40, 0x7A, 0xCF, 0x65, 0xE4, 0x36, 0xEB, 0xC9, 0x1F, 0x80, 0x9A, 0x31, 0x4A,
	  0x45, 0xD4, 0x2B, 0x02, 0x4C, 0xF4, 0x53, 0xBD, 0xA8, 0xF9, 0x50, 0x61, 0x8A, 0xD5, 0xBF, 0x81,
	  0xC0, 0xDB, 0xFE, 0xF7, 0xBA, 0xEC, 0xFA, 0x73, 0xA9, 0x8F, 0xB1, 0x70, 0x33, 0xCE, 0x60, 0xAC,
	  0xB2, 0x58, 0x26, 0x85, 0x6B, 0x7D, 0x93, 0x03, 0x64, 0x47, 0x04, 0x88, 0x01, 0xA6, 0x3B, 0x90,
	  0x98, 0xF5, 0x97, 0x3F, 0xF6, 0xD3, 0x94, 0xB7, 0x29, 0x07, 0x96, 0x6F, 0x14, 0x35, 0x8D, 0x2A,
	  0x16, 0x17, 0x8B, 0xD1, 0x48, 0xD6, 0xF1, 0xE2, 0x79, 0x2C, 0x41, 0x5B, 0xBC, 0xB5, 0x68, 0xDC,
	  0x49, 0xD2, 0x6A, 0xCC, 0x25, 0xB4, 0xAA, 0x63, 0x9C, 0x56, 0x4B, 0xB8, 0x87, 0x5E, 0x86, 0x09,
	  0xC4, 0x95, 0xB6, 0x12, 0xF8, 0x84, 0x4E, 0x21, 0x32, 0xCA, 0x66, 0xC3, 0xBB, 0x27, 0xEE, 0xE0,
	  0x1A, 0xD8, 0x6D, 0x4F, 0xAF, 0x82, 0xEF, 0xCD, 0x5F, 0x8C, 0x67, 0xA2, 0xCB, 0xED, 0xAB, 0xB0,
	  0xA7, 0x92, 0x75, 0x5A, 0xF2, 0x0A, 0x0E, 0xE6, 0x7E, 0xC8, 0xE9, 0x19, 0x24, 0x37, 0x11, 0xA0,
	  0xE3, 0xDD, 0xD7, 0x23, 0x9F, 0x00, 0xA1, 0xC1, 0x74, 0xF0, 0x99, 0x77, 0xAE, 0x91, 0x7B, 0xFB,
	  0xD9, 0xDA, 0xC2, 0x44, 0x0D, 0x76, 0x10, 0x9D, 0xEA, 0xE7, 0xE5, 0x59, 0xF3, 0xD0, 0x5D, 0x2F,
	  0x69, 0xAD, 0x34, 0x0F, 0x5C, 0x8E, 0xBE, 0x13, 0x30, 0x55, 0xE1, 0xDF, 0x0B, 0xB3, 0x46, 0xA3);
	my ($xlate_pos1, $xlate_pos2, $xlate_pos3, $xlate_pos4) = (0,0,0,0);
	my $outputdata;

	if (($datasize < 1) || (ord(substr($data,0,1)) != 1)) {
		# not valid
		return(undef);
	}

	for (my $pos = 1; $pos < $datasize; $pos++) {
		$xlate_pos1 = ($xlate_pos1 + 1) % 256;
		$xlate_pos3 = $xform_map[$xlate_pos1];
		$xlate_pos4 = ($xlate_pos2 + $xlate_pos3) % 256;
		$xlate_pos2 = $xform_map[$xlate_pos4];
		$xform_map[$xlate_pos1] = $xlate_pos2;
		$xlate_pos2 += $xlate_pos3;
		$xform_map[$xlate_pos4] = $xlate_pos3;
		$xlate_pos2 = $xlate_pos2 % 256;
		$xlate_pos3 = $xform_map[$xlate_pos2];
		$xlate_pos2 = $xlate_pos4;
		$outputdata .= chr((((ord(substr($data,$pos,1))) % 256) ^ ($xlate_pos3 % 256)) % 256);
	}

	if ($opt_raw) {
		if (open(F,">>$opt_raw")) {
			print F $outputdata;
			close F;
		}
	}

	return($outputdata);
}

######################################################################################################
# logic to fetch a page via http
#  retries up to 3 times to get a page with 5 second pauses inbetween

sub get_url
{
	my ($url,$status,$dontretry) = @_;
	my $response;
	my $attempts = 0;
	my ($raw, $page, $base);

	$url =~ s#^http://#http://webwarper.net/ww/# if $opt_warper;
	my $request = HTTP::Request->new(GET => $url);
	$request->header('Accept-Encoding' => 'gzip');

	if ($opt_obfuscate) {
		my $randomaddr = sprintf "203.%d.%d.%d",rand(255),rand(255),(rand(254)+1);
		$request->header('Via' => '1.0 proxy:81 (Squid/2.3.STABLE3)');
		$request->header('X-Forwarded-For' => $randomaddr);
	}
	&log(sprintf "fetching %s%s: %s",$status,($opt_obfuscate ? "[obfuscate]" : ""),$url);
	for (1..3) {
		$response = $ua->request($request);
		last if ($response->is_success || $dontretry);

		$stats{http_failed_requests}++;
		$stats{slept_for} += 10;
		$attempts++;
		sleep 10;
	}
	if (!($response->is_success)) {
		if ($dontretry == 0) {
			&log("aborting after $attempts attempts to fetch url $url") if $debug;
			printf STDERR "ERROR: could not open url %s in %d attempts\n",$url,$attempts;
		}
		return undef;
	}

	$stats{bytes_fetched} += do {use bytes; length($response->content)};
	$stats{http_successful_requests}++;

	if (!$opt_fast) {
		my $sleeptimer = int(rand(5)) + 1;  # sleep anywhere from 1 to 5 seconds
		$stats{slept_for} += $sleeptimer;
		sleep $sleeptimer;
	}

	if ($response->header('Content-Encoding') &&
	    $response->header('Content-Encoding') eq 'gzip') {
		$stats{compressed_pages} += do {use bytes; length($response->content)};
		$response->content(Compress::Zlib::memGunzip($response->content));
	}

	if ($response->header('Content-type') eq 'xapplication/ywe-octet-stream') {
		$stats{transformed_pages}++;
		$base = &transform_output(length($response->content), $response->content);
	} else {
		$base = $response->content;
	}
	return $base; 
}

######################################################################################################

sub log
{
	my ($entry) = @_;
	printf STDERR "%s [%d] %s\n",$progname,time,$entry;
}

######################################################################################################

sub print_stats
{
	printf STDERR "%s v%s [%d] completed in %0.2f seconds",$progname,$version,time,tv_interval($script_start_time);
	foreach my $key (sort keys %stats) {
		printf STDERR ", %d %s",$stats{$key},$key;
	}
	printf STDERR "\n";
}

######################################################################################################
# given yahoo7 xml data, parse it into 'shows' ..
# parse it into $tv_guide->{$channel}->{data}->{$event_id}-> structures..

sub parse_xml_data
{
	my $data = shift;
	my $parser = new XML::DOM::Parser;
	my $tree = $parser->parse($data);
	my $tree_channels = $tree->getElementsByTagName("venue");
	for (my $i = 0; $i < $tree_channels->getLength; $i++) {
		my $channel = $tree_channels->item($i)->getAttributeNode("co_short")->getValue;

		# for this channel get every programme ('event')
		my $events = $tree_channels->item($i)->getElementsByTagName("event");
		for (my $j = 0; $j < $events->getLength; $j++) {
			my $event = $events->item($j);
			my $event_id = $event->getElementsByTagName("event_id")->item(0)->getFirstChild->getNodeValue;

			# mandatory fields
			my $event_start = 	$event->getElementsByTagName("event_date")->item(0)->getFirstChild->getNodeValue;
			my $event_end =   	$event->getElementsByTagName("end_date")->item(0)->getFirstChild->getNodeValue;
			$event_id .= $event_start . $event_end; # event_id actually isn't unique - so make it so

			$stats{programmes}++;
			$stats{duplicate_programmes}++ if ($tv_guide->{$channel}->{data}->{$event_id});

			# wrap these non-mandatory fields in an eval so if they don't exist the script doesn't barf out
			my %e;
			foreach my $field ('title', 'subtitle', 'description_1', 'main_cast', 'year_released', 'rating',
			  'genre', 'running_time', 'repeat', 'country', 'movie', 'premiere', 'final', 'captions', 'warnings', 
			  'color', 'language', 'director', 'live') {
				eval { $e{$field} = $event->getElementsByTagName("$field")->item(0)->getFirstChild->getNodeValue; };
			}
			# other fields we dont pick up but exist in source xml data include:
			#  other_title, description_2, genre_id, highlight, ext_url, y7_url

			my @categories;
			push(@categories,"movie") if (($e{movie}) && ($e{movie} == 1));
			push(@categories,"premiere") if (($e{premiere}) && ($e{premiere} == 1));
			push(@categories,"final") if (($e{final}) && ($e{final} == 1));
			push(@categories,"live") if (($e{live}) && ($e{live} == 1));
			push(@categories,translate_category($e{genre})) if (($e{genre}) && ($e{genre} ne ""));

			my %video_details;
			$video_details{'colour'} = "yes" if $e{color};

			my @ratings;
			push(@ratings, [$e{rating}, 'ABA', undef]) if $e{rating};
			push(@ratings, [$e{warnings}, 'Warnings', undef]) if $e{warnings};

			# store it in the correct XMLTV schema!
			$tv_guide->{$channel}->{data}->{$event_id}->{'channel'} =	$channels->{$channel};
			$tv_guide->{$channel}->{data}->{$event_id}->{'start'} = 	strftime "%Y%m%d%H%M", localtime($event_start-$time_offset);
			$tv_guide->{$channel}->{data}->{$event_id}->{'stop'} = 		strftime "%Y%m%d%H%M", localtime($event_end-$time_offset);
			$tv_guide->{$channel}->{data}->{$event_id}->{'title'} = 	[[ $e{title}, $lang ]] if $e{title};
			$tv_guide->{$channel}->{data}->{$event_id}->{'sub-title'} = 	[[ $e{subtitle}, $lang ]] if $e{subtitle};
			$tv_guide->{$channel}->{data}->{$event_id}->{'desc'} = 		[[ $e{description_1}, $lang ]] if $e{description_1};
			$tv_guide->{$channel}->{data}->{$event_id}->{'category'} = 	[[ @categories ]] if @categories;
			$tv_guide->{$channel}->{data}->{$event_id}->{'country'} = 	[[ $e{country}, $lang ]] if $e{country};
			$tv_guide->{$channel}->{data}->{$event_id}->{'premiere'} = 	[ 'premiere', $lang ] if $e{premiere};
			$tv_guide->{$channel}->{data}->{$event_id}->{'rating'} =	[ @ratings ];
			$tv_guide->{$channel}->{data}->{$event_id}->{'credits'}{'actor'} = [ split(/, /, $e{main_cast}) ] if $e{main_cast};
			$tv_guide->{$channel}->{data}->{$event_id}->{'credits'}{'director'} = [ split(/, /, $e{director}) ] if $e{director};
			$tv_guide->{$channel}->{data}->{$event_id}->{'date'} = 		$e{year_released} if $e{year_released};
			$tv_guide->{$channel}->{data}->{$event_id}->{'previously-shown'} = { } if $e{repeat};
			$tv_guide->{$channel}->{data}->{$event_id}->{'subtitles'} = 	[ { 'type' => 'teletext' } ] if $e{captions};
			$tv_guide->{$channel}->{data}->{$event_id}->{'last-chance'} = 	[ 'final', $lang ] if $e{final};
			$tv_guide->{$channel}->{data}->{$event_id}->{'video'} = 	\%video_details;
			$tv_guide->{$channel}->{data}->{$event_id}->{'length'} = 	($e{running_time} * 60) if $e{running_time};
			$tv_guide->{$channel}->{data}->{$event_id}->{'language'} = 	[ split(/, /, $e{language}) ] if $e{language};
		}
	}
	$tree->dispose;
}

######################################################################################################

sub translate_category
{
	my $genre = shift;
	my %translation = (
		'Sport' => 'sports',
		'Soap Opera' => 'Soap',
		'Science and Technology' => 'Science/Nature',
		'Real Life' => 'Reality',
		'Cartoon' => 'Animation',
		'Family' => 'Children',
		'Murder' => 'Crime' );

	return $translation{$genre} if defined $translation{$genre};
	return $genre; 
}     

######################################################################################################
# descend a structure and clean up various things, including stripping
# leading/trailing spaces in strings, translations of html stuff etc
#   -- taken & modified from Michael 'Immir' Smith's excellent tv_grab_au

my %amp;
BEGIN { %amp = ( nbsp => ' ', qw{ amp & lt < gt > apos ' quot " } ) }

sub cleanup {
	my $x = shift;
	if    (ref $x eq "REF")   { cleanup($_) }
	elsif (ref $x eq "HASH")  { cleanup(\$_) for values %$x }
	elsif (ref $x eq "ARRAY") { cleanup(\$_) for @$x }
	elsif (defined $$x) {
		$$x =~ s/&(#(\d+)|(.*?));/ $2 ? chr($2) : $amp{$3}||' ' /eg;
		# $$x =~ s/[^\x20-\x7f]/ /g;
		$$x =~ s/(^\s+|\s+$)//g;
	}
}

######################################################################################################

sub write_data
{
	my %writer_args = ( encoding => 'ISO-8859-1' );
	if ($opt_outputfile) {
		my $fh = new IO::File(">$opt_outputfile")  or die "can't open $opt_outputfile: $!";
		$writer_args{OUTPUT} = $fh;
	}

	my $writer = new XMLTV::Writer(%writer_args);

	$writer->start
	  ( { 'source-info-url'    => "about:blank",
	      'source-info-name'   => "$progname $version",
	      'generator-info-name' => "$progname $version"} );

	for my $channel (sort keys %{$channels}) {
		$writer->write_channel( {
			'display-name' => [[ $channel, $lang ]],
			'id' => $channels->{$channel}
			} );
	}

	for my $channel (sort keys %{$channels}) {
		for my $event_id (sort {$a <=> $b} keys %{($tv_guide->{$channel}->{data})}) {
			my $show = $tv_guide->{$channel}->{data}->{$event_id};
			&cleanup($show);
			$writer->write_programme($show);
		}
	}

	$writer->end();
}

######################################################################################################

