Changeset 84

Show
Ignore:
Timestamp:
10/17/06 02:28:15 (7 years ago)
Author:
max
Message:

Some more code for intelligent-yet-randomish grabber ordering, including support for grabbing timeslices (only particular days and/or channels)

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • shepherd

    r80 r84  
    3030#           data sources like IceTV 
    3131 
    32 BEGIN { *CORE::GLOBAL::die = \&my_die; } 
     32#BEGIN { *CORE::GLOBAL::die = \&my_die; } 
    3333 
    3434use strict; 
     
    4444use POSIX qw(strftime); 
    4545use Time::HiRes qw(gettimeofday tv_interval); 
     46use Algorithm::Diff; 
    4647 
    4748# --------------------------------------------------------------------------- 
     
    8990my $channels_file = "$CWD/channels.conf"; 
    9091my $days = 7; 
     92my $timeslice; 
    9193 
    9294# postprocessing 
     
    238240        my $comm = "$CWD/grabbers/$grabber/$grabber " . 
    239241                   "--region $region " . 
    240                    "--channels_file $channels_file " . 
    241242                   "--output $output"; 
    242243 
    243         # NOTE: ideally a grabber could be instructed to fetch partial data through --channel, --starttime & --endtime 
    244         # we don't have that for now so instead whenever there is missing data, ALL 7 days for all channels will be collected 
    245         # FIXME FUTURE: call grabbers just with what we want... 
    246         $comm .= " --days $days" if ($days); 
    247         $comm .= " --offset $opt->{offset}" if ($opt->{offset}); 
     244        # Category 1 grabbers (i.e. slow ones) are requested to only fetch the timeslice 
     245        # that we need. Category 2 grabbers are requested to get everything, since there's 
     246        # very little cost in grabbing that extra data, and we can use it in the reconciler 
     247        # to verify that everything looks OK. 
     248        if ($components->{$grabber}->{config}->{category} == 1) 
     249        { 
     250            print "CAT1 grabber: grabbing timeslice.\n"; 
     251            if ($timeslice->{start} != 1) 
     252            { 
     253                $comm .= " " .  
     254                         $components->{$grabber}->{config}->{option_offset} . 
     255                         " " . 
     256                         ($timeslice->{start} - 1); 
     257            } 
     258 
     259            my $n = $timeslice->{stop}; 
     260            if ($timeslice->{start} != 1  
     261                    and  
     262                !$components->{$grabber}->{config}->{option_offset_eats_days}) 
     263            { 
     264                $n -= $timeslice->{start}; 
     265            } 
     266            $comm .= " " . 
     267                     $components->{$grabber}->{config}->{option_days} . 
     268                     " " .  
     269                     $n; 
     270             
     271            # Write a temporary channels file specifying only the channels we want 
     272            my $tmpchans; 
     273            foreach (@{$timeslice->{chans}}) 
     274            { 
     275                $tmpchans->{$_} = $channels->{$_}; 
     276            } 
     277            my $tmpcf = "$CWD/channels.conf.tmp"; 
     278            write_file($tmpcf, 'temporary channels', [ $tmpchans ], [ 'channels' ]); 
     279            $comm .= " --channels_file $tmpcf"; 
     280        } 
     281        else 
     282        { 
     283            $comm .= " --days $days" if ($days); 
     284            $comm .= " --offset $opt->{offset}" if ($opt->{offset}); 
     285            $comm .= " --channels_file $channels_file"; 
     286        } 
    248287        $comm .= " --debug" if ($debug); 
    249288        $comm .= " @ARGV" if (@ARGV); 
     
    283322} 
    284323 
     324#  
     325# 1. If set order, run them in that order 
     326# 2. If first time ever run, run the transition grabber 
     327# 3. Randomly select a grabber with a bias towards efficient and high-quality ones 
     328# 
    285329sub choose_grabber 
    286330{ 
     
    292336            { 
    293337                $gscore->{$_} = 0; 
     338# Cache stuff: not enabled yet 
     339#               if ($components->{$_}->{config}->{category} == 1 
     340#                       and 
     341#                   $components->{$_}->{config}->{cache}) 
     342#               { 
     343#                   $gscore->{$_ . ' [cache]'} = 0; 
     344#               } 
    294345            } 
    295346        } 
     
    308359    # score grabbers 
    309360    my $total = score_grabbers(); 
    310  
    311361    return undef unless ($total); 
    312  
     362     
    313363    if ($debug) 
    314364    { 
    315365        print "Grabber selection probabilities:\n"; 
    316         foreach (keys %$gscore) 
     366        foreach (sort { $gscore->{$b} <=> $gscore->{$a} } keys %$gscore) 
    317367        { 
    318             printf "%15s %6.1f%% %12s\n",  
     368            printf "%25s %6.1f%% %12s\n",  
    319369                   $_, 100 * $gscore->{$_} / $total, "($gscore->{$_} pts)"; 
    320370        } 
     
    326376 
    327377    my $c = 0; 
    328     foreach (keys %$gscore) 
    329     { 
    330         next if (!$gscore->{$_}); 
    331         if ($r >= $c and $r < ($c + $gscore->{$_})) 
     378    foreach my $grabber (keys %$gscore) 
     379    { 
     380        next if (!$gscore->{$grabber}); 
     381        if ($r >= $c and $r < ($c + $gscore->{$grabber})) 
    332382        { 
    333             delete $gscore->{$_}; 
    334             print "Selected $_.\n" if ($debug); 
    335             return $_; 
    336         } 
    337         $c += $gscore->{$_}; 
     383            delete $gscore->{$grabber}; 
     384            print "Selected $grabber.\n" if ($debug); 
     385            if ($grabber =~ /(.*) \[cache\]/) 
     386            { 
     387                return $1; 
     388            } 
     389            else 
     390            { 
     391                if ($components->{$grabber}->{config}->{category} == 2) 
     392                { 
     393                    # We might want to run C1 grabbers multiple times 
     394                    # to grab various timeslices, but not C2 grabbers, 
     395                    # which should get everything at once. 
     396                    delete $gscore->{$grabber}; 
     397                } 
     398                return $grabber; 
     399            } 
     400        } 
     401        $c += $gscore->{$grabber}; 
    338402    } 
    339403    die "ERROR: failed to choose grabber."; 
    340404} 
    341405 
     406# **************************************** 
     407# Scoring grabbers: 
     408# 
     409# Earn points for each slot you can fill 
     410# Multipliers: 
     411# * C2 grabbers 
     412# * C1 grabbers that have the data we're looking for cached 
     413# * High 'quality' grabbers get a BIG bonus: low-quals should basically  
     414#   not run unless we need them 
     415# **************************************** 
    342416sub score_grabbers 
    343417{ 
    344     my ($score, $total, $niceness, $granularity, $m); 
     418    my ($score, $total, $niceness, $granularity, $day, $hits, $cat, $dq, $mult); 
    345419 
    346420    my $missing = detect_missing_data(); 
    347421 
     422    calculate_best_timeslice($missing); 
     423     
     424    print "Best timeslice: " . Dumper($timeslice) . "\n" if ($debug); 
     425 
     426    my $missing_slice = create_missing_slice(); 
     427 
     428    # So! Compare C2 grabbers against the raw missing file, because we'll get 
     429    # everything. But compare C1 grabbers against the timeslice, because we'll 
     430    # only ask them for a slice. This goes for the [cache] and regular C1s. 
    348431    foreach my $grabber (keys %$gscore) 
    349432    { 
     
    351434        # whether we support this channel and this day #. 
    352435         
    353         my $hits = 0; 
    354         foreach (@$missing) 
     436        $hits = 0; 
     437 
     438        if ($grabber =~ /(.*) \[cache\]$/) 
    355439        { 
    356             my ($ch, $day) = split(/:/); 
    357             my $result = (can_support_channel($grabber, $ch) 
    358                           and 
    359                           can_support_day($grabber, $day)); 
    360             $hits += $result; 
    361 #           print "$grabber vs $ch:$day: " . ($result ? "OK" : "no") . "\n"; 
    362         } 
    363         $niceness = $components->{$grabber}->{config}->{niceness}; 
    364         unless ($niceness) 
     440            $hits = find_cache_hits($1, $missing); 
     441            $cat = 2; 
     442            $dq = $components->{$1}->{config}->{quality}; 
     443        } 
     444        else 
    365445        { 
    366             print "WARNING: Grabber $grabber has no niceness support " . 
    367                   "specified in config.\n"; 
    368             $niceness = 5; 
    369         } 
    370         $granularity = $components->{$grabber}->{config}->{granularity}; 
    371         unless (defined $granularity) 
    372         { 
    373             print "WARNING: Grabber $grabber has no granularity support " . 
    374                 "specified in config.\n"; 
    375             $granularity = ''; 
    376         } 
    377         # TODO: use granularity more intelligently ('c' vs 'd') -- Max. 
    378         $granularity = length ($granularity); 
    379         my $total_channeldays = $days * scalar (keys %$channels); 
    380         $granularity *= int((($total_channeldays - 1 ) / scalar(@$missing))/2); 
    381  
    382         $score = $hits * ($niceness + $granularity); 
    383         print "Grabber $grabber can fill $hits empty slots with $niceness niceness and $granularity granularity: scoring $score pts.\n"; 
     446            my $key = $missing; 
     447            if ($components->{$grabber}->{config}->{category} == 1) 
     448            { 
     449                $key = $missing_slice; 
     450            } 
     451            foreach my $day (sort keys %$key) 
     452            { 
     453                my $val = supports_day($grabber, $day); 
     454                next unless ($val); 
     455                print "Day $day:"; 
     456                foreach my $ch (@{$key->{$day}}) 
     457                { 
     458                    if (supports_channel($grabber, $ch)) 
     459                    { 
     460                        print " $ch"; 
     461                        $hits += $val; 
     462                    } 
     463                } 
     464                print "\n"; 
     465                $hits = 1 if ($hits > 0 and $hits < 1); 
     466 
     467                $cat = $components->{$grabber}->{config}->{category}; 
     468                unless ($cat) 
     469                { 
     470                    print "WARNING: Grabber $grabber has no category support ". 
     471                    "in config.\n"; 
     472                    $cat = 1; 
     473                } 
     474 
     475                $dq = $components->{$grabber}->{config}->{quality}; 
     476                unless ($dq) 
     477                { 
     478                    print "WARNING: Grabber $grabber has no quality support ". 
     479                    "in config.\n"; 
     480                    $dq = 1; 
     481                } 
     482            } 
     483        } 
     484        $mult = 1; 
     485        $mult++ if ($cat == 2); 
     486        $mult *= 2 ** ($dq-1); 
     487 
     488        $score = int($hits * $mult); 
     489        print "Grabber $grabber can fill $hits slots with multiplier $mult (cat: $cat, dq: $dq): scoring $score pts.\n"; 
    384490        $gscore->{$grabber} = $score; 
    385491        $total += $score; 
     
    388494} 
    389495 
    390 sub can_support_channel 
     496sub supports_channel 
    391497{ 
    392498    my ($grabber, $ch) = @_; 
     
    407513} 
    408514 
    409 sub can_support_day 
     515sub supports_day 
    410516{ 
    411517    my ($grabber, $day) = @_; 
    412518 
    413     my $days_supported = $components->{$grabber}->{config}->{max_days}; 
    414     unless ($days_supported) 
    415     { 
    416         print "WARNING: Grabber $grabber has no max_days support " . 
    417               "specified in config.\n"; 
    418         $days_supported = 2; 
    419     } 
    420     return $day <= $days_supported; 
     519    return 0 unless ($day <= $components->{$grabber}->{config}->{max_days}); 
     520    return 0.5 if ($day > $components->{$grabber}->{config}->{max_reliable_days}); 
     521    return 1; 
     522} 
     523 
     524sub find_cache_hits 
     525{ 
     526    my ($grabber, $missing) = @_; 
     527 
     528    return 5; 
    421529} 
    422530 
     
    428536sub detect_missing_data 
    429537{ 
    430     my @missing; 
     538    my $missing = { }; 
    431539 
    432540    my $timeslots_per_day = (24 * 60 * 60) / $timeslot_size; 
     
    436544        if (defined $channel_data->{$ch}) 
    437545        { 
    438             my $slotnum; 
    439             for ($slotnum = 0; $slotnum < $num_timeslots-1; $slotnum++)  
     546            for (my $slotnum = 0; $slotnum < $num_timeslots-1; $slotnum++)  
    440547            { 
    441548                if (!@{$channel_data->{$ch}->{timeslots}}[$slotnum]) 
    442549                { 
    443550                    my $day = int($slotnum / $timeslots_per_day) + 1; 
    444                     push @missing, "$ch:$day"; 
     551                    push @{$missing->{$day}}, $ch; 
    445552                    $slotnum += $timeslots_per_day - 
    446553                                ($slotnum % $timeslots_per_day); 
     
    451558        else 
    452559        { 
    453             for my $i (1 .. $days) 
     560            for my $day (1 .. $days) 
    454561            { 
    455                 push (@missing, "$ch:$i"); 
    456             } 
    457         } 
    458  
    459     } 
    460  
    461     print "Need data for @missing.\n" if ($debug); 
    462     return \@missing; 
    463 } 
    464  
     562                push @{$missing->{$day}}, $ch; 
     563            } 
     564        } 
     565 
     566    } 
     567 
     568    foreach my $day (keys %$missing) 
     569    { 
     570        $missing->{$day} = [ sort @{$missing->{$day}} ]; 
     571    } 
     572 
     573    if ($debug) 
     574    { 
     575        print "Need data for days " . join(", ", sort keys %$missing) . ".\n"; 
     576    } 
     577    return $missing; 
     578} 
     579 
     580sub calculate_best_timeslice 
     581{ 
     582    my $missing = shift; 
     583 
     584    my ($overlap, $a); 
     585    $timeslice = { 'chandays' => 0 }; 
     586    foreach my $day (1 .. $days) 
     587    { 
     588        consider_slice($day, $day, @{$missing->{$day}}); 
     589        $overlap = $missing->{$day}; 
     590        foreach my $nextday (($day + 1) .. $days) 
     591        { 
     592            last unless ($missing->{$nextday}); 
     593            $a = Algorithm::Diff::LCS($overlap, $missing->{$nextday}); 
     594            # print "Overlap: " . Dumper($a); 
     595            last unless ($a and @{$a}); 
     596            consider_slice($day, $nextday, @{$a}); 
     597            $overlap = $a; 
     598        } 
     599    } 
     600} 
     601 
     602sub consider_slice 
     603{ 
     604    my ($startday, $stopday, @chans) = @_; 
     605 
     606    my $challenger = ($stopday - $startday + 1) * scalar(@chans); 
     607    return unless ($challenger > $timeslice->{chandays}); 
     608 
     609    # We have a winner! 
     610    $timeslice->{start} = $startday; 
     611    $timeslice->{stop} = $stopday; 
     612    $timeslice->{chans} = [ @chans ]; 
     613    $timeslice->{chandays} = $challenger; 
     614} 
     615 
     616sub create_missing_slice 
     617{ 
     618    my $ret; 
     619 
     620    foreach my $day ($timeslice->{start} .. $timeslice->{stop}) 
     621    { 
     622        $ret->{$day} = [ @{$timeslice->{chans}} ]; 
     623    } 
     624    return $ret; 
     625} 
    465626 
    466627# interpret xmltv data from this grabber/postprocessor 
     
    12501411sub write_config_file 
    12511412{ 
    1252     open(CONF, ">$config_file") or die "cannot write to $config_file: $!"; 
    1253     print CONF Data::Dumper->Dump( 
     1413    write_file($config_file, 'configuration',  
    12541414        [$region,  $pref_order,  $mirror_site,  $components, $title_translation_table  ], 
    12551415        ["region", "pref_order", "mirror_site", "components", "title_translation_table" ]); 
    1256     close CONF; 
    1257     print "\nUpdated configuration file $config_file.\n" if ($debug); 
    12581416} 
    12591417 
    12601418sub write_channels_file 
    12611419{ 
    1262     open(CHAN, ">$channels_file") or die "cannot write to $channels_file: $!"; 
    1263     print CHAN Data::Dumper->Dump([$channels], ["channels"]); 
    1264     close CHAN; 
    1265     print "Updated channels file $channels_file.\n" if ($debug); 
     1420    write_file($channels_file, 'channels', [ $channels ], [ 'channels' ]); 
     1421} 
     1422 
     1423sub write_file 
     1424{ 
     1425    my ($fn, $name, $vars, $varnames) = @_; 
     1426    open (FN, ">$fn") or die "Can't write to $name file $fn: $!"; 
     1427    print FN Data::Dumper->Dump($vars, $varnames); 
     1428    close FN; 
     1429    print "Wrote $name file $fn.\n" if ($debug); 
    12661430} 
    12671431