| | 526 | # Subs: Reconciling data |
| | 527 | # ----------------------------------------- |
| | 528 | |
| | 529 | # for all the data we have, try to pick the best bits! |
| | 530 | sub reconcile_data |
| | 531 | { |
| | 532 | printf "\nReconciling data:\n\n"; |
| | 533 | |
| | 534 | my @proglist = [ ]; |
| | 535 | my @position_pointer = [ ]; |
| | 536 | my $num_grabbers = 0; |
| | 537 | my $pref_order; |
| | 538 | |
| | 539 | printf "Preference for whose data we prefer as follows:\n"; |
| | 540 | foreach my $proggy (sort { $grabbers->{$a}->{order} <=> $grabbers->{$b}->{order} } keys %$grabbers) { |
| | 541 | if ((!$grabbers->{$proggy}->{disabled}) && ($plugin_data->{$proggy}) && ($plugin_data->{$proggy}->{valid})) { |
| | 542 | my $orig_prog = $plugin_data->{$proggy}->{xmltv}->[3]; |
| | 543 | my $prognum = 0; |
| | 544 | foreach my $new_prog (sort order_channel_time @{$orig_prog}) { |
| | 545 | $proglist[$num_grabbers]->[$prognum] = $new_prog; |
| | 546 | $prognum++; |
| | 547 | } |
| | 548 | |
| | 549 | printf " %d. %s (%d programmes)\n",($num_grabbers+1),$proggy,$prognum; |
| | 550 | $num_grabbers++; |
| | 551 | } |
| | 552 | } |
| | 553 | |
| | 554 | my %writer_args = ( encoding => 'ISO-8859-1' ); |
| | 555 | my $fh = new IO::File(">input_postprocess_file") || die "can't open outputfile $input_postprocess_file: $!"; |
| | 556 | $writer_args{OUTPUT} = $fh; |
| | 557 | my $writer = new XMLTV::Writer(%writer_args); |
| | 558 | |
| | 559 | $writer->start({'source-info-url' => "about:blank", 'source-info-name' => "$progname $version", 'generator-info-name' => "$progname $version"} ); |
| | 560 | |
| | 561 | for my $ch (sort keys %$channels) { |
| | 562 | $writer->write_channel({'display-name' => [[ $ch, $langs ]], 'id' => $channels->{$ch}} ); |
| | 563 | } |
| | 564 | |
| | 565 | for my $ch (sort keys %$channels) { |
| | 566 | printf "Reconciling channel: %s (%s)\n",$ch,$channels->{$ch}; |
| | 567 | |
| | 568 | # |
| | 569 | # 1. position pointers to first piece of data for this channel |
| | 570 | # |
| | 571 | |
| | 572 | printf "REC#1: processing channel %s\n",$channels->{$ch} if $debug; |
| | 573 | my $position_pointer; |
| | 574 | for (my $i=0; $i < $num_grabbers; $i++) { |
| | 575 | $position_pointer[$i] = 0; |
| | 576 | while ((defined $proglist[$i]->[($position_pointer[$i])]) && ($proglist[$i]->[($position_pointer[$i])]->{'channel'} ne $channels->{$ch})) { |
| | 577 | $position_pointer[$i]++; |
| | 578 | } |
| | 579 | if (!defined $proglist[$i]->[($position_pointer[$i])]) { |
| | 580 | $position_pointer[$i] = -1; |
| | 581 | printf "REC#1: no programmes found for channel %s from gradder %d\n",$channels->{$ch},$i if $debug; |
| | 582 | } else { |
| | 583 | printf "REC#1: advanced position_pointer to %d for grabber %d (first programme is \"%s\", start %d, stop %d)\n", |
| | 584 | $position_pointer[$i],$i,${XMLTV::best_name($langs,$proglist[$i]->[($position_pointer[$i])]->{title})}[0], |
| | 585 | $proglist[$i]->[($position_pointer[$i])]->{start_epoch},$proglist[$i]->[($position_pointer[$i])]->{stop_epoch} if $debug; |
| | 586 | } |
| | 587 | } |
| | 588 | |
| | 589 | my $all_done_on_this_channel; |
| | 590 | do { |
| | 591 | # |
| | 592 | # 2. find 'earliest' programme from all the choices |
| | 593 | # |
| | 594 | |
| | 595 | $all_done_on_this_channel = 1; # unless proven otherwise |
| | 596 | my $earliest_programme_time = undef; |
| | 597 | my $earliest_programme_slot = undef; |
| | 598 | for (my $i=0; $i < $num_grabbers; $i++) { |
| | 599 | next if ($position_pointer[$i] == -1); # skip if no programmes on this channel from this grabber |
| | 600 | my $this_programme = $proglist[$i]->[($position_pointer[$i])]; |
| | 601 | |
| | 602 | if ((!defined $this_programme) || (!defined $this_programme->{title}) || ($this_programme->{channel} ne $channels->{$ch})) { |
| | 603 | # no more programmes on this channel for this grabber! |
| | 604 | printf "REC#2: no more programmes on grabber %d for this channel\n",$i if $debug; |
| | 605 | $position_pointer[$i] = -1; |
| | 606 | } else { |
| | 607 | if ((!defined $earliest_programme_time) || ($earliest_programme_time > $this_programme->{'start_epoch'})) { |
| | 608 | $earliest_programme_time = $this_programme->{'start_epoch'}; |
| | 609 | $earliest_programme_slot = $i; |
| | 610 | printf "REC#2: earliest programme (so far) on grabber %d, start %d (end %d) programme \"%s\"\n", |
| | 611 | $i,$this_programme->{start_epoch},$this_programme->{stop_epoch},${XMLTV::best_name($langs,$this_programme->{title})}[0] if $debug; |
| | 612 | } else { |
| | 613 | printf "REC#2: programme on grabber %d was not earlier start %d (end %d) programme \"%s\"\n", |
| | 614 | $i,$this_programme->{start_epoch},$this_programme->{stop_epoch},${XMLTV::best_name($langs,$this_programme->{title})}[0] if $debug; |
| | 615 | } |
| | 616 | } |
| | 617 | } |
| | 618 | if (!defined $earliest_programme_slot) { |
| | 619 | # no programmes available on ANY grabber for this channel, skip this channel |
| | 620 | printf "REC#2: no programmes on any grabbers for channel %s, skipping this channel\n",$channels->{$ch} if $debug; |
| | 621 | } else { |
| | 622 | # |
| | 623 | # 3a. compare how many programmes on other grabbers overlap with it |
| | 624 | # TODO (FUTURE): enhance this to do "majority voting" acrosa all grabbers based on identical start/stop |
| | 625 | # TODO (FUTURE): where star/stop times match exactly but title names dont, record the mapping for known-as_to_most-preferred-title so we can use this in future transformations |
| | 626 | |
| | 627 | my $preferred_programme = $proglist[$earliest_programme_slot]->[($position_pointer[$earliest_programme_slot])]; |
| | 628 | my $startpoint = $preferred_programme->{'start_epoch'}; |
| | 629 | my $stoppoint = $preferred_programme->{'stop_epoch'}; |
| | 630 | my $duration = $stoppoint - $startpoint; |
| | 631 | my $extraduration = 0; |
| | 632 | $extraduration = int($duration * ($reclogic{compare_overlapping_programmes_extra_overtime_duration_percent}/100)) if ($reclogic{compare_overlapping_programmes_extra_overtime_duration_percent} > 0); |
| | 633 | $extraduration = $reclogic{compare_overlapping_programmes_extra_overtime_max} if ($extraduration > $reclogic{compare_overlapping_programmes_extra_overtime_max}); # upper limit |
| | 634 | $stoppoint += $extraduration; |
| | 635 | printf "REC#3: comparing other grabbers to see how many programmes fit in timeslot %d to %d (%d+%d)\n",$startpoint,$stoppoint,$duration,$extraduration if $debug; |
| | 636 | |
| | 637 | my $max_progs_found = 0; |
| | 638 | my $max_progs_slot = undef; |
| | 639 | for (my $i=0; $i < $num_grabbers; $i++) { |
| | 640 | next if ($i == $earliest_programme_slot); |
| | 641 | next if ($position_pointer[$i] == -1); |
| | 642 | my $position_offset = 0; |
| | 643 | my $progs_found = 0; |
| | 644 | |
| | 645 | while ((defined $proglist[$i]->[($position_pointer[$i]+$position_offset)]) && |
| | 646 | ($proglist[$i]->[($position_pointer[$i]+$position_offset)]->{'channel'} eq $channels->{$ch}) && |
| | 647 | ($proglist[$i]->[($position_pointer[$i]+$position_offset)]->{'start_epoch'} >= $startpoint) && |
| | 648 | ($proglist[$i]->[($position_pointer[$i]+$position_offset)]->{'stop_epoch'} <= $stoppoint)) { |
| | 649 | my $this_prog_duration = $proglist[$i]->[($position_pointer[$i]+$position_offset)]->{'stop_epoch'} - $proglist[$i]->[($position_pointer[$i]+$position_offset)]->{'start_epoch'}; |
| | 650 | |
| | 651 | printf "REC#3a. programme on grabber %d matched (start %d end %d), \"%s\"%s\n", $i, |
| | 652 | $proglist[$i]->[($position_pointer[$i]+$position_offset)]->{'start_epoch'}, |
| | 653 | $proglist[$i]->[($position_pointer[$i]+$position_offset)]->{'stop_epoch'}, |
| | 654 | ${XMLTV::best_name($langs,$proglist[$i]->[($position_pointer[$i]+$position_offset)]->{'title'})}[0], |
| | 655 | ($this_prog_duration <= $reclogic{min_time_override_for_duplicate} ? |
| | 656 | ", but ignored because of min_time_override_for_duplicate ($reclogic{min_time_override_for_duplicate} sec)" : "") if $debug; |
| | 657 | |
| | 658 | $progs_found++ if ($this_prog_duration > $reclogic{min_time_override_for_duplicate}); |
| | 659 | $position_offset++; |
| | 660 | } |
| | 661 | |
| | 662 | printf "REC#3a: %d programmes on grabber %d within timeslot\n",$progs_found,$i if $debug; |
| | 663 | if ($progs_found > $max_progs_found) { |
| | 664 | $max_progs_found = $progs_found; |
| | 665 | $max_progs_slot = $i; |
| | 666 | } |
| | 667 | } |
| | 668 | |
| | 669 | # 3b. if there are 2 or more programmes on other channels, use those - otherwise use this one |
| | 670 | if ((!defined $max_progs_slot) || ($max_progs_found <= 1)) { |
| | 671 | printf "REC#3b: no grabbers with more programmes in timeslot found\n" if $debug; |
| | 672 | } else { |
| | 673 | printf "REC#3b: grabber %d has %d programmes between %d and %d - using THAT grabber as our preference now!\n", |
| | 674 | $max_progs_slot,$max_progs_found,$startpoint,$stoppoint if $debug; |
| | 675 | $earliest_programme_slot = $max_progs_slot; |
| | 676 | } |
| | 677 | |
| | 678 | # |
| | 679 | # 4. populate our "reconciled" programme list with the programming chosen |
| | 680 | # |
| | 681 | |
| | 682 | my $chosen_prog = $proglist[$earliest_programme_slot]->[($position_pointer[$earliest_programme_slot])]; |
| | 683 | $startpoint = $chosen_prog->{'start_epoch'}; |
| | 684 | $stoppoint = $chosen_prog->{'stop_epoch'}; |
| | 685 | my $new_prog_entry = $chosen_prog; |
| | 686 | |
| | 687 | printf "REC#4: chosen programme is from grabber %d: start %d, end %d, duration %d: \"%s\"\n", |
| | 688 | $earliest_programme_slot,$startpoint,$stoppoint,($stoppoint-$startpoint), |
| | 689 | ${XMLTV::best_name($langs,$chosen_prog->{'title'})}[0] if $debug; |
| | 690 | |
| | 691 | # |
| | 692 | # 5a. see if we have it duplicated from multiple grabbers (with fuzz -/+ 5 mins max), -/+ 2.5mins for programmes <= 15 mins |
| | 693 | # TODO (FUTURE): should really do an exact start/stop match as a first pass, then do the fuzz afterwards.. |
| | 694 | |
| | 695 | my $start1, my $start2, my $stop1, my $stop2; |
| | 696 | |
| | 697 | if (($stoppoint - $startpoint) <= $reclogic{duplicate_programme_augment_data_short_cutoff}) { |
| | 698 | $start1 = $startpoint - $reclogic{duplicate_programme_augment_data_short_duration_threshold}; |
| | 699 | $start2 = $startpoint + $reclogic{duplicate_programme_augment_data_short_duration_threshold}; |
| | 700 | $stop1 = $stoppoint - $reclogic{duplicate_programme_augment_data_short_duration_threshold}; |
| | 701 | $stop2 = $stoppoint + $reclogic{duplicate_programme_augment_data_short_duration_threshold}; |
| | 702 | } else { |
| | 703 | $start1 = $startpoint - $reclogic{duplicate_programme_augment_data_long_duration_threshold}; |
| | 704 | $start2 = $startpoint + $reclogic{duplicate_programme_augment_data_long_duration_threshold}; |
| | 705 | $stop1 = $stoppoint - $reclogic{duplicate_programme_augment_data_long_duration_threshold}; |
| | 706 | $stop2 = $stoppoint + $reclogic{duplicate_programme_augment_data_long_duration_threshold}; |
| | 707 | } |
| | 708 | if ($start2 >= $stop1) { |
| | 709 | $start2 = $startpoint; |
| | 710 | $stop1 = $stoppoint; |
| | 711 | } |
| | 712 | |
| | 713 | printf "REC#5: looking in other grabbers for matching programmes within timeslot start %d-%d (%d) and end %d-%d (%d)\n", |
| | 714 | $start1,$start2,($start2-$start1),$stop1,$stop2,($stop2-$stop1) if $debug; |
| | 715 | |
| | 716 | for (my $i=0; $i < $num_grabbers; $i++) { |
| | 717 | next if ($i == $earliest_programme_slot); |
| | 718 | next if ($position_pointer[$i] == -1); |
| | 719 | |
| | 720 | if ((defined $proglist[$i]->[($position_pointer[$i])]) && |
| | 721 | ($proglist[$i]->[($position_pointer[$i])]->{'channel'} eq $channels->{$ch}) && |
| | 722 | ($proglist[$i]->[($position_pointer[$i])]->{'start_epoch'} >= $start1) && |
| | 723 | ($proglist[$i]->[($position_pointer[$i])]->{'start_epoch'} < $start2) && |
| | 724 | ($proglist[$i]->[($position_pointer[$i])]->{'stop_epoch'} >= $stop1) && |
| | 725 | ($proglist[$i]->[($position_pointer[$i])]->{'stop_epoch'} < $stop2)) { |
| | 726 | # winner .. matches our criteria ... |
| | 727 | my $match_prog = $proglist[$i]->[($position_pointer[$i])]; |
| | 728 | |
| | 729 | printf "REC#5: found programme on grabber %d: start %d, end %d: \"%s\"\n", $i, |
| | 730 | $match_prog->{'start_epoch'},$match_prog->{'stop_epoch'}, |
| | 731 | ${XMLTV::best_name($langs,$match_prog->{'title'})}[0] if $debug; |
| | 732 | |
| | 733 | foreach my $field (keys %{$match_prog}) { |
| | 734 | # 5b. pick fields from each one in order of our preferences |
| | 735 | next if ($field eq "start_epoch"); |
| | 736 | next if ($field eq "stop_epoch"); |
| | 737 | if (!defined $new_prog_entry->{$field}) { |
| | 738 | printf "REC#5b: adding field \"%s\"\n",$field; |
| | 739 | $new_prog_entry->{$field} = $match_prog->{$field}; |
| | 740 | # TODO (FUTURE): should we add to programme description to say where we got what data from? |
| | 741 | } |
| | 742 | } |
| | 743 | } |
| | 744 | } |
| | 745 | |
| | 746 | # 6. write out new entry |
| | 747 | printf "REC#6: writing out programme entry\n" if $debug; |
| | 748 | #delete $new_prog_entry->{'start_epoch'}; |
| | 749 | #delete $new_prog_entry->{'stop_epoch'}; |
| | 750 | &cleanup($new_prog_entry); |
| | 751 | $writer->write_programme($new_prog_entry); |
| | 752 | |
| | 753 | # 7a. remove all programmes that end before this endtime |
| | 754 | for (my $i=0; $i < $num_grabbers; $i++) { |
| | 755 | next if ($position_pointer[$i] == -1); |
| | 756 | while ((defined $proglist[$i]->[($position_pointer[$i])]) && |
| | 757 | ($proglist[$i]->[($position_pointer[$i])]->{'channel'} eq $channels->{$ch}) && |
| | 758 | ($proglist[$i]->[($position_pointer[$i])]->{'stop_epoch'} <= $stoppoint)) { |
| | 759 | printf "REC#7a: removing programme on grabber %d slot %d since it ends before inserted start: start %d end %s: \"%s\"\n", |
| | 760 | $i, $position_pointer[$i], |
| | 761 | $proglist[$i]->[($position_pointer[$i])]->{'start_epoch'}, |
| | 762 | $proglist[$i]->[($position_pointer[$i])]->{'stop_epoch'}, |
| | 763 | ${XMLTV::best_name($langs,$proglist[$i]->[($position_pointer[$i])]->{'title'})}[0] if $debug; |
| | 764 | delete $proglist[$i]->[($position_pointer[$i])]; |
| | 765 | $position_pointer[$i]++; |
| | 766 | } |
| | 767 | } |
| | 768 | |
| | 769 | # 7b. adjust starttimes of any programmes to match endtime (with fuzz of +5 mins max) |
| | 770 | for (my $i=0; $i < $num_grabbers; $i++) { |
| | 771 | next if ($position_pointer[$i] == -1); |
| | 772 | my $position_offset = 0; |
| | 773 | while ((defined $proglist[$i]->[($position_pointer[$i]+$position_offset)]) && |
| | 774 | ($proglist[$i]->[($position_pointer[$i]+$position_offset)]->{'channel'} eq $channels->{$ch}) && |
| | 775 | ($proglist[$i]->[($position_pointer[$i]+$position_offset)]->{'start_epoch'} < $stoppoint)) { |
| | 776 | my $this_prog = $proglist[$i]->[($position_pointer[$i]+$position_offset)]; |
| | 777 | if (($this_prog->{'start_epoch'} + $reclogic{readjust_starttime_for_nonmatched_programmes}) >= $stoppoint) { |
| | 778 | printf "REC#7b: adjusting starttime on grabber %d slot %d since it starts <5mins before inserted end:\n", |
| | 779 | $i,($position_pointer[$i]+$position_offset) if $debug; |
| | 780 | printf "REC#7b: orig: start %d end %d, now: start %d end %d: \"%s\"\n", |
| | 781 | $this_prog->{'start_epoch'}, $this_prog->{'stop_epoch'}, $stoppoint, $this_prog->{'stop_epoch'}, |
| | 782 | ${XMLTV::best_name($langs,$this_prog->{'title'})}[0] if $debug; |
| | 783 | $proglist[$i]->[($position_pointer[$i]+$position_offset)]->{'start_epoch'} = $stoppoint; |
| | 784 | $proglist[$i]->[($position_pointer[$i]+$position_offset)]->{'start'} = sprintf "%s",(strftime "%Y%m%d%H%M", localtime($stoppoint)); |
| | 785 | $position_offset++; |
| | 786 | } else { |
| | 787 | printf "REC#7b: removing grabber %d slot %d programme because it started too long before chosen programme: start %d end %d: \"%s\"\n", |
| | 788 | $i, ($position_pointer[$i]+$position_offset), $this_prog->{'start_epoch'}, $this_prog->{'stop_epoch'}, |
| | 789 | ${XMLTV::best_name($langs,$this_prog->{'title'})}[0] if $debug; |
| | 790 | $position_pointer[$i]++; |
| | 791 | } |
| | 792 | } |
| | 793 | } |
| | 794 | } |
| | 795 | |
| | 796 | # 8. check that we still have at least one pointer on current channel |
| | 797 | for (my $i=0; $i < $num_grabbers; $i++) { |
| | 798 | next if ($position_pointer[$i] == -1); |
| | 799 | printf "REC#8: grabber %d is now at slot %d\n",$i,$position_pointer[$i] if $debug; |
| | 800 | if ((defined $proglist[$i]->[($position_pointer[$i])]) && |
| | 801 | ($proglist[$i]->[($position_pointer[$i])]->{'channel'} eq $channels->{$ch})) { |
| | 802 | $all_done_on_this_channel = 0; |
| | 803 | printf "REC#8: grabber %d is at slot %d is still on channel \"%s\"\n",$i,$position_pointer[$i],$channels->{$ch} if $debug; |
| | 804 | } |
| | 805 | } |
| | 806 | printf "REC#9:\n" if $debug; |
| | 807 | } until ($all_done_on_this_channel); |
| | 808 | } |
| | 809 | $writer->end(); |
| | 810 | } |
| | 811 | |
| | 812 | # sorting helper routine - sort by channel then by start-time |
| | 813 | sub order_channel_time { |
| | 814 | my $chanresult = $a->{channel} cmp $b->{channel}; |
| | 815 | return $chanresult if ($chanresult != 0); |
| | 816 | return ($a->{start_epoch} <=> $b->{start_epoch}); |
| | 817 | } |
| | 818 | |
| | 819 | # descend a structure and clean up various things, including stripping |
| | 820 | # leading/trailing spaces in strings, translations of html stuff etc |
| | 821 | # -- taken & modified from Michael 'Immir' Smith's excellent tv_grab_au |
| | 822 | my %amp; |
| | 823 | BEGIN { %amp = ( nbsp => ' ', qw{ amp & lt < gt > apos ' quot " } ) } |
| | 824 | sub cleanup { |
| | 825 | my $x = shift; |
| | 826 | if (ref $x eq "REF") { cleanup($_) } |
| | 827 | elsif (ref $x eq "HASH") { cleanup(\$_) for values %$x } |
| | 828 | elsif (ref $x eq "ARRAY") { cleanup(\$_) for @$x } |
| | 829 | elsif (defined $$x) { |
| | 830 | $$x =~ s/&(#(\d+)|(.*?));/ $2 ? chr($2) : $amp{$3}||' ' /eg; |
| | 831 | # $$x =~ s/[^\x20-\x7f]/ /g; |
| | 832 | $$x =~ s/(^\s+|\s+$)//g; |
| | 833 | } |
| | 834 | } |
| | 835 | |
| | 836 | |
| | 837 | # ----------------------------------------- |