| 1 | #!/usr/bin/perl -w |
|---|
| 2 | |
|---|
| 3 | # |
|---|
| 4 | # test harness for evaluating different title-matching algorithms |
|---|
| 5 | # |
|---|
| 6 | |
|---|
| 7 | use strict; |
|---|
| 8 | $| = 1; |
|---|
| 9 | |
|---|
| 10 | use XMLTV; |
|---|
| 11 | use POSIX qw(strftime mktime); |
|---|
| 12 | use Getopt::Long; |
|---|
| 13 | use Data::Dumper; |
|---|
| 14 | use Time::HiRes qw(gettimeofday tv_interval); |
|---|
| 15 | |
|---|
| 16 | my %algorithms = ( |
|---|
| 17 | 'imdb_type' => \&alternativeTitles_match, |
|---|
| 18 | 'canonicalize' => \&canonicalizeTitles_match, |
|---|
| 19 | # ... add more algorithms here ... |
|---|
| 20 | 'simple_string' => \&string_match); |
|---|
| 21 | |
|---|
| 22 | my %amp = ( nbsp => ' ', qw{ amp & lt < gt > apos ' quot " } ); |
|---|
| 23 | my $gmt_offset; |
|---|
| 24 | my $num_programmes = 0; |
|---|
| 25 | my $datafile, my $datafile_num = 0; |
|---|
| 26 | my $prog_lineup; |
|---|
| 27 | |
|---|
| 28 | my $opt = { }; |
|---|
| 29 | GetOptions( |
|---|
| 30 | 'create=s' => \$opt->{create}, |
|---|
| 31 | 'show_non_matches' => \$opt->{show_non_matches}, |
|---|
| 32 | 'show_matches' => \$opt->{show_matches}, |
|---|
| 33 | 'test=s' => \$opt->{test}, |
|---|
| 34 | 'compare=s' => \$opt->{compare}, |
|---|
| 35 | 'help' => \$opt->{help}); |
|---|
| 36 | |
|---|
| 37 | printf "tiele-matching test harness\n"; |
|---|
| 38 | if (($opt->{help}) || (!defined $opt->{test}) && (!defined $opt->{create}) && !defined $opt->{compare} ) { |
|---|
| 39 | print " usage:\n". |
|---|
| 40 | "\t$0 --create (title_file) [FILE(s)]\n". |
|---|
| 41 | "\t\tcreate a listing of programmes from input xmltv files\n". |
|---|
| 42 | "\t\t(supplied from stdin or a list of files).\n". |
|---|
| 43 | "\t\tthe resulting {titles_file} can then be used as input\n". |
|---|
| 44 | "\t\tinto testing (--test) various title-matching algorithms\n". |
|---|
| 45 | "\n". |
|---|
| 46 | "\t$0 --test {algorithm|all} [--show_non_matches] [--show_matches]\n". |
|---|
| 47 | "\t\ttest the various title-matching algorithms based on\n". |
|---|
| 48 | "\t\tstdin from a list of titles {titles_file}.\n". |
|---|
| 49 | "\n". |
|---|
| 50 | "\t$0 --compare algorithm1,alghrithm2\n". |
|---|
| 51 | "\t\tcompare two title-matching algorithms against each other,\n". |
|---|
| 52 | "\t\tprinting only cases where they differ.\n". |
|---|
| 53 | "\n". |
|---|
| 54 | "\t\tAlgorithm names:\n\t\t '".join("'\n\t\t '",sort keys %algorithms)."'\n"; |
|---|
| 55 | exit(0); |
|---|
| 56 | } |
|---|
| 57 | |
|---|
| 58 | @ARGV = ('-') if not @ARGV; |
|---|
| 59 | $opt->{test} = $opt->{compare} if ($opt->{compare}); |
|---|
| 60 | &create_programme_lineup($opt->{create}) if ($opt->{create}); |
|---|
| 61 | &test_algorithms($opt->{test}) if ($opt->{test}); |
|---|
| 62 | |
|---|
| 63 | exit(0); |
|---|
| 64 | |
|---|
| 65 | ############################################################################## |
|---|
| 66 | # descend a structure and clean up various things, including stripping |
|---|
| 67 | # leading/trailing spaces in strings, translations of html stuff etc |
|---|
| 68 | # -- taken & modified from Michael 'Immir' Smith's excellent tv_grab_au |
|---|
| 69 | |
|---|
| 70 | sub cleanup { |
|---|
| 71 | my $x = shift; |
|---|
| 72 | if (ref $x eq "REF") { cleanup($_) } |
|---|
| 73 | elsif (ref $x eq "HASH") { cleanup(\$_) for values %$x } |
|---|
| 74 | elsif (ref $x eq "ARRAY") { cleanup(\$_) for @$x } |
|---|
| 75 | elsif (defined $$x) { |
|---|
| 76 | $$x =~ s/&(#(\d+)|(.*?));/ $2 ? chr($2) : $amp{$3}||' ' /eg; |
|---|
| 77 | # $$x =~ s/[^\x20-\x7f]/ /g; |
|---|
| 78 | $$x =~ s/(^\s+|\s+$)//g; |
|---|
| 79 | } |
|---|
| 80 | } |
|---|
| 81 | |
|---|
| 82 | ############################################################################## |
|---|
| 83 | # strptime type date parsing - BUT - if no timezone is present, treat time as |
|---|
| 84 | # being in localtime rather than the various other perl implementation which |
|---|
| 85 | # treats it as being in UTC/GMT |
|---|
| 86 | |
|---|
| 87 | sub parse_xmltv_date |
|---|
| 88 | { |
|---|
| 89 | my $datestring = shift; |
|---|
| 90 | my @t; # 0=sec,1=min,2=hour,3=day,4=month,5=year,6=wday,7=yday,8=isdst |
|---|
| 91 | my $tz_offset = 0; |
|---|
| 92 | |
|---|
| 93 | # work out GMT offset - we only do this once |
|---|
| 94 | if (!$gmt_offset) { |
|---|
| 95 | my @l = localtime(43200), my @g = gmtime(43200); |
|---|
| 96 | $gmt_offset = (($l[2] - $g[2])*(60*60)) + (($l[1] - $g[1])*60); |
|---|
| 97 | } |
|---|
| 98 | |
|---|
| 99 | if ($datestring =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})/) { |
|---|
| 100 | ($t[5],$t[4],$t[3],$t[2],$t[1],$t[0]) = (int($1)-1900,int($2)-1,int($3),int($4),int($5),0); |
|---|
| 101 | ($t[6],$t[7],$t[8]) = (-1,-1,-1); |
|---|
| 102 | |
|---|
| 103 | # if input data has a timezone offset, then offset by that |
|---|
| 104 | if ($datestring =~ /\+(\d{2})(\d{2})/) { |
|---|
| 105 | $tz_offset = $gmt_offset - (($1*(60*60)) + ($2*60)); |
|---|
| 106 | } elsif ($datestring =~ /\-(\d{2})(\d{2})/) { |
|---|
| 107 | $tz_offset = $gmt_offset + (($1*(60*60)) + ($2*60)); |
|---|
| 108 | } |
|---|
| 109 | |
|---|
| 110 | my $e = mktime(@t); |
|---|
| 111 | return ($e+$tz_offset) if ($e > 1); |
|---|
| 112 | } |
|---|
| 113 | return undef; |
|---|
| 114 | } |
|---|
| 115 | |
|---|
| 116 | ############################################################################## |
|---|
| 117 | |
|---|
| 118 | sub dontcare_cb( $ ) |
|---|
| 119 | { |
|---|
| 120 | return; |
|---|
| 121 | } |
|---|
| 122 | |
|---|
| 123 | ############################################################################## |
|---|
| 124 | |
|---|
| 125 | sub programme_cb( $ ) |
|---|
| 126 | { |
|---|
| 127 | my $prog=shift; |
|---|
| 128 | &cleanup($prog); |
|---|
| 129 | # print "got programme from $datafile: ".Dumper($prog); |
|---|
| 130 | |
|---|
| 131 | # make sure programme has a title |
|---|
| 132 | if (!defined $prog->{title} || !defined $prog->{title}->[0]) { |
|---|
| 133 | printf STDERR "programme with no title in %s; ignored.", $datafile; |
|---|
| 134 | return; |
|---|
| 135 | } |
|---|
| 136 | my $prog_title = $prog->{title}->[0]->[0]; |
|---|
| 137 | |
|---|
| 138 | my $t = &parse_xmltv_date($prog->{start}); |
|---|
| 139 | |
|---|
| 140 | if (!$t) { |
|---|
| 141 | printf STDERR "programme '%s' from %s had invalid start time (%s); ignored", |
|---|
| 142 | $prog_title, ($prog->{start} ? $prog->{start} : "undef"); |
|---|
| 143 | return; |
|---|
| 144 | } |
|---|
| 145 | |
|---|
| 146 | # make programme time within a half-hour increment (0-47) |
|---|
| 147 | my $inc = int((($t + $gmt_offset) % (60*60*24)) / (60*30)); |
|---|
| 148 | |
|---|
| 149 | # print out programmes half-hour increments (0-47) & programme title |
|---|
| 150 | printf O "%d\t%d\t%s\n",$datafile_num,$inc,$prog_title; |
|---|
| 151 | |
|---|
| 152 | $num_programmes++; |
|---|
| 153 | } |
|---|
| 154 | |
|---|
| 155 | ############################################################################## |
|---|
| 156 | |
|---|
| 157 | sub create_programme_lineup |
|---|
| 158 | { |
|---|
| 159 | my $output = shift; |
|---|
| 160 | |
|---|
| 161 | if (!(open(O,">$output"))) { |
|---|
| 162 | die "could not create output file $output: $!\n"; |
|---|
| 163 | } |
|---|
| 164 | |
|---|
| 165 | printf STDERR "creating programme lineup in file $output\n"; |
|---|
| 166 | printf O "# datafile_num\thalf-hour-slot\tprogramme_name\n"; |
|---|
| 167 | |
|---|
| 168 | foreach my $file (@ARGV) { |
|---|
| 169 | $datafile = $file; |
|---|
| 170 | printf STDERR "parsing: %s\n",($datafile eq "-" ? |
|---|
| 171 | "(from-stdin, hit control-D to finiah)" : $datafile); |
|---|
| 172 | eval { XMLTV::parsefiles_callback(\&dontcare_cb, \&dontcare_cb, |
|---|
| 173 | \&dontcare_cb, \&programme_cb, $datafile); }; |
|---|
| 174 | $datafile_num++; |
|---|
| 175 | } |
|---|
| 176 | |
|---|
| 177 | printf STDERR "finished, $num_programmes programmes from $datafile_num sources in $output\n"; |
|---|
| 178 | close(O); |
|---|
| 179 | } |
|---|
| 180 | |
|---|
| 181 | |
|---|
| 182 | |
|---|
| 183 | ############################################################################## |
|---|
| 184 | |
|---|
| 185 | sub test_algorithms |
|---|
| 186 | { |
|---|
| 187 | my $alg = shift; |
|---|
| 188 | printf STDERR "testing %s algorithm%s against programmes from stdin\n", |
|---|
| 189 | $alg, ($alg eq "all" ? "s" : ""); |
|---|
| 190 | |
|---|
| 191 | while (<STDIN>) { |
|---|
| 192 | chop; |
|---|
| 193 | next if ($_ =~ /^#/); |
|---|
| 194 | my ($n,$t,$p) = split(/\t/,$_); |
|---|
| 195 | push(@{($prog_lineup->[$t]->[$n])},$p); |
|---|
| 196 | $num_programmes++; |
|---|
| 197 | $datafile_num = $n if ($n > $datafile_num); |
|---|
| 198 | } |
|---|
| 199 | printf STDERR "%d programmes from %d sources across %d timeslots\n", |
|---|
| 200 | $num_programmes, $datafile_num, $#{$prog_lineup}; |
|---|
| 201 | |
|---|
| 202 | foreach my $a (sort keys %algorithms) { |
|---|
| 203 | if (($alg eq $a) || ($alg eq "all")) { |
|---|
| 204 | # printf STDERR "testing algorithm '%s' against %d programmes\n", $a, $num_programmes; |
|---|
| 205 | &test_alg($a); |
|---|
| 206 | } |
|---|
| 207 | } |
|---|
| 208 | |
|---|
| 209 | &test_alg() if ($opt->{compare}); |
|---|
| 210 | |
|---|
| 211 | printf STDERR "\n"; |
|---|
| 212 | printf STDERR "to see details on matches, use the --show_matches option.\n" unless $opt->{show_matches}; |
|---|
| 213 | printf STDERR "to see details on non-matches, use the --show_non_matches option.\n" unless $opt->{show_non_matches}; |
|---|
| 214 | } |
|---|
| 215 | |
|---|
| 216 | ############################################################################## |
|---|
| 217 | |
|---|
| 218 | sub test_alg |
|---|
| 219 | { |
|---|
| 220 | my $alg = shift; |
|---|
| 221 | my $matches = 0; |
|---|
| 222 | my $non_matches = 0; |
|---|
| 223 | my $alg_start = [gettimeofday]; |
|---|
| 224 | |
|---|
| 225 | my ($al1, $al2); |
|---|
| 226 | if ($opt->{compare} and $opt->{compare} =~ /(.*),(.*)/) { |
|---|
| 227 | if ($algorithms{$1} and $algorithms{$2}) { |
|---|
| 228 | ($al1, $al2) = ($1, $2); |
|---|
| 229 | print "Comparing $al1 to $al2.\n"; |
|---|
| 230 | } |
|---|
| 231 | } |
|---|
| 232 | |
|---|
| 233 | foreach my $timeslot (0..$#{$prog_lineup}) { |
|---|
| 234 | foreach my $source (0..$datafile_num) { |
|---|
| 235 | next if (!defined $prog_lineup->[$timeslot]->[$source]); |
|---|
| 236 | |
|---|
| 237 | # within each timeslot, compare programmes between |
|---|
| 238 | # data sources using the algorithm routine passed in |
|---|
| 239 | my @c1 = @{($prog_lineup->[$timeslot]->[$source])}; |
|---|
| 240 | |
|---|
| 241 | for my $source2 (0..$datafile_num) { |
|---|
| 242 | next if ($source == $source2); |
|---|
| 243 | next if (!defined $prog_lineup->[$timeslot]->[$source2]); |
|---|
| 244 | |
|---|
| 245 | my @c2 = @{($prog_lineup->[$timeslot]->[$source2])}; |
|---|
| 246 | |
|---|
| 247 | # see how many matches we get between c1 and c2 |
|---|
| 248 | foreach my $prog1 (@c1) { |
|---|
| 249 | foreach my $prog2 (@c2) { |
|---|
| 250 | |
|---|
| 251 | # No point comparing the same string, is there? |
|---|
| 252 | next if ($prog1 eq $prog2); |
|---|
| 253 | |
|---|
| 254 | if ($al1 and $al2) { |
|---|
| 255 | my $m1 = $algorithms{$al1}->($prog1,$prog2); |
|---|
| 256 | my $m2 = $algorithms{$al2}->($prog1,$prog2); |
|---|
| 257 | if ($m1 != $m2) { |
|---|
| 258 | print "\"$prog1\" vs \"$prog2\": $al1 says " . |
|---|
| 259 | ($m1 ? "MATCH" : "NON-MATCH") . |
|---|
| 260 | " and $al2 says " . |
|---|
| 261 | ($m2 ? "MATCH" : "NON-MATCH") . |
|---|
| 262 | ".\n"; |
|---|
| 263 | } |
|---|
| 264 | } |
|---|
| 265 | else { |
|---|
| 266 | if ($algorithms{$alg}->($prog1,$prog2)) { |
|---|
| 267 | printf "algorithm $alg said '$prog1' and '$prog2' DO match\n" if ($opt->{show_matches}); |
|---|
| 268 | $matches++; |
|---|
| 269 | } else { |
|---|
| 270 | printf "algorithm $alg said '$prog1' and '$prog2' don't match\n" if ($opt->{show_non_matches}); |
|---|
| 271 | $non_matches++; |
|---|
| 272 | } |
|---|
| 273 | } |
|---|
| 274 | } |
|---|
| 275 | } |
|---|
| 276 | } |
|---|
| 277 | } |
|---|
| 278 | } |
|---|
| 279 | |
|---|
| 280 | printf STDERR " - algorithm %s took %0.2fsec to record %d matches and %d non-matches.\n", |
|---|
| 281 | $alg, tv_interval($alg_start), $matches, $non_matches; |
|---|
| 282 | } |
|---|
| 283 | |
|---|
| 284 | ############################################################################## |
|---|
| 285 | # simple string match |
|---|
| 286 | |
|---|
| 287 | sub string_match |
|---|
| 288 | { |
|---|
| 289 | my ($s1,$s2) = @_; |
|---|
| 290 | return 1 if (lc($s1) eq lc($s2)); |
|---|
| 291 | return 0; |
|---|
| 292 | } |
|---|
| 293 | |
|---|
| 294 | ############################################################################## |
|---|
| 295 | # original alternativeTitles() from XMLTV::IMDB, this one knows about even more translations.. |
|---|
| 296 | |
|---|
| 297 | sub alternativeTitles($) |
|---|
| 298 | { |
|---|
| 299 | my $title=shift; |
|---|
| 300 | my @titles; |
|---|
| 301 | |
|---|
| 302 | push(@titles, $title); # seed with original title |
|---|
| 303 | |
|---|
| 304 | # try "&" -> "and" conversion ('Spicks & Specks' => 'Spicks and Specks') |
|---|
| 305 | if ( $title=~m/\&/o ) { |
|---|
| 306 | my $t=$title; |
|---|
| 307 | push(@titles, $t) while ( $t=~s/(\s)\&(\s)/$1and$2/o ); |
|---|
| 308 | } |
|---|
| 309 | |
|---|
| 310 | # try the "and" -> "&" conversion ('Spicks and Specks' => 'Spicks & Specks') |
|---|
| 311 | if ( $title=~m/\sand\s/io ) { |
|---|
| 312 | my $t=$title; |
|---|
| 313 | push(@titles, $t) while ( $t=~s/(\s)and(\s)/$1\&$2/io ); |
|---|
| 314 | } |
|---|
| 315 | |
|---|
| 316 | # 'Barney Friends' => 'Barney & Friends' |
|---|
| 317 | foreach my $t (@titles) { |
|---|
| 318 | if ($t =~ /^(.*) (.*)$/) { |
|---|
| 319 | push(@titles,"$1 & $2"); |
|---|
| 320 | } |
|---|
| 321 | } |
|---|
| 322 | |
|---|
| 323 | # 'Barney Friends' => 'Barney - Friends' |
|---|
| 324 | foreach my $t (@titles) { |
|---|
| 325 | if ($t =~ /^(.*) (.*)$/) { |
|---|
| 326 | push(@titles,"$1 - $2"); |
|---|
| 327 | } |
|---|
| 328 | } |
|---|
| 329 | |
|---|
| 330 | # "Creflo A. Dollar, Jr" => "Creflo A Dollar" |
|---|
| 331 | # "House, M.D." => "House" |
|---|
| 332 | foreach my $t (@titles) { |
|---|
| 333 | push(@titles,"$1") if ($t =~ /^(.*), Jr$/); |
|---|
| 334 | push(@titles,"$1") if ($t =~ /^(.*), M\.D\.$/); |
|---|
| 335 | } |
|---|
| 336 | |
|---|
| 337 | # "quotes" -> quotes |
|---|
| 338 | foreach my $t (@titles) { |
|---|
| 339 | push(@titles,$1) if ( $t =~ /^\"(.*)\"$/); |
|---|
| 340 | } |
|---|
| 341 | |
|---|
| 342 | # remove full stops - e.g. will translate 'Dr. Dog.' to 'Dr Dog.' and 'Dr Dog' |
|---|
| 343 | foreach my $t (@titles) { |
|---|
| 344 | if ($t =~ /\./) { |
|---|
| 345 | my @new_titles; |
|---|
| 346 | push(@new_titles, $t) while ( $t=~s/(.*)\.(.*)/$1$2/io ); |
|---|
| 347 | push(@titles,@new_titles); |
|---|
| 348 | } |
|---|
| 349 | } |
|---|
| 350 | |
|---|
| 351 | # remove dashes |
|---|
| 352 | foreach my $t (@titles) { |
|---|
| 353 | if ($t =~ /\-/) { |
|---|
| 354 | my @new_titles; |
|---|
| 355 | push(@new_titles, $t) while ( $t=~s/(.*)\-(.*)/$1$2/io ); |
|---|
| 356 | push(@titles,@new_titles); |
|---|
| 357 | } |
|---|
| 358 | } |
|---|
| 359 | |
|---|
| 360 | # # strip *'s: "M*A*S*H" => "MASH"; |
|---|
| 361 | foreach my $t (@titles) { |
|---|
| 362 | if ($t =~ /\*/) { |
|---|
| 363 | my @new_titles; |
|---|
| 364 | push(@new_titles, $t) while ( $t=~s/(.*)\*(.*)/$1$2/io ); |
|---|
| 365 | push(@titles,@new_titles); |
|---|
| 366 | } |
|---|
| 367 | } |
|---|
| 368 | |
|---|
| 369 | # remove !s |
|---|
| 370 | foreach my $t (@titles) { |
|---|
| 371 | if ($t =~ /\'/) { |
|---|
| 372 | my @new_titles; |
|---|
| 373 | push(@new_titles, $t) while ( $t=~s/(.*)\'(.*)/$1$2/io ); |
|---|
| 374 | push(@titles,@new_titles); |
|---|
| 375 | } |
|---|
| 376 | } |
|---|
| 377 | |
|---|
| 378 | # remove :s |
|---|
| 379 | foreach my $t (@titles) { |
|---|
| 380 | if ($t =~ /:/) { |
|---|
| 381 | my @new_titles; |
|---|
| 382 | push(@new_titles, $t) while ( $t=~s/(.*):(.*)/$1$2/io ); |
|---|
| 383 | push(@titles,@new_titles); |
|---|
| 384 | } |
|---|
| 385 | } |
|---|
| 386 | |
|---|
| 387 | # remove apostrophies |
|---|
| 388 | foreach my $t (@titles) { |
|---|
| 389 | if ($t =~ /\!/) { |
|---|
| 390 | my @new_titles; |
|---|
| 391 | push(@new_titles, $t) while ( $t=~s/(.*)!(.*)/$1$2/io ); |
|---|
| 392 | push(@titles,@new_titles); |
|---|
| 393 | } |
|---|
| 394 | } |
|---|
| 395 | |
|---|
| 396 | # remove commas |
|---|
| 397 | foreach my $t (@titles) { |
|---|
| 398 | if ($t =~ /\,/) { |
|---|
| 399 | my @new_titles; |
|---|
| 400 | push(@new_titles, $t) while ( $t=~s/(.*)\,(.*)/$1$2/io ); |
|---|
| 401 | push(@titles,@new_titles); |
|---|
| 402 | } |
|---|
| 403 | } |
|---|
| 404 | |
|---|
| 405 | # strip text after !s provided we have at least one word |
|---|
| 406 | # e.g. will fix 'Jakers! The Adventures of Piggley Winks' to match 'Jakers' |
|---|
| 407 | # or 'Jakers! The Adventures of Piggley Winks' to match 'The Adventures of Piggley Winks' |
|---|
| 408 | foreach my $t (@titles) { |
|---|
| 409 | if ($t =~ /! /) { |
|---|
| 410 | my @t2 = split(/! /,$t); |
|---|
| 411 | push(@titles, @t2); |
|---|
| 412 | } |
|---|
| 413 | } |
|---|
| 414 | |
|---|
| 415 | # 'Behind The News (5 Min)' => 'Behind the News 5 Min' |
|---|
| 416 | # |
|---|
| 417 | # also used to do: |
|---|
| 418 | # 'Behind The News (5 Min)' => 'Behind the News' |
|---|
| 419 | # 'Message Stick (Shorts)' => 'Message Stick' |
|---|
| 420 | # but removed because it caused false positives on "Stateline (VIC)", "Stateline (TAS)" etc. |
|---|
| 421 | foreach my $t (@titles) { |
|---|
| 422 | if ($t =~ /(.+)\((.+)\)$/) { |
|---|
| 423 | # my $t2 = $1; |
|---|
| 424 | my $t3 = $1.$2; |
|---|
| 425 | # $t2 =~ s/(^\s+|\s+$)//g; # strip leading/trailing spaces |
|---|
| 426 | $t3 =~ s/(^\s+|\s+$)//g; # strip leading/trailing spaces |
|---|
| 427 | # push(@titles,$t2); |
|---|
| 428 | push(@titles,$t3); |
|---|
| 429 | } |
|---|
| 430 | } |
|---|
| 431 | |
|---|
| 432 | # 'Family Story: The Longest Season' => 'Family Story' |
|---|
| 433 | foreach my $t (@titles) { |
|---|
| 434 | if ($t =~ /: /) { |
|---|
| 435 | my @t2 = split(/: /,$t); |
|---|
| 436 | push(@titles, @t2); |
|---|
| 437 | } |
|---|
| 438 | } |
|---|
| 439 | |
|---|
| 440 | # removed: causes a false positive on "One World USA" / "One World Sri Lanka" |
|---|
| 441 | # # 'Family Story The Longest Season' => 'Family Story: The Longest Season' |
|---|
| 442 | # foreach my $t (@titles) { |
|---|
| 443 | # if ($t =~ / /) { |
|---|
| 444 | # my @t2 = split(/ /,$t); |
|---|
| 445 | # push(@titles, @t2); |
|---|
| 446 | # } |
|---|
| 447 | # } |
|---|
| 448 | |
|---|
| 449 | # #39; -> ' ('He's Having A Baby' => 'He\'s Having a Baby') |
|---|
| 450 | foreach my $t (@titles) { |
|---|
| 451 | my $t2 = $t; |
|---|
| 452 | $t2 =~ s/&(#(\d+)|(.*?));/ $2 ? chr($2) : $amp{$3}||' ' /eg; |
|---|
| 453 | push(@titles,$t2) if ($t2 ne $t); |
|---|
| 454 | } |
|---|
| 455 | |
|---|
| 456 | # 'Head 2 Head' => 'Head To Head' |
|---|
| 457 | foreach my $t (@titles) { |
|---|
| 458 | push(@titles, "$1 to $2") if ($t =~ /(.*) 2 (.*)/); |
|---|
| 459 | } |
|---|
| 460 | |
|---|
| 461 | # remove leading 'The': |
|---|
| 462 | # "Iron Chef" alternatives "The Iron Chef" |
|---|
| 463 | # "The Newshour with Jim Lehrer" alternative "Newshour with Jim Lehrer" |
|---|
| 464 | foreach my $t (@titles) { |
|---|
| 465 | push(@titles,"$1") if ($t =~ /^The (.*)$/); |
|---|
| 466 | } |
|---|
| 467 | |
|---|
| 468 | # strip "With [..]" if at least 10 chars before With |
|---|
| 469 | # "The 7.30 Report With Kerry O'brien" alternatives "The 7.30 Report" |
|---|
| 470 | # "Seven News With Peter Mitchell" alternatives "Seven News" |
|---|
| 471 | # "Today Tonight With Naomi Robson" alternatives "Today Tonight" |
|---|
| 472 | foreach my $t (@titles) { |
|---|
| 473 | if ($t =~/^(.*) [Ww]ith (.*)$/) { |
|---|
| 474 | my $t2 = $1; |
|---|
| 475 | push (@titles, $t2) if (length($t2) >= 10); |
|---|
| 476 | } |
|---|
| 477 | } |
|---|
| 478 | |
|---|
| 479 | # Place the articles last |
|---|
| 480 | # 'The Daily Show' => 'Daily Show, The' |
|---|
| 481 | foreach (@titles) { |
|---|
| 482 | if ( m/^(The|A|Une|Les|Los|Las|L\'|Le|La|El|Das|De|Het|Een)\s+(.*)$/io ) { |
|---|
| 483 | my $t=$_; |
|---|
| 484 | $t=~s/^(The|A|Une|Les|Los|Las|L\'|Le|La|El|Das|De|Het|Een)\s+(.*)$/$2, $1/iog; |
|---|
| 485 | push(@titles, $t); |
|---|
| 486 | } |
|---|
| 487 | } |
|---|
| 488 | |
|---|
| 489 | #foreach my $t (@titles) { printf "alternativeTitles: alternative \"$t\"\n"; } |
|---|
| 490 | |
|---|
| 491 | return(@titles); |
|---|
| 492 | } |
|---|
| 493 | |
|---|
| 494 | sub alternativeTitles_match |
|---|
| 495 | { |
|---|
| 496 | my ($s1,$s2) = @_; |
|---|
| 497 | |
|---|
| 498 | my @alt1 = alternativeTitles($s1); |
|---|
| 499 | my @alt2 = alternativeTitles($s2); |
|---|
| 500 | |
|---|
| 501 | foreach my $a1 (@alt1) { |
|---|
| 502 | foreach my $a2 (@alt2) { |
|---|
| 503 | return 1 if (lc($a1) eq lc($a2)); |
|---|
| 504 | } |
|---|
| 505 | } |
|---|
| 506 | return 0; |
|---|
| 507 | } |
|---|
| 508 | |
|---|
| 509 | ############################################################################## |
|---|
| 510 | |
|---|
| 511 | sub canonicalizeTitle |
|---|
| 512 | { |
|---|
| 513 | my $title=shift; |
|---|
| 514 | $title =~ s/^\s+//; |
|---|
| 515 | $title =~ s/\s+$//; |
|---|
| 516 | $title =~ s/\s+/ /; |
|---|
| 517 | $title =~ s/&(#(\d+)|(.*?));/ $2 ? chr($2) : $amp{$3}||' ' /eg; |
|---|
| 518 | $title =~ s/ *\& */ and /g; |
|---|
| 519 | $title =~ s/[^ a-zA-Z0-9]//g; |
|---|
| 520 | return(lc($title)); |
|---|
| 521 | } |
|---|
| 522 | |
|---|
| 523 | my %alternatives = ( |
|---|
| 524 | to => ("2"), |
|---|
| 525 | too => ("2"), |
|---|
| 526 | "2" => ("to", "too") |
|---|
| 527 | ); |
|---|
| 528 | |
|---|
| 529 | sub alternative |
|---|
| 530 | { |
|---|
| 531 | my $word=shift; |
|---|
| 532 | return($alternatives{$word}); |
|---|
| 533 | } |
|---|
| 534 | |
|---|
| 535 | sub canonicalizeTitles_match |
|---|
| 536 | { |
|---|
| 537 | my $shorter=canonicalizeTitle(shift); |
|---|
| 538 | my $longer =canonicalizeTitle(shift); |
|---|
| 539 | |
|---|
| 540 | if (length($shorter) > length($longer)) { |
|---|
| 541 | ($shorter,$longer) = ($longer,$shorter); |
|---|
| 542 | } |
|---|
| 543 | |
|---|
| 544 | #printf "got shorter: '%s', longer '%s'\n",$shorter,$longer; |
|---|
| 545 | WORD: for my $word (split(/\s/,$shorter)) { |
|---|
| 546 | #printf "got word: '%s'\n",$word; |
|---|
| 547 | next WORD if ($longer =~ s/\b$word\b//); |
|---|
| 548 | for my $alt (&alternative($word)) { |
|---|
| 549 | next if (!defined $alt); |
|---|
| 550 | next WORD if ($longer =~ s/$alt//); |
|---|
| 551 | } |
|---|
| 552 | return(0); |
|---|
| 553 | } |
|---|
| 554 | return(1); |
|---|
| 555 | } |
|---|
| 556 | |
|---|
| 557 | ############################################################################## |
|---|