00001 #!/usr/bin/perl -w
00002
00003 #
00004 # This perl script is intended to perform movie data lookups based on
00005 # the popular www.imdb.com website
00006 #
00007 # For more information on MythVideo's external movie lookup mechanism, see
00008 # the README file in this directory.
00009 #
00010 # Author: Tim Harvey (tharvey AT alumni.calpoly DOT edu)
00011 # Modified: Andrei Rjeousski
00012 # v1.1
00013 # - Added amazon.com covers and improved handling for imdb posters
00014 # v1.2
00015 # - when searching amazon, try searching for main movie name and if nothing
00016 # is found, search for informal name
00017 # - better handling for amazon posters, see if movie title is a substring
00018 # in the search results returned by amazon
00019 # - fixed redirects for some movies on impawards
00020 # v1.3
00021 # - fixed search for low res images (imdb changed the page layout)
00022 # - added cinemablend poster search
00023 # - added nexbase poster search
00024 # - removed amazon.com searching for now
00025
00026 # changes:
00027 # 10-26-2007:
00028 # Added release date (in ISO 8601 form) to output
00029 # 9-10-2006: Anduin Withers
00030 # Changed output to utf8
00031
00032 use LWP::Simple; # libwww-perl providing simple HTML get actions
00033 use HTML::Entities;
00034 use URI::Escape;
00035
00036 eval "use DateTime::Format::Strptime"; my $has_date_format = $@ ? 0 : 1;
00037
00038 use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_M $opt_P);
00039 use Getopt::Std;
00040
00041 $title = "IMDB Query";
00042 $version = "v1.3.5";
00043 $author = "Tim Harvey, Andrei Rjeousski";
00044
00045 my @countries = qw(USA UK Canada Japan);
00046
00047 binmode(STDOUT, ":utf8");
00048
00049 # display usage
00050 sub usage {
00051 print "usage: $0 -hdrviMPD [parameters]\n";
00052 print " -h help\n";
00053 print " -d debug\n";
00054 print " -r dump raw query result data only\n";
00055 print " -v display version\n";
00056 print " -i display info\n";
00057 print "\n";
00058 print " -M [options] <query> get movie list\n";
00059 print " some known options are:\n";
00060 print " type=[fuzy] looser search\n";
00061 print " from_year=[int] limit matches to year\n";
00062 print " to_year=[int] limit matches to year\n";
00063 print " sort=[smart] ??\n";
00064 print " tv=[no|both|only] limits between tv and movies\n";
00065 print " Note: multiple options must be separated by ';'\n";
00066 print " -P <movieid> get movie poster\n";
00067 print " -D <movieid> get movie data\n";
00068 exit(-1);
00069 }
00070
00071 # display 1-line of info that describes the version of the program
00072 sub version {
00073 print "$title ($version) by $author\n"
00074 }
00075
00076 # display 1-line of info that can describe the type of query used
00077 sub info {
00078 print "Performs queries using the www.imdb.com website.\n";
00079 }
00080
00081 # display detailed help
00082 sub help {
00083 version();
00084 info();
00085 usage();
00086 }
00087
00088 sub trim {
00089 my ($str) = @_;
00090 $str =~ s/^\s+
00091 $str =~ s/\s+$
00092 return $str;
00093 }
00094
00095 # returns text within 'data' between 'beg' and 'end' matching strings
00096 sub parseBetween {
00097 my ($data, $beg, $end)=@_; # grab parameters
00098
00099 my $ldata = lc($data);
00100 my $start = index($ldata, lc($beg)) + length($beg);
00101 my $finish = index($ldata, lc($end), $start);
00102 if ($start != (length($beg) -1) && $finish != -1) {
00103 my $result = substr($data, $start, $finish - $start);
00104 # return w/ decoded numeric character references
00105 # (see http://www.w3.org/TR/html4/charset.html#h-5.3.1)
00106 decode_entities($result);
00107 return $result;
00108 }
00109 return "";
00110 }
00111
00112 # get Movie Data
00113 sub getMovieData {
00114 my ($movieid)=@_; # grab movieid parameter
00115 if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
00116
00117 my $name_link_pat = qr'<a href="/name/[^"]*">([^<]*)</a>'m;
00118
00119 # get the search results page
00120 my $request = "http://www.imdb.com/title/tt" . $movieid . "/";
00121 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
00122 my $response = get $request;
00123 if (defined $opt_r) { printf("%s", $response); }
00124
00125 # parse title and year
00126 my $year = "";
00127 my $title = parseBetween($response, "<title>", "</title>");
00128 if ($title =~ m#(.+) \((\d+).*\)#) # Note some years have a /II after them?
00129 {
00130 $title = $1;
00131 $year = $2;
00132 }
00133 elsif ($title =~ m#(.+) \(\?\?\?\?\)#)
00134 {
00135 $title = $1;
00136 }
00137
00138 # parse director
00139 my $data = parseBetween($response, ">Director:</h5>", "</div>");
00140 if (!length($data)) {
00141 $data = parseBetween($response, ">Directors:</h5>", "</div>");
00142 }
00143 my $director = join(",", ($data =~ m/$name_link_pat/g));
00144
00145 # parse writer
00146 # (Note: this takes the 'first' writer, may want to include others)
00147 $data = parseBetween($response, ">Writers <a href=\"/wga\">(WGA)</a>:</h5>", "</div>");
00148 if (!length($data)) {
00149 $data = parseBetween($response, ">Writer:</h5>", "</div>");
00150 }
00151 if (!length($data)) {
00152 $data = parseBetween($response, ">Writers:</h5>", "</div>");
00153 }
00154 my $writer = join(",", ($data =~ m/$name_link_pat/g));
00155
00156 # parse release date
00157 my $releasedate = '';
00158 if ($has_date_format) {
00159 my $dtp = new DateTime::Format::Strptime(pattern => '%d %b %Y',
00160 on_error => 'undef');
00161 my $dt = $dtp->parse_datetime(parseBetween($response,
00162 ">Release Date:</h5> ", "<a "));
00163 if (defined($dt)) {
00164 $releasedate = $dt->strftime("%F");
00165 }
00166 }
00167
00168 # parse plot
00169 my $plot = parseBetween($response, ">Plot Outline:</h5> ", "</div>");
00170 if (!$plot) {
00171 $plot = parseBetween($response, ">Plot Summary:</h5> ", "</div>");
00172 }
00173 if (!$plot) {
00174 $plot = parseBetween($response, ">Plot:</h5>", "</div>");
00175 }
00176
00177 if ($plot) {
00178 # replace name links in plot (example 0388795)
00179 $plot =~ s/$name_link_pat/$1/g;
00180
00181 # replace title links
00182 my $title_link_pat = qr!<a href="/title/[^"]*">([^<]*)</a>!m;
00183 $plot =~ s/$title_link_pat/$1/g;
00184
00185 # plot ends at first remaining link
00186 my $plot_end = index($plot, "<a ");
00187 if ($plot_end != -1) {
00188 $plot = substr($plot, 0, $plot_end);
00189 }
00190 $plot = trim($plot);
00191 }
00192
00193 # parse user rating
00194 my $userrating = parseBetween($response, ">User Rating:</b>", "</b>");
00195 $userrating = parseBetween($userrating, "<b>", "/");
00196
00197 # parse MPAA rating
00198 my $ratingcountry = "USA";
00199 my $movierating = trim(parseBetween($response, ">MPAA</a>:</h5>", "</div>"));
00200 if (!$movierating) {
00201 $movierating = parseBetween($response, ">Certification:</h5>", "</div>");
00202 $movierating = parseBetween($movierating, "certificates=$ratingcountry",
00203 "/a>");
00204 $movierating = parseBetween($movierating, ">", "<");
00205 }
00206
00207 # parse movie length
00208 my $rawruntime = trim(parseBetween($response, ">Runtime:</h5>", "</div>"));
00209 my $runtime = trim(parseBetween($rawruntime, "", " min"));
00210 for my $country (@countries) {
00211 last if ($runtime =~ /^-?\d/);
00212 $runtime = trim(parseBetween($rawruntime, "$country:", " min"));
00213 }
00214
00215 # parse cast
00216 # Note: full cast would be from url:
00217 # www.imdb.com/title/<movieid>/fullcredits
00218 my $cast = "";
00219 $data = parseBetween($response, "Cast overview, first billed only",
00220 "/table>");
00221 if (!$data) {
00222 $data = parseBetween($response, "Series Cast Summary",
00223 "/table>");
00224 }
00225
00226 if (!$data) {
00227 $data = parseBetween($response, "Complete credited cast",
00228 "/table>");
00229 }
00230
00231 if ($data) {
00232 $cast = join(',', ($data =~ m/$name_link_pat/g));
00233 $cast = trim($cast);
00234 }
00235
00236
00237 # parse genres
00238 my $lgenres = "";
00239 $data = parseBetween($response, "<h5>Genre:</h5>","</div>");
00240 if ($data) {
00241 my $genre_pat = qr'/Sections/Genres/(?:[a-z ]+/)*">([^<]+)<'im;
00242 $lgenres = join(',', ($data =~ /$genre_pat/g));
00243 }
00244
00245 # parse countries
00246 $data = parseBetween($response, "Country:</h5>","</div>");
00247 my $country_pat = qr'/Sections/Countries/[A-Z]+/">([^<]+)</a>'i;
00248 my $lcountries = trim(join(",", ($data =~ m/$country_pat/g)));
00249
00250 # output fields (these field names must match what MythVideo is looking for)
00251 print "Title:$title\n";
00252 print "Year:$year\n";
00253 print "ReleaseDate:$releasedate\n";
00254 print "Director:$director\n";
00255 print "Plot:$plot\n";
00256 print "UserRating:$userrating\n";
00257 print "MovieRating:$movierating\n";
00258 print "Runtime:$runtime\n";
00259 print "Writers: $writer\n";
00260 print "Cast:$cast\n";
00261 print "Genres: $lgenres\n";
00262 print "Countries: $lcountries\n";
00263 }
00264
00265 # dump Movie Poster
00266 sub getMoviePoster {
00267 my ($movieid)=@_; # grab movieid parameter
00268 if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
00269
00270 # get the search results page
00271 my $request = "http:
00272 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
00273 my $response = get $request;
00274 if (defined $opt_r) { printf("%s", $response); }
00275
00276 if (!defined $response) {return;}
00277
00278 my $uri = "";
00279
00280 # look for references to impawards.com posters - they are high quality
00281 my $site = "http://www.impawards.com";
00282 my $impsite = parseBetween($response, "<a href=\"".$site, "\">".$site);
00283
00284 # jersey girl fix
00285 $impsite = parseBetween($response, "<a href=\"http://impawards.com","\">http://impawards.com") if ($impsite eq "");
00286
00287 if ($impsite) {
00288 $impsite = $site . $impsite;
00289
00290 if (defined $opt_d) { print "# Searching for poster at: ".$impsite."\n"; }
00291 my $impres = get $impsite;
00292 if (defined $opt_d) { printf("# got %i bytes\n", length($impres)); }
00293 if (defined $opt_r) { printf("%s", $impres); }
00294
00295 # making sure it isnt redirect
00296 $uri = parseBetween($impres, "0;URL=..", "\">");
00297 if ($uri ne "") {
00298 if (defined $opt_d) { printf("# processing redirect to %s\n",$uri); }
00299 # this was redirect
00300 $impsite = $site . $uri;
00301 $impres = get $impsite;
00302 }
00303
00304 # do stuff normally
00305 $uri = parseBetween($impres, "<img SRC=\"posters/", "\" ALT");
00306 # uri here is relative... patch it up to make a valid uri
00307 if ($uri =~ /\.(jpe?g|gif|png)$/) {
00308 if (!($uri =~ /http:(.*)/ )) {
00309 my $path = substr($impsite, 0, rindex($impsite, '/') + 1);
00310 $uri = $path."posters/".$uri;
00311 }
00312 if (defined $opt_d) { print "# found ipmawards poster: $uri\n"; }
00313 }
00314 else {
00315 $uri = "";
00316 }
00317 }
00318
00319 # try looking on nexbase
00320 if ($uri eq "" && $response =~ m/<a href="([^"]*)">([^"]*?)nexbase/i) {
00321 if ($1 ne "") {
00322 if (defined $opt_d) { print "# found nexbase poster page: $1 \n"; }
00323 my $cinres = get $1;
00324 if (defined $cinres) {
00325 if (defined $opt_d) { printf("# got %i bytes\n", length($cinres)); }
00326 if (defined $opt_r) { printf("%s", $cinres); }
00327
00328 if ($cinres =~ m/<a id="photo_url" href="([^"]*?)" ><\/a>/i) {
00329 if (defined $opt_d) { print "# nexbase url retreived\n"; }
00330 $uri = $1;
00331 }
00332 }
00333 }
00334 }
00335
00336 # try looking on cinemablend
00337 if ($uri eq "" && $response =~ m/<a href="([^"]*)">([^"]*?)cinemablend/i) {
00338 if ($1 ne "") {
00339 if (defined $opt_d) { print "# found cinemablend poster page: $1 \n"; }
00340 my $cinres = get $1;
00341 if (defined $opt_d) { printf("# got %i bytes\n", length($cinres)); }
00342 if (defined $opt_r) { printf("%s", $cinres); }
00343 if ($cinres =~ m#<img\b[^>]+\bsrc="(/images/reviews/[^"]*?)"#i) {
00344 if (defined $opt_d) { print "# cinemablend url retreived\n"; }
00345 $uri = "http://www.cinemablend.com/".$1;
00346 }
00347 }
00348 }
00349
00350 # if the impawards site attempt didn't give a filename grab it from imdb
00351 if ($uri eq "") {
00352 if (defined $opt_d) { print "# looking for imdb posters\n"; }
00353 my $host = "http://posters.imdb.com/posters/";
00354
00355 $uri = parseBetween($response, $host, "\"><td><td><a href=\"");
00356 if ($uri ne "") {
00357 $uri = $host.$uri;
00358 } else {
00359 if (defined $opt_d) { print "# no poster found\n"; }
00360 }
00361 }
00362
00363
00364
00365 my @movie_titles;
00366 my $found_low_res = 0;
00367 my $k = 0;
00368
00369 # no poster found, take lowres image from imdb
00370 if ($uri eq "") {
00371 if (defined $opt_d) { print "# looking for lowres imdb posters\n"; }
00372 my $host = "http://www.imdb.com/title/tt" . $movieid . "/";
00373 $response = get $host;
00374
00375 # Better handling for low resolution posters
00376 #
00377 if ($response =~ m/<a name="poster".*<img.*src="([^"]*).*<\/a>/ig) {
00378 if (defined $opt_d) { print "# found low res poster at: $1\n"; }
00379 $uri = $1;
00380 $found_low_res = 1;
00381 } else {
00382 if (defined $opt_d) { print "# no low res poster found\n"; }
00383 $uri = "";
00384 }
00385
00386 if (defined $opt_d) { print "# starting to look for movie title\n"; }
00387
00388 # get main title
00389 if (defined $opt_d) { print "# Getting possible movie titles:\n"; }
00390 $movie_titles[$k++] = parseBetween($response, "<title>", "<\/title>");
00391 if (defined $opt_d) { print "# Title: ".$movie_titles[$k-1]."\n"; }
00392
00393 # now we get all other possible movie titles and store them in the titles array
00394 while($response =~ m/>([^>^\(]*)([ ]{0,1}\([^\)]*\)[^\(^\)]*[ ]{0,1}){0,1}\(informal title\)/g) {
00395 $movie_titles[$k++] = trim($1);
00396 if (defined $opt_d) { print "# Title: ".$movie_titles[$k-1]."\n"; }
00397 }
00398
00399 }
00400
00401 print "$uri\n";
00402 }
00403
00404 # dump Movie list: 1 entry per line, each line as 'movieid:Movie Title'
00405 sub getMovieList {
00406 my ($filename, $options)=@_; # grab parameters
00407
00408 # If we wanted to inspect the file for any reason we can do that now
00409
00410 #
00411 # Convert filename into a query string
00412 # (use same rules that Metadata::guesTitle does)
00413 my $query = $filename;
00414 $query = uri_unescape($query); # in case it was escaped
00415 # Strip off the file extension
00416 if (rindex($query, '.') != -1) {
00417 $query = substr($query, 0, rindex($query, '.'));
00418 }
00419 # Strip off anything following '(' - people use this for general comments
00420 if (rindex($query, '(') != -1) {
00421 $query = substr($query, 0, rindex($query, '('));
00422 }
00423 # Strip off anything following '[' - people use this for general comments
00424 if (rindex($query, '[') != -1) {
00425 $query = substr($query, 0, rindex($query, '['));
00426 }
00427
00428 # IMDB searches do better if any trailing ,The is left off
00429 $query =~ /(.*), The$/i;
00430 if ($1) { $query = $1; }
00431
00432 # prepare the url
00433 $query = uri_escape($query);
00434 if (!$options) { $options = "" ;}
00435 if (defined $opt_d) {
00436 printf("# query: '%s', options: '%s'\n", $query, $options);
00437 }
00438
00439 # get the search results page
00440 # some known IMDB options are:
00441 # type=[fuzy] looser search
00442 # from_year=[int] limit matches to year (broken at imdb)
00443 # to_year=[int] limit matches to year (broken at imdb)
00444 # sort=[smart] ??
00445 # tv=[no|both|only] limits between tv and movies (broken at imdb)
00446 #$options = "tt=on;nm=on;mx=20"; # not exactly clear what these options do
00447 my $request = "http://www.imdb.com/find?q=$query;$options";
00448 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
00449 my $response = get $request;
00450 if (defined $opt_r) {
00451 print $response;
00452 exit(0);
00453 }
00454
00455 # check to see if we got a results page or a movie page
00456 # looking for 'add=<movieid>" target=' which only exists
00457 # in a movie description page
00458 my $movienum = parseBetween($response, "add=", "\"");
00459 if (!$movienum) {
00460 $movienum = parseBetween($response, ";add=", "'");
00461 }
00462 if ($movienum) {
00463 if ($movienum !~ m/^[0-9]+$/) {
00464 if (defined $opt_d) {
00465 printf("# Error: IMDB movie number ($movienum), isn't.\n");
00466 }
00467 exit(0);
00468 }
00469
00470 if (defined $opt_d) { printf("# redirected to movie page\n"); }
00471 my $movietitle = parseBetween($response, "<title>", "</title>");
00472 $movietitle =~ m#(.+) \((\d+)\)#;
00473 $movietitle = $1;
00474 print "$movienum:$movietitle\n";
00475 exit(0);
00476 }
00477
00478 # extract possible matches
00479 # possible matches are grouped in several catagories:
00480 # exact, partial, and approximate
00481 my $popular_results = parseBetween($response, "<b>Popular Titles</b>",
00482 "</table>");
00483 my $exact_matches = parseBetween($response, "<b>Titles (Exact Matches)</b>",
00484 "</table>");
00485 my $partial_matches = parseBetween($response, "<b>Titles (Partial Matches)</b>",
00486 "</table>");
00487 # my $approx_matches = parseBetween($response, "<b>Titles (Approx Matches)</b>",
00488 # "</table>");
00489 # parse movie list from matches
00490 my $beg = "<tr>";
00491 my $end = "</tr>";
00492 my $count = 0;
00493 my @movies;
00494
00495 # my $data = $exact_matches.$partial_matches;
00496 my $data = $popular_results.$exact_matches;
00497 # resort to partial matches if no exact
00498 if ($data eq "") { $data = $partial_matches; }
00499 # resort to approximate matches if no exact or partial
00500 # if ($data eq "") { $data = $approx_matches; }
00501 if ($data eq "") {
00502 if (defined $opt_d) { printf("# no results\n"); }
00503 return;
00504 }
00505 my $start = index($data, $beg);
00506 my $finish = index($data, $end, $start);
00507 my $year;
00508 my $type;
00509 my $title;
00510 while ($start != -1 && $start < length($data)) {
00511 $start += length($beg);
00512 my $entry = substr($data, $start, $finish - $start);
00513 $start = index($data, $beg, $finish + 1);
00514 $finish = index($data, $end, $start);
00515
00516 my $title = "";
00517 my $year = "";
00518 my $type = "";
00519 my $movienum = "";
00520
00521 # Some titles are identical, IMDB indicates this by appending /I /II to
00522 # the release year.
00523 # e.g. "Mon meilleur ami" 2006/I vs "Mon meilleur ami" 2006/II
00524 if ($entry =~ m/<a href="\/title\/tt(\d+)\/.*\">(.+)<\/a> \((\d+)\/?[a-z]*\)(?: \((.+)\))?/i) {
00525 $movienum = $1;
00526 $title = $2;
00527 $year = $3;
00528 $type = $4 if ($4);
00529 } else {
00530 if (defined $opt_d) {
00531 print("Unrecognized entry format ($entry)\n");
00532 }
00533 next;
00534 }
00535
00536 my $skip = 0;
00537
00538 # fix broken 'tv=no' option
00539 if ($options =~ /tv=no/) {
00540 if ($type eq "TV") {
00541 if (defined $opt_d) {printf("# skipping TV program: %s\n", $title);}
00542 $skip = 1;
00543 }
00544 }
00545 if ($options =~ /tv=only/) {
00546 if ($type eq "") {
00547 if (defined $opt_d) {printf("# skipping Movie: %s\n", $title);}
00548 $skip = 1;
00549 }
00550 }
00551 # fix broken 'from_year=' option
00552 if ($options =~ /from_year=(\d+)/) {
00553 if ($year < $1) {
00554 if (defined $opt_d) {printf("# skipping b/c of yr: %s\n", $title);}
00555 $skip = 1;
00556 }
00557 }
00558 # fix broken 'to_year=' option
00559 if ($options =~ /to_year=(\d+)/) {
00560 if ($year > $1) {
00561 if (defined $opt_d) {printf("# skipping b/c of yr: %s\n", $title);}
00562 $skip = 1;
00563 }
00564 }
00565
00566 # option to strip out videos (I think that's what '(V)' means anyway?)
00567 if ($options =~ /video=no/) {
00568 if ($type eq "V") {
00569 if (defined $opt_d) {
00570 printf("# skipping Video program: %s\n", $title);
00571 }
00572 $skip = 1;
00573 }
00574 }
00575
00576 # (always) strip out video game's (why does IMDB give these anyway?)
00577 if ($type eq "VG") {
00578 if (defined $opt_d) {printf("# skipping videogame: %s\n", $title);}
00579 $skip = 1;
00580 }
00581
00582 # add to array
00583 if (!$skip) {
00584 my $moviename = $title;
00585 if ($year ne "") {
00586 $moviename .= " ($year)";
00587 }
00588
00589 # $movies[$count++] = $movienum . ":" . $title;
00590 $movies[$count++] = $movienum . ":" . $moviename;
00591 }
00592 }
00593
00594 # display array of values
00595 for $movie (@movies) { print "$movie\n"; }
00596 }
00597
00598 #
00599 # Main Program
00600 #
00601
00602 # parse command line arguments
00603 getopts('ohrdivDMP');
00604
00605 # print out info
00606 if (defined $opt_v) { version(); exit 1; }
00607 if (defined $opt_i) { info(); exit 1; }
00608
00609 # print out usage if needed
00610 if (defined $opt_h || $#ARGV<0) { help(); }
00611
00612 if (defined $opt_D) {
00613 # take movieid from cmdline arg
00614 $movieid = shift || die "Usage : $0 -D <movieid>\n";
00615 getMovieData($movieid);
00616 }
00617
00618 elsif (defined $opt_P) {
00619 # take movieid from cmdline arg
00620 $movieid = shift || die "Usage : $0 -P <movieid>\n";
00621 getMoviePoster($movieid);
00622 }
00623
00624 elsif (defined $opt_M) {
00625 # take query from cmdline arg
00626 $options = shift || die "Usage : $0 -M [options] <query>\n";
00627 $query = shift;
00628 if (!$query) {
00629 $query = $options;
00630 $options = "";
00631 }
00632 getMovieList($query, $options);
00633 }
00634 # vim: set expandtab ts=3 sw=3 :