00001 #!/usr/bin/env perl
00002 # @(#)$Header: /home/mythtv/mythtvrep/scripts/maws.pl,v 1.9 2011/01/24 07:05:01 mythtv Exp $
00003 # Auric 2010/01/10 http://web.aanet.com.au/auric/
00004 #
00005 # MAWS metadata Grabber Script
00006 #
00007 ################################################################################
00008 use strict;
00009 use warnings;
00010 use Getopt::Std;
00011 use LWP::Simple;
00012 use HTML::TreeBuilder;
00013 use HTML::Entities;
00014 use Data::Dumper;
00015 use Encode;
00016
00017 #################################### Settings #################################
00018 my $info = 0; # print info/progress message: 0 - off, 1 - low ,2 - high
00019 my $infoop = 0; # info messages go to: 0 = stderr, filename = filename
00020
00021 #################################### Globals ##################################
00022 my $site = 'MAWS';
00023 my $baseurl = 'http://maws.mameworld.info';
00024 my $searchurl = $baseurl . "/maws/srch.php?search_text=";
00025 my $header = '<?xml version="1.0" encoding="UTF-8"?>
00026 <metadata>';
00027 my $footer = '</metadata>';
00028 our ($opt_M, $opt_D, $opt_v);
00029 my @metaitems;
00030 my $version = '$Revision: 1.9 $'; $version =~ s/\D*([\d\.]+)\D*/$1/; # rcs tag populated
00031 my $command = "maws.pl"; my $commandthumbnail = "maws.png"; my $author = "Auric";
00032 #################################### Util Subs ############################################
00033 # If you copy this for another site, hopefully these won't need to changed
00034
00035 sub printversion {
00036 #print '<?xml version="1.0" encoding="UTF-8"?>'."\n";
00037 print '<grabber>'."\n";
00038 print ' <name>'.$site.'</name>'."\n";
00039 print ' <command>'.$command.'</command>'."\n";
00040 print ' <author>'.$author.'</author>'."\n";
00041 print ' <thumbnail>'.$commandthumbnail.'</thumbnail>'."\n";
00042 print ' <type>games</type>'."\n";
00043 print ' <description>Search and Metadata downloads from the MAME MAWS db</description>'."\n";
00044 print ' <version>'.$version.'</version>'."\n";
00045 print '</grabber>'."\n";
00046 }
00047
00048 sub cleanexit {
00049 my $esig = shift @_;
00050
00051 fileno(FH) and close(FH);
00052 if ($esig =~ /\D/) {
00053 # called by signalhandler
00054 exit 1;
00055 } else {
00056 exit $esig;
00057 }
00058 }
00059
00060 sub infomsg {
00061 my $level = shift @_;
00062 my $mesg = shift @_;
00063
00064 ($info < $level) and return;
00065 $mesg = encode('utf8', $mesg);
00066 if ($infoop =~ /\D/) {
00067 open(FH, ">$infoop") unless fileno(FH);
00068 my $t = localtime();
00069 print FH "$t $level $mesg\n";
00070 } else {
00071 print STDERR "$mesg\n";
00072 }
00073 }
00074
00075 sub cleantext {
00076 my $text = shift @_;
00077
00078 ($text) or return;
00079 $text =~ s/\n/ /g;
00080 $text =~ s/^\s+|\s+$
00081 $text = encode_entities($text);
00082 return $text;
00083 }
00084
00085 sub printitems {
00086
00087 foreach my $i (@metaitems) {
00088 my %item = %{$i};
00089 print " ".'<item>'."\n";
00090 print " ".'<title>'.$item{'title'}.'</title>'."\n";
00091 print " ".'<inetref>'.$item{'inetref'}.'</inetref>'."\n";
00092 print " ".'<description>'.$item{'description'}.'</description>'."\n";
00093 print " ".'<system>MAME</system>'."\n";
00094 print " ".'<categories>'."\n";
00095 print " ".'<category type="genre" name="'.$item{'genre'}.'"/>'."\n";
00096 print " ".'</categories>'."\n";
00097 print " ".'<studios>'."\n";
00098 print " ".'<studio name="'.$item{'studio'}.'"/>'."\n";
00099 print " ".'</studios>'."\n";
00100 print " ".'<year>'.$item{'year'}.'</year>'."\n";
00101 print " ".'<images>'."\n";
00102 print " ".'<image type="screenshot" thumb="'.$item{'screenshotthumb'}.'" url="'.$item{'screenshoturl'}.'"/>'."\n";
00103 print " ".'<image type="coverart" thumb="'.$item{'coverthumb'}.'" url="'.$item{'coverurl'}.'"/>'."\n";
00104 print " ".'</images>'."\n";
00105 print " ".'<popularity>'.$item{'popularity'}.'</popularity>'."\n";
00106 print " ".'</item>'."\n";
00107 }
00108 }
00109
00110 #################################### Site Specific Subs ##########################
00111 sub search {
00112 my $searchstr = shift @_;
00113
00114 my $content = get(${searchurl} . ${searchstr});
00115 unless ($content) {
00116 die "Could not retrieve ${searchurl}${searchstr}";
00117 }
00118 my $tree = HTML::TreeBuilder->new;
00119 eval { $tree->parse($content); };
00120 if ($@) {
00121 die "$searchurl parse failed, $@";
00122 }
00123 $tree->eof();
00124
00125 my @trs = $tree->find_by_tag_name('tr');
00126 foreach my $tr (@trs) {
00127 my @as = $tr->find_by_tag_name('a');
00128 my $count = 0;
00129 foreach my $a (@as) {
00130 ($count++ < 2) && next;
00131 ($a->as_trimmed_text()) || last;
00132 $a->attr('href') =~ /romset/ || last;
00133 my $title = cleantext($a->as_trimmed_text());
00134 my $inetref = $a->attr('href');
00135 $inetref =~ s/\#.*$
00136 $inetref =~ s/^\/maws\
00137 $inetref = cleantext($baseurl . "/maws/" . $inetref);
00138 push(@metaitems, {
00139 'title' => $title,
00140 'inetref' => $inetref,
00141 'description' => "",
00142 'genre' => "",
00143 'studio' => "",
00144 'year' => "",
00145 'coverthumb' => "",
00146 'coverurl' => "",
00147 'screenshotthumb' => "",
00148 'screenshoturl' => "",
00149 'popularity' => ""
00150 });
00151 last;
00152 }
00153 }
00154 return 0;
00155 }
00156
00157 sub queryinetref {
00158 my $inetref = shift @_;
00159
00160 my $content = get(${inetref});
00161 unless ($content) {
00162 die "Could not retrieve ${inetref}";
00163 }
00164 my $tree = HTML::TreeBuilder->new;
00165 eval { $tree->parse($content); };
00166 if ($@) {
00167 die "$inetref parse failed, $@";
00168 }
00169 $tree->eof();
00170
00171 my $title = "";
00172 my $description = "";
00173 my $genre = "";
00174 my $studio = "";
00175 my $year = "";
00176 my $coverthumb = "";
00177 my $coverurl = "";
00178 my $screenshoturl = "";
00179 my $screenshotthumb = "";
00180 my $popularity = "";
00181 my @trs = $tree->find_by_tag_name('tr');
00182 foreach my $tr (@trs) {
00183 my @tds = $tr->find_by_tag_name('td');
00184 foreach my $td (@tds) {
00185 if ($td->as_trimmed_text() eq "title" ) {
00186 my $right = $td->right();
00187 ($right) and $title = cleantext($right->as_trimmed_text());
00188 } elsif ($td->as_trimmed_text() eq "manufacturer" ) {
00189 my $right = $td->right();
00190 ($right) and $studio = cleantext($right->as_trimmed_text());
00191 } elsif ($td->as_trimmed_text() eq "year" ) {
00192 my $right = $td->right();
00193 ($right) and $year = cleantext($right->as_trimmed_text());
00194 } elsif ($td->as_trimmed_text() eq "genre" ) {
00195 my $right = $td->right();
00196 ($right) and $genre = cleantext($right->as_trimmed_text());
00197 } elsif ($td->as_trimmed_text() eq "snapshots" ) {
00198 my $right = $td->right();
00199 my @as = $tree->find_by_tag_name('a');
00200 foreach my $a (@as) {
00201 if ((!$screenshoturl) && $a->as_trimmed_text() =~ /in game/) {
00202 if ($a->attr('onClick')) {
00203 $screenshoturl = $a->attr('onClick');
00204 $screenshoturl =~ s/.*\'(.*)\'.*/$1/;
00205 $screenshoturl =~ s/^\.\.//;
00206 $screenshoturl =~ s/^\///;
00207 $screenshoturl = cleantext($baseurl . "/" . $screenshoturl);
00208 # Making thumb same as they are small.
00209 #$screenshotthumb = $screenshoturl;
00210 }
00211 }
00212 if ($a->as_trimmed_text() =~ /flyer/) {
00213 $coverurl = cleantext($baseurl . $a->attr('href'));
00214 # Making thumb same as they are small.
00215 #$coverthumb = $coverurl;
00216 }
00217 }
00218 } elsif ($td->as_trimmed_text() eq "rating" ) {
00219 my $right = $td->right();
00220 my $d = $right->find_by_tag_name('div');
00221 if ($d->attr('title')) {
00222 $popularity = $d->attr('title');
00223 $popularity =~ s/(.*)%.*/$1/;
00224 $popularity = int(($popularity / 10) + 0.5);
00225 }
00226 }
00227 }
00228 }
00229 push(@metaitems, {
00230 'title' => $title,
00231 'inetref' => $inetref,
00232 'description' => $description,
00233 'genre' => $genre,
00234 'studio' => $studio,
00235 'year' => $year,
00236 'coverthumb' => $coverthumb,
00237 'coverurl' => $coverurl,
00238 'screenshotthumb' => $screenshotthumb,
00239 'screenshoturl' => $screenshoturl,
00240 'popularity' => $popularity
00241 });
00242 return 0;
00243 }
00244
00245 #################################### Main #####################################
00246 getopts('M:D:v');
00247
00248 if ($opt_v) {
00249 printversion;
00250 cleanexit 0;
00251 }
00252
00253 unless (($opt_M) || ($opt_D)){
00254 print "Error must have either -M search str or -D inetref\n";
00255 cleanexit 1;
00256 }
00257
00258 $SIG{'INT'} = \&cleanexit;
00259 $SIG{'HUP'} = \&cleanexit;
00260 $SIG{'TERM'} = \&cleanexit;
00261 $SIG{'QUIT'} = \&cleanexit;
00262
00263 print "$header\n";
00264
00265 if ($opt_M) {
00266 search($opt_M);
00267 printitems();
00268 } elsif ($opt_D) {
00269 queryinetref($opt_D);
00270 printitems();
00271 }
00272
00273 print "$footer\n";
00274
00275 cleanexit 0;
00276