00001 #!/usr/bin/env perl
00002 # @(#)$Header: /home/mythtv/mythtvrep/scripts/topdocumentaryfilm.pl,v 1.17 2010/07/24 23:28:11 mythtv Exp $
00003 # Auric 2010/01/10 http://web.aanet.com.au/auric/
00004 #
00005 # MythNetvision Grabber Script for topdocumentaryfilm site.
00006 #
00007 # If you want to alter any of the default settings.
00008 # Create/Change $HOME/.mythtv/MythNetvision/userGrabberPrefs/topdocumentaryfilm.cfg
00009 # Format of file
00010 # player=mplayer
00011 # playerargs=-fs -zoom %MEDIAURL%
00012 #
00013 # Some settings you can have in this are
00014 # Print info/progress message: 0 - off, 1 - low ,2 - high
00015 # mnvinfo
00016 # Info messages go to: 0 = stderr, filename = filename
00017 # mnvinfoop
00018 # External player to use
00019 # player
00020 # Args to external player %MEDIAURL% will be replaced with content url
00021 # playerargs
00022 # External download to use
00023 # download
00024 # Args to external download %MEDIAURL% will be replaced with content url
00025 # downloadargs
00026 # A network player like a flash or html5 html. TODO 0.24 May not be approved
00027 # netplayer
00028 # Type flash or html5
00029 # netplayertype
00030 # Seconds to cache results Default 72000
00031 # cachetime
00032 #
00033 ################################################################################
00034 use strict;
00035 use warnings;
00036 use Getopt::Std;
00037 use LWP::Simple;
00038 use HTML::TreeBuilder;
00039 use HTML::Entities;
00040 use Data::Dumper;
00041 use Date::Parse;
00042 use Date::Format;
00043 use Encode;
00044 use Storable;
00045 use File::stat;
00046 use File::Basename;
00047 use FindBin '$Bin', '$Script';
00048 use lib "$Bin/nv_perl_libs";
00049 use mnvcommonsubs;
00050
00051 #################################### Settings #################################
00052 # Load from config file. May overwrite above.
00053 mnvloadconfig(fileparse($Script, '.pl'), "notused");
00054 # SKIP completely skips the video
00055 my %autoplay = (
00056 'youtube.com' => '&autoplay=1',
00057 '220.ro' => '&aplay=true',
00058 'megavideo.com' => 'SKIP',
00059 'veoh.com' => 'videoAutoPlay=1',
00060 'crunchyroll.com' => 'auto_play=true',
00061 'mediaservices.myspace.com' => ',AutoPlay=true',
00062 );
00063
00064 #################################### Globals ##################################
00065 my $version = '$Revision: 1.17 $'; $version =~ s/\D*([\d\.]+)\D*/$1/; # rcs tag populated
00066 my $command = "topdocumentaryfilm.pl"; my $commandthumbnail = "topdocumentaryfilm.png"; my $author = "Auric";
00067 my $site = 'TopDocumentaryFilms';
00068 my $description = 'Great collection of documentary movies';
00069 my $baseurl = 'http://topdocumentaryfilms.com/';
00070 my $baseicon = 'http://www.danaroc.com/ezine_pics_031510_websites.jpg';
00071 my $store = "/tmp/.${site}.diritemsref.store";
00072 our ($opt_v, $opt_T, $opt_p, $opt_S);
00073 my %diritems;
00074
00075 #################################### Site Specific Subs ##########################
00076 # Build all vid items for all directories
00077 # input hash ref to { "directory name" => [array of anonymous hash's] }
00078 # anonymous hash {
00079 # 'dirthumbnail' => $icon,
00080 # 'title' => $title,
00081 # 'mythtv:subtitle' => "",
00082 # 'author' => $author,
00083 # 'pubDate' => $pubDate,
00084 # 'description' => $description,
00085 # 'link' => $url,
00086 # 'player' => $player,
00087 # 'playerargs' => $playerargs,
00088 # 'download' => $download,
00089 # 'downloadargs' => $downloadargs,
00090 # 'media:thumbnailurl' => "",
00091 # 'media:contenturl' => $contenturl,
00092 # 'media:contentlength' => $length,
00093 # 'media:contentduration' => "",
00094 # 'media:contentwidth' => "",
00095 # 'media:contentheight' => "",
00096 # 'media:contentlanguage' => $language,
00097 # 'rating' => ""
00098 # 'mythtv:country' => ""
00099 # 'mythtv:season' => ""
00100 # 'mythtv:episode' => ""
00101 # 'mythtv:customhtml' => ""
00102 # }
00103 # Basically this hash ref is what you need to build.
00104 # input base url
00105 # output items found
00106
00107 sub builddiritems {
00108 my $diritemsref = shift @_;
00109 my $baseurl = shift @_;
00110
00111 my $dirurlsref = builddirurls($baseurl);
00112 my $vidurlsref = buildvidurls($dirurlsref);
00113 my $itemsfound = 0;
00114 foreach my $dir (keys(%$vidurlsref)) {
00115 my $diritemsfound = 0;
00116 foreach my $urltitle (@{$vidurlsref->{$dir}}) {
00117 my($url, $title) = @{$urltitle};
00118 my $found = builditems($diritemsref, $dir, $url, $title);
00119 $itemsfound += $found;
00120 $diritemsfound += $found;
00121 }
00122 mnvinfomsg(1, "$dir Items found $diritemsfound");
00123 }
00124 return $itemsfound;
00125 }
00126
00127 sub addautoplay {
00128 my $link = shift @_;
00129
00130 $link = decode_entities($link);
00131 unless ($link =~ s/(.*[?&]autoplay=)false(.*)/${1}true${2}/i) {
00132 unless ($link =~ s/(.*[?&]autostart=)false(.*)/${1}true${2}/i) {
00133 unless ($link =~ s/(.*[?&]aplay=)false(.*)/${1}true${2}/i) {
00134 unless ($link =~ s/(.*[?&]autoplay=)0(.*)/${1}1${2}/i) {
00135 unless ($link =~ s/(.*[?&]autostart=)0(.*)/${1}1${2}/i) {
00136 unless ($link =~ s/(.*[?&]aplay=)0(.*)/${1}1${2}/i) {
00137 foreach my $ap (keys(%autoplay)) {
00138 if ($link =~ /$ap/) {
00139 ($autoplay{$ap}) or return encode_entities($link);
00140 ($autoplay{$ap} eq 'SKIP') and return 0;
00141 if ($autoplay{$ap} =~ /^[\?\&,]/) {
00142 $link .= $autoplay{$ap};
00143 } else {
00144 if ($link =~ /\?/) {
00145 $link .= '&' . $autoplay{$ap};
00146 } else {
00147 $link .= '?' . $autoplay{$ap};
00148 }
00149 }
00150 return encode_entities($link);
00151 }
00152 }
00153 if ($link =~ /\?/) {
00154 $link .= '&' . mnvgetconfig('defaultautoplay');
00155 } else {
00156 $link .= '?' . mnvgetconfig('defaultautoplay');
00157 }
00158 return encode_entities($link);
00159 }
00160 }
00161 }
00162 }
00163 }
00164 }
00165 }
00166
00167 # Collect url's of all the podcasts
00168 # input base url
00169 # return hash ref to { "directory name" => "url" }
00170 sub builddirurls {
00171 my $baseurl = shift @_;
00172
00173 my %dirurls;
00174
00175 mnvinfomsg(1, "Getting $baseurl");
00176 my $content = get($baseurl);
00177 unless ($content) {
00178 die "Could not retrieve $baseurl";
00179 }
00180 my $tree = HTML::TreeBuilder->new;
00181 eval { $tree->parse($content); };
00182 if ($@) {
00183 die "$baseurl parse failed, $@";
00184 }
00185 $tree->eof();
00186
00187 my @ptrs = $tree->find_by_tag_name('a');
00188 foreach my $ptr (@ptrs) {
00189 if ($ptr->attr('href') =~ /topdocumentaryfilms.com\/category\
00190 my $dir = $ptr->as_trimmed_text();
00191 $dirurls{$dir} = mnvcleantext($ptr->attr('href'));
00192 }
00193 }
00194 (keys(%dirurls)) or die "No urls found";
00195
00196 return \%dirurls;
00197 }
00198
00199 # Collect url's to all vids
00200 # return hash ref to { "directory name" => "url" }
00201 # return hash ref to { "directory name" => [[url,title]] }
00202 sub buildvidurls {
00203 my $dirurls = shift @_;
00204
00205 my %vidurls;
00206
00207 foreach my $dir (sort(keys(%$dirurls))) {
00208 mnvinfomsg(1, "Getting $dir $dirurls->{$dir}");
00209 my $content = get($dirurls->{$dir});
00210 unless ($content) {
00211 warn "Could not retrieve $dirurls->{$dir}";
00212 next;
00213 }
00214 my $tree = HTML::TreeBuilder->new;
00215 eval { $tree->parse($content); };
00216 if ($@) {
00217 warn "$dirurls->{$dir} parse failed, $@";
00218 next;
00219 }
00220 $tree->eof();
00221
00222 my @ptrs = $tree->find_by_tag_name('h2');
00223 (@ptrs) or next;
00224
00225 foreach my $ptr (@ptrs) {
00226 my $a = $ptr->find_by_tag_name('a');
00227 ($a) or next;
00228 my $url = mnvcleantext($a->attr('href'));
00229 my $title = mnvcleantext($a->as_trimmed_text());
00230 push(@{$vidurls{$dir}}, [$url, $title]);
00231 }
00232 }
00233 return \%vidurls;;
00234 }
00235
00236 # Build all items
00237 # input hash ref to { "directory name" => [array of anonymous hash's] }
00238 # input "directory name"
00239 # input url
00240 # input title
00241 # output number of items added
00242 sub builditems {
00243 my $diritemsref = shift @_;
00244 my $dir = shift @_;
00245 my $url = shift @_;
00246 my $title = shift @_;
00247
00248 mnvinfomsg(2, "Getting $dir Episode $url");
00249 my $content = get($url);
00250 unless ($content) {
00251 warn "Could not retrieve $url";
00252 return 0;
00253 }
00254 my $tree = HTML::TreeBuilder->new;
00255 eval { $tree->parse($content); };
00256 if ($@) {
00257 warn "$url parse failed, $@";
00258 return 0;
00259 }
00260 $tree->eof();
00261
00262 my $desc = ""; my $icon = $baseicon; my @links;
00263 my $pc = $tree->look_down('class', 'postContent');
00264 ($pc) or return 0;
00265 my $ptr = $pc->find_by_tag_name('p');
00266 ($ptr) and $desc = mnvcleantext($ptr->as_trimmed_text());
00267 $ptr = $pc->find_by_tag_name('img');
00268 ($ptr) and $icon = mnvcleantext($ptr->attr('src'));
00269 my @ptrs = $pc->find_by_tag_name('embed');
00270 foreach my $ptr (@ptrs) {
00271 my $l = mnvcleantext($ptr->attr('src'));
00272 ($l) or next;
00273 my $lap = addautoplay($l);
00274 if ($lap) {
00275 push(@links, $lap);
00276 } else {
00277 mnvinfomsg(2, "Skipped $l");
00278 }
00279 }
00280 (@links) or return 0;
00281
00282 my $country = "";
00283 my $addpart = 1;
00284 my $oldtitle = $title;
00285 foreach my $link (@links) {
00286 if ($#links > 0) {
00287 $title = "$oldtitle Pt $addpart";
00288 $addpart++;
00289 }
00290 push(@{$diritemsref->{$dir}}, {
00291 'dirthumbnail' => $icon,
00292 'title' => $title,
00293 'mythtv:subtitle' => "",
00294 'author' => "",
00295 'pubDate' => "",
00296 'description' => $desc,
00297 'link' => $link,
00298 'player' => mnvgetconfig('player'),
00299 'playerargs' => mnvgetconfig('playerargs'),
00300 'download' => mnvgetconfig('download'),
00301 'downloadargs' => mnvgetconfig('downloadargs'),
00302 'media:thumbnailurl' => $icon,
00303 'media:contenturl' => $link,
00304 'media:contentlength' => "",
00305 'media:contentduration' => "",
00306 'media:contentwidth' => "",
00307 'media:contentheight' => "",
00308 'media:contentlanguage' => "",
00309 'rating' => "",
00310 'mythtv:country' => $country,
00311 'mythtv:season' => "",
00312 'mythtv:episode' => "",
00313 'mythtv:customhtml' => "no"
00314 });
00315
00316 mnvinfomsg(2, "Added $title");
00317 }
00318 return $#links + 1;
00319 }
00320
00321 #################################### Main #####################################
00322 # If you copy this for another site, hopefully these won't need to changed
00323 getopts('vtTp:S:');
00324
00325 if ($opt_v) {
00326 ($mnvcommonsubs::netvisionver == 23) and print "$site|TS\n";
00327 ($mnvcommonsubs::netvisionver > 23) and mnvprintversion($site, $command, $author, $commandthumbnail, $version, $description);
00328 exit 0;
00329 }
00330
00331 my $type; my $page = 1; my $search = "";
00332 if ($opt_T) {
00333 $type = "tree";
00334 } elsif ($opt_S) {
00335 $type = "search";
00336 $search = $opt_S;
00337 ($opt_p) and $page = $opt_p;
00338 } else {
00339 print STDERR "Must have -T or -S option\n";
00340 exit 1;
00341 }
00342
00343 $SIG{'INT'} = \&mnvcleanexit;
00344 $SIG{'HUP'} = \&mnvcleanexit;
00345 $SIG{'TERM'} = \&mnvcleanexit;
00346 $SIG{'QUIT'} = \&mnvcleanexit;
00347
00348 my $diritemsref = \%diritems;
00349 my $totalitems = 0; my $filtereditems = 0;
00350 my $ss = stat($store);
00351 if (($ss) && (time() - $ss->mtime) < mnvgetconfig('cachetime')) {
00352 eval { $diritemsref = retrieve($store); };
00353 if ($@) {
00354 die "Could not load store, $@";
00355 }
00356 $totalitems = mnvnumresults($diritemsref);
00357 mnvinfomsg(1, "Using previous run data");
00358 } else {
00359 $totalitems = builddiritems($diritemsref, $baseurl);
00360 eval { store($diritemsref, $store); };
00361 if ($@) {
00362 warn "Could not save store, $@";
00363 }
00364 }
00365
00366 mnvrssheader();
00367 print '<channel>
00368 <title>'.$site.'</title>
00369 <link>'.$baseurl.'</link>
00370 <description>'.$description.'</description>'."\n";
00371 if ($type eq "search") {
00372 $filtereditems = mnvfilter($diritemsref, $search);
00373 mnvprintsearch($diritemsref, $page);
00374 mnvinfomsg(1, "Total Items match $filtereditems of $totalitems");
00375 } else {
00376 mnvprinttree($diritemsref);
00377 mnvinfomsg(1, "Total Items found $totalitems");
00378 }
00379 print "</channel>\n";
00380 mnvrssfooter();
00381
00382 mnvcleanexit 0;