00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 __title__ ="tedtalksXSL_api - XPath and XSLT functions for the TedTalks RSS/HTML"
00016 __author__="R.D. Vaughan"
00017 __purpose__='''
00018 This python script is intended to perform a variety of utility functions
00019 for the conversion of data to the MNV standard RSS output format.
00020 See this link for the specifications:
00021 http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format
00022 '''
00023
00024 __version__="v0.1.1"
00025
00026
00027
00028
00029
00030 __xpathClassList__ = ['xpathFunctions', ]
00031
00032
00033
00034 __xsltExtentionList__ = []
00035
00036 import os, sys, re, time, datetime, shutil, urllib, string
00037 from copy import deepcopy
00038
00039
00040 class OutStreamEncoder(object):
00041 """Wraps a stream with an encoder"""
00042 def __init__(self, outstream, encoding=None):
00043 self.out = outstream
00044 if not encoding:
00045 self.encoding = sys.getfilesystemencoding()
00046 else:
00047 self.encoding = encoding
00048
00049 def write(self, obj):
00050 """Wraps the output stream, encoding Unicode strings with the specified encoding"""
00051 if isinstance(obj, unicode):
00052 try:
00053 self.out.write(obj.encode(self.encoding))
00054 except IOError:
00055 pass
00056 else:
00057 try:
00058 self.out.write(obj)
00059 except IOError:
00060 pass
00061
00062 def __getattr__(self, attr):
00063 """Delegate everything but write to the stream"""
00064 return getattr(self.out, attr)
00065 sys.stdout = OutStreamEncoder(sys.stdout, 'utf8')
00066 sys.stderr = OutStreamEncoder(sys.stderr, 'utf8')
00067
00068 try:
00069 from StringIO import StringIO
00070 from lxml import etree
00071 except Exception, e:
00072 sys.stderr.write(u'\n! Error - Importing the "lxml" and "StringIO" python libraries failed on error(%s)\n' % e)
00073 sys.exit(1)
00074
00075
00076
00077
00078
00079 version = ''
00080 for digit in etree.LIBXML_VERSION:
00081 version+=str(digit)+'.'
00082 version = version[:-1]
00083 if version < '2.7.2':
00084 sys.stderr.write(u'''
00085 ! Error - The installed version of the "lxml" python library "libxml" version is too old.
00086 At least "libxml" version 2.7.2 must be installed. Your version is (%s).
00087 ''' % version)
00088 sys.exit(1)
00089
00090
00091 class xpathFunctions(object):
00092 """Functions specific extending XPath
00093 """
00094 def __init__(self):
00095 self.functList = ['tedtalksMakeItem', 'tedtalksGetItem', 'tedtalksMakeLink', 'tedtalksTitleRSS', ]
00096 self.namespaces = {
00097 'media': u"http://search.yahoo.com/mrss/",
00098 'xhtml': u"http://www.w3.org/1999/xhtml",
00099 'mythtv': "http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format",
00100 }
00101
00102 self.descriptionFilter = etree.XPath('//p[@id="tagline"]', namespaces=self.namespaces)
00103 self.durationFilter = etree.XPath('//dl[@class="talkMedallion clearfix"]//em[@class="date"]/text()', namespaces=self.namespaces)
00104 self.persistence = {}
00105 self.flvPlayerLink = u'http://static.hd-trailers.net/mediaplayer/player.swf?autostart=true&backcolor=000000&frontcolor=999999&lightcolor=000000&screencolor=000000&controlbar=over&file=%s'
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115 def tedtalksMakeItem(self, context, *arg):
00116 '''Generate item elements from a Video HTML page on the TedTalks site.
00117 Call example: 'mnvXpath:tedtalksMakeItem(concat('http://www.ted.com', normalize-space(./@href), $paraMeter))/link'
00118 return an number of item elements
00119 '''
00120 webURL = arg[0]
00121 parmDict = self.parameterArgs( arg[1])
00122
00123
00124 try:
00125 tmpHandle = urllib.urlopen(webURL)
00126 htmlString = unicode(tmpHandle.read(), 'utf-8')
00127 tmpHandle.close()
00128 except errmsg:
00129 sys.stderr.write(u'! Error: TedTalk web page read issue for URL(%s)\nerror(%s)\n' % (webURL, errmsg))
00130 return etree.XML(u"<xml></xml>" )
00131
00132 htmlElementTree = etree.HTML(htmlString)
00133
00134
00135 mediaNamespace = "http://search.yahoo.com/mrss/"
00136 media = "{%s}" % mediaNamespace
00137 NSMAP = {'media' : mediaNamespace}
00138 elementTmp = etree.Element(media + "media", nsmap=NSMAP)
00139
00140
00141 tmpPubDate = self.stripSubstring(htmlString, '\tpd:"', '"')
00142 if tmpPubDate:
00143 tmpPubDate = common.pubDate('dummy', u'1 '+tmpPubDate, "%d %b %Y")
00144 else:
00145 tmpPubDate = common.pubDate('dummy', u'')
00146
00147
00148 if self.stripSubstring(htmlString, '\ths:"', '"'):
00149 tmpFlvLink = self.flvPlayerLink % u'http://video.ted.com/%s' % self.stripSubstring(htmlString, '\ths:"', '"').replace('high', parmDict['flv'])
00150 else:
00151 tmpFlvLink = webURL
00152
00153
00154 tmpFileName = self.stripSubstring(htmlString, '\ths:"talks/dynamic/', '-')
00155 tmpDownloadLink = u'http://video.ted.com/talks/podcast/%s' % tmpFileName
00156 if parmDict['download'] == 'HD':
00157 tmpDownloadLink+='_480.mp4'
00158 else:
00159 tmpDownloadLink+='.mp4'
00160
00161
00162 tmpThumbNail = self.stripSubstring(htmlString, 'amp;su=', '&')
00163
00164
00165 tmpDesc = self.descriptionFilter(htmlElementTree)
00166 if len(tmpDesc):
00167 tmpDesc = tmpDesc[0].text
00168 else:
00169 tmpDesc = u''
00170
00171
00172 tmpDuration = self.durationFilter(htmlElementTree)
00173 if len(tmpDuration):
00174 index = tmpDuration[0].find(' ')
00175 if index != -1:
00176 tmpDuration = common.convertDuration('dummy', tmpDuration[0][:index])
00177 else:
00178 tmpDuration = u''
00179 else:
00180 tmpDuration = u''
00181
00182
00183 etree.SubElement(elementTmp, "pubDate").text = tmpPubDate
00184 etree.SubElement(elementTmp, "description").text = tmpDesc
00185 etree.SubElement(elementTmp, "link").text = tmpFlvLink
00186 tmpgroup = etree.SubElement(elementTmp, media + "group")
00187 tmpTNail = etree.SubElement(tmpgroup, media + "thumbnail")
00188 tmpTNail.attrib['url'] = tmpThumbNail
00189 tmpContent = etree.SubElement(tmpgroup, media + "content")
00190 tmpContent.attrib['url'] = tmpDownloadLink
00191 tmpContent.attrib['duration'] = tmpDuration
00192 tmpContent.attrib['lang'] = u'en'
00193
00194 self.persistence[webURL] = deepcopy(elementTmp)
00195 return elementTmp
00196
00197
00198 def tedtalksGetItem(self, context, *arg):
00199 '''Return item elements that were previously created in "tedtalksMakeItem" call
00200 Call example: 'mnvXpath:tedtalksGetItem(concat('http://www.ted.com', normalize-space(./@href))/*'
00201 return an number of item elements
00202 '''
00203 elementTmp = self.persistence[arg[0]]
00204 del self.persistence[arg[0]]
00205 return elementTmp
00206
00207
00208 def tedtalksMakeLink(self, context, *arg):
00209 '''Return item elements that were previously created in "tedtalksMakeItem" call
00210 Call example: 'mnvXpath:tedtalksMakeLink(enclosure/@url, $paraMeter)'
00211 return a link for playing the flv file
00212 '''
00213 tmpDownloadLink = arg[0]
00214 parmDict = self.parameterArgs(arg[1])
00215 index = tmpDownloadLink.rfind('/')
00216 videoFileName = u'http://video.ted.com/talks/dynamic%s' % tmpDownloadLink[index:].replace('_480', u'').replace('.mp4', u'')
00217 videoFileName+=u'-%s.flv' % parmDict['flv']
00218 return self.flvPlayerLink % videoFileName
00219
00220
00221 def tedtalksTitleRSS(self, context, *arg):
00222 '''Return item elements that were previously created in "tedtalksMakeItem" call
00223 Call example: 'mnvXpath:tedtalksTitleRSS(string(title))'
00224 return a massaged title string
00225 '''
00226 title = arg[0]
00227 index = title.rfind('-')
00228 if index == -1:
00229 return title
00230 return title[:index].strip()
00231
00232
00233 def stripSubstring(self, string, startText, terminatorChar):
00234 '''Return a substring terminated by specific character(s)
00235 return a substring
00236 '''
00237 index = string.find(startText)
00238 if index == -1:
00239 return u''
00240 string = string[index+len(startText):]
00241 index = string.find(terminatorChar)
00242 if index == -1:
00243 return u''
00244 return string[:index].strip()
00245
00246
00247 def parameterArgs(self, parameters, terminatorChar=u';'):
00248 '''Set the parameters for TedTalks
00249 return a dictionary of parameters
00250 '''
00251 paramDict = {}
00252 args = parameters.split(terminatorChar)
00253 for arg in args:
00254 tmp = arg.split('=')
00255 paramDict[tmp[0]] = tmp[1]
00256 return paramDict
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274
00275