(.*?):(.*?)
(.*?)
')
- if not html_title:
- html_title = findRe(data, '
(.*?) ')
- if html_title:
- html_title = html_title.replace('
', ' ').replace(' ', ' ')
- title = decodeHtml(html_title)
- title = stripTags(title)
- year = findRe(title, '\((\d{4})\)')
- if not year:
- year = findRe(title, '\((\d{4})')
- _y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))')
- if _y:
- title = title.replace(_y, '')
- for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
- title = title.replace(t, '')
- title = title.strip()
- if title.find(u'\xa0') > -1:
- title = title[:title.find(u'\xa0')].strip()
- if title.startswith('"') and title.endswith('"'):
- title = title[1:-1]
- info['title'] = title
- info['year'] = year
+ #get Title
+ title = ''
+ year = ''
+ html_title = findRe(data, '
(.*?)
')
+ if not html_title:
+ html_title = findRe(data, '
(.*?) ')
+ if html_title:
+ html_title = html_title.replace('
', ' ').replace(' ', ' ')
+ title = decodeHtml(html_title)
+ title = stripTags(title)
+ year = findRe(title, '\((\d{4})\)')
+ if not year:
+ year = findRe(title, '\((\d{4})')
+ _y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))')
+ if _y:
+ title = title.replace(_y, '')
+ for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
+ title = title.replace(t, '')
+ title = title.strip()
+ if title.find(u'\xa0') > -1:
+ title = title[:title.find(u'\xa0')].strip()
+ if title.startswith('"') and title.endswith('"'):
+ title = title[1:-1]
+ info['title'] = title
+ info['year'] = year
- #Rating
- rating = findRe(data, '
([\d\.]*?)/10 ')
- if rating:
- info['rating'] = float(rating)
- else:
- info['rating'] = -1
+ #Rating
+ rating = findRe(data, '
([\d\.]*?)/10 ')
+ if rating:
+ info['rating'] = float(rating)
+ else:
+ info['rating'] = -1
- #Votes
- votes = findRe(data, '
\((.*?) votes \) ')
- if votes:
- info['votes'] = int(votes.replace(',', ''))
- else:
- info['votes'] = -1
- return info
+ #Votes
+ votes = findRe(data, '
\((.*?) votes \) ')
+ if votes:
+ info['votes'] = int(votes.replace(',', ''))
+ else:
+ info['votes'] = -1
+ return info
def getMoviePoster(imdbId):
- info = getMovieInfo(imdbId)
- return info['poster']
+ info = getMovieInfo(imdbId)
+ return info['poster']
def getMovieYear(imdbId):
- info = getMovieInfo(imdbId)
- return info['year']
+ info = getMovieInfo(imdbId)
+ return info['year']
def getMovieTitle(imdbId):
- info = getMovieInfo(imdbId)
- return info['title']
+ info = getMovieInfo(imdbId)
+ return info['title']
def creditList(data, section=None):
- if section == 'cast':
- credits_ = re.compile('''
(.*?).*? (.*?) ''').findall(data)
- else:
- credits_ = re.compile('''
.*?(.*?) (.*?) ''').findall(data)
- credits = []
- for c_ in credits_:
- c = [decodeHtml(c_[0]).strip(), decodeHtml(c_[1]).strip()]
- if section=='writers':
- c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','')
- if c[1].endswith(' and'): c[1] = c[1][:-4]
- credits.append(c)
- return credits
+ if section == 'cast':
+ credits_ = re.compile('''
(.*?).*? (.*?) ''').findall(data)
+ else:
+ credits_ = re.compile('''
.*?(.*?) (.*?) ''').findall(data)
+ credits = []
+ for c_ in credits_:
+ c = [decodeHtml(c_[0]).strip(), decodeHtml(c_[1]).strip()]
+ if section=='writers':
+ c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','')
+ if c[1].endswith(' and'): c[1] = c[1][:-4]
+ credits.append(c)
+ return credits
def getMovieCredits(imdbId):
- credits = dict()
- url = "%s/fullcredits" % getUrlBase(imdbId)
- data = getUrlUnicode(url)
- groups = data.split('
')
- for g in groups:
- section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
- if section:
- credits[section[0]] = creditList(g, section[0])
- return credits
+ credits = dict()
+ url = "%s/fullcredits" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ groups = data.split('')
+ for g in groups:
+ section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
+ if section:
+ credits[section[0]] = creditList(g, section[0])
+ return credits
def getMovieTrailers(imdbId):
- url = "%s/trailers" % getUrlBase(imdbId)
- data = getUrlUnicode(url)
- soup = BeautifulSoup(data)
- videos = soup('div', {'class':"video-gallery"})
- trailers = []
- if videos:
- for a in videos[0]('a'):
- title = stripTags(unicode(a)).strip()
- url = 'http://www.imdb.com' + a['href']
- videoId = findRe(url, '/(vi\d*?)/')
- iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
- iframe = getUrlUnicode(iframeUrl)
- videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
- trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
- return trailers
+ url = "%s/trailers" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ soup = BeautifulSoup(data)
+ videos = soup('div', {'class':"video-gallery"})
+ trailers = []
+ if videos:
+ for a in videos[0]('a'):
+ title = stripTags(unicode(a)).strip()
+ url = 'http://www.imdb.com' + a['href']
+ videoId = findRe(url, '/(vi\d*?)/')
+ iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
+ iframe = getUrlUnicode(iframeUrl)
+ videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
+ trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
+ return trailers
def getMovieQuotes(imdbId):
- url = "%s/quotes" % getUrlBase(imdbId)
- data = getUrlUnicode(url)
- quotes = re.compile('(.*?) :(.*?) ', re.DOTALL).findall(findString(data, '(.*?):(.*?) ', re.DOTALL).findall(findString(data, ' (.*?)')
- return plot
+ url = "%s/plotsummary" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ plot = findRe(data, ' (.*?)')
+ return plot
def getMovieTechnical(imdbId):
- url = "%s/technical" % getUrlBase(imdbId)
- data = getUrlUnicode(url)
- results = {}
- for t in re.compile('(.*?) (.*?) ', re.DOTALL).findall(data):
- results[t[0].strip()] = t[1].strip()
- return results
+ url = "%s/technical" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ results = {}
+ for t in re.compile('(.*?) (.*?) ', re.DOTALL).findall(data):
+ results[t[0].strip()] = t[1].strip()
+ return results
def getMovieCompanyCredits(imdbId):
- url = "%s/companycredits" % getUrlBase(imdbId)
- data = getUrlUnicode(url)
- results = {}
- for field, c in re.compile('(.*?) ').findall(data):
- results[field.strip()] = []
- for company in re.compile('(.*?) ').findall(c):
- results[field.strip()].append(company)
- return results
+ url = "%s/companycredits" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ results = {}
+ for field, c in re.compile('(.*?) ').findall(data):
+ results[field.strip()] = []
+ for company in re.compile('(.*?) ').findall(c):
+ results[field.strip()].append(company)
+ return results
def getMovieLocations(imdbId):
- url = "%s/locations" % getUrlBase(imdbId)
- data = getUrlUnicode(url)
- soup = BeautifulSoup(data)
- locations = []
- for key in soup('a', {'href': re.compile('^/List')}):
- locations.append(decodeHtml(key.string))
- return locations
+ url = "%s/locations" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ soup = BeautifulSoup(data)
+ locations = []
+ for key in soup('a', {'href': re.compile('^/List')}):
+ locations.append(decodeHtml(key.string))
+ return locations
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
- photos = {}
- for key in keys:
- url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
- data = getUrlUnicode(url)
- photos[key] = {}
- for s in re.compile(''' ', '').strip()
- if t.startswith('') and t.endswith(' '):
- t = t[4:-5].strip()
- t=decodeHtml(t)
- trivia.append(t)
- return trivia
+ url = "%s/trivia" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ soup = BeautifulSoup(data)
+ trivia = []
+ triviaList = []
+ for i in soup('ul', {'class': "trivia"}):
+ for t in i('li'):
+ t = unicode(t).replace(' ', '').strip()
+ if t.startswith('') and t.endswith(' '):
+ t = t[4:-5].strip()
+ t=decodeHtml(t)
+ trivia.append(t)
+ return trivia
def getMovieConnections(imdbId):
- url = "%s/movieconnections" % getUrlBase(imdbId)
- data = getUrl(url)
- connections={}
- for c in re.compile('''(.*?) (.*?)\n\n''', re.DOTALL).findall(data):
- connections[unicode(c[0])] = re.compile('''''').findall(c[1])
- return connections
+ url = "%s/movieconnections" % getUrlBase(imdbId)
+ data = getUrl(url)
+ connections={}
+ for c in re.compile('''(.*?) (.*?)\n\n''', re.DOTALL).findall(data):
+ connections[unicode(c[0])] = re.compile(''' ''').findall(c[1])
+ return connections
def getMovieKeywords(imdbId):
- url = "%s/keywords" % getUrlBase(imdbId)
- data = getUrlUnicode(url)
- keywords = []
- for keyword in re.compile(''' (.*?)
''').findall(data):
- keyword = decodeHtml(keyword)
- keyword = keyword.replace(u'\xa0', ' ')
- keywords.append(keyword)
- return keywords
+ url = "%s/keywords" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ keywords = []
+ for keyword in re.compile('''(.*?)''').findall(data):
+ keyword = decodeHtml(keyword)
+ keyword = keyword.replace(u'\xa0', ' ')
+ keywords.append(keyword)
+ return keywords
def getMovieExternalReviews(imdbId):
- url = "%s/externalreviews" % getUrlBase(imdbId)
- data = getUrlUnicode(url)
- soup = BeautifulSoup(data)
- ol = soup('ol')
- if ol:
- ol = ol[0]
- ret = {}
- for li in ol('li'):
- try:
- a = li('a')[0]
- href = a.get('href')
- txt = a.contents[0]
- ret[href] = txt
- except:
- pass
- return ret
- return {}
+ url = "%s/externalreviews" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ soup = BeautifulSoup(data)
+ ol = soup('ol')
+ if ol:
+ ol = ol[0]
+ ret = {}
+ for li in ol('li'):
+ try:
+ a = li('a')[0]
+ href = a.get('href')
+ txt = a.contents[0]
+ ret[href] = txt
+ except:
+ pass
+ return ret
+ return {}
def getMovieReleaseDate(imdbId):
- releasedates = getMovieReleaseDates(imdbId)
- first_release = ''
- for r in releasedates:
- if not first_release or r[1] < first_release:
- first_release = r[1]
- return first_release
+ releasedates = getMovieReleaseDates(imdbId)
+ first_release = ''
+ for r in releasedates:
+ if not first_release or r[1] < first_release:
+ first_release = r[1]
+ return first_release
def getMovieReleaseDates(imdbId):
- url = "%s/releaseinfo" % getUrlBase(imdbId)
- data = getUrlUnicode(url)
- releasedates = []
- regexp = '''(.*?) .*?(.*?) .*?(.*?) '''
+ url = "%s/releaseinfo" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ releasedates = []
+ regexp = '''(.*?) .*?(.*?) .*?(.*?) '''
- def _parse_date(d):
- try:
- parsed_date = time.strptime(d, "%d %B %Y")
- parsed_date = time.strftime('%Y-%m-%d', parsed_date)
- return parsed_date
- except:
- return d
-
- for r in re.compile(regexp, re.DOTALL).findall(data):
- r_ = (stripTags(r[0]).strip(),
- _parse_date(stripTags(r[1]).strip()),
- decodeHtml(stripTags(r[2]).strip()))
- releasedates.append(r_)
- return releasedates
- soup = BeautifulSoup(data)
- info = soup('table',{'border': '0', 'cellpadding':'2'})
- if info:
- for row in info[0]('tr'):
- d = row('td', {'align':'right'})
- if d:
+ def _parse_date(d):
try:
- possible_date = stripTags(unicode(d[0])).strip()
- rdate = time.strptime(possible_date, "%d %B %Y")
- rdate = time.strftime('%Y-%m-%d', rdate)
- return rdate
+ parsed_date = time.strptime(d, "%d %B %Y")
+ parsed_date = time.strftime('%Y-%m-%d', parsed_date)
+ return parsed_date
except:
- pass
- return None
+ return d
+
+ for r in re.compile(regexp, re.DOTALL).findall(data):
+ r_ = (stripTags(r[0]).strip(),
+ _parse_date(stripTags(r[1]).strip()),
+ decodeHtml(stripTags(r[2]).strip()))
+ releasedates.append(r_)
+ return releasedates
def getMovieBusinessSum(imdbId):
- business = getMovieBusiness(imdbId)
- b_ = {'budget': 0, 'gross': 0, 'profit': 0}
- if 'budget' in business:
- b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
- if 'gross' in business:
- b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
- if 'weekend gross' in business:
- b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
- if b_['budget'] and b_['gross']:
- b_['profit'] = b_['gross'] - b_['budget']
- return b_
+ business = getMovieBusiness(imdbId)
+ b_ = {'budget': 0, 'gross': 0, 'profit': 0}
+ if 'budget' in business:
+ b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
+ if 'gross' in business:
+ b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
+ if 'weekend gross' in business:
+ b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
+ if b_['budget'] and b_['gross']:
+ b_['profit'] = b_['gross'] - b_['budget']
+ return b_
def getMovieFlimingDates(imdbId):
- business = getMovieBusiness(imdbId)
- if 'filming dates' in business and business['filming dates']:
- return business['filming dates'][0]
- return ''
+ business = getMovieBusiness(imdbId)
+ if 'filming dates' in business and business['filming dates']:
+ return business['filming dates'][0]
+ return ''
def getMovieBusiness(imdbId):
- url = "%s/business" % getUrlBase(imdbId)
- data = getUrlUnicode(url)
- business = {}
- for r in re.compile('''(.*?) (.*?) . ''', re.DOTALL).findall(data):
- key = stripTags(r[0]).strip().lower()
- value = [decodeHtml(stripTags(b).strip()) for b in r[1].split(' ')]
- business[key] = value
- return business
- soup = BeautifulSoup(data)
- business = {'budget': 0, 'gross': 0, 'profit': 0}
- content = soup('div', {'id': 'tn15content'})[0]
- blocks = unicode(content).split('')[1:]
- for c in blocks:
- cs = BeautifulSoup(c)
- line = c.split(' ')
- if line:
- title = line[0]
- line = line[1]
- if title in ['Budget', 'Gross']:
- values = re.compile('\$(.*?) ').findall(line)
- values = [int(value.replace(',','')) for value in values]
- if values:
- business[title.lower()] = max(values)
- if business['budget'] and business['gross']:
- business['profit'] = business['gross'] - business['budget']
- return business
+ url = "%s/business" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ business = {}
+ for r in re.compile('''(.*?) (.*?) . ''', re.DOTALL).findall(data):
+ key = stripTags(r[0]).strip().lower()
+ value = [decodeHtml(stripTags(b).strip()) for b in r[1].split(' ')]
+ business[key] = value
+ return business
def getMovieEpisodes(imdbId):
- url = "%s/episodes" % getUrlBase(imdbId)
- data = getUrlUnicode(url)
- episodes = {}
- regexp = r'''Season (.*?), Episode (.*?): (.*?) (.*?) (.*?) '''
- for r in re.compile(regexp, re.DOTALL).findall(data):
- try:
- episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
- episodes[episode] = {}
- episodes[episode]['imdb'] = r[2]
- episodes[episode]['title'] = r[3].strip()
- if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
- episodes[episode]['title'] = u''
- description = decodeHtml(r[5])
- description = stripTags(description.split('Next US airings:')[0])
- episodes[episode]['description'] = description.strip()
- episodes[episode]['date'] = ''
- try:
- d = stripTags(r[4])
- d = d.replace('Original Air Date: ', '')
- d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
- episodes[episode]['date'] = d
- except:
- pass
- except:
- import traceback
- print traceback.print_exc()
- pass
- return episodes
+ url = "%s/episodes" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ episodes = {}
+ regexp = r'''Season (.*?), Episode (.*?): (.*?) (.*?) (.*?) '''
+ for r in re.compile(regexp, re.DOTALL).findall(data):
+ try:
+ episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
+ episodes[episode] = {}
+ episodes[episode]['imdb'] = r[2]
+ episodes[episode]['title'] = r[3].strip()
+ if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
+ episodes[episode]['title'] = u''
+ description = decodeHtml(r[5])
+ description = stripTags(description.split('Next US airings:')[0])
+ episodes[episode]['description'] = description.strip()
+ episodes[episode]['date'] = ''
+ try:
+ d = stripTags(r[4])
+ d = d.replace('Original Air Date: ', '')
+ d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
+ episodes[episode]['date'] = d
+ except:
+ pass
+ except:
+ import traceback
+ print traceback.print_exc()
+ pass
+ return episodes
'''the old code below'''
class IMDb:
- def __init__(self, imdbId):
- self.imdb = imdbId
- self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
+ def __init__(self, imdbId):
+ self.imdb = imdbId
+ self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
- def getPage(self):
- return getUrlUnicode(self.pageUrl)
+ def getPage(self):
+ return getUrlUnicode(self.pageUrl)
- def parse_raw_value(self, key, value):
- if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
- value = stripTags(value).strip()
- if key == 'runtime':
- parsed_value = findRe(value, '(.*?) min')
- parsed_value = findRe(parsed_value, '([0-9]+)')
- if not parsed_value:
- parsed_value = findRe(value, '(.*?) sec')
- parsed_value = findRe(parsed_value, '([0-9]+)')
- if not parsed_value:
- parsed_value = 0
+ def parse_raw_value(self, key, value):
+ if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
+ value = stripTags(value).strip()
+ if key == 'runtime':
+ parsed_value = findRe(value, '(.*?) min')
+ parsed_value = findRe(parsed_value, '([0-9]+)')
+ if not parsed_value:
+ parsed_value = findRe(value, '(.*?) sec')
+ parsed_value = findRe(parsed_value, '([0-9]+)')
+ if not parsed_value:
+ parsed_value = 0
+ else:
+ parsed_value = int(parsed_value)
+ else:
+ parsed_value = int(parsed_value) * 60
+ elif key in ('country', 'language'):
+ parsed_value = value.split(' / ')
+ if len(parsed_value) == 1:
+ parsed_value = parsed_value[0].split(' | ')
+ parsed_value = [v.strip() for v in parsed_value]
+ elif key == 'genre':
+ parsed_value = value.replace('more', '').strip().split(' / ')
+ if len(parsed_value) == 1:
+ parsed_value = parsed_value[0].split(' | ')
+ parsed_value = [v.strip() for v in parsed_value]
+ elif key == 'tagline':
+ parsed_value = value.replace('more', '').strip()
+ elif key == 'plot_outline':
+ parsed_value = value.replace('(view trailer)', '').strip()
+ if parsed_value.endswith('more'):
+ parsed_value = parsed_value[:-4].strip()
+ elif key == 'tv_series':
+ m = re.compile('(.*?) ').findall(value)
+ if m:
+ parsed_value = m[0][0]
+ else:
+ parsed_value = ''
+ elif key == 'also_known_as':
+ parsed_value = ''
+ m = re.compile('(.*) \(International: English title').findall(value)
+ if m:
+ parsed_value = m[0]
+ else:
+ m = re.compile('(.*) \(USA').findall(value)
+ if m:
+ parsed_value = m[0]
+ parsed_value = parsed_value.split(' ')[-1].split('(')[0]
+ director = self.getCredits().get('director', None)
+ if director:
+ director = director[0]
+ parsed_value = parsed_value.replace(director, '')
+ if parsed_value.startswith("'s"):
+ parsed_value = parsed_value[2:].strip()
+ parsed_value = decodeHtml(parsed_value.strip())
else:
- parsed_value = int(parsed_value)
- else:
- parsed_value = int(parsed_value) * 60
- elif key in ('country', 'language'):
- parsed_value = value.split(' / ')
- if len(parsed_value) == 1:
- parsed_value = parsed_value[0].split(' | ')
- parsed_value = [v.strip() for v in parsed_value]
- elif key == 'genre':
- parsed_value = value.replace('more', '').strip().split(' / ')
- if len(parsed_value) == 1:
- parsed_value = parsed_value[0].split(' | ')
- parsed_value = [v.strip() for v in parsed_value]
- elif key == 'tagline':
- parsed_value = value.replace('more', '').strip()
- elif key == 'plot_outline':
- parsed_value = value.replace('(view trailer)', '').strip()
- if parsed_value.endswith('more'):
- parsed_value = parsed_value[:-4].strip()
- elif key == 'tv_series':
- m = re.compile('(.*?) ').findall(value)
- if m:
- parsed_value = m[0][0]
- else:
- parsed_value = ''
- elif key == 'also_known_as':
- parsed_value = ''
- m = re.compile('(.*) \(International: English title').findall(value)
- if m:
- parsed_value = m[0]
- else:
- m = re.compile('(.*) \(USA').findall(value)
+ print value
+ parsed_value = value
+ return parsed_value
+
+ def parseTitle(self):
+ title = getMovieTitle(self.imdb)
+ title = normalizeTitle(title)
+ if title.startswith('"') and title.find('"',1) > 0 and \
+ title.find('"',1) == title.rfind('"'):
+ data = self.getPage()
+ se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
+ if se:
+ se = se[0]
+ se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1]))
+ title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip()
+ else:
+ part2 = title[title.rfind('"')+1:]
+ part2 = re.sub("[\d\?-]", "", part2).strip()
+ title = normalizeTitle(title[1:title.rfind('"')])
+ if part2:
+ title += ':' + part2
+ return normalizeTitle(title)
+
+ def parseYear(self):
+ year = ''
+ data = self.getPage()
+ soup = BeautifulSoup(data)
+ html_title = soup('div', {'id': 'tn15title'})
+ if not html_title:
+ html_title = soup('title')
+ if html_title:
+ html_title = unicode(html_title[0])
+ html_title = stripTags(html_title)
+ year = re.compile('\((\d{4})\)').findall(html_title)
+ if not year:
+ year = re.compile('\((\d{4})/').findall(html_title)
+ if year:
+ year = year[0]
+ else: year = ''
+ return year
+
+ def parse(self):
+ data = self.getPage()
+ IMDbDict ={}
+ #Poster
+ IMDbDict['poster'] = getMoviePoster(self.imdb)
+ if not IMDbDict['poster']:
+ IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
+ #Title, Year
+ IMDbDict['year'] = self.parseYear()
+ IMDbDict['title'] = self.parseTitle()
+
+ #Rating
+ m = re.compile('(.*?)/10 ', re.IGNORECASE).search(data)
if m:
- parsed_value = m[0]
- parsed_value = parsed_value.split(' ')[-1].split('(')[0]
- director = self.getCredits().get('director', None)
- if director:
- director = director[0]
- parsed_value = parsed_value.replace(director, '')
- if parsed_value.startswith("'s"):
- parsed_value = parsed_value[2:].strip()
- parsed_value = decodeHtml(parsed_value.strip())
- else:
- print value
- parsed_value = value
- return parsed_value
+ IMDbDict['rating'] = int(float(m.group(1)) * 1000)
+ else:
+ IMDbDict['rating'] = -1
+ #Votes
+ m = re.compile('\((.*?) votes \) ', re.IGNORECASE).findall(data)
+ if m:
+ IMDbDict['votes'] = int(m[0].replace(',', ''))
+ else:
+ IMDbDict['votes'] = -1
- def parseTitle(self):
- title = getMovieTitle(self.imdb)
- title = normalizeTitle(title)
- if title.startswith('"') and title.find('"',1) > 0 and \
- title.find('"',1) == title.rfind('"'):
- data = self.getPage()
- se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
- if se:
- se = se[0]
- se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1]))
- title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip()
- else:
- part2 = title[title.rfind('"')+1:]
- part2 = re.sub("[\d\?-]", "", part2).strip()
- title = normalizeTitle(title[1:title.rfind('"')])
- if part2:
- title += ':' + part2
- return normalizeTitle(title)
+ data = data.replace('\n',' ')
+ #some values
+ keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
+ for key in keys:
+ IMDbDict[key] = ''
+ IMDbDict['runtime'] = 0
+ soup = BeautifulSoup(data)
+ for info in soup('div', {'class': 'info'}):
+ key = unicode(info).split(' ')[0].split('')
+ if len(key) > 1:
+ raw_value = unicode(info).split(' ')[1]
+ key = key[1][:-1].lower().replace(' ', '_')
+ if key in keys:
+ IMDbDict[key] = self.parse_raw_value(key, raw_value)
+ IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
+ #is episode
+ IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
- def parseYear(self):
- year = ''
- data = self.getPage()
- soup = BeautifulSoup(data)
- html_title = soup('div', {'id': 'tn15title'})
- if not html_title:
- html_title = soup('title')
- if html_title:
- html_title = unicode(html_title[0])
- html_title = stripTags(html_title)
- year = re.compile('\((\d{4})\)').findall(html_title)
- if not year:
- year = re.compile('\((\d{4})/').findall(html_title)
- if year:
- year = year[0]
- else: year = ''
- return year
+ IMDbDict['episodes'] = getMovieEpisodes(self.imdb)
+ if IMDbDict['episodes']:
+ IMDbDict['tvshow'] = True
+ else:
+ IMDbDict['tvshow'] = False
+ IMDbDict['credits'] = self.getCredits()
+ IMDbDict['plot'] = getMoviePlot(self.imdb)
+ IMDbDict['keywords'] = getMovieKeywords(self.imdb)
+ IMDbDict['trivia'] = getMovieTrivia(self.imdb)
+ IMDbDict['connections'] = getMovieConnections(self.imdb)
+ IMDbDict['locations'] = getMovieLocations(self.imdb)
+ IMDbDict['release_date'] = getMovieReleaseDate(self.imdb)
+ IMDbDict['business'] = getMovieBusinessSum(self.imdb)
+ IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
+ IMDbDict['stills'] = getMovieStills(self.imdb)
+ #IMDbDict['trailer'] = getMovieTrailer(self.imdb)
+ self.IMDbDict = IMDbDict
- def parse(self):
- data = self.getPage()
- IMDbDict ={}
- #Poster
- IMDbDict['poster'] = getMoviePoster(self.imdb)
- if not IMDbDict['poster']:
- IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
- #Title, Year
- IMDbDict['year'] = self.parseYear()
- IMDbDict['title'] = self.parseTitle()
+ if IMDbDict['episode_of']:
+ episode_of =IMDb(IMDbDict['episode_of']).parse()
+ for key in ('country', 'language'):
+ if not IMDbDict[key]:
+ IMDbDict[key] = episode_of[key]
+ return self.IMDbDict
- #Rating
- m = re.compile('(.*?)/10 ', re.IGNORECASE).search(data)
- if m:
- IMDbDict['rating'] = int(float(m.group(1)) * 1000)
- else:
- IMDbDict['rating'] = -1
- #Votes
- m = re.compile('\((.*?) votes \) ', re.IGNORECASE).findall(data)
- if m:
- IMDbDict['votes'] = int(m[0].replace(',', ''))
- else:
- IMDbDict['votes'] = -1
+ def getCredits(self):
+ raw_credits = getMovieCredits(self.imdb)
+ credits = {}
- data = data.replace('\n',' ')
- #some values
- keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
- for key in keys:
- IMDbDict[key] = ''
- IMDbDict['runtime'] = 0
- soup = BeautifulSoup(data)
- for info in soup('div', {'class': 'info'}):
- key = unicode(info).split(' ')[0].split('
')
- if len(key) > 1:
- raw_value = unicode(info).split(' ')[1]
- key = key[1][:-1].lower().replace(' ', '_')
- if key in keys:
- IMDbDict[key] = self.parse_raw_value(key, raw_value)
- IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
- #is episode
- IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
+ def getNames(creditList):
+ return [stripTags(decodeHtml(c[0])) for c in creditList]
- IMDbDict['episodes'] = getMovieEpisodes(self.imdb)
- if IMDbDict['episodes']:
- IMDbDict['tvshow'] = True
- else:
- IMDbDict['tvshow'] = False
- IMDbDict['credits'] = self.getCredits()
- IMDbDict['plot'] = getMoviePlot(self.imdb)
- IMDbDict['keywords'] = getMovieKeywords(self.imdb)
- IMDbDict['trivia'] = getMovieTrivia(self.imdb)
- IMDbDict['connections'] = getMovieConnections(self.imdb)
- IMDbDict['locations'] = getMovieLocations(self.imdb)
- IMDbDict['release_date'] = getMovieReleaseDate(self.imdb)
- IMDbDict['business'] = getMovieBusinessSum(self.imdb)
- IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
- IMDbDict['stills'] = getMovieStills(self.imdb)
- #IMDbDict['trailer'] = getMovieTrailer(self.imdb)
- self.IMDbDict = IMDbDict
+ credits['director'] = getNames(raw_credits.get('directors', ''))
+ credits['writer'] = getNames(raw_credits.get('writers', ''))
+ credits['producer'] = getNames(raw_credits.get('producers', ''))
+ credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])]
- if IMDbDict['episode_of']:
- episode_of =IMDb(IMDbDict['episode_of']).parse()
- for key in ('country', 'language'):
- if not IMDbDict[key]:
- IMDbDict[key] = episode_of[key]
- return self.IMDbDict
-
- def getCredits(self):
- raw_credits = getMovieCredits(self.imdb)
- credits = {}
-
- def getNames(creditList):
- return [stripTags(decodeHtml(c[0])) for c in creditList]
-
- credits['director'] = getNames(raw_credits.get('directors', ''))
- credits['writer'] = getNames(raw_credits.get('writers', ''))
- credits['producer'] = getNames(raw_credits.get('producers', ''))
- credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])]
-
- self.credits = credits
- return self.credits
+ self.credits = credits
+ return self.credits
def guess(title, director=''):
- #FIXME: proper file -> title
- title = title.split('-')[0]
- title = title.split('(')[0]
- title = title.split('.')[0]
- title = title.strip()
- imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
- return_url = ''
+ #FIXME: proper file -> title
+ title = title.split('-')[0]
+ title = title.split('(')[0]
+ title = title.split('.')[0]
+ title = title.strip()
+ imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
+ return_url = ''
- #lest first try google
- #i.e. site:imdb.com Michael Stevens Sin
- if director:
- search = 'site:imdb.com %s "%s"' % (director, title)
- else:
- search = 'site:imdb.com "%s"' % title
- for (name, url, desc) in google.find(search, 2):
- if url.startswith('http://www.imdb.com/title/tt'):
- return url[28:35]
+ #lest first try google
+ #i.e. site:imdb.com Michael Stevens Sin
+ if director:
+ search = 'site:imdb.com %s "%s"' % (director, title)
+ else:
+ search = 'site:imdb.com "%s"' % title
+ for (name, url, desc) in google.find(search, 2):
+ if url.startswith('http://www.imdb.com/title/tt'):
+ return url[28:35]
- try:
+ try:
+ req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS)
+ u = urllib2.urlopen(req)
+ data = u.read()
+ return_url = u.url
+ u.close()
+ except:
+ return None
+ if return_url.startswith('http://www.imdb.com/title/tt'):
+ return return_url[28:35]
+ if data:
+ imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?
.*?')
- for string in strings:
- if string.find('') != -1:
- key = findRe(string, '(.*?)')
- type = findRe(string, '<(.*?)>')
- if type == 'true/':
- value = True
- else:
- value = findRe(string, '<%s>(.*?)%s>' % (type, type))
- if type == 'integer':
- value = int(value)
- elif type == 'string':
- value = decodeHtml(value)
- values[key] = value
- return values
+ values = {}
+ strings = xml.split('')
+ for string in strings:
+ if string.find(' ') != -1:
+ key = findRe(string, '(.*?)')
+ type = findRe(string, '<(.*?)>')
+ if type == 'true/':
+ value = True
+ else:
+ value = findRe(string, '<%s>(.*?)%s>' % (type, type))
+ if type == 'integer':
+ value = int(value)
+ elif type == 'string':
+ value = decodeHtml(value)
+ values[key] = value
+ return values
def parseCast(xml, title):
- list = []
- try:
- strings = findRe(xml, '%s(.*?)' % title[:-1].upper()).split('')
- strings.pop()
- for string in strings:
- list.append(findRe(string, '(.*?) '))
- return list
- except:
- return list
+ list = []
+ try:
+ strings = findRe(xml, '%s(.*?)' % title[:-1].upper()).split('')
+ strings.pop()
+ for string in strings:
+ list.append(findRe(string, '(.*?) '))
+ return list
+ except:
+ return list
def parseMovies(xml, title):
- list = []
- try:
- strings = findRe(xml, '%s(.*?)' % title[:-1].upper()).split('')
- strings.pop()
- for string in strings:
- list.append({
- 'id': findRe(string, 'viewMovie\?id=(.*?)&'),
- 'title': findRe(string, '(.*?) ')
- })
- return list
- except:
- return list
+ list = []
+ try:
+ strings = findRe(xml, '%s(.*?)' % title[:-1].upper()).split('')
+ strings.pop()
+ for string in strings:
+ list.append({
+ 'id': findRe(string, 'viewMovie\?id=(.*?)&'),
+ 'title': findRe(string, '(.*?) ')
+ })
+ return list
+ except:
+ return list
class ItunesAlbum:
- def __init__(self, id = '', title = '', artist = ''):
- self.id = id
- self.title = title
- self.artist = artist
- if not id:
- self.id = self.getId()
+ def __init__(self, id = '', title = '', artist = ''):
+ self.id = id
+ self.title = title
+ self.artist = artist
+ if not id:
+ self.id = self.getId()
- def getId(self):
- url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
- xml = getUrl(url, headers = ITUNES_HEADERS)
- id = findRe(xml, 'viewAlbum\?id=(.*?)&')
- return id
+ def getId(self):
+ url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
+ xml = getUrl(url, headers = ITUNES_HEADERS)
+ id = findRe(xml, 'viewAlbum\?id=(.*?)&')
+ return id
- def getData(self):
- data = {'id': self.id}
- url = composeUrl('viewAlbum', {'id': self.id})
- xml = getUrl(url, None, ITUNES_HEADERS)
- data['albumName'] = findRe(xml, '(.*?) ')
- data['artistName'] = findRe(xml, '(.*?) ')
- data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
- data['genre'] = findRe(xml, 'Genre:(.*?)<')
- data['releaseDate'] = findRe(xml, 'Released(.*?)<')
- data['review'] = stripTags(findRe(xml, 'REVIEW .*?(.*?) '))
- data['tracks'] = []
- strings = findRe(xml, 'items .*?(.*?)$').split('')
- for string in strings:
- data['tracks'].append(parseXmlDict(string))
- data['type'] = findRe(xml, 'listType (.*?)<')
- return data
+ def getData(self):
+ data = {'id': self.id}
+ url = composeUrl('viewAlbum', {'id': self.id})
+ xml = getUrl(url, None, ITUNES_HEADERS)
+ data['albumName'] = findRe(xml, '(.*?) ')
+ data['artistName'] = findRe(xml, '(.*?) ')
+ data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
+ data['genre'] = findRe(xml, 'Genre:(.*?)<')
+ data['releaseDate'] = findRe(xml, 'Released(.*?)<')
+ data['review'] = stripTags(findRe(xml, 'REVIEW .*?(.*?) '))
+ data['tracks'] = []
+ strings = findRe(xml, 'items .*?(.*?)$').split('')
+ for string in strings:
+ data['tracks'].append(parseXmlDict(string))
+ data['type'] = findRe(xml, 'listType (.*?)<')
+ return data
class ItunesMovie:
- def __init__(self, id = '', title = '', director = ''):
- self.id = id
- self.title = title
- self.director = director
- if not id:
- self.id = self.getId()
+ def __init__(self, id = '', title = '', director = ''):
+ self.id = id
+ self.title = title
+ self.director = director
+ if not id:
+ self.id = self.getId()
- def getId(self):
- url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
- xml = getUrl(url, headers = ITUNES_HEADERS)
- id = findRe(xml, 'viewMovie\?id=(.*?)&')
- return id
+ def getId(self):
+ url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
+ xml = getUrl(url, headers = ITUNES_HEADERS)
+ id = findRe(xml, 'viewMovie\?id=(.*?)&')
+ return id
- def getData(self):
- data = {'id': self.id}
- url = composeUrl('viewMovie', {'id': self.id})
- xml = getUrl(url, None, ITUNES_HEADERS)
- f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
- f.write(xml)
- f.close()
- data['actors'] = parseCast(xml, 'actors')
- string = findRe(xml, 'Average Rating:(.*?)')
- data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
- data['directors'] = parseCast(xml, 'directors')
- data['format'] = findRe(xml, 'Format:(.*?)<')
- data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
- data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY.*?(.*?) '))
- data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
- data['producers'] = parseCast(xml, 'producers')
- data['rated'] = findRe(xml, 'Rated(.*?)<')
- data['relatedMovies'] = parseMovies(xml, 'related movies')
- data['releaseDate'] = findRe(xml, 'Released(.*?)<')
- data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
- data['screenwriters'] = parseCast(xml, 'screenwriters')
- data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
- data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
- return data
+ def getData(self):
+ data = {'id': self.id}
+ url = composeUrl('viewMovie', {'id': self.id})
+ xml = getUrl(url, None, ITUNES_HEADERS)
+ f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
+ f.write(xml)
+ f.close()
+ data['actors'] = parseCast(xml, 'actors')
+ string = findRe(xml, 'Average Rating:(.*?)')
+ data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
+ data['directors'] = parseCast(xml, 'directors')
+ data['format'] = findRe(xml, 'Format:(.*?)<')
+ data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
+ data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY.*?(.*?) '))
+ data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
+ data['producers'] = parseCast(xml, 'producers')
+ data['rated'] = findRe(xml, 'Rated(.*?)<')
+ data['relatedMovies'] = parseMovies(xml, 'related movies')
+ data['releaseDate'] = findRe(xml, 'Released(.*?)<')
+ data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
+ data['screenwriters'] = parseCast(xml, 'screenwriters')
+ data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
+ data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
+ return data
if __name__ == '__main__':
- import simplejson
- data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
- print simplejson.dumps(data, sort_keys = True, indent = 4)
- data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
- print simplejson.dumps(data, sort_keys = True, indent = 4)
- for v in data['relatedMovies']:
- data = ItunesMovie(id = v['id']).getData()
+ import simplejson
+ data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
print simplejson.dumps(data, sort_keys = True, indent = 4)
- data = ItunesMovie(id='272960052').getData()
- print simplejson.dumps(data, sort_keys = True, indent = 4)
+ data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
+ print simplejson.dumps(data, sort_keys = True, indent = 4)
+ for v in data['relatedMovies']:
+ data = ItunesMovie(id = v['id']).getData()
+ print simplejson.dumps(data, sort_keys = True, indent = 4)
+ data = ItunesMovie(id='272960052').getData()
+ print simplejson.dumps(data, sort_keys = True, indent = 4)
+
diff --git a/ox/lyricsfly.py b/ox/lyricsfly.py
index 7ae489e..2b2fe8b 100644
--- a/ox/lyricsfly.py
+++ b/ox/lyricsfly.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
from oxutils.cache import getUrl
from oxutils.html import decodeHtml
from oxutils.text import findRe
@@ -16,4 +18,4 @@ def getLyrics(title, artist):
return lyrics
if __name__ == '__main__':
- print getLyrics('Election Day', 'Arcadia')
\ No newline at end of file
+ print getLyrics('Election Day', 'Arcadia')
diff --git a/ox/mininova.py b/ox/mininova.py
index c569e6a..36357e0 100644
--- a/ox/mininova.py
+++ b/ox/mininova.py
@@ -1,7 +1,5 @@
-# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
-
+# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import socket
@@ -14,116 +12,115 @@ import oxutils
from torrent import Torrent
-socket.setdefaulttimeout(10.0)
def _parseResultsPage(data, max_results=10):
- results=[]
- regexp = '''(.*?) (.*?)(.*?) .*? .*? '''
- for row in re.compile(regexp, re.DOTALL).findall(data):
- torrentDate = row[0]
- torrentExtra = row[1]
- torrentId = row[2]
- torrentTitle = decodeHtml(row[3]).strip()
- torrentLink = "http://www.mininova.org/tor/" + torrentId
- privateTracker = 'priv.gif' in torrentExtra
- if not privateTracker:
- results.append((torrentTitle, torrentLink, ''))
- return results
+ results=[]
+ regexp = '''(.*?) (.*?)(.*?) .*? .*? '''
+ for row in re.compile(regexp, re.DOTALL).findall(data):
+ torrentDate = row[0]
+ torrentExtra = row[1]
+ torrentId = row[2]
+ torrentTitle = decodeHtml(row[3]).strip()
+ torrentLink = "http://www.mininova.org/tor/" + torrentId
+ privateTracker = 'priv.gif' in torrentExtra
+ if not privateTracker:
+ results.append((torrentTitle, torrentLink, ''))
+ return results
def findMovie(query, max_results=10):
- '''search for torrents on mininova
- '''
- url = "http://www.mininova.org/search/%s/seeds" % quote(query)
- data = getUrlUnicode(url)
- return _parseResultsPage(data, max_results)
+ '''search for torrents on mininova
+ '''
+ url = "http://www.mininova.org/search/%s/seeds" % quote(query)
+ data = getUrlUnicode(url)
+ return _parseResultsPage(data, max_results)
def findMovieByImdb(imdbId):
- '''find torrents on mininova for a given imdb id
- '''
- results = []
- imdbId = normalizeImdbId(imdbId)
- data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
- return _parseResultsPage(data)
+ '''find torrents on mininova for a given imdb id
+ '''
+ results = []
+ imdbId = normalizeImdbId(imdbId)
+ data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
+ return _parseResultsPage(data)
def getId(mininovaId):
- mininovaId = unicode(mininovaId)
- d = findRe(mininovaId, "/(\d+)")
- if d:
- return d
- mininovaId = mininovaId.split('/')
- if len(mininovaId) == 1:
- return mininovaId[0]
- else:
- return mininovaId[-1]
+ mininovaId = unicode(mininovaId)
+ d = findRe(mininovaId, "/(\d+)")
+ if d:
+ return d
+ mininovaId = mininovaId.split('/')
+ if len(mininovaId) == 1:
+ return mininovaId[0]
+ else:
+ return mininovaId[-1]
def exists(mininovaId):
- mininovaId = getId(mininovaId)
- data = oxutils.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId)
- if not data or 'Torrent not found...' in data:
- return False
- if 'tracker of this torrent requires registration.' in data:
- return False
- return True
+ mininovaId = getId(mininovaId)
+ data = oxutils.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId)
+ if not data or 'Torrent not found...' in data:
+ return False
+ if 'tracker of this torrent requires registration.' in data:
+ return False
+ return True
def getData(mininovaId):
- _key_map = {
- 'by': u'uploader',
- }
- mininovaId = getId(mininovaId)
- torrent = dict()
- torrent[u'id'] = mininovaId
- torrent[u'domain'] = 'mininova.org'
- torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
- torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
- torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
+ _key_map = {
+ 'by': u'uploader',
+ }
+ mininovaId = getId(mininovaId)
+ torrent = dict()
+ torrent[u'id'] = mininovaId
+ torrent[u'domain'] = 'mininova.org'
+ torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
+ torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
+ torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
- data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link'])
- if 'Torrent not found... ' in data:
- return None
+ data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link'])
+ if 'Torrent not found... ' in data:
+ return None
- for d in re.compile('.(.*?): (.*?)
', re.DOTALL).findall(data):
- key = d[0].lower().strip()
- key = _key_map.get(key, key)
- value = decodeHtml(stripTags(d[1].strip()))
- torrent[key] = value
+ for d in re.compile('.(.*?): (.*?)
', re.DOTALL).findall(data):
+ key = d[0].lower().strip()
+ key = _key_map.get(key, key)
+ value = decodeHtml(stripTags(d[1].strip()))
+ torrent[key] = value
- torrent[u'title'] = findRe(data, '(.*?):.*? ')
- torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
- torrent[u'description'] = findRe(data, '(.*?)
')
- if torrent['description']:
- torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
- t = getUrl(torrent[u'torrent_link'])
- torrent[u'torrent_info'] = getTorrentInfo(t)
- return torrent
+ torrent[u'title'] = findRe(data, '(.*?):.*? ')
+ torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
+ torrent[u'description'] = findRe(data, '(.*?)
')
+ if torrent['description']:
+ torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
+ t = getUrl(torrent[u'torrent_link'])
+ torrent[u'torrent_info'] = getTorrentInfo(t)
+ return torrent
class Mininova(Torrent):
- '''
- >>> Mininova('123')
- {}
- >>> Mininova('1072195')['infohash']
- '72dfa59d2338e4a48c78cec9de25964cddb64104'
- '''
- def __init__(self, mininovaId):
- self.data = getData(mininovaId)
- if not self.data:
- return
- Torrent.__init__(self)
- ratio = self.data['share ratio'].split(',')
- self['seeder'] = -1
- self['leecher'] = -1
- if len(ratio) == 2:
- val = intValue(ratio[0].replace(',','').strip())
- if val:
- self['seeder'] = int(val)
- val = intValue(ratio[1].replace(',','').strip())
- if val:
- self['leecher'] = int(val)
- val = intValue(self.data['downloads'].replace(',','').strip())
- if val:
- self['downloaded'] = int(val)
- else:
- self['downloaded'] = -1
- published = self.data['added on']
- published = published.split(' +')[0]
- self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
+ '''
+ >>> Mininova('123')
+ {}
+ >>> Mininova('1072195')['infohash']
+ '72dfa59d2338e4a48c78cec9de25964cddb64104'
+ '''
+ def __init__(self, mininovaId):
+ self.data = getData(mininovaId)
+ if not self.data:
+ return
+ Torrent.__init__(self)
+ ratio = self.data['share ratio'].split(',')
+ self['seeder'] = -1
+ self['leecher'] = -1
+ if len(ratio) == 2:
+ val = intValue(ratio[0].replace(',','').strip())
+ if val:
+ self['seeder'] = int(val)
+ val = intValue(ratio[1].replace(',','').strip())
+ if val:
+ self['leecher'] = int(val)
+ val = intValue(self.data['downloads'].replace(',','').strip())
+ if val:
+ self['downloaded'] = int(val)
+ else:
+ self['downloaded'] = -1
+ published = self.data['added on']
+ published = published.split(' +')[0]
+ self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
diff --git a/ox/opensubtitles.py b/ox/opensubtitles.py
index e7fca01..abb3cee 100644
--- a/ox/opensubtitles.py
+++ b/ox/opensubtitles.py
@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
-
+# vi:si:et:sw=4:sts=4:ts=4
import re
import feedparser
@@ -9,37 +8,34 @@ import oxutils
from oxutils.lang import langCode2To3, langTo3Code
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
- if len(language) == 2:
- language = langCode2To3(language)
- elif len(language) != 3:
- language = langTo3Code(language)
- url = "http://www.opensubtitles.org/en/search/"
- if language:
- url += "sublanguageid-%s/" % language
- url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
- data = getUrl(url)
- if "title>opensubtitles.com - search resultsopensubtitles.com - search results(.*?)'
- for f in re.compile(reg_exp, re.DOTALL).findall(data):
- name = oxutils.stripTags(f[1]).split('\n')[0]
- url = "http://www.opensubtitles.com%s" % f[0]
- srts[name] = getUrlUnicode(url)
- return srts
-
-
+ srts = {}
+ data = getUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
+ reg_exp = 'href="(/en/download/file/.*?)">(.*?)'
+ for f in re.compile(reg_exp, re.DOTALL).findall(data):
+ name = oxutils.stripTags(f[1]).split('\n')[0]
+ url = "http://www.opensubtitles.com%s" % f[0]
+ srts[name] = getUrlUnicode(url)
+ return srts
+
diff --git a/ox/spiegel.py b/ox/spiegel.py
index 492171e..296c7b0 100644
--- a/ox/spiegel.py
+++ b/ox/spiegel.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import time
@@ -8,6 +10,7 @@ import oxutils.cache
from oxutils.html import decodeHtml, stripTags
import oxutils.net
+
def getNews(year, month, day):
sections = [
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
@@ -287,4 +290,4 @@ if __name__ == '__main__':
print x
'''
# archiveIssues()
- archiveNews()
\ No newline at end of file
+ archiveNews()
diff --git a/ox/thepiratebay.py b/ox/thepiratebay.py
index f15be62..e6f52c3 100644
--- a/ox/thepiratebay.py
+++ b/ox/thepiratebay.py
@@ -1,14 +1,11 @@
-# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
-
+# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import socket
from urllib import quote, urlencode
from urllib2 import URLError
-
from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxutils.normalize import normalizeImdbId
@@ -16,107 +13,106 @@ import oxutils
from torrent import Torrent
-socket.setdefaulttimeout(10.0)
season_episode = re.compile("S..E..", re.IGNORECASE)
def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout):
- headers = cache.DEFAULT_HEADERS
- headers['Cookie'] = 'language=en_EN'
- return cache.getUrl(url, data, headers, timeout)
+ headers = cache.DEFAULT_HEADERS
+ headers['Cookie'] = 'language=en_EN'
+ return cache.getUrl(url, data, headers, timeout)
def _getUrlUnicode(url):
- return cache.getUrlUnicode(url, _getUrl=_getUrl)
+ return cache.getUrlUnicode(url, _getUrl=_getUrl)
def findMovies(query, max_results=10):
- results = []
- next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
- page_count = 1
- while next and page_count < 4:
- page_count += 1
- url = next[0]
- if not url.startswith('http'):
- if not url.startswith('/'):
- url = "/" + url
- url = "http://thepiratebay.org" + url
- data = _getUrlUnicode(url)
- regexp = '''(.*?) .*?'''
- for row in re.compile(regexp, re.DOTALL).findall(data):
- torrentType = row[0]
- torrentLink = "http://thepiratebay.org" + row[1]
- torrentTitle = decodeHtml(row[2])
- # 201 = Movies , 202 = Movie DVDR, 205 TV Shows
- if torrentType in ['201']:
- results.append((torrentTitle, torrentLink, ''))
- if len(results) >= max_results:
- return results
- next = re.compile('.*?next.gif.*?').findall(data)
- return results
+ results = []
+ next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
+ page_count = 1
+ while next and page_count < 4:
+ page_count += 1
+ url = next[0]
+ if not url.startswith('http'):
+ if not url.startswith('/'):
+ url = "/" + url
+ url = "http://thepiratebay.org" + url
+ data = _getUrlUnicode(url)
+ regexp = '''(.*?) .*?'''
+ for row in re.compile(regexp, re.DOTALL).findall(data):
+ torrentType = row[0]
+ torrentLink = "http://thepiratebay.org" + row[1]
+ torrentTitle = decodeHtml(row[2])
+ # 201 = Movies , 202 = Movie DVDR, 205 TV Shows
+ if torrentType in ['201']:
+ results.append((torrentTitle, torrentLink, ''))
+ if len(results) >= max_results:
+ return results
+ next = re.compile('.*?next.gif.*?').findall(data)
+ return results
def findMovieByImdb(imdb):
- return findMovies("tt" + normalizeImdbId(imdb))
+ return findMovies("tt" + normalizeImdbId(imdb))
def getId(piratebayId):
- if piratebayId.startswith('http://torrents.thepiratebay.org/'):
- piratebayId = piratebayId.split('org/')[1]
- d = findRe(piratebayId, "tor/(\d+)")
- if d:
- piratebayId = d
- return piratebayId
+ if piratebayId.startswith('http://torrents.thepiratebay.org/'):
+ piratebayId = piratebayId.split('org/')[1]
+ d = findRe(piratebayId, "tor/(\d+)")
+ if d:
+ piratebayId = d
+ return piratebayId
def exists(piratebayId):
- piratebayId = getId(piratebayId)
- return oxutils.net.exists("http://thepiratebay.org/tor/%s" % piratebayId)
+ piratebayId = getId(piratebayId)
+ return oxutils.net.exists("http://thepiratebay.org/tor/%s" % piratebayId)
def getData(piratebayId):
- _key_map = {
- 'spoken language(s)': u'language',
- 'texted language(s)': u'subtitle language',
- 'by': u'uploader',
- 'leechers': 'leecher',
- 'seeders': 'seeder',
- }
- piratebayId = getId(piratebayId)
- torrent = dict()
- torrent[u'id'] = piratebayId
- torrent[u'domain'] = 'thepiratebay.org'
- torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
+ _key_map = {
+ 'spoken language(s)': u'language',
+ 'texted language(s)': u'subtitle language',
+ 'by': u'uploader',
+ 'leechers': 'leecher',
+ 'seeders': 'seeder',
+ }
+ piratebayId = getId(piratebayId)
+ torrent = dict()
+ torrent[u'id'] = piratebayId
+ torrent[u'domain'] = 'thepiratebay.org'
+ torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
- data = _getUrlUnicode(torrent['comment_link'])
- torrent[u'title'] = findRe(data, '(.*?) \(download torrent\) - TPB ')
- if not torrent[u'title']:
- return None
- torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
- torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
- title = quote(torrent['title'].encode('utf-8'))
- torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
- for d in re.compile('dt>(.*?):.*?(.*?)', re.DOTALL).findall(data):
- key = d[0].lower().strip()
- key = _key_map.get(key, key)
- value = decodeHtml(stripTags(d[1].strip()))
- torrent[key] = value
- torrent[u'description'] = findRe(data, '(.*?)
')
- if torrent[u'description']:
- torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
- t = _getUrl(torrent[u'torrent_link'])
- torrent[u'torrent_info'] = getTorrentInfo(t)
- return torrent
+ data = _getUrlUnicode(torrent['comment_link'])
+ torrent[u'title'] = findRe(data, '(.*?) \(download torrent\) - TPB ')
+ if not torrent[u'title']:
+ return None
+ torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
+ torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
+ title = quote(torrent['title'].encode('utf-8'))
+ torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
+ for d in re.compile('dt>(.*?):.*?(.*?)', re.DOTALL).findall(data):
+ key = d[0].lower().strip()
+ key = _key_map.get(key, key)
+ value = decodeHtml(stripTags(d[1].strip()))
+ torrent[key] = value
+ torrent[u'description'] = findRe(data, '(.*?)
')
+ if torrent[u'description']:
+ torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
+ t = _getUrl(torrent[u'torrent_link'])
+ torrent[u'torrent_info'] = getTorrentInfo(t)
+ return torrent
class Thepiratebay(Torrent):
- '''
- >>> Thepiratebay('123')
- {}
+ '''
+ >>> Thepiratebay('123')
+ {}
- >>> Thepiratebay('3951349')['infohash']
- '4e84415d36ed7b54066160c05a0b0f061898d12b'
- '''
- def __init__(self, piratebayId):
- self.data = getData(piratebayId)
- if not self.data:
- return
- Torrent.__init__(self)
- published = self.data['uploaded']
- published = published.replace(' GMT', '').split(' +')[0]
- self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
+ >>> Thepiratebay('3951349')['infohash']
+ '4e84415d36ed7b54066160c05a0b0f061898d12b'
+ '''
+ def __init__(self, piratebayId):
+ self.data = getData(piratebayId)
+ if not self.data:
+ return
+ Torrent.__init__(self)
+ published = self.data['uploaded']
+ published = published.replace(' GMT', '').split(' +')[0]
+ self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
diff --git a/ox/torrent.py b/ox/torrent.py
index 785f604..51ce3c9 100644
--- a/ox/torrent.py
+++ b/ox/torrent.py
@@ -1,39 +1,37 @@
-# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
-
+# vi:si:et:sw=4:sts=4:ts=4
from oxutils import intValue
class Torrent(dict):
- '''
- >>> Torrent()
- {'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
- '''
- _string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
- 'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
- _int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
- _dict_keys = ('torrent_info', )
- _list_keys = ()
- data = {'torrent_info': {}}
+ '''
+ >>> Torrent()
+ {'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
+ '''
+ _string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
+ 'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
+ _int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
+ _dict_keys = ('torrent_info', )
+ _list_keys = ()
+ data = {'torrent_info': {}}
- def __init__(self):
- for key in self._string_keys:
- self[key] = self.data.get(key, u'')
- for key in self._dict_keys:
- self[key] = self.data.get(key, {})
- for key in self._list_keys:
- self[key] = self.data.get(key, [])
- for key in self._int_keys:
- value = self.data.get(key, -1)
- if not isinstance(value, int):
- value = int(intValue(value))
- self[key] = value
- self['infohash'] = self.data['torrent_info'].get('hash', '')
- self['size'] = self.data['torrent_info'].get('size', -1)
- self['announce'] = self.data['torrent_info'].get('announce', '')
- if 'files' in self.data['torrent_info']:
- self['files'] = len(self.data['torrent_info']['files'])
- else:
- self['files'] = 1
+ def __init__(self):
+ for key in self._string_keys:
+ self[key] = self.data.get(key, u'')
+ for key in self._dict_keys:
+ self[key] = self.data.get(key, {})
+ for key in self._list_keys:
+ self[key] = self.data.get(key, [])
+ for key in self._int_keys:
+ value = self.data.get(key, -1)
+ if not isinstance(value, int):
+ value = int(intValue(value))
+ self[key] = value
+ self['infohash'] = self.data['torrent_info'].get('hash', '')
+ self['size'] = self.data['torrent_info'].get('size', -1)
+ self['announce'] = self.data['torrent_info'].get('announce', '')
+ if 'files' in self.data['torrent_info']:
+ self['files'] = len(self.data['torrent_info']['files'])
+ else:
+ self['files'] = 1
diff --git a/ox/wikipedia.py b/ox/wikipedia.py
index a969e24..1d969bf 100644
--- a/ox/wikipedia.py
+++ b/ox/wikipedia.py
@@ -1,72 +1,72 @@
-# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
from urllib import urlencode
import simplejson
from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRe, decodeHtml
+
def getMovieId(title, director='', year=''):
- query = '"%s" film %s %s' % (title, director, year)
- result = find(query, 1)
- if result:
- return result[0][1]
- return ''
+ query = '"%s" film %s %s' % (title, director, year)
+ result = find(query, 1)
+ if result:
+ return result[0][1]
+ return ''
def getUrlByImdb(imdbId):
- query = '"imdb_id = %s"'% imdbId
- result = find(query)
- if result:
- url = result[0][1]
- return url
- if str(imdbId).startswith('0'):
- imdbId = imdbId[1:]
- return getUrlByImdb(imdbId)
+ query = '"imdb_id = %s"'% imdbId
+ result = find(query)
+ if result:
+ url = result[0][1]
+ return url
+ if str(imdbId).startswith('0'):
+ imdbId = imdbId[1:]
+ return getUrlByImdb(imdbId)
def getUrlByAmbId(amg_id):
- query = '"amg_id = %s"'% amg_id
- result = find(query)
- if result:
- url = result[0][1]
- return url
- return ''
+ query = '"amg_id = %s"'% amg_id
+ result = find(query)
+ if result:
+ url = result[0][1]
+ return url
+ return ''
def getWikiData(wikipediaUrl):
- title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
- url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
- html = getUrlUnicode(url)
- data = decodeHtml(findRe(html, "(.*?)"))
- return data
+ title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
+ url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
+ html = getUrlUnicode(url)
+ data = decodeHtml(findRe(html, "(.*?)"))
+ return data
def getMovieData(wikipediaUrl):
- data = getWikiData(wikipediaUrl)
- filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
- filmbox = {}
- for row in filmbox_data.strip().split('|'):
- d = row.split('=')
- if len(d) == 2:
- key = d[0].strip()
- value = d[1].strip()
- filmbox[key] = value
- return filmbox
+ data = getWikiData(wikipediaUrl)
+ filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
+ filmbox = {}
+ for row in filmbox_data.strip().split('|'):
+ d = row.split('=')
+ if len(d) == 2:
+ key = d[0].strip()
+ value = d[1].strip()
+ filmbox[key] = value
+ return filmbox
def getAmgId(wikipediaUrl):
- data = getMovieData(wikipediaUrl)
- return data.get('amg_id', '')
+ data = getMovieData(wikipediaUrl)
+ return data.get('amg_id', '')
def find(query, max_results=10):
- query = {'action': 'query', 'list':'search', 'format': 'json',
- 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
- url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
- data = getUrl(url)
- if not data:
- data = getUrl(url, timeout=0)
- result = simplejson.loads(data)
- results = []
- for r in result['query']['search']:
- title = r['title']
- url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
- results.append((title, url, ''))
- return results
+ query = {'action': 'query', 'list':'search', 'format': 'json',
+ 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
+ url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
+ data = getUrl(url)
+ if not data:
+ data = getUrl(url, timeout=0)
+ result = simplejson.loads(data)
+ results = []
+ for r in result['query']['search']:
+ title = r['title']
+ url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
+ results.append((title, url, ''))
+ return results
diff --git a/ox/youtube.py b/ox/youtube.py
index f1efcb3..c17ebc3 100644
--- a/ox/youtube.py
+++ b/ox/youtube.py
@@ -1,6 +1,5 @@
-# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
from urllib import quote
import xml.etree.ElementTree as ET
@@ -8,49 +7,50 @@ import feedparser
from oxutils.cache import getUrl
from oxutils import findString
+
def getVideoUrl(youtubeId, format='mp4'):
- url = 'http://www.youtube.com/api2_rest?method=youtube.videos.get_video_token&video_id=' + youtubeId
- data = getUrl(url)
- xml = ET.fromstring(data)
- youtubeKey = xml.find('t').text
- if format == 'mp4':
- fmt=18
- url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s"%(youtubeId, youtubeKey, fmt)
- else:
- url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(youtubeId, youtubeKey)
- return url
+ url = 'http://www.youtube.com/api2_rest?method=youtube.videos.get_video_token&video_id=' + youtubeId
+ data = getUrl(url)
+ xml = ET.fromstring(data)
+ youtubeKey = xml.find('t').text
+ if format == 'mp4':
+ fmt=18
+ url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s"%(youtubeId, youtubeKey, fmt)
+ else:
+ url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(youtubeId, youtubeKey)
+ return url
def getMovieInfo(youtubeId):
- url = "http://gdata.youtube.com/feeds/api/videos/%s " % youtubeId
- data = getUrl(url)
- fd = feedparser.parse(data)
- return getInfoFromAtom(fd.entries[0])
+ url = "http://gdata.youtube.com/feeds/api/videos/%s " % youtubeId
+ data = getUrl(url)
+ fd = feedparser.parse(data)
+ return getInfoFromAtom(fd.entries[0])
def getInfoFromAtom(entry):
- info = dict()
- info['title'] = entry['title']
- info['description'] = entry['description']
- info['author'] = entry['author']
- info['published'] = entry['published_parsed']
- info['keywords'] = entry['media_keywords'].split(', ')
- info['url'] = entry['links'][0]['href']
- info['id'] = findString(info['url'], "/watch?v=")
- info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
- info['flv'] = getVideoUrl(info['id'], 'flv')
- info['mp4'] = getVideoUrl(info['id'], 'mp4')
- info['embed'] = ''' ''' % (info['id'], info['id'])
- return info
+ info = dict()
+ info['title'] = entry['title']
+ info['description'] = entry['description']
+ info['author'] = entry['author']
+ info['published'] = entry['published_parsed']
+ info['keywords'] = entry['media_keywords'].split(', ')
+ info['url'] = entry['links'][0]['href']
+ info['id'] = findString(info['url'], "/watch?v=")
+ info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
+ info['flv'] = getVideoUrl(info['id'], 'flv')
+ info['mp4'] = getVideoUrl(info['id'], 'mp4')
+ info['embed'] = ''' ''' % (info['id'], info['id'])
+ return info
def find(query, max_results=10, offset=1, orderBy='relevance'):
- query = quote(query)
- url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s"%(query, orderBy, offset, max_results)
- data = getUrl(url)
- fd = feedparser.parse(data)
- videos = []
- for entry in fd.entries:
- v = getInfoFromAtom(entry)
- videos.append(v)
- if len(videos) >= max_results:
- return videos
- return videos
+ query = quote(query)
+ url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s"%(query, orderBy, offset, max_results)
+ data = getUrl(url)
+ fd = feedparser.parse(data)
+ videos = []
+ for entry in fd.entries:
+ v = getInfoFromAtom(entry)
+ videos.append(v)
+ if len(videos) >= max_results:
+ return videos
+ return videos
diff --git a/setup.py b/setup.py
index 4840537..e8a5096 100644
--- a/setup.py
+++ b/setup.py
@@ -1,33 +1,33 @@
#!/usr/bin/env python
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
from setuptools import setup, find_packages
import os
setup(
- name="ox",
- version="0.1",
- description="collection of scrapers for various websites",
- author="0x",
- author_email="code@0xdb.org",
- url="http://code.0xdb.org/ox",
- download_url="http://code.0xdb.org/ox/download",
- license="GPLv3",
- packages=find_packages(),
- zip_safe=False,
- install_requires=[
- 'oxutils',
- 'feedparser',
- 'beautifulsoup',
- ],
- keywords = [
- ],
- classifiers = [
- 'Development Status :: 3 - Alpha',
- 'Operating System :: OS Independent',
- 'Programming Language :: Python',
- 'Topic :: Software Development :: Libraries :: Python Modules',
- ],
- )
+ name="ox",
+ version="0.1",
+ description="collection of scrapers for various websites",
+ author="0x",
+ author_email="code@0xdb.org",
+ url="http://code.0xdb.org/ox",
+ download_url="http://code.0xdb.org/ox/download",
+ license="GPLv3",
+ packages=find_packages(),
+ zip_safe=False,
+ install_requires=[
+ 'oxutils',
+ 'feedparser',
+ 'beautifulsoup',
+ ],
+ keywords = [
+ ],
+ classifiers = [
+ 'Development Status :: 3 - Alpha',
+ 'Operating System :: OS Independent',
+ 'Programming Language :: Python',
+ 'Topic :: Software Development :: Libraries :: Python Modules',
+ ],
+)