more raw regexp strings
This commit is contained in:
parent
08636ba81a
commit
301babd1dd
1 changed files with 38 additions and 38 deletions
|
@ -43,8 +43,8 @@ def reference_section(id):
|
||||||
return {
|
return {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
|
r'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
|
||||||
'<a href="/name/.*?>(.*?)</a>'
|
r'<a href="/name/.*?>(.*?)</a>'
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
}
|
}
|
||||||
|
@ -54,8 +54,8 @@ def zebra_list(label, more=None):
|
||||||
conditions = {
|
conditions = {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
'_label">' + label + '</td>.*?<ul(.*?)</ul>',
|
r'_label">' + label + '</td>.*?<ul(.*?)</ul>',
|
||||||
'<li.*?>(.*?)</li>'
|
r'<li.*?>(.*?)</li>'
|
||||||
],
|
],
|
||||||
'type': 'list',
|
'type': 'list',
|
||||||
}
|
}
|
||||||
|
@ -67,7 +67,7 @@ def zebra_table(label, more=None, type='string'):
|
||||||
conditions = {
|
conditions = {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
'_label">' + label + '</td>.*?<td>(.*?)</td>',
|
r'_label">' + label + '</td>.*?<td>(.*?)</td>',
|
||||||
],
|
],
|
||||||
'type': type,
|
'type': type,
|
||||||
}
|
}
|
||||||
|
@ -97,9 +97,9 @@ def technical(label):
|
||||||
return {
|
return {
|
||||||
'page': 'technical',
|
'page': 'technical',
|
||||||
're': [
|
're': [
|
||||||
'<td class="label">\s*?%s\s*?</td>.*?<td>\s*?(.*?)\s*?</td>' % label,
|
r'<td class="label">\s*?%s\s*?</td>.*?<td>\s*?(.*?)\s*?</td>' % label,
|
||||||
lambda data: [
|
lambda data: [
|
||||||
re.sub('\s+', ' ', d.strip()) for d in data.strip().split('<br>')
|
re.sub(r'\s+', ' ', d.strip()) for d in data.strip().split('<br>')
|
||||||
] if data else []
|
] if data else []
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
|
@ -258,13 +258,13 @@ class Imdb(SiteParser):
|
||||||
'aspectratio': {
|
'aspectratio': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
|
r'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
|
||||||
parse_aspectratio,
|
parse_aspectratio,
|
||||||
],
|
],
|
||||||
'type': 'float',
|
'type': 'float',
|
||||||
},
|
},
|
||||||
'budget': zebra_table('Budget', more=[
|
'budget': zebra_table('Budget', more=[
|
||||||
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
lambda data: find_re(decode_html(data).replace(',', ''), r'\d+')
|
||||||
], type='int'),
|
], type='int'),
|
||||||
'cast': {
|
'cast': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
|
@ -287,12 +287,12 @@ class Imdb(SiteParser):
|
||||||
},
|
},
|
||||||
'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']),
|
'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']),
|
||||||
'gross': zebra_table('Cumulative Worldwide Gross', more=[
|
'gross': zebra_table('Cumulative Worldwide Gross', more=[
|
||||||
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
lambda data: find_re(decode_html(data).replace(',', ''), r'\d+')
|
||||||
], type='int'),
|
], type='int'),
|
||||||
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
|
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
|
||||||
'originalTitle': {
|
'originalTitle': {
|
||||||
'page': 'releaseinfo',
|
'page': 'releaseinfo',
|
||||||
're': '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
|
're': r'<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'summary': zebra_table('Plot Summary', more=[
|
'summary': zebra_table('Plot Summary', more=[
|
||||||
|
@ -300,7 +300,7 @@ class Imdb(SiteParser):
|
||||||
]),
|
]),
|
||||||
'storyline': {
|
'storyline': {
|
||||||
'page': '',
|
'page': '',
|
||||||
're': '<h2>Storyline</h2>.*?<p>(.*?)</p>',
|
're': r'<h2>Storyline</h2>.*?<p>(.*?)</p>',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'posterId': {
|
'posterId': {
|
||||||
|
@ -312,16 +312,16 @@ class Imdb(SiteParser):
|
||||||
'productionCompany': {
|
'productionCompany': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
'Production Companies.*?<ul(.*?)</ul>',
|
r'Production Companies.*?<ul(.*?)</ul>',
|
||||||
'<a href="/company/.*?/">(.*?)</a>'
|
r'<a href="/company/.*?/">(.*?)</a>'
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
},
|
},
|
||||||
'rating': {
|
'rating': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
'<div class="ipl-rating-star ">(.*?)</div>',
|
r'<div class="ipl-rating-star ">(.*?)</div>',
|
||||||
'ipl-rating-star__rating">([\d,.]+?)</span>',
|
r'ipl-rating-star__rating">([\d,.]+?)</span>',
|
||||||
],
|
],
|
||||||
'type': 'float'
|
'type': 'float'
|
||||||
},
|
},
|
||||||
|
@ -343,38 +343,38 @@ class Imdb(SiteParser):
|
||||||
'season': {
|
'season': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
|
r'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
|
||||||
'Season (\d+)',
|
r'Season (\d+)',
|
||||||
],
|
],
|
||||||
'type': 'int'
|
'type': 'int'
|
||||||
},
|
},
|
||||||
'episode': {
|
'episode': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
|
r'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
|
||||||
'Episode (\d+)',
|
r'Episode (\d+)',
|
||||||
],
|
],
|
||||||
'type': 'int'
|
'type': 'int'
|
||||||
},
|
},
|
||||||
'series': {
|
'series': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': '<h4 itemprop="name">.*?<a href="/title/tt(\d+)',
|
're': r'<h4 itemprop="name">.*?<a href="/title/tt(\d+)',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'isSeries': {
|
'isSeries': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
|
're': r'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'title': {
|
'title': {
|
||||||
'page': 'releaseinfo',
|
'page': 'releaseinfo',
|
||||||
're': '<h2.*?>(.*?)</h2>',
|
're': r'<h2.*?>(.*?)</h2>',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'trivia': {
|
'trivia': {
|
||||||
'page': 'trivia',
|
'page': 'trivia',
|
||||||
're': [
|
're': [
|
||||||
'<div class="sodatext">(.*?)<(br|/div)',
|
r'<div class="sodatext">(.*?)<(br|/div)',
|
||||||
lambda data: data[0]
|
lambda data: data[0]
|
||||||
],
|
],
|
||||||
'type': 'list',
|
'type': 'list',
|
||||||
|
@ -382,7 +382,7 @@ class Imdb(SiteParser):
|
||||||
'votes': {
|
'votes': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
'class="ipl-rating-star__total-votes">\((.*?)\)',
|
r'class="ipl-rating-star__total-votes">\((.*?)\)',
|
||||||
lambda r: r.replace(',', '')
|
lambda r: r.replace(',', '')
|
||||||
],
|
],
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
|
@ -391,8 +391,8 @@ class Imdb(SiteParser):
|
||||||
'year': {
|
'year': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
'<span class="titlereference-title-year">(.*?)</span>',
|
r'<span class="titlereference-title-year">(.*?)</span>',
|
||||||
'<a.*?>(\d+)',
|
r'<a.*?>(\d+)',
|
||||||
],
|
],
|
||||||
'type': 'int'
|
'type': 'int'
|
||||||
},
|
},
|
||||||
|
@ -400,7 +400,7 @@ class Imdb(SiteParser):
|
||||||
'page': 'fullcredits',
|
'page': 'fullcredits',
|
||||||
're': [
|
're': [
|
||||||
lambda data: data.split('<h4'),
|
lambda data: data.split('<h4'),
|
||||||
'>(.*?)</h4>.*?(<table.*?</table>)',
|
r'>(.*?)</h4>.*?(<table.*?</table>)',
|
||||||
lambda data: [d for d in data if d]
|
lambda data: [d for d in data if d]
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
|
@ -468,7 +468,7 @@ class Imdb(SiteParser):
|
||||||
title = title[1:-1]
|
title = title[1:-1]
|
||||||
if title.startswith("'") and title.endswith("'"):
|
if title.startswith("'") and title.endswith("'"):
|
||||||
title = title[1:-1]
|
title = title[1:-1]
|
||||||
title = re.sub('\(\#[.\d]+\)', '', title)
|
title = re.sub(r'\(\#[.\d]+\)', '', title)
|
||||||
return title.strip()
|
return title.strip()
|
||||||
|
|
||||||
for t in ('title', 'originalTitle'):
|
for t in ('title', 'originalTitle'):
|
||||||
|
@ -518,7 +518,7 @@ class Imdb(SiteParser):
|
||||||
self['actor'] = [c[0] for c in self['cast']]
|
self['actor'] = [c[0] for c in self['cast']]
|
||||||
def cleanup_character(c):
|
def cleanup_character(c):
|
||||||
c = c.replace('(uncredited)', '').strip()
|
c = c.replace('(uncredited)', '').strip()
|
||||||
c = re.sub('\s+', ' ', c)
|
c = re.sub(r'\s+', ' ', c)
|
||||||
return c
|
return c
|
||||||
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
||||||
for x in self['cast']]
|
for x in self['cast']]
|
||||||
|
@ -528,7 +528,7 @@ class Imdb(SiteParser):
|
||||||
del self['isSeries']
|
del self['isSeries']
|
||||||
self['isSeries'] = True
|
self['isSeries'] = True
|
||||||
if 'episodeTitle' in self:
|
if 'episodeTitle' in self:
|
||||||
self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle'])
|
self['episodeTitle'] = re.sub(r'Episode \#\d+\.\d+', '', self['episodeTitle'])
|
||||||
|
|
||||||
|
|
||||||
#make lists unique but keep order
|
#make lists unique but keep order
|
||||||
|
@ -790,7 +790,7 @@ def get_movie_by_title(title, timeout=-1):
|
||||||
url = "http://www.imdb.com/find?" + params
|
url = "http://www.imdb.com/find?" + params
|
||||||
data = read_url(url, timeout=timeout, unicode=True)
|
data = read_url(url, timeout=timeout, unicode=True)
|
||||||
#if search results in redirect, get id of current page
|
#if search results in redirect, get id of current page
|
||||||
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
|
r = r'<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
|
||||||
results = re.compile(r).findall(data)
|
results = re.compile(r).findall(data)
|
||||||
if results:
|
if results:
|
||||||
return results[0]
|
return results[0]
|
||||||
|
@ -869,12 +869,12 @@ def get_movie_id(title, director='', year='', timeout=-1):
|
||||||
|
|
||||||
data = read_url(url, timeout=timeout, unicode=True)
|
data = read_url(url, timeout=timeout, unicode=True)
|
||||||
#if search results in redirect, get id of current page
|
#if search results in redirect, get id of current page
|
||||||
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
|
r = r'<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
|
||||||
results = re.compile(r).findall(data)
|
results = re.compile(r).findall(data)
|
||||||
if results:
|
if results:
|
||||||
return results[0]
|
return results[0]
|
||||||
#otherwise get first result
|
#otherwise get first result
|
||||||
r = '<td valign="top">.*?<a href="/title/tt(\d+)/"'
|
r = r'<td valign="top">.*?<a href="/title/tt(\d+)/"'
|
||||||
results = re.compile(r).findall(data)
|
results = re.compile(r).findall(data)
|
||||||
if results:
|
if results:
|
||||||
return results[0]
|
return results[0]
|
||||||
|
@ -885,7 +885,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
|
||||||
results = duckduckgo.find(google_query, timeout=timeout)
|
results = duckduckgo.find(google_query, timeout=timeout)
|
||||||
if results:
|
if results:
|
||||||
for r in results[:2]:
|
for r in results[:2]:
|
||||||
imdbId = find_re(r[1], 'title/tt(\d+)')
|
imdbId = find_re(r[1], r'title/tt(\d+)')
|
||||||
if imdbId:
|
if imdbId:
|
||||||
return imdbId
|
return imdbId
|
||||||
#or nothing
|
#or nothing
|
||||||
|
@ -912,11 +912,11 @@ def get_episodes(imdbId, season=None):
|
||||||
if season:
|
if season:
|
||||||
url += '?season=%d' % season
|
url += '?season=%d' % season
|
||||||
data = cache.read_url(url).decode()
|
data = cache.read_url(url).decode()
|
||||||
for e in re.compile('<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
for e in re.compile(r'<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
||||||
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
|
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
|
||||||
else:
|
else:
|
||||||
data = cache.read_url(url)
|
data = cache.read_url(url)
|
||||||
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
|
match = re.compile(r'<strong>Season (\d+)</strong>').findall(data)
|
||||||
if match:
|
if match:
|
||||||
for season in range(1, int(match[0]) + 1):
|
for season in range(1, int(match[0]) + 1):
|
||||||
episodes.update(get_episodes(imdbId, season))
|
episodes.update(get_episodes(imdbId, season))
|
||||||
|
@ -927,7 +927,7 @@ def max_votes():
|
||||||
data = cache.read_url(url).decode('utf-8', 'ignore')
|
data = cache.read_url(url).decode('utf-8', 'ignore')
|
||||||
votes = max([
|
votes = max([
|
||||||
int(v.replace(',', ''))
|
int(v.replace(',', ''))
|
||||||
for v in re.compile('Votes</span>.*?([\d,]+)', re.DOTALL).findall(data)
|
for v in re.compile(r'Votes</span>.*?([\d,]+)', re.DOTALL).findall(data)
|
||||||
])
|
])
|
||||||
return votes
|
return votes
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue