Compare commits

...

2 commits

Author SHA1 Message Date
j
301babd1dd more raw regexp strings 2024-07-08 13:33:07 +01:00
j
08636ba81a update ua 2024-07-08 13:26:48 +01:00
2 changed files with 39 additions and 39 deletions

View file

@ -18,7 +18,7 @@ from chardet.universaldetector import UniversalDetector
DEBUG = False DEBUG = False
# Default headers for HTTP requests. # Default headers for HTTP requests.
DEFAULT_HEADERS = { DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4', 'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',

View file

@ -43,8 +43,8 @@ def reference_section(id):
return { return {
'page': 'reference', 'page': 'reference',
're': [ 're': [
'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id), r'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
'<a href="/name/.*?>(.*?)</a>' r'<a href="/name/.*?>(.*?)</a>'
], ],
'type': 'list' 'type': 'list'
} }
@ -54,8 +54,8 @@ def zebra_list(label, more=None):
conditions = { conditions = {
'page': 'reference', 'page': 'reference',
're': [ 're': [
'_label">' + label + '</td>.*?<ul(.*?)</ul>', r'_label">' + label + '</td>.*?<ul(.*?)</ul>',
'<li.*?>(.*?)</li>' r'<li.*?>(.*?)</li>'
], ],
'type': 'list', 'type': 'list',
} }
@ -67,7 +67,7 @@ def zebra_table(label, more=None, type='string'):
conditions = { conditions = {
'page': 'reference', 'page': 'reference',
're': [ 're': [
'_label">' + label + '</td>.*?<td>(.*?)</td>', r'_label">' + label + '</td>.*?<td>(.*?)</td>',
], ],
'type': type, 'type': type,
} }
@ -97,9 +97,9 @@ def technical(label):
return { return {
'page': 'technical', 'page': 'technical',
're': [ 're': [
'<td class="label">\s*?%s\s*?</td>.*?<td>\s*?(.*?)\s*?</td>' % label, r'<td class="label">\s*?%s\s*?</td>.*?<td>\s*?(.*?)\s*?</td>' % label,
lambda data: [ lambda data: [
re.sub('\s+', ' ', d.strip()) for d in data.strip().split('<br>') re.sub(r'\s+', ' ', d.strip()) for d in data.strip().split('<br>')
] if data else [] ] if data else []
], ],
'type': 'list' 'type': 'list'
@ -258,13 +258,13 @@ class Imdb(SiteParser):
'aspectratio': { 'aspectratio': {
'page': 'reference', 'page': 'reference',
're': [ 're': [
'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)', r'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
parse_aspectratio, parse_aspectratio,
], ],
'type': 'float', 'type': 'float',
}, },
'budget': zebra_table('Budget', more=[ 'budget': zebra_table('Budget', more=[
lambda data: find_re(decode_html(data).replace(',', ''), '\d+') lambda data: find_re(decode_html(data).replace(',', ''), r'\d+')
], type='int'), ], type='int'),
'cast': { 'cast': {
'page': 'reference', 'page': 'reference',
@ -287,12 +287,12 @@ class Imdb(SiteParser):
}, },
'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']), 'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']),
'gross': zebra_table('Cumulative Worldwide Gross', more=[ 'gross': zebra_table('Cumulative Worldwide Gross', more=[
lambda data: find_re(decode_html(data).replace(',', ''), '\d+') lambda data: find_re(decode_html(data).replace(',', ''), r'\d+')
], type='int'), ], type='int'),
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']), 'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
'originalTitle': { 'originalTitle': {
'page': 'releaseinfo', 'page': 'releaseinfo',
're': '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>', 're': r'<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
'type': 'string' 'type': 'string'
}, },
'summary': zebra_table('Plot Summary', more=[ 'summary': zebra_table('Plot Summary', more=[
@ -300,7 +300,7 @@ class Imdb(SiteParser):
]), ]),
'storyline': { 'storyline': {
'page': '', 'page': '',
're': '<h2>Storyline</h2>.*?<p>(.*?)</p>', 're': r'<h2>Storyline</h2>.*?<p>(.*?)</p>',
'type': 'string' 'type': 'string'
}, },
'posterId': { 'posterId': {
@ -312,16 +312,16 @@ class Imdb(SiteParser):
'productionCompany': { 'productionCompany': {
'page': 'reference', 'page': 'reference',
're': [ 're': [
'Production Companies.*?<ul(.*?)</ul>', r'Production Companies.*?<ul(.*?)</ul>',
'<a href="/company/.*?/">(.*?)</a>' r'<a href="/company/.*?/">(.*?)</a>'
], ],
'type': 'list' 'type': 'list'
}, },
'rating': { 'rating': {
'page': 'reference', 'page': 'reference',
're': [ 're': [
'<div class="ipl-rating-star ">(.*?)</div>', r'<div class="ipl-rating-star ">(.*?)</div>',
'ipl-rating-star__rating">([\d,.]+?)</span>', r'ipl-rating-star__rating">([\d,.]+?)</span>',
], ],
'type': 'float' 'type': 'float'
}, },
@ -343,38 +343,38 @@ class Imdb(SiteParser):
'season': { 'season': {
'page': 'reference', 'page': 'reference',
're': [ 're': [
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>', r'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
'Season (\d+)', r'Season (\d+)',
], ],
'type': 'int' 'type': 'int'
}, },
'episode': { 'episode': {
'page': 'reference', 'page': 'reference',
're': [ 're': [
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>', r'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
'Episode (\d+)', r'Episode (\d+)',
], ],
'type': 'int' 'type': 'int'
}, },
'series': { 'series': {
'page': 'reference', 'page': 'reference',
're': '<h4 itemprop="name">.*?<a href="/title/tt(\d+)', 're': r'<h4 itemprop="name">.*?<a href="/title/tt(\d+)',
'type': 'string' 'type': 'string'
}, },
'isSeries': { 'isSeries': {
'page': 'reference', 'page': 'reference',
're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"', 're': r'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
'type': 'string' 'type': 'string'
}, },
'title': { 'title': {
'page': 'releaseinfo', 'page': 'releaseinfo',
're': '<h2.*?>(.*?)</h2>', 're': r'<h2.*?>(.*?)</h2>',
'type': 'string' 'type': 'string'
}, },
'trivia': { 'trivia': {
'page': 'trivia', 'page': 'trivia',
're': [ 're': [
'<div class="sodatext">(.*?)<(br|/div)', r'<div class="sodatext">(.*?)<(br|/div)',
lambda data: data[0] lambda data: data[0]
], ],
'type': 'list', 'type': 'list',
@ -382,7 +382,7 @@ class Imdb(SiteParser):
'votes': { 'votes': {
'page': 'reference', 'page': 'reference',
're': [ 're': [
'class="ipl-rating-star__total-votes">\((.*?)\)', r'class="ipl-rating-star__total-votes">\((.*?)\)',
lambda r: r.replace(',', '') lambda r: r.replace(',', '')
], ],
'type': 'string' 'type': 'string'
@ -391,8 +391,8 @@ class Imdb(SiteParser):
'year': { 'year': {
'page': 'reference', 'page': 'reference',
're': [ 're': [
'<span class="titlereference-title-year">(.*?)</span>', r'<span class="titlereference-title-year">(.*?)</span>',
'<a.*?>(\d+)', r'<a.*?>(\d+)',
], ],
'type': 'int' 'type': 'int'
}, },
@ -400,7 +400,7 @@ class Imdb(SiteParser):
'page': 'fullcredits', 'page': 'fullcredits',
're': [ 're': [
lambda data: data.split('<h4'), lambda data: data.split('<h4'),
'>(.*?)</h4>.*?(<table.*?</table>)', r'>(.*?)</h4>.*?(<table.*?</table>)',
lambda data: [d for d in data if d] lambda data: [d for d in data if d]
], ],
'type': 'list' 'type': 'list'
@ -468,7 +468,7 @@ class Imdb(SiteParser):
title = title[1:-1] title = title[1:-1]
if title.startswith("'") and title.endswith("'"): if title.startswith("'") and title.endswith("'"):
title = title[1:-1] title = title[1:-1]
title = re.sub('\(\#[.\d]+\)', '', title) title = re.sub(r'\(\#[.\d]+\)', '', title)
return title.strip() return title.strip()
for t in ('title', 'originalTitle'): for t in ('title', 'originalTitle'):
@ -518,7 +518,7 @@ class Imdb(SiteParser):
self['actor'] = [c[0] for c in self['cast']] self['actor'] = [c[0] for c in self['cast']]
def cleanup_character(c): def cleanup_character(c):
c = c.replace('(uncredited)', '').strip() c = c.replace('(uncredited)', '').strip()
c = re.sub('\s+', ' ', c) c = re.sub(r'\s+', ' ', c)
return c return c
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])} self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
for x in self['cast']] for x in self['cast']]
@ -528,7 +528,7 @@ class Imdb(SiteParser):
del self['isSeries'] del self['isSeries']
self['isSeries'] = True self['isSeries'] = True
if 'episodeTitle' in self: if 'episodeTitle' in self:
self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle']) self['episodeTitle'] = re.sub(r'Episode \#\d+\.\d+', '', self['episodeTitle'])
#make lists unique but keep order #make lists unique but keep order
@ -790,7 +790,7 @@ def get_movie_by_title(title, timeout=-1):
url = "http://www.imdb.com/find?" + params url = "http://www.imdb.com/find?" + params
data = read_url(url, timeout=timeout, unicode=True) data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page #if search results in redirect, get id of current page
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />' r = r'<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
results = re.compile(r).findall(data) results = re.compile(r).findall(data)
if results: if results:
return results[0] return results[0]
@ -869,12 +869,12 @@ def get_movie_id(title, director='', year='', timeout=-1):
data = read_url(url, timeout=timeout, unicode=True) data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page #if search results in redirect, get id of current page
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />' r = r'<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
results = re.compile(r).findall(data) results = re.compile(r).findall(data)
if results: if results:
return results[0] return results[0]
#otherwise get first result #otherwise get first result
r = '<td valign="top">.*?<a href="/title/tt(\d+)/"' r = r'<td valign="top">.*?<a href="/title/tt(\d+)/"'
results = re.compile(r).findall(data) results = re.compile(r).findall(data)
if results: if results:
return results[0] return results[0]
@ -885,7 +885,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
results = duckduckgo.find(google_query, timeout=timeout) results = duckduckgo.find(google_query, timeout=timeout)
if results: if results:
for r in results[:2]: for r in results[:2]:
imdbId = find_re(r[1], 'title/tt(\d+)') imdbId = find_re(r[1], r'title/tt(\d+)')
if imdbId: if imdbId:
return imdbId return imdbId
#or nothing #or nothing
@ -912,11 +912,11 @@ def get_episodes(imdbId, season=None):
if season: if season:
url += '?season=%d' % season url += '?season=%d' % season
data = cache.read_url(url).decode() data = cache.read_url(url).decode()
for e in re.compile('<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data): for e in re.compile(r'<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0] episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
else: else:
data = cache.read_url(url) data = cache.read_url(url)
match = re.compile('<strong>Season (\d+)</strong>').findall(data) match = re.compile(r'<strong>Season (\d+)</strong>').findall(data)
if match: if match:
for season in range(1, int(match[0]) + 1): for season in range(1, int(match[0]) + 1):
episodes.update(get_episodes(imdbId, season)) episodes.update(get_episodes(imdbId, season))
@ -927,7 +927,7 @@ def max_votes():
data = cache.read_url(url).decode('utf-8', 'ignore') data = cache.read_url(url).decode('utf-8', 'ignore')
votes = max([ votes = max([
int(v.replace(',', '')) int(v.replace(',', ''))
for v in re.compile('Votes</span>.*?([\d,]+)', re.DOTALL).findall(data) for v in re.compile(r'Votes</span>.*?([\d,]+)', re.DOTALL).findall(data)
]) ])
return votes return votes