fix release date parser
This commit is contained in:
parent
d7bd98d63a
commit
f7e9605828
1 changed files with 18 additions and 4 deletions
|
@ -15,6 +15,7 @@ import ox.cache
|
||||||
from siteparser import SiteParser
|
from siteparser import SiteParser
|
||||||
import duckduckgo
|
import duckduckgo
|
||||||
|
|
||||||
|
from ..utils import datetime
|
||||||
|
|
||||||
def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
|
def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
|
||||||
headers = headers.copy()
|
headers = headers.copy()
|
||||||
|
@ -210,8 +211,11 @@ class Imdb(SiteParser):
|
||||||
},
|
},
|
||||||
'releasedate': {
|
'releasedate': {
|
||||||
'page': 'releaseinfo',
|
'page': 'releaseinfo',
|
||||||
're': '<a href="/date/(\d{2})-(\d{2})/">.*?</a> <a href="/year/(\d{4})/">',
|
're': [
|
||||||
'type': 'date'
|
'<td class="release_date">(.*?)</td>',
|
||||||
|
ox.strip_tags,
|
||||||
|
],
|
||||||
|
'type': 'list'
|
||||||
},
|
},
|
||||||
'reviews': {
|
'reviews': {
|
||||||
'page': 'externalreviews',
|
'page': 'externalreviews',
|
||||||
|
@ -549,8 +553,18 @@ class Imdb(SiteParser):
|
||||||
self['profit'] = self['gross'] - self['budget']
|
self['profit'] = self['gross'] - self['budget']
|
||||||
|
|
||||||
if 'releasedate' in self:
|
if 'releasedate' in self:
|
||||||
if isinstance(self['releasedate'], list):
|
def parse_date(d):
|
||||||
self['releasedate'] = min(self['releasedate'])
|
try:
|
||||||
|
d = datetime.strptime(d, '%d %B %Y')
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
d = datetime.strptime(d, '%B %Y')
|
||||||
|
except:
|
||||||
|
return 'x'
|
||||||
|
return '%d-%02d-%02d' % (d.year, d.month, d.day)
|
||||||
|
self['releasedate'] = min([
|
||||||
|
parse_date(d) for d in self['releasedate']
|
||||||
|
])
|
||||||
if 'summary' in self:
|
if 'summary' in self:
|
||||||
self['summary'] = self['summary'].split('</p')[0].strip()
|
self['summary'] = self['summary'].split('</p')[0].strip()
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue