From 1b9c4d288c8894f22958b5ade8cd9723cf0dab31 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Sat, 15 Oct 2011 19:03:32 +0200 Subject: [PATCH 01/14] check before use --- ox/web/imdb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index a9d3d88..5771b5b 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -330,7 +330,8 @@ class Imdb(SiteParser): self[key] = filter(lambda x: x.lower() != 'home', self[key]) if 'creator' in self: - self['episodeDirector'] = self['director'] + if 'director' in self: + self['episodeDirector'] = self['director'] self['director'] = self['creator'] if 'series' in self: if 'episodeTitle' in self: From 0b380abaebf8cca2911f379c3ba06e709b4d52f6 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Sat, 15 Oct 2011 21:32:32 +0200 Subject: [PATCH 02/14] series creator(s) fixed #25 --- ox/web/imdb.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 5771b5b..417d15b 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -88,7 +88,7 @@ class Imdb(SiteParser): 'creator': { 'page': 'combined', 're': [ - '
Creators:
.*?
(.*?)
', + '
Creator.?:
.*?
(.*?)
', ' Date: Mon, 17 Oct 2011 13:56:49 +0200 Subject: [PATCH 04/14] . --- ox/movie.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ox/movie.py b/ox/movie.py index 75c5dab..c13d108 100644 --- a/ox/movie.py +++ b/ox/movie.py @@ -61,7 +61,7 @@ def parse_movie_path(path): director = [] #extension/language - fileparts = parts[-1].split('.') + fileparts = [x.replace('||', '. ') for x in parts[-1].replace('. ', '||').split('.')] extension = fileparts[-1] if len(fileparts[-2]) == 2: From 7a0fad1e03c16ad3022e9ed541ce6a8318714060 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Mon, 17 Oct 2011 22:53:02 +0200 Subject: [PATCH 05/14] series without episodes --- ox/movie.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ox/movie.py b/ox/movie.py index c13d108..6b6eae0 100644 --- a/ox/movie.py +++ b/ox/movie.py @@ -152,7 +152,7 @@ def get_oxid(title, director=[], year='', return hashlib.sha1(string.encode('utf-8')).hexdigest().upper() director = ', '.join(director) episode_director = ', '.join(episode_director) - if not episode: + if not episode and not episode_title: oxid = get_hash(director)[:8] + get_hash('\n'.join([title, str(year)]))[:8] else: oxid = get_hash('\n'.join([director, title, str(year), str(season)]))[:8] + \ From fc95c4797b24a6ec6da0265bc60df12c13e26f77 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 18 Oct 2011 10:19:02 +0200 Subject: [PATCH 06/14] script titles --- ox/web/imdb.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index c555ab3..ba2154e 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -266,6 +266,7 @@ class Imdb(SiteParser): #only list one country per alternative title def is_international_title(t): + if 'script title' in t[1].lower(): return False if 'recut version' in t[1].lower(): return False if 'working title' in t[1].lower(): return False if 'complete title' in t[1].lower(): return False From 2057e699bde0de7fcba10943728f778aec80d743 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 18 Oct 2011 12:58:03 +0200 Subject: [PATCH 07/14] special case series without creator but a director --- ox/web/imdb.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index ba2154e..3696b8a 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -102,6 +102,14 @@ class Imdb(SiteParser): ], 'type': 'list' }, + '_director': { + 'page': 'combined', + 're': [ + '
Director:
.*?
(.*?)
', + '
>> decodeHtml('me & you and $&%') u'me & you and $&%' + >>> decodeHtml('€') + u'€' """ if type(html) != unicode: html = unicode(html)[:] @@ -146,7 +148,9 @@ def decodeHtml(html): uchr = lambda value: value > 255 and unichr(value) or chr(value) def entitydecode(match, uchr=uchr): entity = match.group(1) - if entity.startswith('#x'): + if entity == '#x80': + return u'€' + elif entity.startswith('#x'): return uchr(int(entity[2:], 16)) elif entity.startswith('#'): return uchr(int(entity[1:])) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 3696b8a..ad97bf4 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -356,6 +356,10 @@ class Imdb(SiteParser): for key in ['creator', 'year', 'country']: if key in series: self[key] = series[key] + + if not 'director' in self and 'director' in series: + self['director'] = series['director'] + if 'originalTitle' in self: del self['originalTitle'] else: From 71abfcc3078402ceeaac34fcbcadc73941163335 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 18 Oct 2011 14:33:45 +0200 Subject: [PATCH 09/14] directors are creators --- ox/web/imdb.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index ad97bf4..a9e9e2f 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -353,13 +353,13 @@ class Imdb(SiteParser): self['episode%s'%key] = self[key.lower()] series = Imdb(self['series']) + if not 'creator' in series and 'director' in series: + series['creator'] = series['director'] + for key in ['creator', 'year', 'country']: if key in series: self[key] = series[key] - if not 'director' in self and 'director' in series: - self['director'] = series['director'] - if 'originalTitle' in self: del self['originalTitle'] else: From 3e7b463ac8d2a0e9362655edd6f6eb75e312e375 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 18 Oct 2011 14:45:00 +0200 Subject: [PATCH 10/14] International (Spanish --- ox/web/imdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index a9e9e2f..3147e9c 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -279,7 +279,7 @@ class Imdb(SiteParser): if 'working title' in t[1].lower(): return False if 'complete title' in t[1].lower(): return False if t[1].lower() == 'usa': return True - if 'international' in t[1].lower(): return True + #if 'international' in t[1].lower(): return True #fails if orignial is english... Japan (English title) #if 'english title' in t[1].lower(): return True return False From 23ea669b7d200f7a51fab3841141b7a497b055bd Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 18 Oct 2011 14:57:31 +0200 Subject: [PATCH 11/14] cleanup summary --- ox/web/imdb.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 3147e9c..1e7fc49 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -377,6 +377,8 @@ class Imdb(SiteParser): if 'releaseDate' in self: if isinstance(self['releaseDate'], list): self['releaseDate'] = min(self['releaseDate']) + if 'summary' in self: + self['summary'] = self['summary'].split(' Date: Tue, 18 Oct 2011 15:30:16 +0200 Subject: [PATCH 12/14] _ --- ox/web/imdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 1e7fc49..1274a96 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -530,8 +530,8 @@ def getMoviePoster(imdbId): 'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg' ''' info = ImdbCombined(imdbId) - if 'poster_id' in info: - url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['poster_id'], imdbId) + if 'posterId' in info: + url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId) data = readUrl(url) poster = findRe(data, 'img id="primary-img".*?src="(.*?)"') return poster From 60d8c6bc05e6572af685e2ae3d8f2fd1a4a545bf Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 18 Oct 2011 15:50:16 +0200 Subject: [PATCH 13/14] not everybody can be --- ox/web/imdb.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 1274a96..bfb1cba 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -355,6 +355,8 @@ class Imdb(SiteParser): if not 'creator' in series and 'director' in series: series['creator'] = series['director'] + if len(series['creator']) > 10: + series['creator'] = series['director'][:1] for key in ['creator', 'year', 'country']: if key in series: From c8dc06d68265e265957a44bde0d85ce78480cb68 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 18 Oct 2011 16:15:07 +0200 Subject: [PATCH 14/14] take International (English title) --- ox/web/imdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index bfb1cba..c3c2338 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -279,7 +279,7 @@ class Imdb(SiteParser): if 'working title' in t[1].lower(): return False if 'complete title' in t[1].lower(): return False if t[1].lower() == 'usa': return True - #if 'international' in t[1].lower(): return True + if 'international (english title)' in t[1].lower(): return True #fails if orignial is english... Japan (English title) #if 'english title' in t[1].lower(): return True return False