dates, reduce number of imdb pages loaded

This commit is contained in:
j 2010-07-10 13:54:33 +02:00
parent 18ce4cd92d
commit f3147437b6
4 changed files with 16 additions and 8 deletions

1
README
View file

@ -5,6 +5,7 @@ Depends:
python-chardet (http://chardet.feedparser.org/) python-chardet (http://chardet.feedparser.org/)
python-feedparser (http://www.feedparser.org/) python-feedparser (http://www.feedparser.org/)
python-beautifulsoup (http://www.crummy.com/software/BeautifulSoup/) python-beautifulsoup (http://www.crummy.com/software/BeautifulSoup/)
django (optional, otherwise dates < 1900 are not supported)
Usage: Usage:
import ox import ox

7
ox/utils.py Normal file
View file

@ -0,0 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
try:
from django.utils.datetime_safe import datetime
except:
from datetime import datetime

View file

@ -26,7 +26,7 @@ class Imdb(SiteParser):
}, },
'cast': { 'cast': {
'page': 'fullcredits', 'page': 'combined',
're': [ 're': [
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>', '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
lambda ll: [stripTags(l) for l in ll] lambda ll: [stripTags(l) for l in ll]
@ -34,7 +34,7 @@ class Imdb(SiteParser):
'type': 'list' 'type': 'list'
}, },
'cinematographers': { 'cinematographers': {
'page': 'fullcredits', 'page': 'combined',
're': [ 're': [
lambda data: data.split('Series Crew')[0], lambda data: data.split('Series Crew')[0],
'Cinematography by</a>(.*?)</table>', 'Cinematography by</a>(.*?)</table>',
@ -53,7 +53,7 @@ class Imdb(SiteParser):
'type': 'list' 'type': 'list'
}, },
'directors': { 'directors': {
'page': 'fullcredits', 'page': 'combined',
're': [ 're': [
lambda data: data.split('Series Crew')[0], lambda data: data.split('Series Crew')[0],
'Directed by</a>(.*?)</table>', 'Directed by</a>(.*?)</table>',
@ -62,7 +62,7 @@ class Imdb(SiteParser):
'type': 'list' 'type': 'list'
}, },
'editors': { 'editors': {
'page': 'fullcredits', 'page': 'combined',
're': [ 're': [
lambda data: data.split('Series Crew')[0], lambda data: data.split('Series Crew')[0],
'Film Editing by</a>(.*?)</table>', 'Film Editing by</a>(.*?)</table>',
@ -152,7 +152,7 @@ class Imdb(SiteParser):
'type': 'string' 'type': 'string'
}, },
'writers': { 'writers': {
'page': 'fullcredits', 'page': 'combined',
're': [ 're': [
lambda data: data.split('Series Crew')[0], lambda data: data.split('Series Crew')[0],
'Writing credits</a>(.*?)</table>', 'Writing credits</a>(.*?)</table>',

View file

@ -1,10 +1,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from datetime import datetime
from ox.cache import readUrlUnicode from ..cache import readUrlUnicode
from ox import stripTags, decodeHtml from .. import stripTags, decodeHtml
from ..utils import datetime
def cleanup(key, data, data_type): def cleanup(key, data, data_type):