add scrapeit
This commit is contained in:
commit
ca2a42e773
18 changed files with 1864 additions and 0 deletions
14
scrapeit/__init__.py
Normal file
14
scrapeit/__init__.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
# encoding: utf-8
|
||||||
|
|
||||||
|
import btjunkie
|
||||||
|
import google
|
||||||
|
import imdb
|
||||||
|
import mininova
|
||||||
|
import thepiratebay
|
||||||
|
import torrent
|
||||||
|
import rottentomatoes
|
||||||
|
|
||||||
|
|
||||||
|
__version__ = '1.0.0'
|
32
scrapeit/btjunkie.py
Normal file
32
scrapeit/btjunkie.py
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
|
||||||
|
from urllib import quote
|
||||||
|
import re
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
from utils import read_url, stripTags
|
||||||
|
from btutils import torrentsWeLike
|
||||||
|
|
||||||
|
|
||||||
|
def search(query):
|
||||||
|
'''search for torrents on btjunkie
|
||||||
|
'''
|
||||||
|
url = "http://btjunkie.org/search?q=%s&c=6&t=0&o=52&m=0&l=1" % quote(query)
|
||||||
|
page = read_url(url)
|
||||||
|
soup = BeautifulSoup(page)
|
||||||
|
torrents = soup.findAll('a', {'class': 'BlckUnd'})
|
||||||
|
torrents = filter(torrentsWeLike, torrents)
|
||||||
|
torrent_links = []
|
||||||
|
for t in torrents:
|
||||||
|
tlink = "http://btjunkie.org%s.torrent" % t.attrMap['href']
|
||||||
|
tlink = tlink.replace('do=stat', 'do=download')
|
||||||
|
torrent_links.append(tlink)
|
||||||
|
return torrent_links
|
||||||
|
|
||||||
|
def searchByImdb(imdb):
|
||||||
|
'''search for torrents by imdb, not supported on btjunkie right now
|
||||||
|
'''
|
||||||
|
return []
|
25
scrapeit/btutils.py
Normal file
25
scrapeit/btutils.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
|
||||||
|
from utils import stripTags
|
||||||
|
|
||||||
|
|
||||||
|
def torrentsWeLike(link):
|
||||||
|
'''check if torrent title looks like something we want to see,
|
||||||
|
dvdrip / no cam / no dubbed versions
|
||||||
|
'''
|
||||||
|
text = stripTags(unicode(link)).lower()
|
||||||
|
#no cams / telesyncs or other stuff
|
||||||
|
for word in ('cam', 'telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'vcd', 'ts-screener'):
|
||||||
|
if word in text:
|
||||||
|
return False
|
||||||
|
#no dubbed versions
|
||||||
|
for word in ('italian', 'german', 'spanish', 'french'):
|
||||||
|
if word in text:
|
||||||
|
return False
|
||||||
|
#only dvdrips or dvdscrs
|
||||||
|
for word in ('dvdrip', 'dvdscr', 'dvd screener'):
|
||||||
|
if word in text:
|
||||||
|
return True
|
||||||
|
return False
|
115
scrapeit/djangohtml.py
Normal file
115
scrapeit/djangohtml.py
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
"HTML utilities suitable for global use."
|
||||||
|
|
||||||
|
import re, string
|
||||||
|
|
||||||
|
# Configuration for urlize() function
|
||||||
|
LEADING_PUNCTUATION = ['(', '<', '<']
|
||||||
|
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>']
|
||||||
|
|
||||||
|
# list of possible strings used for bullets in bulleted lists
|
||||||
|
DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•']
|
||||||
|
|
||||||
|
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
|
||||||
|
word_split_re = re.compile(r'(\s+)')
|
||||||
|
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
|
||||||
|
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
|
||||||
|
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
|
||||||
|
simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
|
||||||
|
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
|
||||||
|
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
|
||||||
|
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
|
||||||
|
trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z')
|
||||||
|
del x # Temporary variable
|
||||||
|
|
||||||
|
def escape(html):
|
||||||
|
"Returns the given HTML with ampersands, quotes and carets encoded"
|
||||||
|
if not isinstance(html, basestring):
|
||||||
|
html = str(html)
|
||||||
|
return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')
|
||||||
|
|
||||||
|
def linebreaks(value):
|
||||||
|
"Converts newlines into <p> and <br />s"
|
||||||
|
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
|
||||||
|
paras = re.split('\n{2,}', value)
|
||||||
|
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
|
||||||
|
return '\n\n'.join(paras)
|
||||||
|
|
||||||
|
def strip_tags(value):
|
||||||
|
"Returns the given HTML with all tags stripped"
|
||||||
|
return re.sub(r'<[^>]*?>', '', value)
|
||||||
|
|
||||||
|
def strip_spaces_between_tags(value):
|
||||||
|
"Returns the given HTML with spaces between tags normalized to a single space"
|
||||||
|
return re.sub(r'>\s+<', '> <', value)
|
||||||
|
|
||||||
|
def strip_entities(value):
|
||||||
|
"Returns the given HTML with all entities (&something;) stripped"
|
||||||
|
return re.sub(r'&(?:\w+|#\d);', '', value)
|
||||||
|
|
||||||
|
def fix_ampersands(value):
|
||||||
|
"Returns the given HTML with all unencoded ampersands encoded correctly"
|
||||||
|
return unencoded_ampersands_re.sub('&', value)
|
||||||
|
|
||||||
|
def urlize(text, trim_url_limit=None, nofollow=False):
|
||||||
|
"""
|
||||||
|
Converts any URLs in text into clickable links. Works on http://, https:// and
|
||||||
|
www. links. Links can have trailing punctuation (periods, commas, close-parens)
|
||||||
|
and leading punctuation (opening parens) and it'll still do the right thing.
|
||||||
|
|
||||||
|
If trim_url_limit is not None, the URLs in link text will be limited to
|
||||||
|
trim_url_limit characters.
|
||||||
|
|
||||||
|
If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
|
||||||
|
"""
|
||||||
|
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
|
||||||
|
words = word_split_re.split(text)
|
||||||
|
nofollow_attr = nofollow and ' rel="nofollow"' or ''
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
match = punctuation_re.match(word)
|
||||||
|
if match:
|
||||||
|
lead, middle, trail = match.groups()
|
||||||
|
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
|
||||||
|
len(middle) > 0 and middle[0] in string.letters + string.digits and \
|
||||||
|
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
|
||||||
|
middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(anchor))
|
||||||
|
if middle.startswith('http://') or middle.startswith('https://'):
|
||||||
|
middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
|
||||||
|
if '@' in middle and not middle.startswith('www.') and not ':' in middle \
|
||||||
|
and simple_email_re.match(middle):
|
||||||
|
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
|
||||||
|
if lead + middle + trail != word:
|
||||||
|
words[i] = lead + middle + trail
|
||||||
|
return ''.join(words)
|
||||||
|
|
||||||
|
def clean_html(text):
|
||||||
|
"""
|
||||||
|
Cleans the given HTML. Specifically, it does the following:
|
||||||
|
* Converts <b> and <i> to <strong> and <em>.
|
||||||
|
* Encodes all ampersands correctly.
|
||||||
|
* Removes all "target" attributes from <a> tags.
|
||||||
|
* Removes extraneous HTML, such as presentational tags that open and
|
||||||
|
immediately close and <br clear="all">.
|
||||||
|
* Converts hard-coded bullets into HTML unordered lists.
|
||||||
|
* Removes stuff like "<p> </p>", but only if it's at the
|
||||||
|
bottom of the text.
|
||||||
|
"""
|
||||||
|
from djangotext import normalize_newlines
|
||||||
|
text = normalize_newlines(text)
|
||||||
|
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
|
||||||
|
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
|
||||||
|
text = fix_ampersands(text)
|
||||||
|
# Remove all target="" attributes from <a> tags.
|
||||||
|
text = link_target_attribute_re.sub('\\1', text)
|
||||||
|
# Trim stupid HTML such as <br clear="all">.
|
||||||
|
text = html_gunk_re.sub('', text)
|
||||||
|
# Convert hard-coded bullets into HTML unordered lists.
|
||||||
|
def replace_p_tags(match):
|
||||||
|
s = match.group().replace('</p>', '</li>')
|
||||||
|
for d in DOTS:
|
||||||
|
s = s.replace('<p>%s' % d, '<li>')
|
||||||
|
return '<ul>\n%s\n</ul>' % s
|
||||||
|
text = hard_coded_bullets_re.sub(replace_p_tags, text)
|
||||||
|
# Remove stuff like "<p> </p>", but only if it's at the bottom of the text.
|
||||||
|
text = trailing_empty_content_re.sub('', text)
|
||||||
|
return text
|
||||||
|
|
111
scrapeit/djangotext.py
Normal file
111
scrapeit/djangotext.py
Normal file
|
@ -0,0 +1,111 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Capitalizes the first letter of a string.
|
||||||
|
capfirst = lambda x: x and x[0].upper() + x[1:]
|
||||||
|
|
||||||
|
def wrap(text, width):
|
||||||
|
"""
|
||||||
|
A word-wrap function that preserves existing line breaks and most spaces in
|
||||||
|
the text. Expects that existing line breaks are posix newlines (\n).
|
||||||
|
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
|
||||||
|
"""
|
||||||
|
return reduce(lambda line, word, width=width: '%s%s%s' %
|
||||||
|
(line,
|
||||||
|
' \n'[(len(line[line.rfind('\n')+1:])
|
||||||
|
+ len(word.split('\n',1)[0]
|
||||||
|
) >= width)],
|
||||||
|
word),
|
||||||
|
text.split(' ')
|
||||||
|
)
|
||||||
|
|
||||||
|
def truncate_words(s, num):
|
||||||
|
"Truncates a string after a certain number of words."
|
||||||
|
length = int(num)
|
||||||
|
words = s.split()
|
||||||
|
if len(words) > length:
|
||||||
|
words = words[:length]
|
||||||
|
if not words[-1].endswith('...'):
|
||||||
|
words.append('...')
|
||||||
|
return ' '.join(words)
|
||||||
|
|
||||||
|
def get_valid_filename(s):
|
||||||
|
"""
|
||||||
|
Returns the given string converted to a string that can be used for a clean
|
||||||
|
filename. Specifically, leading and trailing spaces are removed; other
|
||||||
|
spaces are converted to underscores; and all non-filename-safe characters
|
||||||
|
are removed.
|
||||||
|
>>> get_valid_filename("john's portrait in 2004.jpg")
|
||||||
|
'johns_portrait_in_2004.jpg'
|
||||||
|
"""
|
||||||
|
s = s.strip().replace(' ', '_')
|
||||||
|
return re.sub(r'[^-A-Za-z0-9_.]', '', s)
|
||||||
|
|
||||||
|
def get_text_list(list_, last_word='or'):
|
||||||
|
"""
|
||||||
|
>>> get_text_list(['a', 'b', 'c', 'd'])
|
||||||
|
'a, b, c or d'
|
||||||
|
>>> get_text_list(['a', 'b', 'c'], 'and')
|
||||||
|
'a, b and c'
|
||||||
|
>>> get_text_list(['a', 'b'], 'and')
|
||||||
|
'a and b'
|
||||||
|
>>> get_text_list(['a'])
|
||||||
|
'a'
|
||||||
|
>>> get_text_list([])
|
||||||
|
''
|
||||||
|
"""
|
||||||
|
if len(list_) == 0: return ''
|
||||||
|
if len(list_) == 1: return list_[0]
|
||||||
|
return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1])
|
||||||
|
|
||||||
|
def normalize_newlines(text):
|
||||||
|
return re.sub(r'\r\n|\r|\n', '\n', text)
|
||||||
|
|
||||||
|
def recapitalize(text):
|
||||||
|
"Recapitalizes text, placing caps after end-of-sentence punctuation."
|
||||||
|
# capwords = ()
|
||||||
|
text = text.lower()
|
||||||
|
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
|
||||||
|
text = capsRE.sub(lambda x: x.group(1).upper(), text)
|
||||||
|
# for capword in capwords:
|
||||||
|
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
|
||||||
|
# text = capwordRE.sub(capword, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def phone2numeric(phone):
|
||||||
|
"Converts a phone number with letters into its numeric equivalent."
|
||||||
|
letters = re.compile(r'[A-PR-Y]', re.I)
|
||||||
|
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
|
||||||
|
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
|
||||||
|
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
|
||||||
|
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
|
||||||
|
'y': '9', 'x': '9'}.get(m.group(0).lower())
|
||||||
|
return letters.sub(char2number, phone)
|
||||||
|
|
||||||
|
# From http://www.xhaus.com/alan/python/httpcomp.html#gzip
|
||||||
|
# Used with permission.
|
||||||
|
def compress_string(s):
|
||||||
|
import cStringIO, gzip
|
||||||
|
zbuf = cStringIO.StringIO()
|
||||||
|
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
|
||||||
|
zfile.write(s)
|
||||||
|
zfile.close()
|
||||||
|
return zbuf.getvalue()
|
||||||
|
|
||||||
|
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
|
||||||
|
def smart_split(text):
|
||||||
|
"""
|
||||||
|
Generator that splits a string by spaces, leaving quoted phrases together.
|
||||||
|
Supports both single and double quotes, and supports escaping quotes with
|
||||||
|
backslashes. In the output, strings will keep their initial and trailing
|
||||||
|
quote marks.
|
||||||
|
>>> list(smart_split('This is "a person\'s" test.'))
|
||||||
|
['This', 'is', '"a person\'s"', 'test.']
|
||||||
|
"""
|
||||||
|
for bit in smart_split_re.finditer(text):
|
||||||
|
bit = bit.group(0)
|
||||||
|
if bit[0] == '"':
|
||||||
|
yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
|
||||||
|
elif bit[0] == "'":
|
||||||
|
yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
|
||||||
|
else:
|
||||||
|
yield bit
|
68
scrapeit/epguides.py
Normal file
68
scrapeit/epguides.py
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
from google import google
|
||||||
|
from utils import read_url, read_url_utf8, stripTags
|
||||||
|
import tvcom
|
||||||
|
import imdb
|
||||||
|
|
||||||
|
def epguidesUrl(title):
|
||||||
|
'''
|
||||||
|
Search Epguide Url for Show via Show Title.
|
||||||
|
Use Google to search the url, this is also done on Epguide.
|
||||||
|
'''
|
||||||
|
for (name, url, desc) in google('allintitle: site:epguides.com %s' % title, 1):
|
||||||
|
if url.startswith('http://epguides.com'):
|
||||||
|
if re.search(title, name):
|
||||||
|
return url
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getShowImdb(title):
|
||||||
|
imdbid = None
|
||||||
|
url = epguidesUrl(title)
|
||||||
|
if url:
|
||||||
|
data = read_url(url)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
links = soup('a', {'href': re.compile('imdb.com/title/tt')})
|
||||||
|
if links:
|
||||||
|
link = links[0].get('href')
|
||||||
|
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
|
||||||
|
if not imdbid:
|
||||||
|
imdbid = imdb.guess(title)
|
||||||
|
return imdbid
|
||||||
|
|
||||||
|
def getEpisodeData(title, episode, show_url = None):
|
||||||
|
'''
|
||||||
|
Collect information about an episode.
|
||||||
|
|
||||||
|
Returns dict with title, show, description and episode
|
||||||
|
'''
|
||||||
|
episodeData = {
|
||||||
|
'title': u'',
|
||||||
|
'show': title,
|
||||||
|
'description': u'',
|
||||||
|
'episode': episode,
|
||||||
|
}
|
||||||
|
description = u''
|
||||||
|
data = u''
|
||||||
|
if not show_url:
|
||||||
|
show_url = epguidesUrl(title)
|
||||||
|
if show_url:
|
||||||
|
data = read_url_utf8(show_url)
|
||||||
|
else:
|
||||||
|
return imdb.getEpisodeData(title, episode)
|
||||||
|
estring = u'' +episode.replace('S','').replace('E','-').replace('0',' ').strip()
|
||||||
|
for line in data.split('\n'):
|
||||||
|
a = line.split(estring)
|
||||||
|
if len(a) == 2:
|
||||||
|
soup = BeautifulSoup(line)
|
||||||
|
episodeData['title'] = soup('a')[0].contents[0]
|
||||||
|
tvcom_url = soup('a')[0].get('href')
|
||||||
|
episodeData['description'] = tvcom.getEpisodeData(tvcom_url)['description']
|
||||||
|
break
|
||||||
|
return episodeData
|
375
scrapeit/google.py
Normal file
375
scrapeit/google.py
Normal file
|
@ -0,0 +1,375 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
"""
|
||||||
|
Query Web search engines.
|
||||||
|
|
||||||
|
This module works by filtering the HTML returned by the search engine and thus tends to break when
|
||||||
|
search engines modify their HTML output.
|
||||||
|
|
||||||
|
Public domain, Connelly Barnes 2005-2007. Compatible with Python 2.3-2.5.
|
||||||
|
|
||||||
|
See L{examples} for a quick start. See L{description} for the full
|
||||||
|
explanation, precautions, and legal disclaimers.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import urllib
|
||||||
|
import urllib2
|
||||||
|
import weakref
|
||||||
|
import threading
|
||||||
|
import Queue
|
||||||
|
|
||||||
|
from utils import read_url
|
||||||
|
|
||||||
|
__version__ = '1.0.2'
|
||||||
|
|
||||||
|
# Default headers for HTTP requests.
|
||||||
|
DEFAULT_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5)'}
|
||||||
|
|
||||||
|
# Default maximum number of results.
|
||||||
|
DEFAULT_MAX_RESULTS = 10
|
||||||
|
|
||||||
|
# Function names for supported search engines.
|
||||||
|
SEARCH_ENGINES = ['ask', 'dmoz', 'excite', 'google', 'msn', 'yahoo']
|
||||||
|
|
||||||
|
__all__ = SEARCH_ENGINES + ['examples', 'description']
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------
|
||||||
|
# Functions
|
||||||
|
# --------------------------------------------------------------------
|
||||||
|
|
||||||
|
def quote_plus(s):
|
||||||
|
"""
|
||||||
|
A variant of urllib.quote_plus which handles ASCII and Unicode.
|
||||||
|
"""
|
||||||
|
return urllib.quote_plus(s.encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
|
def fix_url(url):
|
||||||
|
"""
|
||||||
|
Given url str, trim redirect stuff and return actual URL.
|
||||||
|
|
||||||
|
Currently this just returns the URL unmodified.
|
||||||
|
"""
|
||||||
|
# if url.lower().find('http%3a//') > 0:
|
||||||
|
# return 'http://' + url[url.lower().rindex('http%3a//')+9:]
|
||||||
|
# if url.find('http://') > 0:
|
||||||
|
# return url[url.rindex('http://'):]
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def get_search_page_links(page, results_per_page, begin, end, link_re):
|
||||||
|
"""
|
||||||
|
Given str contents of search result page, return list of links.
|
||||||
|
|
||||||
|
Returns list of (name, url, desc) str tuples. See make_searcher()
|
||||||
|
for a description of results_per_page and link_re.
|
||||||
|
"""
|
||||||
|
if begin is not None and begin in page:
|
||||||
|
page = page[page.index(begin):]
|
||||||
|
if end is not None and end in page:
|
||||||
|
page = page[:page.index(end)]
|
||||||
|
ans = []
|
||||||
|
for match in re.compile(link_re, re.DOTALL).finditer(page):
|
||||||
|
(name, url, desc) = match.group('name', 'url', 'desc')
|
||||||
|
url = fix_url(url)
|
||||||
|
ans += [(html_to_text(name), url, html_to_text(desc))]
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def html_to_text(s):
|
||||||
|
"""
|
||||||
|
Given an HTML formatted str, convert it to a text str.
|
||||||
|
"""
|
||||||
|
s = re.sub(r'<.*?>', '', s)
|
||||||
|
s = s.replace('\r', ' ')
|
||||||
|
s = s.replace('\n', ' ')
|
||||||
|
s = s.replace('\t', ' ')
|
||||||
|
s = s.replace('&', '&')
|
||||||
|
s = s.replace('<', '<')
|
||||||
|
s = s.replace('>', '>')
|
||||||
|
s = s.replace('"', '"')
|
||||||
|
s = s.replace('·', '\xb7')
|
||||||
|
for i in range(256):
|
||||||
|
s = s.replace('&#%d;' % i, chr(i))
|
||||||
|
while s.replace(' ', ' ') != s:
|
||||||
|
s = s.replace(' ', ' ')
|
||||||
|
return s.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def nonblocking(f, blocking_return=None, sleep_time=0.01):
|
||||||
|
"""
|
||||||
|
Wrap a callable which returns an iter so that it no longer blocks.
|
||||||
|
|
||||||
|
The wrapped iterator returns blocking_return while callable f is
|
||||||
|
blocking. The callable f is called in a background thread. If the
|
||||||
|
wrapped iterator is deleted, then the iterator returned by f is
|
||||||
|
deleted also and the background thread is terminated.
|
||||||
|
"""
|
||||||
|
def g(*args, **kwargs):
|
||||||
|
f_iter = f(*args, **kwargs)
|
||||||
|
g_iter = None
|
||||||
|
def run():
|
||||||
|
while True:
|
||||||
|
g_obj = g_iter()
|
||||||
|
if g_obj is None:
|
||||||
|
return
|
||||||
|
if g_obj.q.qsize() == 0:
|
||||||
|
try:
|
||||||
|
f_next = f_iter.next()
|
||||||
|
except Exception, e:
|
||||||
|
g_obj.exc = e
|
||||||
|
return
|
||||||
|
g_obj.q.put(f_next)
|
||||||
|
else:
|
||||||
|
del g_obj
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
class Iter:
|
||||||
|
def __init__(self):
|
||||||
|
self.q = Queue.Queue()
|
||||||
|
self.exc = None
|
||||||
|
self.thread = threading.Thread(target=run)
|
||||||
|
self.thread.setDaemon(True)
|
||||||
|
def next(self):
|
||||||
|
if self.exc is not None:
|
||||||
|
raise self.exc
|
||||||
|
try:
|
||||||
|
return self.q.get_nowait()
|
||||||
|
except Queue.Empty:
|
||||||
|
return blocking_return
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
obj = Iter()
|
||||||
|
g_iter = weakref.ref(obj)
|
||||||
|
obj.thread.start()
|
||||||
|
try:
|
||||||
|
return obj
|
||||||
|
finally:
|
||||||
|
del obj
|
||||||
|
return g
|
||||||
|
|
||||||
|
|
||||||
|
def make_searcher(query_url, results_per_page, page_url, page_mode,
|
||||||
|
begin, end, link_re):
|
||||||
|
"""
|
||||||
|
Return a search function for the given search engine.
|
||||||
|
|
||||||
|
Here query_url is the URL for the initial search, with %(q)s for
|
||||||
|
the query string, results_per_page is the number of search results
|
||||||
|
per page, page_url is the URL for the 2nd and subsequent pages of
|
||||||
|
search results, with %(q)s for the query string and %(n)s for the
|
||||||
|
page "number." Here page_mode controls the actual value for the
|
||||||
|
page "number:"
|
||||||
|
|
||||||
|
- page_mode='page0': Use 0-based index of the page.
|
||||||
|
- page_mode='page1': Use 1-based index of the page.
|
||||||
|
- page_mode='offset0': Use 0-based index of the search result,
|
||||||
|
which is a multiple of results_per_page.
|
||||||
|
- page_mode='offset1': Use 1-based index of the search result
|
||||||
|
(one plus a multiple of results_per_page).
|
||||||
|
|
||||||
|
If begin is not None, then only text after the first occurrence of
|
||||||
|
begin will be used in the search results page. If end is not None,
|
||||||
|
then only text before the first occurrence of end will be used.
|
||||||
|
|
||||||
|
Finally, link_re is a regex string (see module re) which matches
|
||||||
|
three named groups: 'name', 'url', and 'desc'. These correspond to
|
||||||
|
the name, URL and description of each search result. The regex is
|
||||||
|
applied in re.DOTALL mode.
|
||||||
|
|
||||||
|
Returns a search() function which has the same interface as
|
||||||
|
described in the module docstring.
|
||||||
|
"""
|
||||||
|
def search_blocking(query, max_results):
|
||||||
|
last_links = None
|
||||||
|
page_num = 0
|
||||||
|
# done = False
|
||||||
|
q = Queue.Queue()
|
||||||
|
for i in range(max_results):
|
||||||
|
if q.qsize() == 0:
|
||||||
|
if page_num == 0:
|
||||||
|
page = read_url(query_url % {'q': quote_plus(query)})
|
||||||
|
else:
|
||||||
|
# if done:
|
||||||
|
# break
|
||||||
|
if page_mode == 'page0':
|
||||||
|
n = page_num
|
||||||
|
elif page_mode == 'page1':
|
||||||
|
n = page_num + 1
|
||||||
|
elif page_mode == 'offset0':
|
||||||
|
n = page_num * results_per_page
|
||||||
|
elif page_mode == 'offset1':
|
||||||
|
n = page_num * results_per_page + 1
|
||||||
|
else:
|
||||||
|
raise ValueError('unknown page mode')
|
||||||
|
page = read_url(page_url % {'n': n, 'q': quote_plus(query)})
|
||||||
|
page_num += 1
|
||||||
|
links = get_search_page_links(page, results_per_page, begin, end, link_re)
|
||||||
|
if len(links) == 0 or links == last_links:
|
||||||
|
break
|
||||||
|
# if len(links) < results_per_page:
|
||||||
|
# done = True
|
||||||
|
last_links = links
|
||||||
|
for link in links:
|
||||||
|
q.put(link)
|
||||||
|
yield q.get()
|
||||||
|
|
||||||
|
search_nonblocking = nonblocking(search_blocking)
|
||||||
|
|
||||||
|
def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True):
|
||||||
|
"""
|
||||||
|
See docstring for web_search module.
|
||||||
|
"""
|
||||||
|
if blocking:
|
||||||
|
return search_blocking(query, max_results)
|
||||||
|
else:
|
||||||
|
return search_nonblocking(query, max_results)
|
||||||
|
|
||||||
|
return search
|
||||||
|
|
||||||
|
|
||||||
|
def examples():
|
||||||
|
"""
|
||||||
|
Examples of the web_search module.
|
||||||
|
|
||||||
|
Example 1:
|
||||||
|
|
||||||
|
>>> from web_search import google
|
||||||
|
>>> for (name, url, desc) in google('python', 20):
|
||||||
|
... print name, url
|
||||||
|
...
|
||||||
|
(First 20 results for Google search of "python").
|
||||||
|
|
||||||
|
Example 2:
|
||||||
|
|
||||||
|
>>> from web_search import dmoz
|
||||||
|
>>> list(dmoz('abc', 10))
|
||||||
|
[('ABC.com', 'http://www.abc.com', "What's on ABC..."), ...]
|
||||||
|
|
||||||
|
"""
|
||||||
|
print examples.__doc__
|
||||||
|
|
||||||
|
|
||||||
|
def description():
|
||||||
|
"""
|
||||||
|
Full explanation and precautions for web_search module.
|
||||||
|
|
||||||
|
The search functions in this module follow a common interface::
|
||||||
|
|
||||||
|
search(query, max_results=10, blocking=True) =>
|
||||||
|
iterator of (name, url, description) search results.
|
||||||
|
|
||||||
|
Here query is the query string, max_results gives the maximum number
|
||||||
|
of search results, and the items in the returned iterator are string
|
||||||
|
3-tuples containing the Website name, URL, and description for each
|
||||||
|
search result.
|
||||||
|
|
||||||
|
If blocking=False, then an iterator is returned which does not block
|
||||||
|
execution: the iterator yields None when the next search result is
|
||||||
|
not yet available (a background thread is created).
|
||||||
|
|
||||||
|
Supported search engines are 'ask', 'dmoz', 'excite', 'google', 'msn',
|
||||||
|
'yahoo'. This module is not associated with or endorsed by any of
|
||||||
|
these search engine corporations.
|
||||||
|
|
||||||
|
Be warned that if searches are made too frequently, or max_results is
|
||||||
|
large and you enumerate all search results, then you will be a drain
|
||||||
|
on the search engine's bandwidth, and the search engine organization
|
||||||
|
may respond by banning your IP address or IP address range.
|
||||||
|
|
||||||
|
This software has been placed in the public domain with the
|
||||||
|
following legal notice::
|
||||||
|
|
||||||
|
http://oregonstate.edu/~barnesc/documents/public_domain.txt
|
||||||
|
|
||||||
|
"""
|
||||||
|
print description.__doc__
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------
|
||||||
|
# Search engines
|
||||||
|
# --------------------------------------------------------------------
|
||||||
|
|
||||||
|
ask = make_searcher('http://www.ask.com/web?q=%(q)s', 10,
|
||||||
|
'http://www.ask.com/web?page=%(n)d&q=%(q)s', 'page1',
|
||||||
|
None, None,
|
||||||
|
r'<a .*? class="L4" href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
|
||||||
|
r'.*?</div>(?P<desc>.*?)</div>')
|
||||||
|
|
||||||
|
dmoz = make_searcher('http://search.dmoz.org/cgi-bin/search?search=%(q)s', 20,
|
||||||
|
'http://search.dmoz.org/cgi-bin/search?start=%(n)d&search=%(q)s', 'offset1',
|
||||||
|
None, None,
|
||||||
|
r'<li><a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
|
||||||
|
r'.*? - (?P<desc>.*?)<br>')
|
||||||
|
|
||||||
|
excite = make_searcher('http://msxml.excite.com/info.xcite/search/web/%(q)s', 20,
|
||||||
|
'http://msxml.excite.com/info.xcite/search/web/%(q)s/%(n)d', 'offset1',
|
||||||
|
None, None,
|
||||||
|
r'<div class="listingmain" style=""><a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
|
||||||
|
r'(?P<desc>.*?)</span>')
|
||||||
|
|
||||||
|
google = make_searcher('http://www.google.com/search?q=%(q)s', 10,
|
||||||
|
'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0',
|
||||||
|
None, None,
|
||||||
|
r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +
|
||||||
|
r'.*?(?:<br>|<table.*?>)' +
|
||||||
|
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)')
|
||||||
|
|
||||||
|
msn = make_searcher('http://search.msn.com/results.aspx?q=%(q)s', 10,
|
||||||
|
'http://search.msn.com/results.aspx?q=%(q)s&first=%(n)d', 'offset1',
|
||||||
|
'<h2>Results</h2>', '<div id="ads_rightC">',
|
||||||
|
r'<h3><a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
|
||||||
|
r'(?P<desc>.*?)<li class="first">')
|
||||||
|
|
||||||
|
yahoo = make_searcher('http://search.yahoo.com/search?p=%(q)s', 10,
|
||||||
|
'http://search.yahoo.com/search?p=%(q)s&b=%(n)d', 'offset1',
|
||||||
|
None, None,
|
||||||
|
'<li><div><a class=yschttl.*?href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
|
||||||
|
r'.*?<div class=yschabstr>(?P<desc>.*?)</div>')
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------
|
||||||
|
# Unit tests
|
||||||
|
# --------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_engine(search):
|
||||||
|
"""
|
||||||
|
Test a search engine function returned by make_searcher().
|
||||||
|
"""
|
||||||
|
for query in ['abc', 'microsoft', 'love', 'pweropieiw', 'addfdae']:
|
||||||
|
popular = query in ['abc', 'microsoft', 'love', 'run']
|
||||||
|
for n in [6, 17, 31]:
|
||||||
|
n1 = len(list(search(query, n)))
|
||||||
|
if popular:
|
||||||
|
assert n1 == n
|
||||||
|
else:
|
||||||
|
assert n1 <= n
|
||||||
|
n2 = 0
|
||||||
|
for item in search(query, n, False):
|
||||||
|
if item is not None:
|
||||||
|
n2 += 1
|
||||||
|
else:
|
||||||
|
time.sleep(0.01)
|
||||||
|
if popular:
|
||||||
|
assert n2 == n
|
||||||
|
else:
|
||||||
|
assert n2 <= n
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
"""
|
||||||
|
Unit test main routine.
|
||||||
|
"""
|
||||||
|
import inspect
|
||||||
|
print 'Testing:'
|
||||||
|
for name in SEARCH_ENGINES:
|
||||||
|
print ' ' + (name + ':').ljust(20),
|
||||||
|
test_engine(getattr(inspect.getmodule(test), name))
|
||||||
|
print 'OK'
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test()
|
34
scrapeit/googlemovie.py
Normal file
34
scrapeit/googlemovie.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
|
||||||
|
import re
|
||||||
|
from urllib import quote
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
from utils import read_url, read_url_utf8, stripTags
|
||||||
|
|
||||||
|
def getGoogleMovieId(title):
|
||||||
|
url = 'http://google.com/movies?q=%s&btnG=Search+Movies' % quote(title)
|
||||||
|
data = read_url(url)
|
||||||
|
cids = re.compile('reviews\?cid=(.*?)&').findall(data)
|
||||||
|
if cids:
|
||||||
|
return cids[0]
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getGoogleMovieData(title, year = None, cid = None):
|
||||||
|
gdata = {
|
||||||
|
'title': title,
|
||||||
|
'year': year,
|
||||||
|
'cid': cid,
|
||||||
|
'rating': '',
|
||||||
|
}
|
||||||
|
if not cid:
|
||||||
|
cid = getGoogleMovieId("%s (%s)" % (title, year))
|
||||||
|
if cid:
|
||||||
|
gdata['cid'] = cid
|
||||||
|
data = read_url('http://www.google.com/movies/reviews?cid=%s' % cid)
|
||||||
|
gdata['rating'] = re.compile('font size=.3><b><nobr>(.*?) / 5').findall(data)[0]
|
||||||
|
gdata['reviews'] = re.compile('Based on (.*?) reviews').findall(data)[0]
|
||||||
|
gdata['year'] = re.compile("<title>.*?\((.*?)\).*?</title").findall(data)[0]
|
||||||
|
return gdata
|
441
scrapeit/imdb.py
Normal file
441
scrapeit/imdb.py
Normal file
|
@ -0,0 +1,441 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
|
||||||
|
import urllib2
|
||||||
|
from urllib import quote
|
||||||
|
import re, time
|
||||||
|
import os
|
||||||
|
|
||||||
|
from elementtree.ElementTree import parse, tostring
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
from google import google
|
||||||
|
from utils import stripTags, read_url_utf8, htmldecode
|
||||||
|
|
||||||
|
import utils
|
||||||
|
|
||||||
|
def read_url(url):
|
||||||
|
base = "/var/cache/scrapeit/cache/"
|
||||||
|
path = os.path.join(base, url.replace('http://',''))
|
||||||
|
if path.endswith('/'):
|
||||||
|
path = "%sindex.html" % path
|
||||||
|
if os.path.isdir(path):
|
||||||
|
path = "%s/index.html" % path
|
||||||
|
if os.path.exists(path):
|
||||||
|
f = open(path)
|
||||||
|
data = f.read()
|
||||||
|
f.close()
|
||||||
|
return data
|
||||||
|
else:
|
||||||
|
data = utils.read_url(url)
|
||||||
|
folder = os.path.dirname(path)
|
||||||
|
if not os.path.exists(folder):
|
||||||
|
os.makedirs(folder)
|
||||||
|
f = open(path, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _get_data(url):
|
||||||
|
data = None
|
||||||
|
try:
|
||||||
|
data = read_url(url)
|
||||||
|
except:
|
||||||
|
print "error reading data from", url
|
||||||
|
return data
|
||||||
|
|
||||||
|
def get_image(url):
|
||||||
|
return read_url(url)
|
||||||
|
|
||||||
|
def _castList(data, regexp):
|
||||||
|
soup = re.compile(regexp).findall(data)
|
||||||
|
if soup:
|
||||||
|
soup = BeautifulSoup(soup[0])
|
||||||
|
names = []
|
||||||
|
for i in soup('a', {'href': re.compile('/name/nm')}):
|
||||||
|
if i.string:
|
||||||
|
cast = stripTags(i.string)
|
||||||
|
if cast not in names:
|
||||||
|
names.append(cast)
|
||||||
|
return names
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _getTerm(data, regexp):
|
||||||
|
term = ''
|
||||||
|
try:
|
||||||
|
reg = re.compile(regexp, re.IGNORECASE)
|
||||||
|
m = reg.search(data)
|
||||||
|
if m:
|
||||||
|
term = stripTags(m.group(1)).strip()
|
||||||
|
except:
|
||||||
|
print "waring, parsing failed for", regexp
|
||||||
|
return term.encode('utf8')
|
||||||
|
|
||||||
|
|
||||||
|
class IMDb:
|
||||||
|
def __init__(self, imdb):
|
||||||
|
self.imdb = imdb
|
||||||
|
self.pageSource = None
|
||||||
|
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
||||||
|
|
||||||
|
self.businessSource = None
|
||||||
|
self.businessUrl = "%sbusiness" % self.pageUrl
|
||||||
|
self.connectionsSource = None
|
||||||
|
self.connectionsUrl = "%smovieconnections" % self.pageUrl
|
||||||
|
self.creditsSource = None
|
||||||
|
self.creditsUrl = "%sfullcredits" % self.pageUrl
|
||||||
|
self.episodesSource = None
|
||||||
|
self.episodesUrl = "%sepisodes" % self.pageUrl
|
||||||
|
self.keywordSource = None
|
||||||
|
self.keywordUrl = "%skeywords" % self.pageUrl
|
||||||
|
self.plotSource = None
|
||||||
|
self.plotUrl = "%splotsummary" % self.pageUrl
|
||||||
|
self.releaseinfoSource = None
|
||||||
|
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
||||||
|
self.triviaSource = None
|
||||||
|
self.triviaUrl = "%strivia" % self.pageUrl
|
||||||
|
|
||||||
|
def getPage(self, forcereload = False):
|
||||||
|
if forcereload or not self.pageSource:
|
||||||
|
self.pageSource = read_url(self.pageUrl)
|
||||||
|
return self.pageSource
|
||||||
|
|
||||||
|
def parse_raw_value(self, key, value):
|
||||||
|
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
|
||||||
|
value = stripTags(value).strip()
|
||||||
|
if key == 'runtime':
|
||||||
|
parsed_value = _getTerm(value, '(.*?) min')
|
||||||
|
parsed_value = _getTerm(parsed_value, '([0-9]+)')
|
||||||
|
if not parsed_value:
|
||||||
|
parsed_value = _getTerm(value, '(.*?) sec')
|
||||||
|
parsed_value = _getTerm(parsed_value, '([0-9]+)')
|
||||||
|
if not parsed_value:
|
||||||
|
parsed_value = 0
|
||||||
|
else:
|
||||||
|
parsed_value = int(parsed_value)
|
||||||
|
else:
|
||||||
|
parsed_value = int(parsed_value) * 60
|
||||||
|
elif key in ('country', 'language'):
|
||||||
|
parsed_value = value.split(' / ')
|
||||||
|
elif key == 'genre':
|
||||||
|
parsed_value = value.replace('more', '').strip().split(' / ')
|
||||||
|
elif key == 'tagline':
|
||||||
|
parsed_value = value.replace('more', '').strip()
|
||||||
|
elif key == 'plot_outline':
|
||||||
|
parsed_value = value.replace('(view trailer)', '').strip()
|
||||||
|
if parsed_value.endswith('more'):
|
||||||
|
parsed_value = parsed_value[:-4].strip()
|
||||||
|
elif key == 'tv_series':
|
||||||
|
m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
|
||||||
|
if m:
|
||||||
|
parsed_value = m[0][0]
|
||||||
|
else:
|
||||||
|
parsed_value = ''
|
||||||
|
else:
|
||||||
|
print value
|
||||||
|
parsed_value = value
|
||||||
|
return parsed_value
|
||||||
|
|
||||||
|
def parse(self):
|
||||||
|
data = self.getPage()
|
||||||
|
IMDbDict ={}
|
||||||
|
#Poster
|
||||||
|
IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||||
|
if not IMDbDict['poster']:
|
||||||
|
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
||||||
|
#Title, Year
|
||||||
|
title = u''
|
||||||
|
year = u''
|
||||||
|
flat_data = data.replace('\n', '').replace('\r', '')
|
||||||
|
html_title = re.compile('<strong class="title">(.*?) <small>\(<a href="/Sections/Years/(.*?)">').findall(flat_data)
|
||||||
|
if html_title:
|
||||||
|
title = html_title[0][0]
|
||||||
|
IMDbDict['year'] = html_title[0][1]
|
||||||
|
IMDbDict['title'] = stripTags(title).strip()
|
||||||
|
else:
|
||||||
|
title = _getTerm(data, '<title>(.*?)</title>').split('(')
|
||||||
|
year = title[-1].split(')')[0].strip()
|
||||||
|
title = title[0].strip().decode('utf-8')
|
||||||
|
IMDbDict['title'] = title
|
||||||
|
IMDbDict['year'] = year
|
||||||
|
IMDbDict['title'] = htmldecode(IMDbDict['title'])
|
||||||
|
if IMDbDict['title'][0] == '"' and IMDbDict['title'][-1] == '"':
|
||||||
|
IMDbDict['title'] = IMDbDict['title'][1:-1]
|
||||||
|
|
||||||
|
#Votes
|
||||||
|
m = re.compile('<b>(.*?)/10</b> \(<a href="ratings">(.*?) votes</a>\)', re.IGNORECASE).search(data)
|
||||||
|
if m:
|
||||||
|
IMDbDict['rating'] = int(float(m.group(1)) * 1000)
|
||||||
|
IMDbDict['votes'] = int(m.group(2).replace(',', ''))
|
||||||
|
else:
|
||||||
|
IMDbDict['rating'] = -1
|
||||||
|
IMDbDict['votes'] = -1
|
||||||
|
|
||||||
|
data = data.replace('\n',' ')
|
||||||
|
#some values
|
||||||
|
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series')
|
||||||
|
for key in keys:
|
||||||
|
IMDbDict[key] = ''
|
||||||
|
IMDbDict['runtime'] = 0
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
for info in soup('div', {'class': 'info'}):
|
||||||
|
key = str(info).split('</h5>')[0].split('<h5>')
|
||||||
|
if len(key) > 1:
|
||||||
|
raw_value = str(info).split('</h5>')[1]
|
||||||
|
key = key[1][:-1].lower().replace(' ', '_')
|
||||||
|
if key in keys:
|
||||||
|
IMDbDict[key] = self.parse_raw_value(key, raw_value)
|
||||||
|
|
||||||
|
#is episode
|
||||||
|
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
|
||||||
|
|
||||||
|
IMDbDict['episodes'] = self.parseEpisodes()
|
||||||
|
IMDbDict['credits'] = self.parseCredits()
|
||||||
|
IMDbDict['plot'] = self.parsePlot()
|
||||||
|
IMDbDict['keywords'] = self.parseKeywords()
|
||||||
|
|
||||||
|
IMDbDict['trivia'] = self.parseTrivia()
|
||||||
|
IMDbDict['connections'] = self.parseConnections()
|
||||||
|
IMDbDict['release_date'] = self.parseReleaseinfo()
|
||||||
|
IMDbDict['business'] = self.parseBusiness()
|
||||||
|
self.IMDbDict = IMDbDict
|
||||||
|
return self.IMDbDict
|
||||||
|
|
||||||
|
def getCredits(self, forcereload = False):
|
||||||
|
if forcereload or not self.creditsSource:
|
||||||
|
self.creditsSource = read_url(self.creditsUrl)
|
||||||
|
return self.creditsSource
|
||||||
|
|
||||||
|
def parseCredits(self):
|
||||||
|
data = self.getCredits()
|
||||||
|
credits = {}
|
||||||
|
credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
|
||||||
|
credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>')
|
||||||
|
credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>')
|
||||||
|
#credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
|
||||||
|
credits['cast'] = []
|
||||||
|
soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
cast = soup('table', {'class': 'cast'})
|
||||||
|
if cast:
|
||||||
|
cast = str(cast[0])
|
||||||
|
names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
|
||||||
|
for name in names:
|
||||||
|
real_name = name[0]
|
||||||
|
role_name = name[1]
|
||||||
|
if role_name:
|
||||||
|
role_name = role_name.split('(')[0].replace('/ ...','').strip()
|
||||||
|
credits['cast'].append((stripTags(real_name), stripTags(role_name)))
|
||||||
|
self.credits = credits
|
||||||
|
return self.credits
|
||||||
|
|
||||||
|
def getPlot(self, forcereload = False):
|
||||||
|
if forcereload or not self.plotSource:
|
||||||
|
self.plotSource = read_url(self.plotUrl)
|
||||||
|
return self.plotSource
|
||||||
|
|
||||||
|
def parsePlot(self):
|
||||||
|
soup = BeautifulSoup(self.getPlot())
|
||||||
|
plot = soup('p', {'class':'plotpar'})
|
||||||
|
if plot:
|
||||||
|
plot = str(plot[0]).split('<i>')[0]
|
||||||
|
else:
|
||||||
|
plot = u''
|
||||||
|
plot = stripTags(plot).strip()
|
||||||
|
self.plot = plot
|
||||||
|
return plot
|
||||||
|
|
||||||
|
def getEpisodes(self, forcereload = False):
|
||||||
|
if forcereload or not self.episodesSource:
|
||||||
|
self.episodesSource = read_url(self.episodesUrl)
|
||||||
|
return self.episodesSource
|
||||||
|
|
||||||
|
def parseEpisodes(self):
|
||||||
|
episodes = {}
|
||||||
|
cdata = self.getEpisodes().replace('\r\n',' ')
|
||||||
|
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>.*?</b><br>(.*?)<br/>'''
|
||||||
|
#regexp = r'''Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></span><br>.*?<br>(.*?)</td>'''
|
||||||
|
reg = re.compile(regexp, re.IGNORECASE)
|
||||||
|
m = reg.findall(cdata)
|
||||||
|
for match in m:
|
||||||
|
try:
|
||||||
|
episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
|
||||||
|
episodes[episode] = {}
|
||||||
|
episodes[episode]['imdb'] = match[2]
|
||||||
|
episodes[episode]['title'] = match[3].strip()
|
||||||
|
description = htmldecode(match[4])
|
||||||
|
description = stripTags(description.split('Next US airings:')[0])
|
||||||
|
episodes[episode]['description'] = description
|
||||||
|
except:
|
||||||
|
import traceback
|
||||||
|
print traceback.print_exc()
|
||||||
|
pass
|
||||||
|
self.episodes = episodes
|
||||||
|
return self.episodes
|
||||||
|
|
||||||
|
def getKeywords(self, forcereload = False):
|
||||||
|
if forcereload or not self.keywordSource:
|
||||||
|
self.keywordSource = read_url(self.keywordUrl)
|
||||||
|
return self.keywordSource
|
||||||
|
|
||||||
|
def parseKeywords(self):
|
||||||
|
soup = BeautifulSoup(self.getKeywords())
|
||||||
|
keywords = []
|
||||||
|
for key in soup('a', {'href': re.compile('/keyword')}):
|
||||||
|
keywords.append(htmldecode(key.string))
|
||||||
|
self.keywords = keywords
|
||||||
|
return self.keywords
|
||||||
|
|
||||||
|
def getTrivia(self, forcereload = False):
|
||||||
|
if forcereload or not self.triviaSource:
|
||||||
|
self.triviaSource = read_url(self.triviaUrl)
|
||||||
|
return self.triviaSource
|
||||||
|
|
||||||
|
def parseTrivia(self):
|
||||||
|
trivia = []
|
||||||
|
soup = BeautifulSoup(self.getTrivia())
|
||||||
|
triviaList = []
|
||||||
|
for i in soup('ul', {'class': "trivia"}):
|
||||||
|
for t in i('li'):
|
||||||
|
t = str(t).replace('<br />', '').strip()
|
||||||
|
if t.startswith('<li>') and t.endswith('</li>'):
|
||||||
|
t = t[4:-5].strip()
|
||||||
|
trivia.append(t)
|
||||||
|
self.trivia = trivia
|
||||||
|
return self.trivia
|
||||||
|
|
||||||
|
def getConnections(self, forcereload = False):
|
||||||
|
if forcereload or not self.connectionsSource:
|
||||||
|
self.connectionsSource = read_url(self.connectionsUrl)
|
||||||
|
return self.connectionsSource
|
||||||
|
|
||||||
|
def parseConnections(self):
|
||||||
|
connections = {}
|
||||||
|
soup = BeautifulSoup(self.getConnections())
|
||||||
|
content = soup('div', {'id': 'tn15content'})[0]
|
||||||
|
blocks = str(content).split('<h5>')[1:]
|
||||||
|
for c in blocks:
|
||||||
|
connection = c.split('</h5>')[0]
|
||||||
|
cs = BeautifulSoup(c)
|
||||||
|
if connection:
|
||||||
|
#relation -> list of imdb ids
|
||||||
|
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
|
||||||
|
return connections
|
||||||
|
|
||||||
|
def getReleaseinfo(self, forcereload = False):
|
||||||
|
if forcereload or not self.releaseinfoSource:
|
||||||
|
self.releaseinfoSource = read_url(self.releaseinfoUrl)
|
||||||
|
return self.releaseinfoSource
|
||||||
|
|
||||||
|
def parseReleaseinfo(self):
|
||||||
|
soup = BeautifulSoup(self.getReleaseinfo())
|
||||||
|
for row in soup('table',{'border': '0', 'cellpadding':'2'})[0]('tr'):
|
||||||
|
d = row('td', {'align':'right'})
|
||||||
|
if d:
|
||||||
|
try:
|
||||||
|
possible_date = stripTags(str(d[0])).strip()
|
||||||
|
rdate = time.strptime(possible_date, "%d %B %Y")
|
||||||
|
rdate = time.strftime('%Y-%m-%d', rdate)
|
||||||
|
return rdate
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getBusiness(self, forcereload = False):
|
||||||
|
if forcereload or not self.businessSource:
|
||||||
|
self.businessSource = read_url(self.businessUrl)
|
||||||
|
return self.businessSource
|
||||||
|
|
||||||
|
def parseBusiness(self):
|
||||||
|
soup = BeautifulSoup(self.getBusiness())
|
||||||
|
business = {'budget': 0, 'gross': 0, 'profit': 0}
|
||||||
|
content = soup('div', {'id': 'tn15content'})[0]
|
||||||
|
blocks = str(content).split('<h5>')[1:]
|
||||||
|
for c in blocks:
|
||||||
|
cs = BeautifulSoup(c)
|
||||||
|
line = c.split('</h5>')
|
||||||
|
if line:
|
||||||
|
title = line[0]
|
||||||
|
line = line[1]
|
||||||
|
if title in ['Budget', 'Gross']:
|
||||||
|
values = re.compile('\$(.*?) ').findall(line)
|
||||||
|
values = [int(value.replace(',','')) for value in values]
|
||||||
|
if values:
|
||||||
|
business[title.lower()] = max(values)
|
||||||
|
if business['budget'] and business['gross']:
|
||||||
|
business['profit'] = business['gross'] - business['budget']
|
||||||
|
return business
|
||||||
|
|
||||||
|
def guess(title, director=''):
|
||||||
|
#FIXME: proper file -> title
|
||||||
|
title = title.split('-')[0]
|
||||||
|
title = title.split('(')[0]
|
||||||
|
title = title.split('.')[0]
|
||||||
|
title = title.strip()
|
||||||
|
imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
|
||||||
|
return_url = ''
|
||||||
|
|
||||||
|
#lest first try google
|
||||||
|
#i.e. site:imdb.com Michael Stevens Sin
|
||||||
|
if director:
|
||||||
|
search = 'site:imdb.com %s "%s"' % (director, title)
|
||||||
|
else:
|
||||||
|
search = 'site:imdb.com "%s"' % title
|
||||||
|
for (name, url, desc) in google(search, 1):
|
||||||
|
if url.startswith('http://www.imdb.com/title/tt'):
|
||||||
|
return url[28:35]
|
||||||
|
|
||||||
|
req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
|
||||||
|
u = urllib2.urlopen(req)
|
||||||
|
data = u.read()
|
||||||
|
return_url = u.url
|
||||||
|
u.close()
|
||||||
|
|
||||||
|
if return_url.startswith('http://www.imdb.com/title/tt'):
|
||||||
|
return return_url[28:35]
|
||||||
|
if data:
|
||||||
|
imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
|
||||||
|
if imdb_id:
|
||||||
|
return imdb_id
|
||||||
|
|
||||||
|
imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
|
||||||
|
req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
|
||||||
|
u = urllib2.urlopen(req)
|
||||||
|
data = u.read()
|
||||||
|
return_url = u.url
|
||||||
|
u.close()
|
||||||
|
if return_url.startswith('http://www.imdb.com/title/tt'):
|
||||||
|
return return_url[28:35]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getEpisodeData(title, episode, show_url = None):
|
||||||
|
'''
|
||||||
|
Collect information about an episode.
|
||||||
|
|
||||||
|
Returns dict with title, show, description and episode
|
||||||
|
'''
|
||||||
|
episodeData = {
|
||||||
|
'title': u'',
|
||||||
|
'show': title,
|
||||||
|
'description': u'',
|
||||||
|
'episode': episode,
|
||||||
|
}
|
||||||
|
description = u''
|
||||||
|
if not show_url:
|
||||||
|
imdbid = guess(title)
|
||||||
|
else:
|
||||||
|
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
|
||||||
|
if imdbid:
|
||||||
|
i = IMDb(imdbid).parse()
|
||||||
|
episodeData['title'] = i['episodes'][episode]['title']
|
||||||
|
episodeData['description'] = i['episodes'][episode]['description']
|
||||||
|
episodeData['imdb'] = i['episodes'][episode]['imdb']
|
||||||
|
return episodeData
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
#print parse(sys.argv[1])
|
||||||
|
print "imdb:", guess(sys.argv[1])
|
40
scrapeit/mininova.py
Normal file
40
scrapeit/mininova.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
import re
|
||||||
|
import socket
|
||||||
|
from urllib import quote
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
from utils import read_url, read_url_utf8
|
||||||
|
from btutils import torrentsWeLike
|
||||||
|
|
||||||
|
socket.setdefaulttimeout(10.0)
|
||||||
|
|
||||||
|
def search(query):
|
||||||
|
'''search for torrents on mininova
|
||||||
|
'''
|
||||||
|
torrents = []
|
||||||
|
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||||
|
page = read_url(url)
|
||||||
|
soup = BeautifulSoup(page)
|
||||||
|
for row in soup('tr'):
|
||||||
|
links = row('a', {'href':re.compile('/tor')})
|
||||||
|
if links and torrentsWeLike(links[0]):
|
||||||
|
torrent_url = "http://www.mininova.org%s" % links[0].get('href').replace('/tor', '/get')
|
||||||
|
torrents.append(torrent_url)
|
||||||
|
return torrents
|
||||||
|
|
||||||
|
def searchByImdb(imdb):
|
||||||
|
'''search for torrents on mininova by imdb
|
||||||
|
'''
|
||||||
|
torrents = []
|
||||||
|
page = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdb)
|
||||||
|
soup = BeautifulSoup(page)
|
||||||
|
for row in soup('tr'):
|
||||||
|
links = row('a', {'href':re.compile('/get')})
|
||||||
|
if links:
|
||||||
|
torrent_url = "http://www.mininova.org%s" % links[0].get('href')
|
||||||
|
torrents.append(torrent_url)
|
||||||
|
return torrents
|
37
scrapeit/rottentomatoes.py
Normal file
37
scrapeit/rottentomatoes.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
|
||||||
|
from urllib import quote
|
||||||
|
import re
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
from utils import read_url
|
||||||
|
|
||||||
|
|
||||||
|
def getRottenTomatoes(rating = 70):
|
||||||
|
'''
|
||||||
|
Get movie TITLES
|
||||||
|
rated ABOVE 70 or value passed as first argument
|
||||||
|
from RottenTomatoes
|
||||||
|
'''
|
||||||
|
movies = []
|
||||||
|
offset = 0
|
||||||
|
titles = ['1']
|
||||||
|
while titles:
|
||||||
|
url = "http://www.rottentomatoes.com/movies/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=40&y=5&start_index=%s" % (rating, offset)
|
||||||
|
page = read_url(url)
|
||||||
|
soup = BeautifulSoup(page)
|
||||||
|
titles = [link.contents[0] for link in soup.findAll('a', {'class': 'movie-link'})]
|
||||||
|
data = str(soup)
|
||||||
|
ratings = re.compile('<span class="bold">(.*?) %</span>').findall(data)
|
||||||
|
|
||||||
|
ratings = ratings[len(ratings)- len(titles):]
|
||||||
|
|
||||||
|
for title in titles:
|
||||||
|
movies.append({'title': title, 'rating': ratings[titles.index(title)], 'torrent': ''})
|
||||||
|
|
||||||
|
offset += 10
|
||||||
|
return movies
|
||||||
|
|
16
scrapeit/scrapetorrent.py
Normal file
16
scrapeit/scrapetorrent.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
|
||||||
|
from urllib import quote
|
||||||
|
import re
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def search(query):
|
||||||
|
'''search for torrents on scrapetorrent
|
||||||
|
'''
|
||||||
|
torrents = []
|
||||||
|
return torrents
|
||||||
|
|
104
scrapeit/thepiratebay.py
Normal file
104
scrapeit/thepiratebay.py
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
|
||||||
|
import re
|
||||||
|
import socket
|
||||||
|
from urllib import quote
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
from google import google
|
||||||
|
from utils import read_url, read_url_utf8
|
||||||
|
|
||||||
|
|
||||||
|
socket.setdefaulttimeout(10.0)
|
||||||
|
|
||||||
|
season_episode = re.compile("S..E..", re.IGNORECASE)
|
||||||
|
|
||||||
|
def shows(name = None):
|
||||||
|
data = read_url_utf8('http://thepiratebay.org/tv/all')
|
||||||
|
shows = re.compile('<dt><a href="/tv/(.*?)/">(.*?)</a></dt>').findall(data)
|
||||||
|
if not name:
|
||||||
|
return shows
|
||||||
|
for show in shows:
|
||||||
|
id = show[0]
|
||||||
|
if name == show[1]:
|
||||||
|
return id
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def findMatch(data, reg):
|
||||||
|
m = re.compile(reg).findall(data)
|
||||||
|
if m:
|
||||||
|
return m[0]
|
||||||
|
return u''
|
||||||
|
|
||||||
|
def get_info(url):
|
||||||
|
url = url.strip()
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = 'http://thepiratebay.org' + url
|
||||||
|
data = read_url(url)
|
||||||
|
line = data.replace('\n', ' ')
|
||||||
|
info = {}
|
||||||
|
info['torrent'] = findMatch(data, '(http://.*?.torrent)"')
|
||||||
|
info['files'] = findMatch(data, '<dd><a href="/details.php.id=.*?&fl#show">(.*?)</a></dd>')
|
||||||
|
try:
|
||||||
|
info['files'] = int(info['files'])
|
||||||
|
except:
|
||||||
|
info['files'] = 0
|
||||||
|
info['spoken_language'] = findMatch(line, '<dt>Spoken language\(s\):</dt>.*?<dd>(.*?)</dd>')
|
||||||
|
info['texted_language'] = findMatch(line, '<dt>Texted language\(s\):</dt>.*?<dd>(.*?)</dd>')
|
||||||
|
return info
|
||||||
|
|
||||||
|
def get_episode_name(string):
|
||||||
|
episode = ''
|
||||||
|
ep = season_episode.findall(string)
|
||||||
|
if ep:
|
||||||
|
episode = ep[0].upper()
|
||||||
|
return episode
|
||||||
|
|
||||||
|
def in_killwords(string):
|
||||||
|
string = string.lower()
|
||||||
|
match = False
|
||||||
|
for w in ['swesub', 'mpeg']:
|
||||||
|
if w in string:
|
||||||
|
match = True
|
||||||
|
return match
|
||||||
|
|
||||||
|
def get_episode(show_id, episode):
|
||||||
|
if show_id <= 0:
|
||||||
|
return ''
|
||||||
|
tpbe = get_episodes(show_id)
|
||||||
|
for e in tpbe:
|
||||||
|
link =e[0]
|
||||||
|
ep = get_episode_name(e[1])
|
||||||
|
if ep == episode:
|
||||||
|
info = get_info(link)
|
||||||
|
if not in_killwords(info['torrent']) \
|
||||||
|
and info['files'] > 0 and info['files'] < 10 \
|
||||||
|
and (not info['texted_language'] or info['texted_language'] == info['spoken_language']):
|
||||||
|
return info['torrent']
|
||||||
|
return u''
|
||||||
|
|
||||||
|
def get_episodes(id):
|
||||||
|
data = read_url("http://thepiratebay.org/tv/%s" % id)
|
||||||
|
episodes = re.compile('<nobr><a href="(.*?)">(.*?)</a></nobr>').findall(data)
|
||||||
|
return episodes
|
||||||
|
|
||||||
|
def search(query):
|
||||||
|
torrents = []
|
||||||
|
url = "http://thepiratebay.org/search.php?video=on&q=%s" % quote(query)
|
||||||
|
page = read_url(url)
|
||||||
|
soup = BeautifulSoup(page)
|
||||||
|
for row in soup('tr'):
|
||||||
|
torrentType = row.findAll('td', {'class': 'vertTh'})
|
||||||
|
if torrentType:
|
||||||
|
torrentType = torrentType[0]('a')[0].get('href').split('/')[-1]
|
||||||
|
# 201 = Movies , 202 = Movie DVDR
|
||||||
|
if torrentType in ['201']:
|
||||||
|
torrent = row.findAll('a', {'href':re.compile('.torrent$')})[0].get('href')
|
||||||
|
torrents.append(torrent)
|
||||||
|
return torrents
|
||||||
|
|
||||||
|
def searchByImdb(imdb):
|
||||||
|
return search("tt" + imdb)
|
18
scrapeit/torrent.py
Normal file
18
scrapeit/torrent.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
|
||||||
|
import mininova
|
||||||
|
import btjunkie
|
||||||
|
import thepiratebay
|
||||||
|
|
||||||
|
def search(query):
|
||||||
|
'''meta function to search with the best known torrent search engine
|
||||||
|
'''
|
||||||
|
return btjunkie.search(query)
|
||||||
|
|
||||||
|
def searchByImdb(imdb):
|
||||||
|
'''meta function to search by imdb with the best known torrent search engine
|
||||||
|
'''
|
||||||
|
return mininova.searchByImdb(imdb)
|
||||||
|
|
34
scrapeit/tvcom.py
Normal file
34
scrapeit/tvcom.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
from utils import read_url_utf8, stripTags
|
||||||
|
|
||||||
|
def getEpisodeData(url):
|
||||||
|
''' prases informatin on tvcom episode pages
|
||||||
|
returns dict with title, show, description, score
|
||||||
|
'''
|
||||||
|
tvcom = {
|
||||||
|
'description': u''
|
||||||
|
}
|
||||||
|
data = read_url_utf8(url).replace('\n',' ')
|
||||||
|
regexp = r'''<div id="main-col">.*?<div>(.*?)<div class="ta-r mt-10 f-bold">'''
|
||||||
|
reg = re.compile(regexp, re.IGNORECASE)
|
||||||
|
m = reg.findall(data)
|
||||||
|
for match in m:
|
||||||
|
description = match.strip()
|
||||||
|
description = stripTags(description).replace('Watch Video','')
|
||||||
|
tvcom['description'] = description.strip()
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
#optional data
|
||||||
|
try:
|
||||||
|
tvcom['show'] = soup('h1')[0].contents[0]
|
||||||
|
tvcom['title'] = soup('h1')[1].contents[0]
|
||||||
|
tvcom['score'] = soup("span", {'class':"f-28 f-bold mt-10 mb-10 f-FF9 db lh-18"})[0].contents[0]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return tvcom
|
219
scrapeit/tvrss.py
Executable file
219
scrapeit/tvrss.py
Executable file
|
@ -0,0 +1,219 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
|
||||||
|
from os.path import *
|
||||||
|
import sys
|
||||||
|
import datetime
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
from urllib2 import urlopen
|
||||||
|
import Image
|
||||||
|
import StringIO
|
||||||
|
|
||||||
|
import feedparser
|
||||||
|
|
||||||
|
from utils import read_url
|
||||||
|
|
||||||
|
|
||||||
|
hr_hdtv = re.compile('HR HDTV')
|
||||||
|
hdtv = re.compile('HDTV')
|
||||||
|
|
||||||
|
def get_url(title):
|
||||||
|
return title.replace(' ','_').replace('/', '_').lower()
|
||||||
|
|
||||||
|
def get_show(string):
|
||||||
|
return string.split(';')[0].split(':')[1].strip()
|
||||||
|
|
||||||
|
def get_title(string):
|
||||||
|
title = string.split(';')[1].split(':')[1].strip()
|
||||||
|
if title != 'n/a':
|
||||||
|
return title
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def get_season(string):
|
||||||
|
try:
|
||||||
|
season = int(string.split(';')[2].split(':')[1].strip())
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
return season
|
||||||
|
|
||||||
|
def get_episode(string):
|
||||||
|
try:
|
||||||
|
episode = int(string.split(';')[3].split(':')[1].strip())
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
return episode
|
||||||
|
|
||||||
|
def get_episodedate(string):
|
||||||
|
s = string.split('Episode Date:')
|
||||||
|
if len(s) == 2:
|
||||||
|
return s[1].strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
def choose_item(old, new):
|
||||||
|
if old['link'] == new['link']:
|
||||||
|
return False
|
||||||
|
if not hdtv.search(old['title']):
|
||||||
|
if hdtv.search(new['title']):
|
||||||
|
display_item(new)
|
||||||
|
log.debug("vs.")
|
||||||
|
display_item(old)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_imdbdata(imdbid):
|
||||||
|
thumbnail = None
|
||||||
|
description=''
|
||||||
|
imdb = IMDb.parse(imdbid)
|
||||||
|
if imdb:
|
||||||
|
poster = imdb['poster']
|
||||||
|
if poster != 'http://i.imdb.com/Heads/npa.gif':
|
||||||
|
log.debug("getting poster %s" % poster)
|
||||||
|
try:
|
||||||
|
thumbnail = read_url(poster)
|
||||||
|
im = Image.open(StringIO.StringIO(thumbnail))
|
||||||
|
out = StringIO.StringIO()
|
||||||
|
im.crop((0,0,100,100)).convert().save(out, 'JPEG')
|
||||||
|
thumbnail = out.getvalue()
|
||||||
|
except:
|
||||||
|
thumbnail = None
|
||||||
|
if imdb['summary']:
|
||||||
|
description=imdb['summary']
|
||||||
|
else:
|
||||||
|
description=imdb['tagline']
|
||||||
|
return (imdb, description, thumbnail)
|
||||||
|
else:
|
||||||
|
return(imdb, '', None)
|
||||||
|
|
||||||
|
def load():
|
||||||
|
log.debug("getting new shows from tvrss...")
|
||||||
|
feed = feedparser.parse('http://tvrss.net/feed/combined/')
|
||||||
|
shows = {}
|
||||||
|
for item in feed['entries']:
|
||||||
|
show = get_show(item['description'])
|
||||||
|
season = get_season(item['description'])
|
||||||
|
episode = get_episode(item['description'])
|
||||||
|
episodedate = get_episodedate(item['description'])
|
||||||
|
estring = None
|
||||||
|
if season and episode:
|
||||||
|
estring = "S%02dE%02d" %(season, episode)
|
||||||
|
elif episodedate:
|
||||||
|
estring = episodedate
|
||||||
|
if estring:
|
||||||
|
if show and not hr_hdtv.search(item['title']):
|
||||||
|
if shows.has_key(show):
|
||||||
|
if shows[show].has_key(estring):
|
||||||
|
if choose_item(shows[show][estring], item):
|
||||||
|
shows[show][estring] = item
|
||||||
|
else:
|
||||||
|
shows[show][estring] = item
|
||||||
|
else:
|
||||||
|
shows[show] = {}
|
||||||
|
shows[show][estring] = item
|
||||||
|
for show in shows:
|
||||||
|
imdb = None
|
||||||
|
try:
|
||||||
|
model.ShowsBlacklist.byShowUrl(get_url(show))
|
||||||
|
log.debug("ignoring blacklisted show %s" % show)
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
s = None
|
||||||
|
try:
|
||||||
|
s = model.Shows.byUrl(get_url(show))
|
||||||
|
except SQLObjectNotFound:
|
||||||
|
try:
|
||||||
|
alias = model.ShowsAlias.byAlias(get_url(show))
|
||||||
|
s = alias.show
|
||||||
|
except SQLObjectNotFound:
|
||||||
|
s = None
|
||||||
|
if not s:
|
||||||
|
log.debug("about to add %s" % show)
|
||||||
|
thumbnail = None
|
||||||
|
description=''
|
||||||
|
ur = '-'
|
||||||
|
try:
|
||||||
|
imdbid = IMDb.guess(show)
|
||||||
|
if imdbid:
|
||||||
|
imdb, description, thumbnail = get_imdbdata(imdbid)
|
||||||
|
if imdb:
|
||||||
|
ur = imdb['rating']
|
||||||
|
except:
|
||||||
|
import traceback
|
||||||
|
print ptraceback.print_exc()
|
||||||
|
pass
|
||||||
|
s= model.Shows(
|
||||||
|
title = show,
|
||||||
|
url = get_url(show),
|
||||||
|
description = description,
|
||||||
|
imdb = imdbid,
|
||||||
|
imdbUserRating = ur
|
||||||
|
)
|
||||||
|
s.thumbnail = thumbnail
|
||||||
|
meta = metacritic.scrapeMetacritic(s.title, s.metacriticUrl)
|
||||||
|
if meta:
|
||||||
|
s.metacriticUrl = meta['url']
|
||||||
|
s.metacriticScore = "%s" % meta['score']
|
||||||
|
for review in meta['critics']:
|
||||||
|
model.addReview(s, review)
|
||||||
|
model.hub.commit()
|
||||||
|
log.debug('added %s' % show)
|
||||||
|
for episode in shows[show]:
|
||||||
|
episode_title = get_title(shows[show][episode]['description'])
|
||||||
|
episode_description = ''
|
||||||
|
episode_imdb = ''
|
||||||
|
q = model.Episodes.select(AND(
|
||||||
|
model.Episodes.q.showID == s.id,
|
||||||
|
model.Episodes.q.episode == episode))
|
||||||
|
if q.count() == 0:
|
||||||
|
if not imdb:
|
||||||
|
try:
|
||||||
|
imdbid = IMDb.guess(show)
|
||||||
|
if imdbid:
|
||||||
|
imdb = IMDb.parse(imdbid)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if imdb and imdb['episodes'].has_key(episode):
|
||||||
|
episode_title = imdb['episodes'][episode]['title']
|
||||||
|
episode_description = imdb['episodes'][episode]['description']
|
||||||
|
episode_imdb = imdb['episodes'][episode]['imdb']
|
||||||
|
if not episode_description or not episode_title:
|
||||||
|
tvcom_data = tvcom.get(show, episode)
|
||||||
|
if not episode_description:
|
||||||
|
episode_description = tvcom_data['description']
|
||||||
|
if not episode_title:
|
||||||
|
episode_title = tvcom_data['title']
|
||||||
|
e = model.Episodes(
|
||||||
|
showID = s.id,
|
||||||
|
title = episode_title,
|
||||||
|
episode = episode,
|
||||||
|
torrent = shows[show][episode]['enclosures'][0]['href'],
|
||||||
|
description = episode_description,
|
||||||
|
imdb = episode_imdb,
|
||||||
|
thumbnail = None,
|
||||||
|
pubDate = datetime.datetime.fromtimestamp(time.mktime(shows[show][episode]['updated_parsed']))
|
||||||
|
)
|
||||||
|
s.lastUpdate = datetime.datetime.now()
|
||||||
|
model.hub.commit()
|
||||||
|
log.debug("from tvrss add %s %s" %(episode, show))
|
||||||
|
log.debug("updating tvrss done.")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# first look on the command line for a desired config file,
|
||||||
|
# if it's not on the command line, then
|
||||||
|
# look for setup.py in this directory. If it's not there, this script is
|
||||||
|
# probably installed
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
turbogears.update_config(configfile=sys.argv[1],
|
||||||
|
modulename="btvcr.config")
|
||||||
|
elif exists(join(dirname(__file__), "setup.py")):
|
||||||
|
turbogears.update_config(configfile="dev.cfg",
|
||||||
|
modulename="btvcr.config")
|
||||||
|
else:
|
||||||
|
turbogears.update_config(configfile="prod.cfg",
|
||||||
|
modulename="btvcr.config")
|
||||||
|
|
||||||
|
from btvcr.controllers import Root
|
||||||
|
load()
|
150
scrapeit/utils.py
Normal file
150
scrapeit/utils.py
Normal file
|
@ -0,0 +1,150 @@
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
"""
|
||||||
|
screape tools
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import urllib
|
||||||
|
import urllib2
|
||||||
|
|
||||||
|
import djangohtml
|
||||||
|
|
||||||
|
|
||||||
|
# Default headers for HTTP requests.
|
||||||
|
DEFAULT_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5)'}
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------
|
||||||
|
# Functions
|
||||||
|
# --------------------------------------------------------------------
|
||||||
|
|
||||||
|
def quote_plus(s):
|
||||||
|
"""
|
||||||
|
A variant of urllib.quote_plus which handles ASCII and Unicode.
|
||||||
|
"""
|
||||||
|
return urllib.quote_plus(s.encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
|
def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
|
||||||
|
"""
|
||||||
|
Read str contents of given str URL.
|
||||||
|
|
||||||
|
Here headers is a map of str -> str for HTTP request headers. If
|
||||||
|
blocking is True, returns the str page contents. If blocking is
|
||||||
|
False, returns an iterator which gives None until a successful read,
|
||||||
|
at which point the str page contents is yielded.
|
||||||
|
"""
|
||||||
|
req = urllib2.Request(url, None, headers)
|
||||||
|
f = urllib2.urlopen(req)
|
||||||
|
data = f.read()
|
||||||
|
f.close()
|
||||||
|
ctype = f.headers.getheader('content-type')
|
||||||
|
charset = ctype.split('charset=')
|
||||||
|
if len(charset)>1: charset = charset[1]
|
||||||
|
else: charset = 'latin-1'
|
||||||
|
data = unicode(data, charset)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
|
||||||
|
"""
|
||||||
|
Read str contents of given str URL.
|
||||||
|
|
||||||
|
Here headers is a map of str -> str for HTTP request headers. If
|
||||||
|
blocking is True, returns the str page contents. If blocking is
|
||||||
|
False, returns an iterator which gives None until a successful read,
|
||||||
|
at which point the str page contents is yielded.
|
||||||
|
"""
|
||||||
|
req = urllib2.Request(url, None, headers)
|
||||||
|
f = urllib2.urlopen(req)
|
||||||
|
data = f.read()
|
||||||
|
f.close()
|
||||||
|
return data
|
||||||
|
|
||||||
|
def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
|
||||||
|
"""
|
||||||
|
opens given str URL and returns the url after redirection.
|
||||||
|
"""
|
||||||
|
rurl = url
|
||||||
|
try:
|
||||||
|
req = urllib2.Request(url, None, headers)
|
||||||
|
rurl = urllib2.urlopen(req).url
|
||||||
|
rurl = rurl.replace('&src=rss', '')
|
||||||
|
except:
|
||||||
|
rurl = url
|
||||||
|
return rurl
|
||||||
|
|
||||||
|
|
||||||
|
def fix_url(url):
|
||||||
|
"""
|
||||||
|
Given url str, trim redirect stuff and return actual URL.
|
||||||
|
|
||||||
|
Currently this just returns the URL unmodified.
|
||||||
|
"""
|
||||||
|
# if url.lower().find('http%3a//') > 0:
|
||||||
|
# return 'http://' + url[url.lower().rindex('http%3a//')+9:]
|
||||||
|
# if url.find('http://') > 0:
|
||||||
|
# return url[url.rindex('http://'):]
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
_html_entity_re = re.compile(r'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(?:([0-9]+)|[xX]([0-9a-fA-F]+)));?')
|
||||||
|
import htmlentitydefs
|
||||||
|
|
||||||
|
def html_entity_decode(s, encoding = 'utf-8'):
|
||||||
|
r = []
|
||||||
|
p = 0
|
||||||
|
mo = _html_entity_re.search(s, p)
|
||||||
|
while mo:
|
||||||
|
r.append(s[p:mo.start()].decode(encoding))
|
||||||
|
i = mo.lastindex
|
||||||
|
e = mo.group(i)
|
||||||
|
try:
|
||||||
|
if i == 1:
|
||||||
|
c = htmlentitydefs.name2codepoint[e]
|
||||||
|
elif i == 2:
|
||||||
|
c = int(e)
|
||||||
|
elif i == 3:
|
||||||
|
c = int(e, 16)
|
||||||
|
else:
|
||||||
|
assert 0
|
||||||
|
r.append(unichr(c))
|
||||||
|
except KeyError:
|
||||||
|
r.append(mo.group(0))
|
||||||
|
|
||||||
|
p = mo.end()
|
||||||
|
mo = _html_entity_re.search(s, p)
|
||||||
|
r.append(s[p:].decode(encoding))
|
||||||
|
return u''.join(r)
|
||||||
|
|
||||||
|
def stripTags(s):
|
||||||
|
return djangohtml.strip_tags(htmldecode(s))
|
||||||
|
|
||||||
|
|
||||||
|
from htmlentitydefs import name2codepoint
|
||||||
|
|
||||||
|
# This pattern matches a character entity reference (a decimal numeric
|
||||||
|
# references, a hexadecimal numeric reference, or a named reference).
|
||||||
|
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
|
||||||
|
|
||||||
|
def htmldecode(text):
|
||||||
|
"""Decode HTML entities in the given text."""
|
||||||
|
if type(text) != unicode:
|
||||||
|
text = unicode(text)
|
||||||
|
if type(text) is unicode:
|
||||||
|
uchr = unichr
|
||||||
|
else:
|
||||||
|
uchr = lambda value: value > 255 and unichr(value) or chr(value)
|
||||||
|
def entitydecode(match, uchr=uchr):
|
||||||
|
entity = match.group(1)
|
||||||
|
if entity.startswith('#x'):
|
||||||
|
return uchr(int(entity[2:], 16))
|
||||||
|
elif entity.startswith('#'):
|
||||||
|
return uchr(int(entity[1:]))
|
||||||
|
elif entity in name2codepoint:
|
||||||
|
return uchr(name2codepoint[entity])
|
||||||
|
else:
|
||||||
|
return match.group(0)
|
||||||
|
return charrefpat.sub(entitydecode, text)
|
||||||
|
|
31
setup.py
Normal file
31
setup.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
# encoding: utf-8
|
||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="scrapeit",
|
||||||
|
version="0.1",
|
||||||
|
|
||||||
|
# uncomment the following lines if you fill them out in release.py
|
||||||
|
description="collection of scrapers for various websites",
|
||||||
|
author="bot",
|
||||||
|
author_email="bot@mailb.org",
|
||||||
|
#url=url,
|
||||||
|
#download_url=download_url,
|
||||||
|
#license=license,
|
||||||
|
packages=find_packages(),
|
||||||
|
zip_safe=False,
|
||||||
|
keywords = [
|
||||||
|
],
|
||||||
|
classifiers = [
|
||||||
|
'Development Status :: 3 - Alpha',
|
||||||
|
'Operating System :: OS Independent',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
Loading…
Reference in a new issue