49 lines
2.3 KiB
Python
49 lines
2.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
import re
|
|
|
|
import ox
|
|
from ox import strip_tags, find_re
|
|
|
|
|
|
def get_data(id):
|
|
base = 'http://www.istockphoto.com'
|
|
url = base + '/stock-photo-%s.php' % id
|
|
id = find_re(id, '\d+')
|
|
data = ox.cache.read_url(url, timeout=-1)
|
|
info = {}
|
|
info['title'] = ox.find_re(data, '<title>(.*?) \|')
|
|
info['thumbnail'] = base + ox.find_re(data, 'src="(/file_thumbview_approve/%s.*?)"'%id)
|
|
info['views'] = ox.find_re(data, '<tr><td>Views:</td><td>(\d+)</td>')
|
|
info['collections'] = strip_tags(ox.find_re(data, '<td>Collections:</td><td>(.*?)</td>')).split(', ')
|
|
info['collections'] = filter(lambda x: x.strip(), info['collections'])
|
|
info['keywords'] = map(lambda k: k.strip(), strip_tags(ox.find_re(data, '<td>Keywords:</td>.*?<td>(.*?)\.\.\.<')).split(', '))
|
|
info['keywords'] = ox.find_re(data, '<meta name="keywords" content="(.*?), stock image').split(', ')
|
|
info['keywords'].sort()
|
|
info['uploaded'] = ox.find_re(data, '<td>Uploaded on:</td>.*?<td>([\d\-]+)')
|
|
info['downloads'] = ox.find_re(data, '<span class="fl">.*?(\d+) </span>')
|
|
info['contributor'] = ox.find_re(data, '<td class="m">Contributor:</td>.*?<a href="user_view.php\?id=.*?">.*?alt="(.*?)"')
|
|
info['description'] = strip_tags(ox.find_re(data, 'artistsDescriptionData = \["(.*?)<br'))
|
|
info['description'] = info['description'].split('CLICK TO SEE')[0].strip()
|
|
info['similar'] = re.compile('size=1\&id=(\d+)').findall(data)
|
|
return info
|
|
|
|
def get_collection_ids(collection, timeout=-1):
|
|
url = "http://www.istockphoto.com/browse/%s/" % (collection)
|
|
data = ox.cache.read_url(url, timeout=timeout)
|
|
ids = []
|
|
ids += re.compile('<a href="/stock-photo-(.*?).php">').findall(data)
|
|
pages = re.compile('class="paginatorLink">(\d+)</a>').findall(data)
|
|
if pages:
|
|
for page in range(2, max(map(int, pages))):
|
|
url = "http://www.istockphoto.com/browse/%s/%s" % (collection, page)
|
|
data = ox.cache.read_url(url, timeout=timeout)
|
|
ids += re.compile('<a href="/stock-photo-(.*?).php">').findall(data)
|
|
return ids
|
|
|
|
def get_ids():
|
|
ids = []
|
|
for collection in ('vetta', 'agency', 'dollarbin', 'latest/photo', 'latest/photo/exclusive'):
|
|
ids += getCollectionIds(collection)
|
|
return ids
|
|
|