# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import ox
from ox import strip_tags, find_re
def get_data(id):
base = 'http://www.istockphoto.com'
url = base + '/stock-photo-%s.php' % id
id = find_re(id, '\d+')
data = ox.cache.read_url(url, timeout=-1)
info = {}
info['title'] = ox.find_re(data, '
(.*?) \|')
info['thumbnail'] = base + ox.find_re(data, 'src="(/file_thumbview_approve/%s.*?)"'%id)
info['views'] = ox.find_re(data, 'Views: | (\d+) | ')
info['collections'] = strip_tags(ox.find_re(data, 'Collections: | (.*?) | ')).split(', ')
info['collections'] = filter(lambda x: x.strip(), info['collections'])
info['keywords'] = map(lambda k: k.strip(), strip_tags(ox.find_re(data, 'Keywords: | .*?(.*?)\.\.\.<')).split(', '))
info['keywords'] = ox.find_re(data, '.*?(\d+) ')
info['contributor'] = ox.find_re(data, ' | Contributor: | .*?.*?alt="(.*?)"')
info['description'] = strip_tags(ox.find_re(data, 'artistsDescriptionData = \["(.*?)
').findall(data)
pages = re.compile('class="paginatorLink">(\d+)').findall(data)
if pages:
for page in range(2, max(map(int, pages))):
url = "http://www.istockphoto.com/browse/%s/%s" % (collection, page)
data = ox.cache.read_url(url, timeout=timeout)
ids += re.compile('').findall(data)
return ids
def get_ids():
ids = []
for collection in ('vetta', 'agency', 'dollarbin', 'latest/photo', 'latest/photo/exclusive'):
ids += getCollectionIds(collection)
return ids