openmedialibrary_platform/Shared/lib/python2.7/site-packages/ox/web/istockphoto.py

50 lines
2.3 KiB
Python
Raw Normal View History

2013-10-11 17:28:32 +00:00
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import ox
from ox import strip_tags, find_re
def get_data(id):
base = 'http://www.istockphoto.com'
url = base + '/stock-photo-%s.php' % id
id = find_re(id, '\d+')
data = ox.cache.read_url(url, timeout=-1)
info = {}
info['title'] = ox.find_re(data, '<title>(.*?) \|')
info['thumbnail'] = base + ox.find_re(data, 'src="(/file_thumbview_approve/%s.*?)"'%id)
info['views'] = ox.find_re(data, '<tr><td>Views:</td><td>(\d+)</td>')
info['collections'] = strip_tags(ox.find_re(data, '<td>Collections:</td><td>(.*?)</td>')).split(', ')
info['collections'] = filter(lambda x: x.strip(), info['collections'])
info['keywords'] = map(lambda k: k.strip(), strip_tags(ox.find_re(data, '<td>Keywords:</td>.*?<td>(.*?)\.\.\.<')).split(', '))
info['keywords'] = ox.find_re(data, '<meta name="keywords" content="(.*?), stock image').split(', ')
info['keywords'].sort()
info['uploaded'] = ox.find_re(data, '<td>Uploaded on:</td>.*?<td>([\d\-]+)')
info['downloads'] = ox.find_re(data, '<span class="fl">.*?(\d+)&nbsp;</span>')
info['contributor'] = ox.find_re(data, '<td class="m">Contributor:</td>.*?<a href="user_view.php\?id=.*?">.*?alt="(.*?)"')
info['description'] = strip_tags(ox.find_re(data, 'artistsDescriptionData = \["(.*?)<br'))
info['description'] = info['description'].split('CLICK TO SEE')[0].strip()
info['similar'] = re.compile('size=1\&id=(\d+)').findall(data)
return info
def get_collection_ids(collection, timeout=-1):
url = "http://www.istockphoto.com/browse/%s/" % (collection)
data = ox.cache.read_url(url, timeout=timeout)
ids = []
ids += re.compile('<a href="/stock-photo-(.*?).php">').findall(data)
pages = re.compile('class="paginatorLink">(\d+)</a>').findall(data)
if pages:
for page in range(2, max(map(int, pages))):
url = "http://www.istockphoto.com/browse/%s/%s" % (collection, page)
data = ox.cache.read_url(url, timeout=timeout)
ids += re.compile('<a href="/stock-photo-(.*?).php">').findall(data)
return ids
def get_ids():
ids = []
for collection in ('vetta', 'agency', 'dollarbin', 'latest/photo', 'latest/photo/exclusive'):
ids += getCollectionIds(collection)
return ids