# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re import ox from ox import strip_tags, find_re def get_data(id): base = 'http://www.istockphoto.com' url = base + '/stock-photo-%s.php' % id id = find_re(id, '\d+') data = ox.cache.read_url(url, timeout=-1) info = {} info['title'] = ox.find_re(data, '(.*?) \|') info['thumbnail'] = base + ox.find_re(data, 'src="(/file_thumbview_approve/%s.*?)"'%id) info['views'] = ox.find_re(data, '<tr><td>Views:</td><td>(\d+)</td>') info['collections'] = strip_tags(ox.find_re(data, '<td>Collections:</td><td>(.*?)</td>')).split(', ') info['collections'] = filter(lambda x: x.strip(), info['collections']) info['keywords'] = map(lambda k: k.strip(), strip_tags(ox.find_re(data, '<td>Keywords:</td>.*?<td>(.*?)\.\.\.<')).split(', ')) info['keywords'] = ox.find_re(data, '<meta name="keywords" content="(.*?), stock image').split(', ') info['keywords'].sort() info['uploaded'] = ox.find_re(data, '<td>Uploaded on:</td>.*?<td>([\d\-]+)') info['downloads'] = ox.find_re(data, '<span class="fl">.*?(\d+) </span>') info['contributor'] = ox.find_re(data, '<td class="m">Contributor:</td>.*?<a href="user_view.php\?id=.*?">.*?alt="(.*?)"') info['description'] = strip_tags(ox.find_re(data, 'artistsDescriptionData = \["(.*?)<br')) info['description'] = info['description'].split('CLICK TO SEE')[0].strip() info['similar'] = re.compile('size=1\&id=(\d+)').findall(data) return info def get_collection_ids(collection, timeout=-1): url = "http://www.istockphoto.com/browse/%s/" % (collection) data = ox.cache.read_url(url, timeout=timeout) ids = [] ids += re.compile('<a href="/stock-photo-(.*?).php">').findall(data) pages = re.compile('class="paginatorLink">(\d+)</a>').findall(data) if pages: for page in range(2, max(map(int, pages))): url = "http://www.istockphoto.com/browse/%s/%s" % (collection, page) data = ox.cache.read_url(url, timeout=timeout) ids += re.compile('<a href="/stock-photo-(.*?).php">').findall(data) return ids def get_ids(): ids = [] for collection in ('vetta', 'agency', 'dollarbin', 'latest/photo', 'latest/photo/exclusive'): ids += getCollectionIds(collection) return ids