openmedialibrary_platform/Shared/lib/python2.7/site-packages/ox/web/istockphoto.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re

import ox
from ox import strip_tags, find_re


def get_data(id):
    base = 'http://www.istockphoto.com'
    url = base + '/stock-photo-%s.php' % id
    id = find_re(id, '\d+')
    data = ox.cache.read_url(url, timeout=-1)
    info = {}
    info['title'] = ox.find_re(data, '<title>(.*?) \|')
    info['thumbnail'] = base + ox.find_re(data, 'src="(/file_thumbview_approve/%s.*?)"'%id)
    info['views'] = ox.find_re(data, '<tr><td>Views:</td><td>(\d+)</td>')
    info['collections'] = strip_tags(ox.find_re(data, '<td>Collections:</td><td>(.*?)</td>')).split(', ')
    info['collections'] = filter(lambda x: x.strip(), info['collections'])
    info['keywords'] = map(lambda k: k.strip(), strip_tags(ox.find_re(data, '<td>Keywords:</td>.*?<td>(.*?)\.\.\.<')).split(', '))
    info['keywords'] = ox.find_re(data, '<meta name="keywords" content="(.*?), stock image').split(', ')
    info['keywords'].sort()
    info['uploaded'] = ox.find_re(data, '<td>Uploaded on:</td>.*?<td>([\d\-]+)')
    info['downloads'] = ox.find_re(data, '<span class="fl">.*?(\d+)&nbsp;</span>')
    info['contributor'] = ox.find_re(data, '<td class="m">Contributor:</td>.*?<a href="user_view.php\?id=.*?">.*?alt="(.*?)"')
    info['description'] = strip_tags(ox.find_re(data, 'artistsDescriptionData = \["(.*?)<br'))
    info['description'] = info['description'].split('CLICK TO SEE')[0].strip()
    info['similar'] = re.compile('size=1\&id=(\d+)').findall(data)
    return info

def get_collection_ids(collection, timeout=-1):
    url = "http://www.istockphoto.com/browse/%s/" % (collection)
    data = ox.cache.read_url(url, timeout=timeout)
    ids = []
    ids += re.compile('<a href="/stock-photo-(.*?).php">').findall(data)
    pages = re.compile('class="paginatorLink">(\d+)</a>').findall(data)
    if pages:
        for page in range(2, max(map(int, pages))):
            url = "http://www.istockphoto.com/browse/%s/%s" % (collection, page)
            data = ox.cache.read_url(url, timeout=timeout)
            ids += re.compile('<a href="/stock-photo-(.*?).php">').findall(data)
    return ids

def get_ids():
    ids = []
    for collection in ('vetta', 'agency', 'dollarbin', 'latest/photo', 'latest/photo/exclusive'):
        ids += getCollectionIds(collection)
    return ids