openmedialibrary_platform/Shared/lib/python2.7/site-packages/ox/web/istockphoto.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re

import ox
from ox import strip_tags, find_re


def get_data(id):
    base = 'http://www.istockphoto.com'
    url = base + '/stock-photo-%s.php' % id
    id = find_re(id, '\d+')
    data = ox.cache.read_url(url, timeout=-1)
    info = {}
    info['title'] = ox.find_re(data, '<title>(.*?) \|')
    info['thumbnail'] = base + ox.find_re(data, 'src="(/file_thumbview_approve/%s.*?)"'%id)
    info['views'] = ox.find_re(data, '<tr><td>Views:</td><td>(\d+)</td>')
    info['collections'] = strip_tags(ox.find_re(data, '<td>Collections:</td><td>(.*?)</td>')).split(', ')
    info['collections'] = filter(lambda x: x.strip(), info['collections'])
    info['keywords'] = map(lambda k: k.strip(), strip_tags(ox.find_re(data, '<td>Keywords:</td>.*?<td>(.*?)\.\.\.<')).split(', '))
    info['keywords'] = ox.find_re(data, '<meta name="keywords" content="(.*?), stock image').split(', ')
    info['keywords'].sort()
    info['uploaded'] = ox.find_re(data, '<td>Uploaded on:</td>.*?<td>([\d\-]+)')
    info['downloads'] = ox.find_re(data, '<span class="fl">.*?(\d+)&nbsp;</span>')
    info['contributor'] = ox.find_re(data, '<td class="m">Contributor:</td>.*?<a href="user_view.php\?id=.*?">.*?alt="(.*?)"')
    info['description'] = strip_tags(ox.find_re(data, 'artistsDescriptionData = \["(.*?)<br'))
    info['description'] = info['description'].split('CLICK TO SEE')[0].strip()
    info['similar'] = re.compile('size=1\&id=(\d+)').findall(data)
    return info

def get_collection_ids(collection, timeout=-1):
    url = "http://www.istockphoto.com/browse/%s/" % (collection)
    data = ox.cache.read_url(url, timeout=timeout)
    ids = []
    ids += re.compile('<a href="/stock-photo-(.*?).php">').findall(data)
    pages = re.compile('class="paginatorLink">(\d+)</a>').findall(data)
    if pages:
        for page in range(2, max(map(int, pages))):
            url = "http://www.istockphoto.com/browse/%s/%s" % (collection, page)
            data = ox.cache.read_url(url, timeout=timeout)
            ids += re.compile('<a href="/stock-photo-(.*?).php">').findall(data)
    return ids

def get_ids():
    ids = []
    for collection in ('vetta', 'agency', 'dollarbin', 'latest/photo', 'latest/photo/exclusive'):
        ids += getCollectionIds(collection)
    return ids
Open Media Library Platform 2013-10-11 17:28:32 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
			`import re`

			`import ox`
			`from ox import strip_tags, find_re`


			`def get_data(id):`
			`base = 'http://www.istockphoto.com'`
			`url = base + '/stock-photo-%s.php' % id`
			`id = find_re(id, '\d+')`
			`data = ox.cache.read_url(url, timeout=-1)`
			`info = {}`
			`info['title'] = ox.find_re(data, '<title>(.*?) \\|')`
			`info['thumbnail'] = base + ox.find_re(data, 'src="(/file_thumbview_approve/%s.*?)"'%id)`
			`info['views'] = ox.find_re(data, '<tr><td>Views:</td><td>(\d+)</td>')`
			`info['collections'] = strip_tags(ox.find_re(data, '<td>Collections:</td><td>(.*?)</td>')).split(', ')`
			`info['collections'] = filter(lambda x: x.strip(), info['collections'])`
			`info['keywords'] = map(lambda k: k.strip(), strip_tags(ox.find_re(data, '<td>Keywords:</td>.?<td>(.?)\.\.\.<')).split(', '))`
			`info['keywords'] = ox.find_re(data, '<meta name="keywords" content="(.*?), stock image').split(', ')`
			`info['keywords'].sort()`
			`info['uploaded'] = ox.find_re(data, '<td>Uploaded on:</td>.*?<td>([\d\-]+)')`
			`info['downloads'] = ox.find_re(data, '<span class="fl">.*?(\d+) </span>')`
			`info['contributor'] = ox.find_re(data, '<td class="m">Contributor:</td>.?<a href="user_view.php\?id=.?">.?alt="(.?)"')`
			`info['description'] = strip_tags(ox.find_re(data, 'artistsDescriptionData = \["(.*?)<br'))`
			`info['description'] = info['description'].split('CLICK TO SEE')[0].strip()`
			`info['similar'] = re.compile('size=1\&id=(\d+)').findall(data)`
			`return info`

			`def get_collection_ids(collection, timeout=-1):`
			`url = "http://www.istockphoto.com/browse/%s/" % (collection)`
			`data = ox.cache.read_url(url, timeout=timeout)`
			`ids = []`
			`ids += re.compile('<a href="/stock-photo-(.*?).php">').findall(data)`
			`pages = re.compile('class="paginatorLink">(\d+)</a>').findall(data)`
			`if pages:`
			`for page in range(2, max(map(int, pages))):`
			`url = "http://www.istockphoto.com/browse/%s/%s" % (collection, page)`
			`data = ox.cache.read_url(url, timeout=timeout)`
			`ids += re.compile('<a href="/stock-photo-(.*?).php">').findall(data)`
			`return ids`

			`def get_ids():`
			`ids = []`
			`for collection in ('vetta', 'agency', 'dollarbin', 'latest/photo', 'latest/photo/exclusive'):`
			`ids += getCollectionIds(collection)`
			`return ids`