distributed_scraper/client.py

54 lines
1.4 KiB
Python
Executable File

#!/usr/bin/python
# encoding: utf-8
# vi:si:et:sw=4:sts=4:ts=4
import ox
import requests
import time
import json
from urllib import quote
class Client:
def __init__(self, url, name, type):
self.url = url
self.name = name
self.type = type
def next(self):
url = '%s/get' % self.url
r = requests.post(url, {
'type': self.type,
'client': self.name
})
data = json.loads(r.content)
if 'url' in data:
print data['url']
#result = ox.net.read_url(data['url'])
result = ox.cache.read_url(data['url'])
put_url = '%s/save?url=%s&client=%s&type=%s' % (self.url, quote(data['url']), quote(self.name), quote(self.type))
r = requests.put(put_url, data)
assert r.status_code == 200
return True
return False
def run(self):
delay = 10
new = True
while True:
if not self.next():
if new:
new = False
print "currently no more urls to fetch, reducing pull time to %d seconds" % delay
time.sleep(delay)
else:
new = True
if __name__ == '__main__':
import sys
url = sys.argv[1]
name = sys.argv[2]
type = sys.argv[3]
print 'processing "%s" urls as "%s"' % (type, name)
c = Client(url, name, type)
c.run()