- no unicode decoding to get ids to seepdup get_ids
- one function for to get ids get_ids and get_ids(page=N)
This commit is contained in:
parent
62f5e84642
commit
21dd1c7903
1 changed files with 15 additions and 19 deletions
|
@ -75,30 +75,26 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||||
data['director'], data['year'], timeout=timeout)
|
data['director'], data['year'], timeout=timeout)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def get_ids():
|
def get_ids(page=None):
|
||||||
|
|
||||||
ids = []
|
ids = []
|
||||||
|
if page:
|
||||||
|
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
|
||||||
|
html = read_url(url)
|
||||||
|
results = re.compile("films/(\d+)").findall(html)
|
||||||
|
ids += results
|
||||||
|
results = re.compile("boxsets/(.*?)\"").findall(html)
|
||||||
|
for result in results:
|
||||||
|
html = read_url("http://www.criterion.com/boxsets/" + result)
|
||||||
|
results = re.compile("films/(\d+)").findall(html)
|
||||||
|
ids += results
|
||||||
|
return set(ids)
|
||||||
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
|
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
|
||||||
results = re.compile("\&p=(\d+)\&").findall(html)
|
results = re.compile("\&p=(\d+)\&").findall(html)
|
||||||
pages = max(map(int, results))
|
pages = max(map(int, results))
|
||||||
for page in range(1, pages):
|
for page in range(1, pages):
|
||||||
for id in get_idsByPage(page):
|
ids += get_ids(page)
|
||||||
ids.append(id)
|
return sorted(set(ids), key=int)
|
||||||
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
|
|
||||||
|
|
||||||
def get_idsByPage(page):
|
|
||||||
ids = []
|
|
||||||
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
|
|
||||||
html = read_url(url, unicode=True)
|
|
||||||
results = re.compile("films/(\d+)").findall(html)
|
|
||||||
for result in results:
|
|
||||||
ids.append(result)
|
|
||||||
results = re.compile("boxsets/(.*?)\"").findall(html)
|
|
||||||
for result in results:
|
|
||||||
html = read_url("http://www.criterion.com/boxsets/" + result, unicode=True)
|
|
||||||
results = re.compile("films/(\d+)").findall(html)
|
|
||||||
for result in results:
|
|
||||||
ids.append(result)
|
|
||||||
return set(ids)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print get_ids()
|
print get_ids()
|
||||||
|
|
Loading…
Reference in a new issue