python-ox/ox/web/epguides.py

51 lines
1.6 KiB
Python
Raw Permalink Normal View History

2010-07-07 23:25:57 +00:00
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
2014-09-30 19:27:26 +00:00
from __future__ import print_function
2010-07-07 23:25:57 +00:00
import re
import time
from ox import strip_tags, find_re
from ox.cache import read_url
2010-07-07 23:25:57 +00:00
2016-06-08 13:32:46 +00:00
from . import google
2010-07-07 23:25:57 +00:00
2012-08-15 15:15:40 +00:00
def get_show_url(title):
2010-07-07 23:25:57 +00:00
'''
Search Epguide Url for Show via Show Title.
Use Google to search the url, this is also done on Epguide.
'''
for (name, url, desc) in google.find('allintitle: site:epguides.com %s' % title, 1):
if url.startswith('http://epguides.com'):
if re.search(title, name):
return url
return None
2012-08-15 15:15:40 +00:00
def get_show_data(url):
data = read_url(url, unicode=True)
2010-07-07 23:25:57 +00:00
r = {}
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
2024-09-11 21:52:01 +00:00
r['imdb'] = find_re(data, r'<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
2010-07-07 23:25:57 +00:00
r['episodes'] = {}
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
2024-09-11 21:52:01 +00:00
for episode in re.compile(r'(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
2010-07-07 23:25:57 +00:00
air_date = episode[3].strip()
#'22 Sep 04' -> 2004-09-22
try:
air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y'))
except:
pass
s = episode[1].split('-')[0].strip()
e = episode[1].split('-')[-1].strip()
try:
r['episodes']['S%02dE%02d' % (int(s), int(e))] = {
'prod code': episode[2],
'air date': air_date,
'url': episode[4],
2024-09-11 21:52:01 +00:00
'title': episode[5],
2010-07-07 23:25:57 +00:00
}
except:
2014-09-30 19:27:26 +00:00
print("oxweb.epguides failed,", url)
2010-07-07 23:25:57 +00:00
return r