cleanup pylint errors and py2/3 issues

This commit is contained in:
j 2016-06-08 15:32:46 +02:00
parent 4e7898ae57
commit 77f8876fca
20 changed files with 232 additions and 197 deletions

View file

@ -101,7 +101,7 @@ class API(object):
result = result.decode('utf-8') result = result.decode('utf-8')
result = json.loads(result) result = json.loads(result)
except: except:
result = {'status':{}} result = {'status': {}}
result['status']['code'] = e.code result['status']['code'] = e.code
result['status']['text'] = str(e) result['status']['text'] = str(e)
return result return result

View file

@ -131,25 +131,25 @@ def oshash(filename, cached=True):
if filesize < 65536: if filesize < 65536:
for x in range(int(filesize/bytesize)): for x in range(int(filesize/bytesize)):
buffer = f.read(bytesize) buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer) (l_value,) = struct.unpack(longlongformat, buffer)
hash += l_value hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number hash = hash & 0xFFFFFFFFFFFFFFFF # to remain as 64bit number
else: else:
for x in range(int(65536/bytesize)): for x in range(int(65536/bytesize)):
buffer = f.read(bytesize) buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer) (l_value,) = struct.unpack(longlongformat, buffer)
hash += l_value hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number hash = hash & 0xFFFFFFFFFFFFFFFF # to remain as 64bit number
f.seek(max(0,filesize-65536),0) f.seek(max(0, filesize-65536), 0)
for x in range(int(65536/bytesize)): for x in range(int(65536/bytesize)):
buffer = f.read(bytesize) buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer) (l_value,) = struct.unpack(longlongformat, buffer)
hash += l_value hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF hash = hash & 0xFFFFFFFFFFFFFFFF
f.close() f.close()
returnedhash = "%016x" % hash returnedhash = "%016x" % hash
return returnedhash return returnedhash
except(IOError): except IOError:
return "IOError" return "IOError"
def avinfo(filename, cached=True): def avinfo(filename, cached=True):
@ -160,23 +160,25 @@ def avinfo(filename, cached=True):
return ffprobe(filename) return ffprobe(filename)
ffmpeg2theora = cmd('ffmpeg2theora') ffmpeg2theora = cmd('ffmpeg2theora')
p = subprocess.Popen([ffmpeg2theora], stdout=subprocess.PIPE, stderr=subprocess.PIPE) p = subprocess.Popen([ffmpeg2theora], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
info, error = p.communicate() stdout, error = p.communicate()
version = info.split('\n')[0].split(' - ')[0].split(' ')[-1] stdout = stdout.decode('utf-8')
version = stdout.split('\n')[0].split(' - ')[0].split(' ')[-1]
if version < '0.27': if version < '0.27':
raise EnvironmentError('version of ffmpeg2theora needs to be 0.27 or later, found %s' % version) raise EnvironmentError('version of ffmpeg2theora needs to be 0.27 or later, found %s' % version)
p = subprocess.Popen([ffmpeg2theora, '--info', filename], p = subprocess.Popen([ffmpeg2theora, '--info', filename],
stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout=subprocess.PIPE, stderr=subprocess.PIPE)
info, error = p.communicate() stdout, error = p.communicate()
try: try:
info = json.loads(info) info = json.loads(stdout)
except: except:
#remove metadata, can be broken # remove metadata, can be broken
stdout = stdout.decode('utf-8')
reg = re.compile('"metadata": {.*?},', re.DOTALL) reg = re.compile('"metadata": {.*?},', re.DOTALL)
info = re.sub(reg, '', info) stdout = re.sub(reg, '', stdout)
info = json.loads(info) info = json.loads(stdout)
if 'video' in info: if 'video' in info:
for v in info['video']: for v in info['video']:
if not 'display_aspect_ratio' in v and 'width' in v: if 'display_aspect_ratio' not in v and 'width' in v:
v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height']) v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height'])
v['pixel_aspect_ratio'] = '1:1' v['pixel_aspect_ratio'] = '1:1'
if len(info.get('audio', [])) > 1: if len(info.get('audio', [])) > 1:
@ -189,6 +191,7 @@ def avinfo(filename, cached=True):
ffmpeg = cmd('ffmpeg') ffmpeg = cmd('ffmpeg')
p = subprocess.Popen([ffmpeg, '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) p = subprocess.Popen([ffmpeg, '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate() stdout, stderr = p.communicate()
stderr = stderr.decode('utf-8')
languages = [re.compile('\((.+?)\):').findall(l) for l in stderr.split('\n') if 'Stream' in l and 'Audio' in l] languages = [re.compile('\((.+?)\):').findall(l) for l in stderr.split('\n') if 'Stream' in l and 'Audio' in l]
if len(languages) == len(info['audio']): if len(languages) == len(info['audio']):
for i, stream in enumerate(info['audio']): for i, stream in enumerate(info['audio']):
@ -278,16 +281,16 @@ def ffprobe(filename):
info[s['codec_type']].append(stream) info[s['codec_type']].append(stream)
else: else:
pass pass
#print s # print s
for v in info['video']: for v in info['video']:
k = 'display_aspect_ratio' k = 'display_aspect_ratio'
if not k in v and 'width' in v \ if k not in v and 'width' in v \
or (k in v and v[k] == '0:1'): or (k in v and v[k] == '0:1'):
v[k] = '%d:%d' % (v['width'], v['height']) v[k] = '%d:%d' % (v['width'], v['height'])
v['pixel_aspect_ratio'] = '1:1' v['pixel_aspect_ratio'] = '1:1'
info['oshash'] = oshash(filename) info['oshash'] = oshash(filename)
info['path'] = filename info['path'] = filename
if not 'size' in info: if 'size' not in info:
info['size'] = os.path.getsize(filename) info['size'] = os.path.getsize(filename)
return info return info

View file

@ -6,7 +6,7 @@ from __future__ import print_function
import unicodedata import unicodedata
from six import unichr, PY2 from six import unichr, text_type
__all__ = ['fix_bad_unicode'] __all__ = ['fix_bad_unicode']
@ -151,10 +151,7 @@ def text_badness(text):
- Improbable single-byte characters, such as ƒ or ¬ - Improbable single-byte characters, such as ƒ or ¬
- Letters in somewhat rare scripts - Letters in somewhat rare scripts
''' '''
if PY2: assert isinstance(text, text_type)
assert isinstance(text, unicode)
else:
assert isinstance(text, str)
errors = 0 errors = 0
very_weird_things = 0 very_weird_things = 0
weird_things = 0 weird_things = 0

View file

@ -4,6 +4,7 @@ import math
import re import re
import string import string
from six import text_type
def toAZ(num): def toAZ(num):
""" """
@ -20,7 +21,8 @@ def toAZ(num):
>>> toAZ(1234567890) >>> toAZ(1234567890)
'CYWOQVJ' 'CYWOQVJ'
""" """
if num < 1: raise ValueError("must supply a positive integer") if num < 1:
raise ValueError("must supply a positive integer")
digits = string.ascii_uppercase digits = string.ascii_uppercase
az = '' az = ''
while num != 0: while num != 0:
@ -30,7 +32,7 @@ def toAZ(num):
az = digits[r] + az az = digits[r] + az
return az return az
encode_base26=toAZ encode_base26 = toAZ
def fromAZ(num): def fromAZ(num):
""" """
@ -45,7 +47,7 @@ def fromAZ(num):
>>> fromAZ('FOO') >>> fromAZ('FOO')
4461 4461
""" """
num = num.replace('-','') num = num.replace('-', '')
digits = string.ascii_uppercase digits = string.ascii_uppercase
r = 0 r = 0
for exp, char in enumerate(reversed(num)): for exp, char in enumerate(reversed(num)):
@ -64,7 +66,8 @@ def to26(q):
>>> to26(347485647) >>> to26(347485647)
'BDGKMAP' 'BDGKMAP'
""" """
if q < 0: raise ValueError("must supply a positive integer") if q < 0:
raise ValueError("must supply a positive integer")
base26 = string.ascii_uppercase base26 = string.ascii_uppercase
converted = [] converted = []
while q != 0: while q != 0:
@ -73,7 +76,7 @@ def to26(q):
converted.insert(0, l) converted.insert(0, l)
return "".join(converted) or 'A' return "".join(converted) or 'A'
decode_base26=toAZ decode_base26 = toAZ
def from26(q): def from26(q):
""" """
@ -82,7 +85,7 @@ def from26(q):
0 0
""" """
base26 = string.ascii_uppercase base26 = string.ascii_uppercase
q = q.replace('-','') q = q.replace('-', '')
r = 0 r = 0
for i in q: for i in q:
r = r * 26 + base26.index(i.upper()) r = r * 26 + base26.index(i.upper())
@ -123,7 +126,8 @@ def to32(q):
ValueError: must supply a positive integer ValueError: must supply a positive integer
""" """
if q < 0: raise ValueError("must supply a positive integer") if q < 0:
raise ValueError("must supply a positive integer")
letters = "0123456789ABCDEFGHJKMNPQRSTVWXYZ" letters = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
converted = [] converted = []
while q != 0: while q != 0:
@ -188,7 +192,7 @@ def from32(q):
'Z': 31, 'Z': 31,
} }
base32 = ('0123456789' + string.ascii_uppercase)[:32] base32 = ('0123456789' + string.ascii_uppercase)[:32]
q = q.replace('-','') q = q.replace('-', '')
q = ''.join([base32[_32map[i.upper()]] for i in q]) q = ''.join([base32[_32map[i.upper()]] for i in q])
return int(q, 32) return int(q, 32)
@ -210,7 +214,8 @@ def to36(q):
... ...
ValueError: must supply a positive integer ValueError: must supply a positive integer
""" """
if q < 0: raise ValueError("must supply a positive integer") if q < 0:
raise ValueError("must supply a positive integer")
letters = "0123456789abcdefghijklmnopqrstuvwxyz" letters = "0123456789abcdefghijklmnopqrstuvwxyz"
converted = [] converted = []
while q != 0: while q != 0:
@ -233,7 +238,7 @@ def int_value(strValue, default=u''):
u'' u''
""" """
try: try:
val = re.compile('(\d+)').findall(unicode(strValue).strip())[0] val = re.compile('(\d+)').findall(text_type(strValue).strip())[0]
except: except:
val = default val = default
return val return val
@ -250,7 +255,7 @@ def float_value(strValue, default=u''):
u'' u''
""" """
try: try:
val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0] val = re.compile('([\d.]+)').findall(text_type(strValue).strip())[0]
except: except:
val = default val = default
return val return val
@ -286,7 +291,7 @@ def format_number(number, longName, shortName):
n = number / math.pow(1024, i + 1) n = number / math.pow(1024, i + 1)
return '%s %s%s' % (format_thousands('%.*f' % (i, n)), prefix[i], shortName) return '%s %s%s' % (format_thousands('%.*f' % (i, n)), prefix[i], shortName)
def format_thousands(number, separator = ','): def format_thousands(number, separator=','):
""" """
Return the number with separators (1,000,000) Return the number with separators (1,000,000)
@ -318,13 +323,13 @@ def format_pixels(number):
def format_currency(amount, currency="$"): def format_currency(amount, currency="$"):
if amount: if amount:
temp = "%.2f" % amount temp = "%.2f" % amount
profile=re.compile(r"(\d)(\d\d\d[.,])") profile = re.compile(r"(\d)(\d\d\d[.,])")
while 1: while 1:
temp, count = re.subn(profile,r"\1,\2",temp) temp, count = re.subn(profile, r"\1,\2", temp)
if not count: if not count:
break break
if temp.startswith('-'): if temp.startswith('-'):
return "-"+ currency + temp[1:-3] return "-" + currency + temp[1:-3]
return currency + temp[:-3] return currency + temp[:-3]
else: else:
return "" return ""
@ -339,7 +344,8 @@ def plural(amount, unit, plural='s'):
if abs(amount) != 1: if abs(amount) != 1:
if plural == 's': if plural == 's':
unit = unit + plural unit = unit + plural
else: unit = plural else:
unit = plural
return "%s %s" % (format_thousands(amount), unit) return "%s %s" % (format_thousands(amount), unit)
def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True): def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
@ -396,7 +402,7 @@ def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
if milliseconds: if milliseconds:
durations.append("%sms" % ms) durations.append("%sms" % ms)
else: else:
durations = [plural(d, 'day'), plural(h,'hour'), durations = [plural(d, 'day'), plural(h, 'hour'),
plural(m, 'minute'), plural(s, 'second')] plural(m, 'minute'), plural(s, 'second')]
if years: if years:
durations.insert(0, plural(y, 'year')) durations.insert(0, plural(y, 'year'))
@ -434,7 +440,7 @@ def parse_timecode(string):
''' '''
timecode = 0 timecode = 0
for i, v in enumerate(list(reversed(string.split(':')))[:4]): for i, v in enumerate(list(reversed(string.split(':')))[:4]):
timecode += float(v) * ( 86400 if i == 3 else pow(60, i)) timecode += float(v) * (86400 if i == 3 else pow(60, i))
return timecode return timecode
def ms2runtime(ms, shortenLong=False): def ms2runtime(ms, shortenLong=False):
@ -482,7 +488,8 @@ def time2ms(timeString):
p = timeString.split(':') p = timeString.split(':')
for i in range(len(p)): for i in range(len(p)):
_p = p[i] _p = p[i]
if _p.endswith('.'): _p =_p[:-1] if _p.endswith('.'):
_p = _p[:-1]
ms = ms * 60 + float(_p) ms = ms * 60 + float(_p)
return int(ms * 1000) return int(ms * 1000)

View file

@ -18,8 +18,8 @@ DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)') unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
word_split_re = re.compile(r'(\s+)') word_split_re = re.compile(r'(\s+)')
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \ punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % (
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]), '|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION]))) '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$') simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+') link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
@ -83,20 +83,22 @@ def add_links(text, trim_url_limit=None, nofollow=False):
If nofollow is True, the URLs in link text will get a rel="nofollow" attribute. If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
""" """
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >= limit and '...' or '')) or x
words = word_split_re.split(text) words = word_split_re.split(text)
nofollow_attr = nofollow and ' rel="nofollow"' or '' nofollow_attr = nofollow and ' rel="nofollow"' or ''
for i, word in enumerate(words): for i, word in enumerate(words):
match = punctuation_re.match(word) match = punctuation_re.match(word)
if match: if match:
lead, middle, trail = match.groups() lead, middle, trail = match.groups()
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \ if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and
len(middle) > 0 and middle[0] in letters + string.digits and \ len(middle) > 0 and middle[0] in letters + string.digits and
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))): (middle.endswith('.org') or
middle.endswith('.net') or
middle.endswith('.com'))):
middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle)) middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if middle.startswith('http://') or middle.startswith('https://'): if middle.startswith('http://') or middle.startswith('https://'):
middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle)) middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if '@' in middle and not middle.startswith('www.') and not ':' in middle \ if '@' in middle and not middle.startswith('www.') and ':' not in middle \
and simple_email_re.match(middle): and simple_email_re.match(middle):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle) middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
if lead + middle + trail != word: if lead + middle + trail != word:
@ -127,6 +129,7 @@ def clean_html(text):
# Trim stupid HTML such as <br clear="all">. # Trim stupid HTML such as <br clear="all">.
text = html_gunk_re.sub('', text) text = html_gunk_re.sub('', text)
# Convert hard-coded bullets into HTML unordered lists. # Convert hard-coded bullets into HTML unordered lists.
def replace_p_tags(match): def replace_p_tags(match):
s = match.group().replace('</p>', '</li>') s = match.group().replace('</p>', '</li>')
for d in DOTS: for d in DOTS:
@ -153,6 +156,7 @@ def decode_html(html):
if isinstance(html, bytes): if isinstance(html, bytes):
html = html.decode('utf-8') html = html.decode('utf-8')
uchr = unichr uchr = unichr
def entitydecode(match, uchr=uchr): def entitydecode(match, uchr=uchr):
entity = match.group(1) entity = match.group(1)
if entity == '#x80': if entity == '#x80':
@ -328,15 +332,14 @@ def sanitize_html(html, tags=None, global_attributes=[]):
for tag in tags: for tag in tags:
valid_attributes[tag['name']] = tag.get('required', []) \ valid_attributes[tag['name']] = tag.get('required', []) \
+ tag.get('optional', []) \ + tag.get('optional', []) + global_attributes
+ global_attributes
required_attributes[tag['name']] = tag.get('required', []) required_attributes[tag['name']] = tag.get('required', [])
validation[tag['name']] = tag.get('validation', {}) validation[tag['name']] = tag.get('validation', {})
if '[]' in validation: if '[]' in validation:
html = re.sub( html = re.sub(
re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE), re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
'<a href="\\1">\\3</a>', html); '<a href="\\1">\\3</a>', html)
parts = split_tags(html) parts = split_tags(html)
for i, part in enumerate(parts): for i, part in enumerate(parts):
@ -351,7 +354,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):
a = attr_re.findall(attributes) a = attr_re.findall(attributes)
attrs = dict(a) attrs = dict(a)
if not closing and not name in non_closing_tags: if not closing and name not in non_closing_tags:
level += 1 level += 1
if not attrs and attributes or name not in valid_tags: if not attrs and attributes or name not in valid_tags:
@ -361,7 +364,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):
for key in set(attrs) - set(valid_attributes[name]): for key in set(attrs) - set(valid_attributes[name]):
del attrs[key] del attrs[key]
for key in required_attributes[tag['name']]: for key in required_attributes[tag['name']]:
if not key in attrs: if key not in attrs:
valid = False valid = False
if valid: if valid:
@ -395,6 +398,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):
def split_tags(string): def split_tags(string):
tags = [] tags = []
def collect(match): def collect(match):
tags.append(match.group(0)) tags.append(match.group(0))
return '\0' return '\0'

View file

@ -208,7 +208,7 @@ def langTo3Code(lang):
if lang: if lang:
lang = langEnglishName(lang) lang = langEnglishName(lang)
if lang: if lang:
lang=lang.lower() lang = lang.lower()
for l in _iso639_languages: for l in _iso639_languages:
if l[0].lower() == lang: if l[0].lower() == lang:
return l[3] return l[3]
@ -218,7 +218,7 @@ def langTo2Code(lang):
if lang: if lang:
lang = langEnglishName(lang) lang = langEnglishName(lang)
if lang: if lang:
lang=lang.lower() lang = lang.lower()
for l in _iso639_languages: for l in _iso639_languages:
if l[0].lower() == lang: if l[0].lower() == lang:
return l[2] return l[2]

View file

@ -11,9 +11,9 @@ def minify(source, comment=''):
pass pass
# python2 performance with unicode string is terrible # python2 performance with unicode string is terrible
if PY2: if PY2:
if isinstance(source, unicode): if isinstance(source, unicode): # pylint: disable=undefined-variable
source = source.encode('utf-8') source = source.encode('utf-8')
if isinstance(comment, unicode): if isinstance(comment, unicode): # pylint: disable=undefined-variable
comment = comment.encode('utf-8') comment = comment.encode('utf-8')
tokens = tokenize(source) tokens = tokenize(source)
length = len(tokens) length = len(tokens)
@ -30,20 +30,20 @@ def minify(source, comment=''):
# numbers or strings or unary operators or grouping operators # numbers or strings or unary operators or grouping operators
# with a single newline, otherwise remove it # with a single newline, otherwise remove it
if prevToken and nextToken\ if prevToken and nextToken\
and (prevToken['type'] in ['identifier', 'number', 'string']\ and (prevToken['type'] in ['identifier', 'number', 'string']
or prevToken['value'] in ['++', '--', ')', ']', '}'])\ or prevToken['value'] in ['++', '--', ')', ']', '}']) \
and (nextToken['type'] in ['identifier', 'number', 'string']\ and (nextToken['type'] in ['identifier', 'number', 'string']
or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']): or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']):
minified += '\n' minified += '\n'
elif token['type'] == 'whitespace': elif token['type'] == 'whitespace':
# replace whitespace between two tokens that are identifiers or # replace whitespace between two tokens that are identifiers or
# numbers, or between a token that ends with "+" or "-" and one that # numbers, or between a token that ends with "+" or "-" and one that
# begins with "+" or "-", with a single space, otherwise remove it # begins with "+" or "-", with a single space, otherwise remove it
if prevToken and nextToken\ if prevToken and nextToken \
and ((prevToken['type'] in ['identifier', 'number']\ and ((prevToken['type'] in ['identifier', 'number'] and
and nextToken['type'] in ['identifier', 'number']) nextToken['type'] in ['identifier', 'number']) or
or (prevToken['value'] in ['+', '-', '++', '--'] (prevToken['value'] in ['+', '-', '++', '--'] and
and nextToken['value'] in ['+', '-', '++', '--'])): nextToken['value'] in ['+', '-', '++', '--'])):
minified += ' ' minified += ' '
elif token['type'] != 'comment': elif token['type'] != 'comment':
# remove comments and leave all other tokens untouched # remove comments and leave all other tokens untouched
@ -178,7 +178,7 @@ def tokenize(source):
'value': value 'value': value
}) })
if type == 'comment': if type == 'comment':
lines = value.split('\n'); lines = value.split('\n')
column = len(lines[-1]) column = len(lines[-1])
line += len(lines) - 1 line += len(lines) - 1
elif type == 'linebreak': elif type == 'linebreak':

View file

@ -23,11 +23,11 @@ def loads(source):
try: try:
m = re.search(r'line (\d+) column (\d+)', msg) m = re.search(r'line (\d+) column (\d+)', msg)
if m: if m:
(lineno, colno) = map(int, m.groups()) (lineno, colno) = [int(n) for n in m.groups()]
except: except:
pass pass
if lineno and colno: if lineno and colno:
s = minified.split('\n') s = minified.split('\n')
context = s[lineno-1][max(0, colno-30):colno+30] context = s[lineno-1][max(0, colno-30):colno+30]
msg += ' at:\n\n %s\n %s\033[1m^\033[0m' %(context, ' ' * (colno - max(0, colno-30) - 2)) msg += ' at:\n\n %s\n %s\033[1m^\033[0m' % (context, ' ' * (colno - max(0, colno-30) - 2))
raise ValueError(msg) raise ValueError(msg)

View file

@ -18,7 +18,8 @@ _articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
_articlesDict = dict([(x, x) for x in _articles]) _articlesDict = dict([(x, x) for x in _articles])
_spArticles = [] _spArticles = []
for article in _articles: for article in _articles:
if article[-1] not in ("'", '-'): article += ' ' if article[-1] not in ("'", '-'):
article += ' '
_spArticles.append(article) _spArticles.append(article)
_noarticles = ( _noarticles = (
@ -50,8 +51,10 @@ def canonical_title(title):
'Los Angeles Plays Itself' 'Los Angeles Plays Itself'
""" """
try: try:
if _articlesDict.has_key(title.split(', ')[-1].lower()): return title if title.split(', ')[-1].lower() in _articlesDict:
except IndexError: pass return title
except IndexError:
pass
ltitle = title.lower() ltitle = title.lower()
for start in _noarticles: for start in _noarticles:
if ltitle.startswith(start): if ltitle.startswith(start):
@ -60,7 +63,8 @@ def canonical_title(title):
if ltitle.startswith(article): if ltitle.startswith(article):
lart = len(article) lart = len(article)
title = '%s, %s' % (title[lart:], title[:lart]) title = '%s, %s' % (title[lart:], title[:lart])
if article[-1] == ' ': title = title[:-1] if article[-1] == ' ':
title = title[:-1]
break break
## XXX: an attempt using a dictionary lookup. ## XXX: an attempt using a dictionary lookup.
##for artSeparator in (' ', "'", '-'): ##for artSeparator in (' ', "'", '-'):
@ -82,9 +86,10 @@ def normalize_title(title):
'The Movie Title' 'The Movie Title'
""" """
stitle = title.split(', ') stitle = title.split(', ')
if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()): if len(stitle) > 1 and stitle[-1].lower() in _articlesDict:
sep = ' ' sep = ' '
if stitle[-1][-1] in ("'", '-'): sep = '' if stitle[-1][-1] in ("'", '-'):
sep = ''
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1])) title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
return title return title
@ -139,7 +144,8 @@ def canonical_name(name):
# Don't convert names already in the canonical format. # Don't convert names already in the canonical format.
if name in ('Unknown Director', ): if name in ('Unknown Director', ):
return name return name
if name.find(', ') != -1: return name if name.find(', ') != -1:
return name
sname = name.split(' ') sname = name.split(' ')
snl = len(sname) snl = len(sname)
if snl == 2: if snl == 2:
@ -147,11 +153,14 @@ def canonical_name(name):
name = '%s, %s' % (sname[1], sname[0]) name = '%s, %s' % (sname[1], sname[0])
elif snl > 2: elif snl > 2:
lsname = [x.lower() for x in sname] lsname = [x.lower() for x in sname]
if snl == 3: _indexes = (0, snl-2) if snl == 3:
else: _indexes = (0, snl-2, snl-3) _indexes = (0, snl-2)
else:
_indexes = (0, snl-2, snl-3)
# Check for common surname prefixes at the beginning and near the end. # Check for common surname prefixes at the beginning and near the end.
for index in _indexes: for index in _indexes:
if lsname[index] not in _sname_suffixes: continue if lsname[index] not in _sname_suffixes:
continue
try: try:
# Build the surname. # Build the surname.
surn = '%s %s' % (sname[index], sname[index+1]) surn = '%s %s' % (sname[index], sname[index+1])
@ -194,11 +203,12 @@ def normalize_name(name):
def normalize_path(path): def normalize_path(path):
path = path.replace(':', '_').replace('/', '_') path = path.replace(':', '_').replace('/', '_')
if path.endswith('.'): path = path[:-1] + '_' if path.endswith('.'):
path = path[:-1] + '_'
return path return path
def strip_accents(s): def strip_accents(s):
if isinstance(s, str): if isinstance(s, str):
s = unicode(s) s = s.decode('utf-8')
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')) return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))

View file

@ -6,13 +6,16 @@ from . import cache
from .text import find_re from .text import find_re
from .utils import json, ET from .utils import json, ET
def get_embed_code(url, maxwidth=None, maxheight=None): def get_embed_code(url, maxwidth=None, maxheight=None):
embed = {} embed = {}
header = cache.get_headers(url) header = cache.get_headers(url)
if header.get('content-type', '').startswith('text/html'): if header.get('content-type', '').startswith('text/html'):
html = cache.read_url(url) html = cache.read_url(url)
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html)) links = re.compile('<link.*?>').findall(html)
xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html)) json_oembed = [l for l in links if 'json+oembed' in l]
xml_oembed = [l for l in links if 'xml+oembed' in l]
if json_oembed: if json_oembed:
oembed_url = find_re(json_oembed[0], 'href="(.*?)"') oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
if maxwidth: if maxwidth:
@ -21,7 +24,7 @@ def get_embed_code(url, maxwidth=None, maxheight=None):
oembed_url += '&maxheight=%d' % maxheight oembed_url += '&maxheight=%d' % maxheight
embed = json.loads(cache.read_url(oembed_url)) embed = json.loads(cache.read_url(oembed_url))
elif xml_oembed: elif xml_oembed:
oembed_url = find_re(json_oembed[0], 'href="(.*?)"') oembed_url = find_re(xml_oembed[0], 'href="(.*?)"')
if maxwidth: if maxwidth:
oembed_url += '&maxwidth=%d' % maxwidth oembed_url += '&maxwidth=%d' % maxwidth
if maxheight: if maxheight:

View file

@ -14,8 +14,8 @@ else:
__all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size'] __all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size']
def create_torrent(file, url, params = {}, flag = Event(), def create_torrent(file, url, params={}, flag=Event(),
progress = lambda x: None, progress_percent = 1): progress=lambda x: None, progress_percent=1):
"Creates a torrent for a given file, using url as tracker url" "Creates a torrent for a given file, using url as tracker url"
from .makemetafile import make_meta_file from .makemetafile import make_meta_file
return make_meta_file(file, url, params, flag, progress, progress_percent) return make_meta_file(file, url, params, flag, progress, progress_percent)

View file

@ -11,7 +11,7 @@ def _decode_int(data):
""" """
data = data[1:] data = data[1:]
end = data.index(b'e') end = data.index(b'e')
return int(data[:end],10), data[end+1:] return int(data[:end], 10), data[end+1:]
def _decode_str(data): def _decode_str(data):
""" """
@ -19,9 +19,9 @@ def _decode_str(data):
return string, remaining data return string, remaining data
""" """
start = data.index(b':') start = data.index(b':')
l = int(data[:start].decode(),10) l = int(data[:start].decode(), 10)
if l <= 0: if l <= 0:
raise Exception('invalid string size: %d'%d) raise Exception('invalid string size: %d' % l)
start += 1 start += 1
ret = bytes(data[start:start+l]) ret = bytes(data[start:start+l])
data = data[start+l:] data = data[start+l:]
@ -67,45 +67,45 @@ def _decode(data):
elif ch.isdigit(): elif ch.isdigit():
return _decode_str(data) return _decode_str(data)
else: else:
raise Exception('could not deserialize data: %s'%data) raise Exception('could not deserialize data: %s' % data)
def bdecode(data): def bdecode(data):
""" """
decode a bytearray decode a bytearray
return deserialized object return deserialized object
""" """
obj , data = _decode(data) obj, data = _decode(data)
if len(data) > 0: if len(data) > 0:
raise Exception('failed to deserialize, extra data: %s'%data) raise Exception('failed to deserialize, extra data: %s' % data)
return obj return obj
def _encode_str(s,buff): def _encode_str(s, buff):
""" """
encode string to a buffer encode string to a buffer
""" """
s = bytearray(s) s = bytearray(s)
l = len(s) l = len(s)
buff.append(bytearray(str(l)+':','utf-8')) buff.append(bytearray(str(l)+':', 'utf-8'))
buff.append(s) buff.append(s)
def _encode_int(i,buff): def _encode_int(i, buff):
""" """
encode integer to a buffer encode integer to a buffer
""" """
buff.append(b'i') buff.append(b'i')
buff.append(bytearray(str(i),'ascii')) buff.append(bytearray(str(i), 'ascii'))
buff.append(b'e') buff.append(b'e')
def _encode_list(l,buff): def _encode_list(l, buff):
""" """
encode list of elements to a buffer encode list of elements to a buffer
""" """
buff.append(b'l') buff.append(b'l')
for i in l: for i in l:
_encode(i,buff) _encode(i, buff)
buff.append(b'e') buff.append(b'e')
def _encode_dict(d,buff): def _encode_dict(d, buff):
""" """
encode dict encode dict
""" """
@ -113,30 +113,30 @@ def _encode_dict(d,buff):
l = list(d.keys()) l = list(d.keys())
l.sort() l.sort()
for k in l: for k in l:
_encode(str(k),buff) _encode(str(k), buff)
_encode(d[k],buff) _encode(d[k], buff)
buff.append(b'e') buff.append(b'e')
def _encode(obj,buff): def _encode(obj, buff):
""" """
encode element obj to a buffer buff encode element obj to a buffer buff
""" """
if isinstance(obj,str): if isinstance(obj, str):
_encode_str(bytearray(obj,'utf-8'),buff) _encode_str(bytearray(obj, 'utf-8'), buff)
elif isinstance(obj,bytes): elif isinstance(obj, bytes):
_encode_str(bytearray(obj),buff) _encode_str(bytearray(obj), buff)
elif isinstance(obj,bytearray): elif isinstance(obj, bytearray):
_encode_str(obj,buff) _encode_str(obj, buff)
elif str(obj).isdigit(): elif str(obj).isdigit():
_encode_int(obj,buff) _encode_int(obj, buff)
elif isinstance(obj,list): elif isinstance(obj, list):
_encode_list(obj,buff) _encode_list(obj, buff)
elif hasattr(obj,'keys') and hasattr(obj,'values'): elif hasattr(obj, 'keys') and hasattr(obj, 'values'):
_encode_dict(obj,buff) _encode_dict(obj, buff)
elif str(obj) in ['True','False']: elif str(obj) in ['True', 'False']:
_encode_int(int(obj and '1' or '0'),buff) _encode_int(int(obj and '1' or '0'), buff)
else: else:
raise Exception('non serializable object: %s'%obj) raise Exception('non serializable object: %s' % obj)
def bencode(obj): def bencode(obj):
@ -144,7 +144,7 @@ def bencode(obj):
bencode element, return bytearray bencode element, return bytearray
""" """
buff = [] buff = []
_encode(obj,buff) _encode(obj, buff)
ret = bytearray() ret = bytearray()
for ba in buff: for ba in buff:
ret += ba ret += ba

View file

@ -2,6 +2,7 @@ from __future__ import print_function
import json import json
import re import re
from six import text_type
from ox.cache import read_url from ox.cache import read_url
HEADERS = { HEADERS = {
@ -16,9 +17,9 @@ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3' USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
def get_movie_data(title, director): def get_movie_data(title, director):
if isinstance(title, unicode): if isinstance(title, text_type):
title = title.encode('utf-8') title = title.encode('utf-8')
if isinstance(director, unicode): if isinstance(director, text_type):
director = director.encode('utf-8') director = director.encode('utf-8')
data = {} data = {}
# itunes section (preferred source for link) # itunes section (preferred source for link)
@ -45,7 +46,7 @@ def get_movie_data(title, director):
results = js['results'] results = js['results']
if results: if results:
url = host + results[0]['location'] url = host + results[0]['location']
if not 'link' in data: if 'link' not in data:
data['link'] = url data['link'] = url
headers = { headers = {
'User-Agent': USER_AGENT 'User-Agent': USER_AGENT

View file

@ -7,7 +7,7 @@ import time
from ox import strip_tags, find_re from ox import strip_tags, find_re
from ox.cache import read_url from ox.cache import read_url
import google from . import google
def get_show_url(title): def get_show_url(title):

View file

@ -28,22 +28,32 @@ def get_show_url(title):
def get_data(url): def get_data(url):
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
doc = document_fromstring(data) doc = document_fromstring(data)
score = filter(lambda s: s.attrib.get('property') == 'v:average', score = [s for s in doc.xpath('//span[@class="score_value"]')
doc.xpath('//span[@class="score_value"]')) if s.attrib.get('property') == 'v:average']
if score: if score:
score = int(score[0].text) score = int(score[0].text)
else: else:
score = -1 score = -1
authors = [a.text authors = [
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')] a.text
sources = [d.text for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')] ]
reviews = [d.text sources = [
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')] d.text
scores = [int(d.text.strip()) for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')] ]
urls = [a.attrib['href'] reviews = [
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')] d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')
]
scores = [
int(d.text.strip())
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')
]
urls = [
a.attrib['href']
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')
]
metacritics = [] metacritics = []
for i in range(len(authors)): for i in range(len(authors)):

View file

@ -32,7 +32,7 @@ def get_data(url):
r['summary'] = get_og(data, 'description') r['summary'] = get_og(data, 'description')
meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data) meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data)
meter = filter(lambda m: m[1].isdigit(), meter) meter = [m for m in meter if m[1].isdigit()]
if meter: if meter:
r['tomatometer'] = meter[0][1] r['tomatometer'] = meter[0][1]
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>') r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')

View file

@ -95,7 +95,7 @@ def format_subsection(string):
'ussports': 'US-Sports', 'ussports': 'US-Sports',
'wunderbar': 'wunderBAR' 'wunderbar': 'wunderBAR'
} }
if subsection.has_key(string): if string in subsection:
return subsection[string].replace(u'\xc3', 'ae') return subsection[string].replace(u'\xc3', 'ae')
return string[:1].upper() + string[1:] return string[:1].upper() + string[1:]
@ -219,8 +219,8 @@ def archive_news():
else: else:
dMax = days[m] dMax = days[m]
for d in range(dMax, 0, -1): for d in range(dMax, 0, -1):
print('getNews(%d, %d, %d)' % (y, m, d)) print('get_news(%d, %d, %d)' % (y, m, d))
news = getNews(y, m ,d) news = get_news(y, m, d)
for new in news: for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16] dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname): if not os.path.exists(dirname):
@ -230,7 +230,7 @@ def archive_news():
else: else:
filename = dirname + '/' + new['url'] + '.json' filename = dirname + '/' + new['url'] + '.json'
if not os.path.exists(filename) or True: if not os.path.exists(filename) or True:
data = json.dumps(new, ensure_ascii = False) data = json.dumps(new, ensure_ascii=False)
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()
@ -253,7 +253,7 @@ def archive_news():
string = strings[3] string = strings[3]
if len(strings) == 6: if len(strings) == 6:
string += '/' + strings[4] string += '/' + strings[4]
if not count.has_key(string): if string not in count:
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))} count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
else: else:
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])} count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
@ -269,12 +269,12 @@ if __name__ == '__main__':
# spiegel = Spiegel(2008, 8) # spiegel = Spiegel(2008, 8)
# print(spiegel.getContents()) # print(spiegel.getContents())
# news = News(2001, 9, 10) # news = News(2001, 9, 10)
# output(news.getNews()) # output(news.get_news())
''' '''
x = [] x = []
for d in range(10, 30): for d in range(10, 30):
print('2/%d' % d) print('2/%d' % d)
news = getNews(2008, 2, d) news = get_news(2008, 2, d)
for new in news: for new in news:
strings = new['url'].split('/') strings = new['url'].split('/')
string = format_section(strings[3]) string = format_section(strings[3])

View file

@ -27,15 +27,15 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout):
""" """
fmt = None fmt = None
if format == '4k': if format == '4k':
fmt=38 fmt = 38
elif format == '1080p': elif format == '1080p':
fmt=37 fmt = 37
elif format == '720p': elif format == '720p':
fmt=22 fmt = 22
elif format == 'mp4': elif format == 'mp4':
fmt=18 fmt = 18
elif format == 'high': elif format == 'high':
fmt=35 fmt = 35
elif format == 'webm': elif format == 'webm':
streams = videos(youtubeId, 'webm') streams = videos(youtubeId, 'webm')
return streams[max(streams.keys())]['url'] return streams[max(streams.keys())]['url']