diff --git a/ox/api.py b/ox/api.py index 9717ff2..03d877c 100644 --- a/ox/api.py +++ b/ox/api.py @@ -101,7 +101,7 @@ class API(object): result = result.decode('utf-8') result = json.loads(result) except: - result = {'status':{}} + result = {'status': {}} result['status']['code'] = e.code result['status']['text'] = str(e) return result diff --git a/ox/file.py b/ox/file.py index 1791061..c03b4ef 100644 --- a/ox/file.py +++ b/ox/file.py @@ -30,7 +30,7 @@ EXTENSIONS = { '3gp', 'avi', 'divx', 'dv', 'flv', 'm2t', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', - 'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD + 'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD 'mxf', 'ts' ], } @@ -131,25 +131,25 @@ def oshash(filename, cached=True): if filesize < 65536: for x in range(int(filesize/bytesize)): buffer = f.read(bytesize) - (l_value,)= struct.unpack(longlongformat, buffer) + (l_value,) = struct.unpack(longlongformat, buffer) hash += l_value - hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number + hash = hash & 0xFFFFFFFFFFFFFFFF # to remain as 64bit number else: for x in range(int(65536/bytesize)): buffer = f.read(bytesize) - (l_value,)= struct.unpack(longlongformat, buffer) + (l_value,) = struct.unpack(longlongformat, buffer) hash += l_value - hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number - f.seek(max(0,filesize-65536),0) + hash = hash & 0xFFFFFFFFFFFFFFFF # to remain as 64bit number + f.seek(max(0, filesize-65536), 0) for x in range(int(65536/bytesize)): buffer = f.read(bytesize) - (l_value,)= struct.unpack(longlongformat, buffer) + (l_value,) = struct.unpack(longlongformat, buffer) hash += l_value hash = hash & 0xFFFFFFFFFFFFFFFF f.close() - returnedhash = "%016x" % hash + returnedhash = "%016x" % hash return returnedhash - except(IOError): + except IOError: return "IOError" def avinfo(filename, cached=True): @@ -160,23 +160,25 @@ def avinfo(filename, cached=True): return ffprobe(filename) ffmpeg2theora = cmd('ffmpeg2theora') p = subprocess.Popen([ffmpeg2theora], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - info, error = p.communicate() - version = info.split('\n')[0].split(' - ')[0].split(' ')[-1] + stdout, error = p.communicate() + stdout = stdout.decode('utf-8') + version = stdout.split('\n')[0].split(' - ')[0].split(' ')[-1] if version < '0.27': raise EnvironmentError('version of ffmpeg2theora needs to be 0.27 or later, found %s' % version) p = subprocess.Popen([ffmpeg2theora, '--info', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - info, error = p.communicate() + stdout, error = p.communicate() try: - info = json.loads(info) + info = json.loads(stdout) except: - #remove metadata, can be broken + # remove metadata, can be broken + stdout = stdout.decode('utf-8') reg = re.compile('"metadata": {.*?},', re.DOTALL) - info = re.sub(reg, '', info) - info = json.loads(info) + stdout = re.sub(reg, '', stdout) + info = json.loads(stdout) if 'video' in info: for v in info['video']: - if not 'display_aspect_ratio' in v and 'width' in v: + if 'display_aspect_ratio' not in v and 'width' in v: v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height']) v['pixel_aspect_ratio'] = '1:1' if len(info.get('audio', [])) > 1: @@ -189,6 +191,7 @@ def avinfo(filename, cached=True): ffmpeg = cmd('ffmpeg') p = subprocess.Popen([ffmpeg, '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() + stderr = stderr.decode('utf-8') languages = [re.compile('\((.+?)\):').findall(l) for l in stderr.split('\n') if 'Stream' in l and 'Audio' in l] if len(languages) == len(info['audio']): for i, stream in enumerate(info['audio']): @@ -236,7 +239,7 @@ def ffprobe(filename): info['video'] = [] info['metadata'] = ffinfo['format'].get('tags', {}) for s in ffinfo['streams']: - tags = s.pop('tags', {}) + tags = s.pop('tags', {}) language = None for t in tags: if t == 'language': @@ -278,16 +281,16 @@ def ffprobe(filename): info[s['codec_type']].append(stream) else: pass - #print s + # print s for v in info['video']: k = 'display_aspect_ratio' - if not k in v and 'width' in v \ + if k not in v and 'width' in v \ or (k in v and v[k] == '0:1'): v[k] = '%d:%d' % (v['width'], v['height']) v['pixel_aspect_ratio'] = '1:1' info['oshash'] = oshash(filename) info['path'] = filename - if not 'size' in info: + if 'size' not in info: info['size'] = os.path.getsize(filename) return info diff --git a/ox/fixunicode.py b/ox/fixunicode.py index b649a58..d3a162d 100644 --- a/ox/fixunicode.py +++ b/ox/fixunicode.py @@ -6,7 +6,7 @@ from __future__ import print_function import unicodedata -from six import unichr, PY2 +from six import unichr, text_type __all__ = ['fix_bad_unicode'] @@ -151,10 +151,7 @@ def text_badness(text): - Improbable single-byte characters, such as ƒ or ¬ - Letters in somewhat rare scripts ''' - if PY2: - assert isinstance(text, unicode) - else: - assert isinstance(text, str) + assert isinstance(text, text_type) errors = 0 very_weird_things = 0 weird_things = 0 diff --git a/ox/form.py b/ox/form.py index d9fe66d..48e7e92 100644 --- a/ox/form.py +++ b/ox/form.py @@ -75,7 +75,7 @@ class MultiPartForm(object): # line is separated by '\r\n'. parts = [] part_boundary = '--' + self.boundary - + # Add the form fields parts.extend( [ part_boundary, @@ -85,7 +85,7 @@ class MultiPartForm(object): ] for name, value in self.form_fields ) - + # Add the files to upload parts.extend( [ part_boundary, @@ -97,7 +97,7 @@ class MultiPartForm(object): ] for field_name, filename, content_type, body in self.files ) - + # Flatten the list and add closing boundary marker, # then return CR+LF separated data flattened = list(itertools.chain(*parts)) diff --git a/ox/format.py b/ox/format.py index aafd89c..ad18c31 100644 --- a/ox/format.py +++ b/ox/format.py @@ -4,13 +4,14 @@ import math import re import string +from six import text_type def toAZ(num): """ Converts an integer to bijective base 26 string using A-Z >>> for i in range(1, 1000): assert fromAZ(toAZ(i)) == i - + >>> toAZ(1) 'A' @@ -20,7 +21,8 @@ def toAZ(num): >>> toAZ(1234567890) 'CYWOQVJ' """ - if num < 1: raise ValueError("must supply a positive integer") + if num < 1: + raise ValueError("must supply a positive integer") digits = string.ascii_uppercase az = '' while num != 0: @@ -30,7 +32,7 @@ def toAZ(num): az = digits[r] + az return az -encode_base26=toAZ +encode_base26 = toAZ def fromAZ(num): """ @@ -45,7 +47,7 @@ def fromAZ(num): >>> fromAZ('FOO') 4461 """ - num = num.replace('-','') + num = num.replace('-', '') digits = string.ascii_uppercase r = 0 for exp, char in enumerate(reversed(num)): @@ -64,7 +66,8 @@ def to26(q): >>> to26(347485647) 'BDGKMAP' """ - if q < 0: raise ValueError("must supply a positive integer") + if q < 0: + raise ValueError("must supply a positive integer") base26 = string.ascii_uppercase converted = [] while q != 0: @@ -73,7 +76,7 @@ def to26(q): converted.insert(0, l) return "".join(converted) or 'A' -decode_base26=toAZ +decode_base26 = toAZ def from26(q): """ @@ -82,7 +85,7 @@ def from26(q): 0 """ base26 = string.ascii_uppercase - q = q.replace('-','') + q = q.replace('-', '') r = 0 for i in q: r = r * 26 + base26.index(i.upper()) @@ -123,7 +126,8 @@ def to32(q): ValueError: must supply a positive integer """ - if q < 0: raise ValueError("must supply a positive integer") + if q < 0: + raise ValueError("must supply a positive integer") letters = "0123456789ABCDEFGHJKMNPQRSTVWXYZ" converted = [] while q != 0: @@ -188,7 +192,7 @@ def from32(q): 'Z': 31, } base32 = ('0123456789' + string.ascii_uppercase)[:32] - q = q.replace('-','') + q = q.replace('-', '') q = ''.join([base32[_32map[i.upper()]] for i in q]) return int(q, 32) @@ -210,7 +214,8 @@ def to36(q): ... ValueError: must supply a positive integer """ - if q < 0: raise ValueError("must supply a positive integer") + if q < 0: + raise ValueError("must supply a positive integer") letters = "0123456789abcdefghijklmnopqrstuvwxyz" converted = [] while q != 0: @@ -233,7 +238,7 @@ def int_value(strValue, default=u''): u'' """ try: - val = re.compile('(\d+)').findall(unicode(strValue).strip())[0] + val = re.compile('(\d+)').findall(text_type(strValue).strip())[0] except: val = default return val @@ -250,7 +255,7 @@ def float_value(strValue, default=u''): u'' """ try: - val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0] + val = re.compile('([\d.]+)').findall(text_type(strValue).strip())[0] except: val = default return val @@ -286,7 +291,7 @@ def format_number(number, longName, shortName): n = number / math.pow(1024, i + 1) return '%s %s%s' % (format_thousands('%.*f' % (i, n)), prefix[i], shortName) -def format_thousands(number, separator = ','): +def format_thousands(number, separator=','): """ Return the number with separators (1,000,000) @@ -316,18 +321,18 @@ def format_pixels(number): return format_number(number, 'pixel', 'px') def format_currency(amount, currency="$"): - if amount: - temp = "%.2f" % amount - profile=re.compile(r"(\d)(\d\d\d[.,])") - while 1: - temp, count = re.subn(profile,r"\1,\2",temp) - if not count: - break - if temp.startswith('-'): - return "-"+ currency + temp[1:-3] - return currency + temp[:-3] - else: - return "" + if amount: + temp = "%.2f" % amount + profile = re.compile(r"(\d)(\d\d\d[.,])") + while 1: + temp, count = re.subn(profile, r"\1,\2", temp) + if not count: + break + if temp.startswith('-'): + return "-" + currency + temp[1:-3] + return currency + temp[:-3] + else: + return "" def plural(amount, unit, plural='s'): ''' @@ -339,7 +344,8 @@ def plural(amount, unit, plural='s'): if abs(amount) != 1: if plural == 's': unit = unit + plural - else: unit = plural + else: + unit = plural return "%s %s" % (format_thousands(amount), unit) def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True): @@ -390,14 +396,14 @@ def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True): duration += ".%03d" % ms else: if verbosity == 1: - durations = ["%sd" % d, "%sh" % h, "%sm" % m, "%ss" % s] + durations = ["%sd" % d, "%sh" % h, "%sm" % m, "%ss" % s] if years: durations.insert(0, "%sy" % y) if milliseconds: durations.append("%sms" % ms) else: - durations = [plural(d, 'day'), plural(h,'hour'), - plural(m, 'minute'), plural(s, 'second')] + durations = [plural(d, 'day'), plural(h, 'hour'), + plural(m, 'minute'), plural(s, 'second')] if years: durations.insert(0, plural(y, 'year')) if milliseconds: @@ -434,7 +440,7 @@ def parse_timecode(string): ''' timecode = 0 for i, v in enumerate(list(reversed(string.split(':')))[:4]): - timecode += float(v) * ( 86400 if i == 3 else pow(60, i)) + timecode += float(v) * (86400 if i == 3 else pow(60, i)) return timecode def ms2runtime(ms, shortenLong=False): @@ -482,7 +488,8 @@ def time2ms(timeString): p = timeString.split(':') for i in range(len(p)): _p = p[i] - if _p.endswith('.'): _p =_p[:-1] + if _p.endswith('.'): + _p = _p[:-1] ms = ms * 60 + float(_p) return int(ms * 1000) diff --git a/ox/html.py b/ox/html.py index 7154e21..bd59ace 100644 --- a/ox/html.py +++ b/ox/html.py @@ -10,7 +10,7 @@ letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' # Configuration for add_links() function -LEADING_PUNCTUATION = ['(', '<', '<'] +LEADING_PUNCTUATION = ['(', '<', '<'] TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>', "'", '"'] # list of possible strings used for bullets in bulleted lists @@ -18,16 +18,16 @@ DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•'] unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)') word_split_re = re.compile(r'(\s+)') -punctuation_re = re.compile('^(?P(?:%s)*)(?P.*?)(?P(?:%s)*)$' % \ - ('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]), - '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION]))) +punctuation_re = re.compile('^(?P(?:%s)*)(?P.*?)(?P(?:%s)*)$' % ( + '|'.join([re.escape(x) for x in LEADING_PUNCTUATION]), + '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION]))) simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$') link_target_attribute_re = re.compile(r'(]*?)target=[^\s>]+') html_gunk_re = re.compile(r'(?:
|<\/i>|<\/b>|<\/em>|<\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) hard_coded_bullets_re = re.compile(r'((?:

(?:%s).*?[a-zA-Z].*?

\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) trailing_empty_content_re = re.compile(r'(?:

(?: |\s|
)*?

\s*)+\Z') if PY2: - del x # Temporary variable + del x # Temporary variable def escape(html): ''' @@ -44,7 +44,7 @@ def linebreaks(value): ''' Converts newlines into

and
''' - value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines + value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines paras = re.split('\n{2,}', value) paras = ['

%s

' % p.strip().replace('\n', '
') for p in paras] return '\n\n'.join(paras) @@ -83,21 +83,23 @@ def add_links(text, trim_url_limit=None, nofollow=False): If nofollow is True, the URLs in link text will get a rel="nofollow" attribute. """ - trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x + trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >= limit and '...' or '')) or x words = word_split_re.split(text) nofollow_attr = nofollow and ' rel="nofollow"' or '' for i, word in enumerate(words): match = punctuation_re.match(word) if match: lead, middle, trail = match.groups() - if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \ - len(middle) > 0 and middle[0] in letters + string.digits and \ - (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))): + if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and + len(middle) > 0 and middle[0] in letters + string.digits and + (middle.endswith('.org') or + middle.endswith('.net') or + middle.endswith('.com'))): middle = '
%s' % (middle, nofollow_attr, trim_url(middle)) if middle.startswith('http://') or middle.startswith('https://'): middle = '%s' % (middle, nofollow_attr, trim_url(middle)) - if '@' in middle and not middle.startswith('www.') and not ':' in middle \ - and simple_email_re.match(middle): + if '@' in middle and not middle.startswith('www.') and ':' not in middle \ + and simple_email_re.match(middle): middle = '%s' % (middle, middle) if lead + middle + trail != word: words[i] = lead + middle + trail @@ -127,6 +129,7 @@ def clean_html(text): # Trim stupid HTML such as
. text = html_gunk_re.sub('', text) # Convert hard-coded bullets into HTML unordered lists. + def replace_p_tags(match): s = match.group().replace('

', '') for d in DOTS: @@ -153,6 +156,7 @@ def decode_html(html): if isinstance(html, bytes): html = html.decode('utf-8') uchr = unichr + def entitydecode(match, uchr=uchr): entity = match.group(1) if entity == '#x80': @@ -282,7 +286,7 @@ def sanitize_html(html, tags=None, global_attributes=[]): {'name': 'thead'}, {'name': 'tr'}, # other - {'name': '[]'}, + {'name': '[]'}, { 'name': 'a', 'required': ['href'], @@ -328,15 +332,14 @@ def sanitize_html(html, tags=None, global_attributes=[]): for tag in tags: valid_attributes[tag['name']] = tag.get('required', []) \ - + tag.get('optional', []) \ - + global_attributes + + tag.get('optional', []) + global_attributes required_attributes[tag['name']] = tag.get('required', []) validation[tag['name']] = tag.get('validation', {}) if '[]' in validation: html = re.sub( re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE), - '\\3', html); + '\\3', html) parts = split_tags(html) for i, part in enumerate(parts): @@ -351,17 +354,17 @@ def sanitize_html(html, tags=None, global_attributes=[]): a = attr_re.findall(attributes) attrs = dict(a) - if not closing and not name in non_closing_tags: + if not closing and name not in non_closing_tags: level += 1 - if not attrs and attributes or name not in valid_tags: + if not attrs and attributes or name not in valid_tags: valid = False else: valid = True for key in set(attrs) - set(valid_attributes[name]): del attrs[key] for key in required_attributes[tag['name']]: - if not key in attrs: + if key not in attrs: valid = False if valid: @@ -395,6 +398,7 @@ def sanitize_html(html, tags=None, global_attributes=[]): def split_tags(string): tags = [] + def collect(match): tags.append(match.group(0)) return '\0' diff --git a/ox/iso.py b/ox/iso.py index 6c28435..2b63e8c 100644 --- a/ox/iso.py +++ b/ox/iso.py @@ -208,7 +208,7 @@ def langTo3Code(lang): if lang: lang = langEnglishName(lang) if lang: - lang=lang.lower() + lang = lang.lower() for l in _iso639_languages: if l[0].lower() == lang: return l[3] @@ -218,7 +218,7 @@ def langTo2Code(lang): if lang: lang = langEnglishName(lang) if lang: - lang=lang.lower() + lang = lang.lower() for l in _iso639_languages: if l[0].lower() == lang: return l[2] diff --git a/ox/js.py b/ox/js.py index 5bc68d5..43b6fb0 100644 --- a/ox/js.py +++ b/ox/js.py @@ -11,9 +11,9 @@ def minify(source, comment=''): pass # python2 performance with unicode string is terrible if PY2: - if isinstance(source, unicode): + if isinstance(source, unicode): # pylint: disable=undefined-variable source = source.encode('utf-8') - if isinstance(comment, unicode): + if isinstance(comment, unicode): # pylint: disable=undefined-variable comment = comment.encode('utf-8') tokens = tokenize(source) length = len(tokens) @@ -30,20 +30,20 @@ def minify(source, comment=''): # numbers or strings or unary operators or grouping operators # with a single newline, otherwise remove it if prevToken and nextToken\ - and (prevToken['type'] in ['identifier', 'number', 'string']\ - or prevToken['value'] in ['++', '--', ')', ']', '}'])\ - and (nextToken['type'] in ['identifier', 'number', 'string']\ - or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']): + and (prevToken['type'] in ['identifier', 'number', 'string'] + or prevToken['value'] in ['++', '--', ')', ']', '}']) \ + and (nextToken['type'] in ['identifier', 'number', 'string'] + or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']): minified += '\n' elif token['type'] == 'whitespace': # replace whitespace between two tokens that are identifiers or # numbers, or between a token that ends with "+" or "-" and one that # begins with "+" or "-", with a single space, otherwise remove it - if prevToken and nextToken\ - and ((prevToken['type'] in ['identifier', 'number']\ - and nextToken['type'] in ['identifier', 'number']) - or (prevToken['value'] in ['+', '-', '++', '--'] - and nextToken['value'] in ['+', '-', '++', '--'])): + if prevToken and nextToken \ + and ((prevToken['type'] in ['identifier', 'number'] and + nextToken['type'] in ['identifier', 'number']) or + (prevToken['value'] in ['+', '-', '++', '--'] and + nextToken['value'] in ['+', '-', '++', '--'])): minified += ' ' elif token['type'] != 'comment': # remove comments and leave all other tokens untouched @@ -178,7 +178,7 @@ def tokenize(source): 'value': value }) if type == 'comment': - lines = value.split('\n'); + lines = value.split('\n') column = len(lines[-1]) line += len(lines) - 1 elif type == 'linebreak': diff --git a/ox/jsonc.py b/ox/jsonc.py index 83751ea..eadcc96 100644 --- a/ox/jsonc.py +++ b/ox/jsonc.py @@ -23,11 +23,11 @@ def loads(source): try: m = re.search(r'line (\d+) column (\d+)', msg) if m: - (lineno, colno) = map(int, m.groups()) + (lineno, colno) = [int(n) for n in m.groups()] except: pass if lineno and colno: s = minified.split('\n') context = s[lineno-1][max(0, colno-30):colno+30] - msg += ' at:\n\n %s\n %s\033[1m^\033[0m' %(context, ' ' * (colno - max(0, colno-30) - 2)) + msg += ' at:\n\n %s\n %s\033[1m^\033[0m' % (context, ' ' * (colno - max(0, colno-30) - 2)) raise ValueError(msg) diff --git a/ox/normalize.py b/ox/normalize.py index 128f33c..dea40ae 100644 --- a/ox/normalize.py +++ b/ox/normalize.py @@ -18,7 +18,8 @@ _articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el', _articlesDict = dict([(x, x) for x in _articles]) _spArticles = [] for article in _articles: - if article[-1] not in ("'", '-'): article += ' ' + if article[-1] not in ("'", '-'): + article += ' ' _spArticles.append(article) _noarticles = ( @@ -50,8 +51,10 @@ def canonical_title(title): 'Los Angeles Plays Itself' """ try: - if _articlesDict.has_key(title.split(', ')[-1].lower()): return title - except IndexError: pass + if title.split(', ')[-1].lower() in _articlesDict: + return title + except IndexError: + pass ltitle = title.lower() for start in _noarticles: if ltitle.startswith(start): @@ -60,7 +63,8 @@ def canonical_title(title): if ltitle.startswith(article): lart = len(article) title = '%s, %s' % (title[lart:], title[:lart]) - if article[-1] == ' ': title = title[:-1] + if article[-1] == ' ': + title = title[:-1] break ## XXX: an attempt using a dictionary lookup. ##for artSeparator in (' ', "'", '-'): @@ -82,9 +86,10 @@ def normalize_title(title): 'The Movie Title' """ stitle = title.split(', ') - if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()): + if len(stitle) > 1 and stitle[-1].lower() in _articlesDict: sep = ' ' - if stitle[-1][-1] in ("'", '-'): sep = '' + if stitle[-1][-1] in ("'", '-'): + sep = '' title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1])) return title @@ -139,7 +144,8 @@ def canonical_name(name): # Don't convert names already in the canonical format. if name in ('Unknown Director', ): return name - if name.find(', ') != -1: return name + if name.find(', ') != -1: + return name sname = name.split(' ') snl = len(sname) if snl == 2: @@ -147,11 +153,14 @@ def canonical_name(name): name = '%s, %s' % (sname[1], sname[0]) elif snl > 2: lsname = [x.lower() for x in sname] - if snl == 3: _indexes = (0, snl-2) - else: _indexes = (0, snl-2, snl-3) + if snl == 3: + _indexes = (0, snl-2) + else: + _indexes = (0, snl-2, snl-3) # Check for common surname prefixes at the beginning and near the end. for index in _indexes: - if lsname[index] not in _sname_suffixes: continue + if lsname[index] not in _sname_suffixes: + continue try: # Build the surname. surn = '%s %s' % (sname[index], sname[index+1]) @@ -194,11 +203,12 @@ def normalize_name(name): def normalize_path(path): path = path.replace(':', '_').replace('/', '_') - if path.endswith('.'): path = path[:-1] + '_' + if path.endswith('.'): + path = path[:-1] + '_' return path def strip_accents(s): if isinstance(s, str): - s = unicode(s) + s = s.decode('utf-8') return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')) diff --git a/ox/oembed.py b/ox/oembed.py index f9f1717..b5a02dc 100644 --- a/ox/oembed.py +++ b/ox/oembed.py @@ -6,13 +6,16 @@ from . import cache from .text import find_re from .utils import json, ET + def get_embed_code(url, maxwidth=None, maxheight=None): embed = {} header = cache.get_headers(url) if header.get('content-type', '').startswith('text/html'): html = cache.read_url(url) - json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('').findall(html)) - xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('').findall(html)) + links = re.compile('').findall(html) + json_oembed = [l for l in links if 'json+oembed' in l] + xml_oembed = [l for l in links if 'xml+oembed' in l] + if json_oembed: oembed_url = find_re(json_oembed[0], 'href="(.*?)"') if maxwidth: @@ -21,7 +24,7 @@ def get_embed_code(url, maxwidth=None, maxheight=None): oembed_url += '&maxheight=%d' % maxheight embed = json.loads(cache.read_url(oembed_url)) elif xml_oembed: - oembed_url = find_re(json_oembed[0], 'href="(.*?)"') + oembed_url = find_re(xml_oembed[0], 'href="(.*?)"') if maxwidth: oembed_url += '&maxwidth=%d' % maxwidth if maxheight: diff --git a/ox/torrent/__init__.py b/ox/torrent/__init__.py index 9e96bad..7818bc7 100644 --- a/ox/torrent/__init__.py +++ b/ox/torrent/__init__.py @@ -14,8 +14,8 @@ else: __all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size'] -def create_torrent(file, url, params = {}, flag = Event(), - progress = lambda x: None, progress_percent = 1): +def create_torrent(file, url, params={}, flag=Event(), + progress=lambda x: None, progress_percent=1): "Creates a torrent for a given file, using url as tracker url" from .makemetafile import make_meta_file return make_meta_file(file, url, params, flag, progress, progress_percent) diff --git a/ox/torrent/bencode3.py b/ox/torrent/bencode3.py index d2a2906..49f5aab 100644 --- a/ox/torrent/bencode3.py +++ b/ox/torrent/bencode3.py @@ -11,7 +11,7 @@ def _decode_int(data): """ data = data[1:] end = data.index(b'e') - return int(data[:end],10), data[end+1:] + return int(data[:end], 10), data[end+1:] def _decode_str(data): """ @@ -19,9 +19,9 @@ def _decode_str(data): return string, remaining data """ start = data.index(b':') - l = int(data[:start].decode(),10) + l = int(data[:start].decode(), 10) if l <= 0: - raise Exception('invalid string size: %d'%d) + raise Exception('invalid string size: %d' % l) start += 1 ret = bytes(data[start:start+l]) data = data[start+l:] @@ -67,45 +67,45 @@ def _decode(data): elif ch.isdigit(): return _decode_str(data) else: - raise Exception('could not deserialize data: %s'%data) + raise Exception('could not deserialize data: %s' % data) def bdecode(data): """ decode a bytearray return deserialized object """ - obj , data = _decode(data) + obj, data = _decode(data) if len(data) > 0: - raise Exception('failed to deserialize, extra data: %s'%data) + raise Exception('failed to deserialize, extra data: %s' % data) return obj -def _encode_str(s,buff): +def _encode_str(s, buff): """ encode string to a buffer """ s = bytearray(s) l = len(s) - buff.append(bytearray(str(l)+':','utf-8')) + buff.append(bytearray(str(l)+':', 'utf-8')) buff.append(s) -def _encode_int(i,buff): +def _encode_int(i, buff): """ encode integer to a buffer """ buff.append(b'i') - buff.append(bytearray(str(i),'ascii')) + buff.append(bytearray(str(i), 'ascii')) buff.append(b'e') -def _encode_list(l,buff): +def _encode_list(l, buff): """ encode list of elements to a buffer """ buff.append(b'l') for i in l: - _encode(i,buff) + _encode(i, buff) buff.append(b'e') -def _encode_dict(d,buff): +def _encode_dict(d, buff): """ encode dict """ @@ -113,30 +113,30 @@ def _encode_dict(d,buff): l = list(d.keys()) l.sort() for k in l: - _encode(str(k),buff) - _encode(d[k],buff) + _encode(str(k), buff) + _encode(d[k], buff) buff.append(b'e') -def _encode(obj,buff): +def _encode(obj, buff): """ encode element obj to a buffer buff """ - if isinstance(obj,str): - _encode_str(bytearray(obj,'utf-8'),buff) - elif isinstance(obj,bytes): - _encode_str(bytearray(obj),buff) - elif isinstance(obj,bytearray): - _encode_str(obj,buff) + if isinstance(obj, str): + _encode_str(bytearray(obj, 'utf-8'), buff) + elif isinstance(obj, bytes): + _encode_str(bytearray(obj), buff) + elif isinstance(obj, bytearray): + _encode_str(obj, buff) elif str(obj).isdigit(): - _encode_int(obj,buff) - elif isinstance(obj,list): - _encode_list(obj,buff) - elif hasattr(obj,'keys') and hasattr(obj,'values'): - _encode_dict(obj,buff) - elif str(obj) in ['True','False']: - _encode_int(int(obj and '1' or '0'),buff) + _encode_int(obj, buff) + elif isinstance(obj, list): + _encode_list(obj, buff) + elif hasattr(obj, 'keys') and hasattr(obj, 'values'): + _encode_dict(obj, buff) + elif str(obj) in ['True', 'False']: + _encode_int(int(obj and '1' or '0'), buff) else: - raise Exception('non serializable object: %s'%obj) + raise Exception('non serializable object: %s' % obj) def bencode(obj): @@ -144,8 +144,8 @@ def bencode(obj): bencode element, return bytearray """ buff = [] - _encode(obj,buff) - ret = bytearray() + _encode(obj, buff) + ret = bytearray() for ba in buff: - ret += ba + ret += ba return bytes(ret) diff --git a/ox/web/apple.py b/ox/web/apple.py index 57093a2..099d6cb 100644 --- a/ox/web/apple.py +++ b/ox/web/apple.py @@ -2,6 +2,7 @@ from __future__ import print_function import json import re +from six import text_type from ox.cache import read_url HEADERS = { @@ -16,9 +17,9 @@ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) ' USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3' def get_movie_data(title, director): - if isinstance(title, unicode): + if isinstance(title, text_type): title = title.encode('utf-8') - if isinstance(director, unicode): + if isinstance(director, text_type): director = director.encode('utf-8') data = {} # itunes section (preferred source for link) @@ -45,7 +46,7 @@ def get_movie_data(title, director): results = js['results'] if results: url = host + results[0]['location'] - if not 'link' in data: + if 'link' not in data: data['link'] = url headers = { 'User-Agent': USER_AGENT diff --git a/ox/web/dailymotion.py b/ox/web/dailymotion.py index 90a3fff..0ec8d86 100644 --- a/ox/web/dailymotion.py +++ b/ox/web/dailymotion.py @@ -1,21 +1,21 @@ -# -*- coding: utf-8 -*- -# vi:si:et:sw=4:sts=4:ts=4 -import re -from six.moves.urllib.parse import unquote -from ox.cache import read_url - - -def get_video_url(url): - ''' - >>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0] - 'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv' - - >>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0] - 'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv' - ''' - data = read_url(url) - video = re.compile('''video", "(.*?)"''').findall(data) - for v in video: - v = unquote(v).split('@@')[0] - return v - return '' +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re +from six.moves.urllib.parse import unquote +from ox.cache import read_url + + +def get_video_url(url): + ''' + >>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0] + 'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv' + + >>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0] + 'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv' + ''' + data = read_url(url) + video = re.compile('''video", "(.*?)"''').findall(data) + for v in video: + v = unquote(v).split('@@')[0] + return v + return '' diff --git a/ox/web/epguides.py b/ox/web/epguides.py index bb0e551..65670e7 100644 --- a/ox/web/epguides.py +++ b/ox/web/epguides.py @@ -7,7 +7,7 @@ import time from ox import strip_tags, find_re from ox.cache import read_url -import google +from . import google def get_show_url(title): diff --git a/ox/web/metacritic.py b/ox/web/metacritic.py index 0e43c80..2ecded5 100644 --- a/ox/web/metacritic.py +++ b/ox/web/metacritic.py @@ -28,22 +28,32 @@ def get_show_url(title): def get_data(url): data = read_url(url, unicode=True) doc = document_fromstring(data) - score = filter(lambda s: s.attrib.get('property') == 'v:average', - doc.xpath('//span[@class="score_value"]')) + score = [s for s in doc.xpath('//span[@class="score_value"]') + if s.attrib.get('property') == 'v:average'] if score: score = int(score[0].text) else: score = -1 - authors = [a.text - for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')] - sources = [d.text - for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')] - reviews = [d.text - for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')] - scores = [int(d.text.strip()) - for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')] - urls = [a.attrib['href'] - for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')] + authors = [ + a.text + for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a') + ] + sources = [ + d.text + for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a') + ] + reviews = [ + d.text + for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]') + ] + scores = [ + int(d.text.strip()) + for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]') + ] + urls = [ + a.attrib['href'] + for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]') + ] metacritics = [] for i in range(len(authors)): @@ -54,7 +64,7 @@ def get_data(url): 'quote': strip_tags(reviews[i]).strip(), 'score': scores[i], }) - + return { 'critics': metacritics, 'id': get_id(url), diff --git a/ox/web/rottentomatoes.py b/ox/web/rottentomatoes.py index fd3265d..605f313 100644 --- a/ox/web/rottentomatoes.py +++ b/ox/web/rottentomatoes.py @@ -32,7 +32,7 @@ def get_data(url): r['summary'] = get_og(data, 'description') meter = re.compile('(.*?)').findall(data) - meter = filter(lambda m: m[1].isdigit(), meter) + meter = [m for m in meter if m[1].isdigit()] if meter: r['tomatometer'] = meter[0][1] r['rating'] = find_re(data, 'Average Rating: ([\d.]+)/10') diff --git a/ox/web/spiegel.py b/ox/web/spiegel.py index 8f20b39..455aec8 100644 --- a/ox/web/spiegel.py +++ b/ox/web/spiegel.py @@ -95,7 +95,7 @@ def format_subsection(string): 'ussports': 'US-Sports', 'wunderbar': 'wunderBAR' } - if subsection.has_key(string): + if string in subsection: return subsection[string].replace(u'\xc3', 'ae') return string[:1].upper() + string[1:] @@ -219,8 +219,8 @@ def archive_news(): else: dMax = days[m] for d in range(dMax, 0, -1): - print('getNews(%d, %d, %d)' % (y, m, d)) - news = getNews(y, m ,d) + print('get_news(%d, %d, %d)' % (y, m, d)) + news = get_news(y, m, d) for new in news: dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16] if not os.path.exists(dirname): @@ -230,7 +230,7 @@ def archive_news(): else: filename = dirname + '/' + new['url'] + '.json' if not os.path.exists(filename) or True: - data = json.dumps(new, ensure_ascii = False) + data = json.dumps(new, ensure_ascii=False) f = open(filename, 'w') f.write(data) f.close() @@ -253,7 +253,7 @@ def archive_news(): string = strings[3] if len(strings) == 6: string += '/' + strings[4] - if not count.has_key(string): + if string not in count: count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))} else: count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])} @@ -269,12 +269,12 @@ if __name__ == '__main__': # spiegel = Spiegel(2008, 8) # print(spiegel.getContents()) # news = News(2001, 9, 10) - # output(news.getNews()) + # output(news.get_news()) ''' x = [] for d in range(10, 30): print('2/%d' % d) - news = getNews(2008, 2, d) + news = get_news(2008, 2, d) for new in news: strings = new['url'].split('/') string = format_section(strings[3]) diff --git a/ox/web/youtube.py b/ox/web/youtube.py index 7268598..db74e45 100644 --- a/ox/web/youtube.py +++ b/ox/web/youtube.py @@ -27,15 +27,15 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout): """ fmt = None if format == '4k': - fmt=38 + fmt = 38 elif format == '1080p': - fmt=37 + fmt = 37 elif format == '720p': - fmt=22 + fmt = 22 elif format == 'mp4': - fmt=18 + fmt = 18 elif format == 'high': - fmt=35 + fmt = 35 elif format == 'webm': streams = videos(youtubeId, 'webm') return streams[max(streams.keys())]['url']