diff --git a/ox/format.py b/ox/format.py index 9090cf0..83756c1 100644 --- a/ox/format.py +++ b/ox/format.py @@ -224,16 +224,16 @@ def to36(q): def from36(q): return int(q, 36) -def int_value(strValue, default=u''): +def int_value(strValue, default=''): """ >>> int_value('abc23') - u'23' + '23' >>> int_value(' abc23') - u'23' + '23' >>> int_value('ab') - u'' + '' """ try: val = re.compile('(\d+)').findall(str(strValue).strip())[0] @@ -241,16 +241,16 @@ def int_value(strValue, default=u''): val = default return val -def float_value(strValue, default=u''): +def float_value(strValue, default=''): """ >>> float_value('abc23.4') - u'23.4' + '23.4' >>> float_value(' abc23.4') - u'23.4' + '23.4' >>> float_value('ab') - u'' + '' """ try: val = re.compile('([\d.]+)').findall(str(strValue).strip())[0] diff --git a/ox/html.py b/ox/html.py index 5286116..8666713 100644 --- a/ox/html.py +++ b/ox/html.py @@ -145,11 +145,11 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?') def decode_html(html): """ >>> decode_html('me & you and $&%') - u'me & you and $&%' + 'me & you and $&%' >>> decode_html('€') - u'\u20ac' + '\u20ac' >>> decode_html('Anniversary of Daoud's Republic') - u"Anniversary of Daoud's Republic" + "Anniversary of Daoud's Republic" """ if isinstance(html, bytes): html = html.decode('utf-8') @@ -158,7 +158,7 @@ def decode_html(html): def entitydecode(match, uchr=uchr): entity = match.group(1) if entity == '#x80': - return u'€' + return '€' elif entity.startswith('#x'): return uchr(int(entity[2:], 16)) elif entity.startswith('#'): @@ -169,7 +169,7 @@ def decode_html(html): return "'" else: return match.group(0) - return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ') + return charrefpat.sub(entitydecode, html).replace('\xa0', ' ') def highlight(text, query, hlClass="hl"): """ @@ -187,51 +187,51 @@ def highlight(text, query, hlClass="hl"): def escape_html(value): ''' - >>> escape_html(u'') - u'<script>alert()</script>' + '<script>alert()</script>' >>> sanitize_html("'foo' < 'bar' && \\"foo\\" > \\"bar\\"") - u'\\'foo\\' < \\'bar\\' && "foo" > "bar"' + '\\'foo\\' < \\'bar\\' && "foo" > "bar"' >>> sanitize_html('foo') - u'foo' + 'foo' >>> sanitize_html('foo') - u'foo' + 'foo' >>> sanitize_html('Anniversary of Daoud's Republic') - u"Anniversary of Daoud's Republic" + "Anniversary of Daoud's Republic" >>> sanitize_html('') - u'' + '' >>> sanitize_html(' ') - u' ' - >>> sanitize_html(u' ') # canonicalised to a space: okay, I suppose - u' ' - >>> sanitize_html(u'\u00a0') # also nbsp - u' ' + ' ' + >>> sanitize_html(' ') # canonicalised to a space: okay, I suppose + ' ' + >>> sanitize_html('\u00a0') # also nbsp + ' ' ''' if not tags: valid_url = '^((https?:\/\/|\/|mailto:).*?)' @@ -412,24 +412,24 @@ def sanitize_fragment(html): are quoted, etc. Does not strip potentially-malicious HTML: use sanitize_html() for that. - >>> sanitize_fragment(u'') - u'' - >>> sanitize_fragment(u'') - u'' - >>> sanitize_fragment(u'


') - u'

' - >>> sanitize_fragment(u'foo') - u'foo' - >>> sanitize_fragment(u'') - u'' - >>> sanitize_fragment(u' ') - u' ' - >>> sanitize_fragment(u' ') - u'\\xa0' - >>> sanitize_fragment(u'\\u00a0') # nbsp - u'\\xa0' - >>> sanitize_fragment(u'\\ufeff') # zero-width no-break space - u'\\ufeff' + >>> sanitize_fragment('') + '' + >>> sanitize_fragment('') + '' + >>> sanitize_fragment('


') + '

' + >>> sanitize_fragment('foo') + 'foo' + >>> sanitize_fragment('') + '' + >>> sanitize_fragment(' ') + ' ' + >>> sanitize_fragment(' ') + '\\xa0' + >>> sanitize_fragment('\\u00a0') # nbsp + '\\xa0' + >>> sanitize_fragment('\\ufeff') # zero-width no-break space + '\\ufeff' ''' ''' diff --git a/ox/text.py b/ox/text.py index a967092..d650262 100644 --- a/ox/text.py +++ b/ox/text.py @@ -475,10 +475,10 @@ def wrap(text, width): def wrap_string(string, length=80, separator='\n', balance=False): ''' - >>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16) - u"Anticonstitution\\nellement, Paris \\ns'eveille" + >>> wrap_string("Anticonstitutionellement, Paris s'eveille", 16) + "Anticonstitution\\nellement, Paris \\ns'eveille" >>> wrap_string(u'All you can eat', 12, '\\n', True) - u'All you \\ncan eat' + 'All you \\ncan eat' ''' words = string.split(' ') if balance: @@ -493,20 +493,20 @@ def wrap_string(string, length=80, separator='\n', balance=False): break lines = [''] for word in words: - if len(lines[len(lines) - 1] + word + u' ') <= length + 1: + if len(lines[len(lines) - 1] + word + ' ') <= length + 1: # word fits in current line - lines[len(lines) - 1] += word + u' ' + lines[len(lines) - 1] += word + ' ' else: if len(word) <= length: # word fits in next line - lines.append(word + u' ') + lines.append(word + ' ') else: # word is longer than line position = length - len(lines[len(lines) - 1]) lines[len(lines) - 1] += word[0:position] for i in range(position, len(word), length): lines.append(word[i:i+length]) - lines[len(lines) - 1] += u' ' + lines[len(lines) - 1] += ' ' return separator.join(lines).strip() def truncate_string(string, length, padding='...', position='right'): @@ -578,14 +578,14 @@ def get_valid_filename(s): def get_text_list(list_, last_word='or'): """ - >>> get_text_list([u'a', u'b', u'c', u'd']) - u'a, b, c or d' - >>> get_text_list([u'a', u'b', u'c'], 'and') - u'a, b and c' - >>> get_text_list([u'a', u'b'], 'and') - u'a and b' - >>> get_text_list([u'a']) - u'a' + >>> get_text_list(['a', 'b', 'c', 'd']) + 'a, b, c or d' + >>> get_text_list(['a', 'b', 'c'], 'and') + 'a, b and c' + >>> get_text_list(['a', 'b'], 'and') + 'a and b' + >>> get_text_list(['a']) + 'a' >>> get_text_list([]) '' """ @@ -593,24 +593,24 @@ def get_text_list(list_, last_word='or'): return '' if len(list_) == 1: return list_[0] - return u'%s %s %s' % (u', '.join([i for i in list_][:-1]), last_word, list_[-1]) + return '%s %s %s' % (', '.join([i for i in list_][:-1]), last_word, list_[-1]) def get_list_text(text, last_word='or'): """ - >>> get_list_text(u'a, b, c or d') - [u'a', u'b', u'c', u'd'] - >>> get_list_text(u'a, b and c', u'and') - [u'a', u'b', u'c'] - >>> get_list_text(u'a and b', u'and') - [u'a', u'b'] - >>> get_list_text(u'a') - [u'a'] - >>> get_list_text(u'') + >>> get_list_text('a, b, c or d') + ['a', 'b', 'c', 'd'] + >>> get_list_text('a, b and c', 'and') + ['a', 'b', 'c'] + >>> get_list_text('a and b', 'and') + ['a', 'b'] + >>> get_list_text('a') + ['a'] + >>> get_list_text('') [] """ list_ = [] if text: - list_ = text.split(u', ') + list_ = text.split(', ') if list_: i = len(list_)-1 last = list_[i].split(last_word) @@ -682,7 +682,7 @@ def words(text): return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text] def sort_string(string): - string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th') + string = string.replace('Æ', 'AE').replace('Ø', 'O').replace('Þ', 'Th') # pad numbered titles string = re.sub('(\d),(\d{3})', '\\1\\2', string) diff --git a/ox/web/allmovie.py b/ox/web/allmovie.py index fdb7a46..c94c438 100644 --- a/ox/web/allmovie.py +++ b/ox/web/allmovie.py @@ -13,13 +13,13 @@ def get_id(url): def get_data(id): ''' >>> get_data('129689')['cast'][1][1] - u'Marianne' + 'Marianne' >>> get_data('129689')['credits'][0][0] - u'Jean-Luc Godard' + 'Jean-Luc Godard' >>> get_data('129689')['posters'][0] - u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' + 'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' >>> get_data('129689')['rating'] - u'4.5' + '4.5' ''' if id.startswith('http'): id = get_id(id) diff --git a/ox/web/arsenalberlin.py b/ox/web/arsenalberlin.py index e5a0dd2..ca77b5e 100644 --- a/ox/web/arsenalberlin.py +++ b/ox/web/arsenalberlin.py @@ -19,18 +19,18 @@ def get_data(id, language='en'): if 'Willkommen in der Datenbank des Arsenal' in html: return None data = {} - data[u'id'] = id - data[u'url'] = url + data['id'] = id + data['url'] = url m = re.compile('

(.*?)

').findall(html) if m: - data[u'title'] = m[0] + data['title'] = m[0] m = re.compile("Director: (.*?)").findall(html) if m: - data[u'director'] = m[0] + data['director'] = m[0] m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html) if m: - data[u'image'] = m[0] + data['image'] = m[0] units = re.compile("
(.*?)
", re.DOTALL).findall(html) for x in map(re.compile('(.*?): (.*)', re.DOTALL).findall, units): @@ -43,7 +43,7 @@ def get_data(id, language='en'): else: data[key] = strip_tags(data[key]) if "running time (minutes)" in data: - data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60 + data['runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60 for key in ('year', 'length in metres', 'forum participation year', 'number of reels'): if key in data and data[key].isdigit(): data[key] = int(data[key]) diff --git a/ox/web/criterion.py b/ox/web/criterion.py index d7914be..67d4a8a 100644 --- a/ox/web/criterion.py +++ b/ox/web/criterion.py @@ -19,13 +19,13 @@ def get_url(id): def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): ''' >>> get_data('1333').get('imdbId') - u'0060304' + '0060304' >>> get_data('236')['posters'][0] - u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg' + 'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg' >>> get_data('786')['posters'][0] - u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg' + 'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg' ''' data = { "id": id, @@ -39,7 +39,7 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): data["number"] = find_re(html, "Spine #(\d+)") data["title"] = decode_html(find_re(html, "

(.*?)

")) - data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip() + data["title"] = data["title"].split(' \u2014 The Television Version')[0].strip() results = find_re(html, '
    (.*?)
') info = re.compile('
  • (.*?)
  • ', re.DOTALL).findall(results) info = {k: strip_tags(v).strip() for k, v in info} diff --git a/ox/web/flixter.py b/ox/web/flixter.py index e6d6a0a..d713208 100644 --- a/ox/web/flixter.py +++ b/ox/web/flixter.py @@ -58,10 +58,10 @@ def get_data(id, timeout=-1): def get_id(url=None, imdb=None): ''' >>> get_id(imdb='0133093') - u'the-matrix' + 'the-matrix' #>>> get_id(imdb='0060304') - #u'2-or-3-things-i-know-about-her' + #'2-or-3-things-i-know-about-her' ''' if imdb: i = ImdbCombined(imdb)