more raw regexp strings

also consider local version
avoid distutils, no longer in python 3.12
2024-08-30 13:30:47 +02:00 · 2024-08-30 13:07:20 +02:00 · 2024-08-29 20:22:37 +02:00 · 2024-07-08 13:33:07 +01:00 · 2024-07-08 13:26:48 +01:00 · 2024-06-30 09:52:59 +01:00
16 changed files with 199 additions and 179 deletions
--- a/ox/api.py
+++ b/ox/api.py
@ -126,7 +126,7 @@ class API(object):
            tmpname = filename + '.tmp'
            with open(tmpname, 'wb') as fd:
                r = self._requests_session.get(url)
-                for chunk in iter(lambda: r.read(chunk_size), b''):
+                for chunk in r.iter_content(chunk_size=chunk_size):
                    fd.write(chunk)
            shutil.move(tmpname, filename)

@ -167,22 +167,22 @@ class API(object):
                try:
                    data = self._json_request(uploadUrl, meta, files=files)
                except KeyboardInterrupt:
-                    if not slient:
+                    if not silent:
                        print("\ninterrupted by user.")
                    sys.exit(1)
                except:
-                    if not slient:
+                    if not silent:
                        print("uploading chunk failed, will try again in 5 seconds\r", end='')
                    sys.stdout.flush()
                    data = {'result': -1}
                    time.sleep(5)
                if data and 'status' in data:
                    if data['status']['code'] == 403:
-                        if not slient:
+                        if not silent:
                            print("login required")
                        return False
                    if data['status']['code'] != 200:
-                        if not slient:
+                        if not silent:
                            print("request returned error, will try again in 5 seconds")
                        if self.DEBUG:
                            print(data)
@ -190,7 +190,7 @@ class API(object):
                if data and data.get('result') == 1:
                    done += len(chunk)
                    if data.get('offset') not in (None, done):
-                        if not slient:
+                        if not silent:
                            print('server offset out of sync, continue from', data['offset'])
                        done = data['offset']
                        f.seek(done)
--- a/ox/file.py
+++ b/ox/file.py
@ -2,7 +2,6 @@
 # vi:si:et:sw=4:sts=4:ts=4
 # GPL 2008
 from __future__ import division, print_function
-from distutils.spawn import find_executable
 from glob import glob
 import hashlib
 import os
@ -39,6 +38,24 @@ EXTENSIONS = {
    ],
 }

+def is_exe(fpath):
+    return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
+
+def which(program):
+    local = os.path.expanduser('~/.ox/bin/%s' % program)
+    if os.path.exists(local):
+        return local
+    fpath, fname = os.path.split(program)
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ.get("PATH", "").split(os.pathsep):
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+    return None
+
 def cmd(program):
    local = os.path.expanduser('~/.ox/bin/%s' % program)
    if os.path.exists(local):
@ -160,12 +177,11 @@ def avinfo(filename, cached=True):
    if cached:
        return cache(filename, 'info')
    if os.path.getsize(filename):
-        if find_executable('ffprobe'):
+        if which('ffprobe'):
            return ffprobe(filename)
        raise EnvironmentError('could to find ffprobe. please install ffmpeg')
    return {'path': filename, 'size': 0}

-
 def ffprobe(filename):
    p = subprocess.Popen([
        cmd('ffprobe'),
@ -258,6 +274,9 @@ def ffprobe(filename):
                            'pix_fmt': 'pixel_format',
                            'sample_aspect_ratio': 'pixel_aspect_ratio',
                        }.get(key, key)] = fix_value(key, s[key])
+                if 'avg_frame_rate' in s and stream.get('framerate') == "90000:1":
+                    stream['framerate'] = fix_value('r_frame_rate', s['avg_frame_rate'])
+                    stream['force_framerate'] = True
                info[s['codec_type']].append(stream)
            elif s.get('codec_type') == 'subtitle':
                info['subtitles'] = info.get('subtitles', [])
--- a/ox/format.py
+++ b/ox/format.py
@ -236,7 +236,7 @@ def int_value(strValue, default=''):
    ''
    """
    try:
-        val = re.compile('(\d+)').findall(str(strValue).strip())[0]
+        val = re.compile(r'(\d+)').findall(str(strValue).strip())[0]
    except:
        val = default
    return val
@ -253,7 +253,7 @@ def float_value(strValue, default=''):
    ''
    """
    try:
-        val = re.compile('([\d.]+)').findall(str(strValue).strip())[0]
+        val = re.compile(r'([\d.]+)').findall(str(strValue).strip())[0]
    except:
        val = default
    return val
--- a/ox/html.py
+++ b/ox/html.py
@ -16,7 +16,7 @@ TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;', "'", '"']
 DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']

 unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
-word_split_re = re.compile(r'(\s+)')
+word_split_re = re.compile(r'(\s+|<br>)')
 punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % (
                            '|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
                            '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
@ -178,10 +178,10 @@ def highlight(text, query, hlClass="hl"):
    """
    if query:
        text = text.replace('<br />', '|')
-        query = re.escape(query).replace('\ ', '.')
+        query = re.escape(query).replace(r'\ ', '.')
        m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
        for i in m:
-            text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '<span class="%s">\\1</span>' % hlClass, text)
+            text = re.sub(r"(%s)" % re.escape(i).replace(r'\ ', '.'), r'<span class="%s">\\1</span>' % hlClass, text)
        text = text.replace('|', '<br />')
    return text

@ -234,7 +234,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):
    ' '
    '''
    if not tags:
-        valid_url = '^((https?:\/\/|\/|mailto:).*?)'
+        valid_url = r'^((https?:\/\/|\/|mailto:).*?)'
        tags = [
            # inline formatting
            {'name': 'b'},
@ -300,8 +300,8 @@ def sanitize_html(html, tags=None, global_attributes=[]):
                'optional': ['width', 'height'],
                'required': ['src'],
                'validation': {
-                    'width': '^\d+$',
-                    'height': '^\d+$',
+                    'width': r'^\d+$',
+                    'height': r'^\d+$',
                    'src': valid_url
                }
            },
@ -310,8 +310,8 @@ def sanitize_html(html, tags=None, global_attributes=[]):
                'optional': ['width', 'height'],
                'required': ['src'],
                'validation': {
-                    'width': '^\d+$',
-                    'height': '^\d+$',
+                    'width': r'^\d+$',
+                    'height': r'^\d+$',
                    'src': valid_url
                },
            },
@ -319,8 +319,8 @@ def sanitize_html(html, tags=None, global_attributes=[]):
            {'name': 'figcaption'}
        ]

-    tag_re = re.compile('<(/)?([^\ /]+)(.*?)(/)?>')
-    attr_re = re.compile('([^=\ ]+)="([^"]+)"')
+    tag_re = re.compile(r'<(/)?([^\ /]+)(.*?)(/)?>')
+    attr_re = re.compile(r'([^=\ ]+)="([^"]+)"')

    escaped = {}
    level = 0
@ -338,7 +338,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):

    if '[]' in validation:
        html = re.sub(
-            re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
+            re.compile(r'\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
            '<a href="\\1">\\3</a>', html)

    parts = split_tags(html)
@ -392,8 +392,8 @@ def sanitize_html(html, tags=None, global_attributes=[]):
        else:
            parts[i] = escape_html(decode_html(part))
    html = ''.join(parts)
-    html = html.replace('\n\n', '<br/><br/>')
    html = add_links(html)
+    html = html.replace('\n\n', '<br><br>')
    return sanitize_fragment(html)

 def split_tags(string):
--- a/ox/movie.py
+++ b/ox/movie.py
@ -25,7 +25,7 @@ The Title[ ([SXX][EYY[+ZZ|-ZZ]])[ Episode Title]][.Version][.Part XY[.Part Title

 def format_path(data, directory_key='director'):
    def format_underscores(string):
-        return re.sub('^\.|\.$|:|/|\?|<|>', '_', string)
+        return re.sub(r'^\.|\.$|:|/|\?|<|>', '_', string)
    director = data['directorSort'] or ['Unknown Director']
    title = data['seriesTitle' if data['isEpisode'] else 'title'] or 'Untitled'
    year = data['seriesYear' if data['isEpisode'] else 'year'] or None
@ -199,14 +199,14 @@ def parse_path(path, directory_key='director'):
        string = re.sub('^_', '.', string)
        string = re.sub('_$', '.', string)
        # '_.foo$' or '_ (' is '?'
-        string = re.sub(re.compile('_(?=(\.\w+$| \())', re.U), '?', string)
+        string = re.sub(re.compile(r'_(?=(\.\w+$| \())', re.U), '?', string)
        # ' _..._ ' is '<...>'
        string = re.sub('(?<= )_(.+)_(?= )', '<\g<1>>', string)
        # 'foo_bar' or 'foo _ bar' is '/'
-        string = re.sub(re.compile('(?<=\w)_(?=\w)', re.U), '/', string)
+        string = re.sub(re.compile(r'(?<=\w)_(?=\w)', re.U), '/', string)
        string = re.sub(' _ ', ' / ', string)
        # 'foo_ ' is ':'
-        string = re.sub(re.compile('(?<=[\w\)\]])_ ', re.U), ': ', string)
+        string = re.sub(re.compile(r'(?<=[\w\)\]])_ ', re.U), ': ', string)
        string = unicodedata.normalize('NFD', string)
        return string

@ -238,14 +238,14 @@ def parse_path(path, directory_key='director'):
    # title, year
    data['title'] = data['year'] = None
    if title:
-        match = re.search(' \(\d{4}(-(\d{4})?)?\)$', title)
+        match = re.search(r' \(\d{4}(-(\d{4})?)?\)$', title)
        data['title'] = title[:-len(match.group(0))] if match else title
        data['year'] = match.group(0)[2:-1] if match else None        
        file_title = re.sub('[/:]', '_', data['title'])
        # (remove title from beginning of filename if the rest contains a dot)
        file = re.sub('^' + re.escape(file_title) + '(?=.*\.)', '', file)
    # (split by nospace+dot+word, but remove spaces preceding extension)
-    parts = re.split('(?<!\s)\.(?=\w)', re.sub('\s+(?=.\w+$)', '', file))
+    parts = re.split(r'(?<!\s)\.(?=\w)', re.sub(r'\s+(?=.\w+$)', '', file))
    title, parts, extension = [
        parts[0],
        parts[1:-1],
@ -256,7 +256,7 @@ def parse_path(path, directory_key='director'):
    # season, episode, episodes, episodeTitle
    data['season'] = data['episode'] = data['episodeTitle'] = None
    data['episodes'] = []
-    match = re.search(' \((S\d{2})?(E\d{2}([+-]\d{2})?)?\)(.+)?', title)
+    match = re.search(r' \((S\d{2})?(E\d{2}([+-]\d{2})?)?\)(.+)?', title)
    if match:
        if match.group(1):
            data['season'] = int(match.group(1)[1:])
@ -267,7 +267,7 @@ def parse_path(path, directory_key='director'):
                data['episodes'] = range(int(match.group(2)[1:3]), int(match.group(2)[-2:]) + 1)
        if match.group(4):
            data['episodeTitle'] = match.group(4)[1:]
-    while data['episodeTitle'] and len(parts) and re.search('^\w+\.*$', parts[0]) and not re.search('^[a-z]{2}$', parts[0]):
+    while data['episodeTitle'] and len(parts) and re.search(r'^\w+\.*$', parts[0]) and not re.search(r'^[a-z]{2}$', parts[0]):
        data['episodeTitle'] += '.%s' % parts.pop(0)
    # isEpisode, seriesTitle, seriesYear
    data['isEpisode'] = False
@ -343,14 +343,14 @@ def parse_movie_path(path):
    if title.startswith('_'):
        title = '.' + title[1:]

-    year = find_re(title, '(\(\d{4}\))')
+    year = find_re(title, r'(\(\d{4}\))')
    if not year:
-        year = find_re(title, '(\(\d{4}-\d*\))')
+        year = find_re(title, r'(\(\d{4}-\d*\))')
    if year and title.endswith(year):
        title = title[:-len(year)].strip()
        year = year[1:-1]
        if '-' in year:
-            year = find_re(year, '\d{4}')
+            year = find_re(year, r'\d{4}')

    #director
    if len(parts) == 4:
@ -373,7 +373,7 @@ def parse_movie_path(path):
        language = ''
    
    #season/episode/episodeTitle
-    match = re.compile('(.+?) \((S(\d+))?(E(\d+))?\)( (.+?))?\.').match(parts[-1])
+    match = re.compile(r'(.+?) \((S(\d+))?(E(\d+))?\)( (.+?))?\.').match(parts[-1])
    if match:
        seriesTitle = match.group(1)
        season = match.group(3)
@ -386,13 +386,13 @@ def parse_movie_path(path):
        if episode and not season:
            season = 1
    else:
-        season = find_re(parts[-1], '\.Season (\d+)\.')
+        season = find_re(parts[-1], r'\.Season (\d+)\.')
        if season:
            season = int(season)
        else:
            season = None

-        episode = find_re(parts[-1], '\.Episode[s]* ([\d+]+)\.')
+        episode = find_re(parts[-1], r'\.Episode[s]* ([\d+]+)\.')
        if episode:
            episode = episode.split('+')[0]
            episode = int(episode)
@ -422,7 +422,7 @@ def parse_movie_path(path):
            title = u'%s %s' % (title, episodeTitle)

    #part
-    part = find_re(parts[-1], '\.Part (\d+)\.')
+    part = find_re(parts[-1], r'\.Part (\d+)\.')
    if part:
        part = int(part)
    else:
--- a/ox/net.py
+++ b/ox/net.py
@ -18,7 +18,7 @@ from chardet.universaldetector import UniversalDetector
 DEBUG = False
 # Default headers for HTTP requests.
 DEFAULT_HEADERS = {
-    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0',
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',
@ -113,14 +113,15 @@ get_url = read_url
 def save_url(url, filename, overwrite=False):
    if not os.path.exists(filename) or overwrite:
        dirname = os.path.dirname(filename)
-        if dirname and not os.path.exists(dirname):
-            os.makedirs(dirname)
+        os.makedirs(dirname, exist_ok=True)
        headers = DEFAULT_HEADERS.copy()
        r = requests.get(url, headers=headers, stream=True)
-        with open(filename, 'wb') as f:
+        filename_tmp = filename + '~'
+        with open(filename_tmp, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)
+        os.rename(filename_tmp, filename)

 def _get_size(url):
    req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
--- a/ox/normalize.py
+++ b/ox/normalize.py
@ -102,7 +102,7 @@ def normalize_imdbid(imdbId):
    '0159206'
    """
    if isinstance(imdbId, str):
-        imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
+        imdbId = re.sub(r'.*(\d{7}).*', '\\1', imdbId)
    elif isinstance(imdbId, int):
        imdbId = "%07d" % imdbId
    return imdbId
--- a/ox/text.py
+++ b/ox/text.py
@ -133,86 +133,86 @@ UA_NAMES = {
 }
 UA_REGEXPS = {
    'browser': [
-        '(Camino)\/(\d+)',
-        '(Chimera)\/(\d+)',
-        '(chromeframe)\/(\d+)',
-        '(Edge)\/(\d+)',
-        '(Epiphany)\/(\d+)',  # before Chrome, Chromium and Safari
-        '(Chromium)\/(\d+)',  # before Chrome
-        '(Chrome)\/(\d+)',
-        '(FBForIPhone)',
-        '(Firefox)\/(\d+)',
-        '(Galeon)\/(\d+)',
-        '(IEMobile)\/(\d+)',
-        '(iCab) (\d+)',
-        '(iCab)\/(\d+)',
-        '(konqueror)\/(\d+)',
-        '(Konqueror)\/(\d+)',
-        '(Lynx)\/(\d+)',
-        '(Netscape)\d?\/(\d+)',
-        '(NokiaBrowser)\/(\d+)',
-        '(OmniWeb)\/(\d+)',
-        '(Opera)\/.+Version\/(\d+)',
-        '(OviBrowser)\/(\d+)',
-        'Version\/(\d+).+(Safari)',
-        '(WebKit)\/(\d+)',
-        '(MSIE) (\d\d?(?!\d))',  # last, since Opera used to mask as MSIE
-        '(Trident)\/.*?rv:(\d+)',
-        '(Gecko)',
-        '(Mozilla)\/(3|4)'
+        r'(Camino)\/(\d+)',
+        r'(Chimera)\/(\d+)',
+        r'(chromeframe)\/(\d+)',
+        r'(Edge)\/(\d+)',
+        r'(Epiphany)\/(\d+)',  # before Chrome, Chromium and Safari
+        r'(Chromium)\/(\d+)',  # before Chrome
+        r'(Chrome)\/(\d+)',
+        r'(FBForIPhone)',
+        r'(Firefox)\/(\d+)',
+        r'(Galeon)\/(\d+)',
+        r'(IEMobile)\/(\d+)',
+        r'(iCab) (\d+)',
+        r'(iCab)\/(\d+)',
+        r'(konqueror)\/(\d+)',
+        r'(Konqueror)\/(\d+)',
+        r'(Lynx)\/(\d+)',
+        r'(Netscape)\d?\/(\d+)',
+        r'(NokiaBrowser)\/(\d+)',
+        r'(OmniWeb)\/(\d+)',
+        r'(Opera)\/.+Version\/(\d+)',
+        r'(OviBrowser)\/(\d+)',
+        r'Version\/(\d+).+(Safari)',
+        r'(WebKit)\/(\d+)',
+        r'(MSIE) (\d\d?(?!\d))',  # last, since Opera used to mask as MSIE
+        r'(Trident)\/.*?rv:(\d+)',
+        r'(Gecko)',
+        r'(Mozilla)\/(3|4)'
    ],
    'robot': [
-        '(BingPreview)\/(\d+)',
-        '(Google Web Preview).+Chrome\/(\d+)',
-        '(Googlebot)\/(\d+)',
-        '(WebCrawler)\/(\d+)',
-        '(Yahoo! Slurp)\/(\d+)',
-        '(YandexBot)\/([\d\.]+)',
-        '(YandexMobileBot)\/([\d\.]+)',
+        r'(BingPreview)\/(\d+)',
+        r'(Google Web Preview).+Chrome\/(\d+)',
+        r'(Googlebot)\/(\d+)',
+        r'(WebCrawler)\/(\d+)',
+        r'(Yahoo! Slurp)\/(\d+)',
+        r'(YandexBot)\/([\d\.]+)',
+        r'(YandexMobileBot)\/([\d\.]+)',
    ],
    'system': [
-        '(Android) (\d+)',
-        '(Android)',
-        '(BB)(\d+)',
-        '(BeOS)',
-        '(BlackBerry) (\d+)',
-        '(BlackBerry)',
-        '(Darwin)',
-        '(BSD) (FreeBSD|NetBSD|OpenBSD)',
-        '(CPU OS) (\d+)',
-        '(iPhone OS) (\d+)',
-        '(iPhone)',  # Opera
-        '(J2ME\/MIDP)',
-        '(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
-        '(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
-        '(Linux)',
-        '(Mac OS X) (10.\d+)',
-        '(Mac OS X)',
-        '(Mac_PowerPC)',
-        '(Mac_PPC)',
-        '(Macintosh)',
-        'Nintendo (Wii).+NX\/(\d+)',
-        '(PLAYSTATION) (\d+)',
-        '(PlayStation) Vita (\d+)',
-        '(RIM Tablet OS) (\d+)',
-        '(S)(60);',
-        '(Series) ?(40|60)',
-        '(Symbian OS)',
-        '(SymbianOS)\/(\d+)',
-        '(SymbOS)',
-        '(OS\/2)',
-        '(Unix) (AIX|HP-UX|IRIX|SunOS)',
-        '(Unix)',
-        '(Windows) (NT \d\.\d)',
-        '(Windows Phone) (\d+)',
-        '(Windows Phone OS) (\d+)',
-        '(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)',  # Opera
-        '(Win) (9x 4\.90)',  # Firefox
-        '(Win)(16)',  # Firefox
-        '(Win)(9\d)',  # Firefox
-        '(Win)(NT)',  # Firefox
-        '(Win)(NT4\.0)',  # Firefox
-        '(X11)'
+        r'(Android) (\d+)',
+        r'(Android)',
+        r'(BB)(\d+)',
+        r'(BeOS)',
+        r'(BlackBerry) (\d+)',
+        r'(BlackBerry)',
+        r'(Darwin)',
+        r'(BSD) (FreeBSD|NetBSD|OpenBSD)',
+        r'(CPU OS) (\d+)',
+        r'(iPhone OS) (\d+)',
+        r'(iPhone)',  # Opera
+        r'(J2ME\/MIDP)',
+        r'(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
+        r'(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
+        r'(Linux)',
+        r'(Mac OS X) (10.\d+)',
+        r'(Mac OS X)',
+        r'(Mac_PowerPC)',
+        r'(Mac_PPC)',
+        r'(Macintosh)',
+        r'Nintendo (Wii).+NX\/(\d+)',
+        r'(PLAYSTATION) (\d+)',
+        r'(PlayStation) Vita (\d+)',
+        r'(RIM Tablet OS) (\d+)',
+        r'(S)(60);',
+        r'(Series) ?(40|60)',
+        r'(Symbian OS)',
+        r'(SymbianOS)\/(\d+)',
+        r'(SymbOS)',
+        r'(OS\/2)',
+        r'(Unix) (AIX|HP-UX|IRIX|SunOS)',
+        r'(Unix)',
+        r'(Windows) (NT \d\.\d)',
+        r'(Windows Phone) (\d+)',
+        r'(Windows Phone OS) (\d+)',
+        r'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)',  # Opera
+        r'(Win) (9x 4\.90)',  # Firefox
+        r'(Win)(16)',  # Firefox
+        r'(Win)(9\d)',  # Firefox
+        r'(Win)(NT)',  # Firefox
+        r'(Win)(NT4\.0)',  # Firefox
+        r'(X11)'
    ]
 }
 UA_VERSIONS = {
@ -332,9 +332,9 @@ def get_sort_name(name):

    first_names = name.split(' ')
    last_names = []
-    if re.search('^[0-9]+$', first_names[-1]):
+    if re.search(r'^[0-9]+$', first_names[-1]):
        add_name()
-    if re.search('[(\[].+?[)\]]$', first_names[-1]):
+    if re.search(r'[(\[].+?[)\]]$', first_names[-1]):
        add_name()
    if find_name(SUFFIXES):
        add_name()
@ -425,7 +425,7 @@ def parse_useragent(useragent):
                matches = list(match.groups())
                if len(matches) == 1:
                    matches.append('')
-                swap = re.match('^\d', matches[0]) or matches[1] == 'Linux'
+                swap = re.match(r'^\d', matches[0]) or matches[1] == 'Linux'
                name = matches[1 if swap else 0]
                version = matches[0 if swap else 1].replace('_', '.')
                name = UA_NAMES[key][name] if name in UA_NAMES[key] else name
@ -685,8 +685,8 @@ def sort_string(string):
    string = string.replace('Æ', 'AE').replace('Ø', 'O').replace('Þ', 'Th')

    # pad numbered titles
-    string = re.sub('(\d),(\d{3})', '\\1\\2', string)
-    string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string)
+    string = re.sub(r'(\d),(\d{3})', '\\1\\2', string)
+    string = re.sub(r'(\d+)', lambda x: '%010d' % int(x.group(0)), string)
    return unicodedata.normalize('NFKD', string)

 def sorted_strings(strings, key=None):
--- a/ox/web/allmovie.py
+++ b/ox/web/allmovie.py
@ -43,7 +43,7 @@ def get_data(id):
    data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
    data['themes'] = parse_list(html, 'themes')
    data['types'] = parse_list(html, 'types')
-    data['year'] = find_re(html, '<span class="year">.*?(\d+)')
+    data['year'] = find_re(html, r'<span class="year">.*?(\d+)')
    #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
    data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
    #html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
--- a/ox/web/apple.py
+++ b/ox/web/apple.py
@ -51,11 +51,11 @@ def get_movie_data(title, director):
            'User-Agent': USER_AGENT
        }
        html = read_url(url, headers=headers, unicode=True)
-        results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
+        results = re.compile(r'"(' + host + r'.*?poster\.jpg)"').findall(html)
        if results:
            data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
        html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
-        results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
+        results = re.compile(r'"(' + host + r'\S+\.mov)"').findall(html)
        if results:
            data['trailer'] = results[-1]
    return data
--- a/ox/web/arsenalberlin.py
+++ b/ox/web/arsenalberlin.py
@ -28,7 +28,7 @@ def get_data(id, language='en'):
    if m:
        data['director'] = m[0]

-    m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
+    m = re.compile(r"caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
    if m:
        data['image'] = m[0]

--- a/ox/web/google.py
+++ b/ox/web/google.py
@ -60,7 +60,7 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
            url += '&start=%d' % offset
        data = read_url(url, timeout=timeout)
        data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
-        for a in re.compile('<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
+        for a in re.compile(r'<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
            results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
            if len(results) >= max_results:
                break
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -43,8 +43,8 @@ def reference_section(id):
    return {
        'page': 'reference',
        're': [
-            '<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
-            '<a href="/name/.*?>(.*?)</a>'
+            r'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
+            r'<a href="/name/.*?>(.*?)</a>'
        ],
        'type': 'list'
    }
@ -54,8 +54,8 @@ def zebra_list(label, more=None):
    conditions = {
        'page': 'reference',
        're': [
-            '_label">' + label + '</td>.*?<ul(.*?)</ul>',
-            '<li.*?>(.*?)</li>'
+            r'_label">' + label + '</td>.*?<ul(.*?)</ul>',
+            r'<li.*?>(.*?)</li>'
        ],
        'type': 'list',
    }
@ -67,7 +67,7 @@ def zebra_table(label, more=None, type='string'):
    conditions = {
        'page': 'reference',
        're': [
-            '_label">' + label + '</td>.*?<td>(.*?)</td>',
+            r'_label">' + label + '</td>.*?<td>(.*?)</td>',
        ],
        'type': type,
    }
@ -97,9 +97,9 @@ def technical(label):
    return {
        'page': 'technical',
        're': [
-            '<td class="label">\s*?%s\s*?</td>.*?<td>\s*?(.*?)\s*?</td>' % label,
+            r'<td class="label">\s*?%s\s*?</td>.*?<td>\s*?(.*?)\s*?</td>' % label,
            lambda data: [
-                re.sub('\s+', ' ', d.strip()) for d in data.strip().split('<br>')
+                re.sub(r'\s+', ' ', d.strip()) for d in data.strip().split('<br>')
            ] if data else []
        ],
        'type': 'list'
@ -258,13 +258,13 @@ class Imdb(SiteParser):
        'aspectratio': {
            'page': 'reference',
            're': [
-                'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
+                r'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
                parse_aspectratio,
            ],
            'type': 'float',
        },
        'budget': zebra_table('Budget', more=[
-            lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
+            lambda data: find_re(decode_html(data).replace(',', ''), r'\d+')
        ], type='int'),
        'cast': {
            'page': 'reference',
@ -287,12 +287,12 @@ class Imdb(SiteParser):
        },
        'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']),
        'gross': zebra_table('Cumulative Worldwide Gross', more=[
-            lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
+            lambda data: find_re(decode_html(data).replace(',', ''), r'\d+')
        ], type='int'),
        'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
        'originalTitle': {
            'page': 'releaseinfo',
-            're': '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
+            're': r'<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
            'type': 'string'
        },
        'summary': zebra_table('Plot Summary', more=[
@ -300,7 +300,7 @@ class Imdb(SiteParser):
        ]),
        'storyline': {
            'page': '',
-            're': '<h2>Storyline</h2>.*?<p>(.*?)</p>',
+            're': r'<h2>Storyline</h2>.*?<p>(.*?)</p>',
            'type': 'string'
        },
        'posterId': {
@ -312,16 +312,16 @@ class Imdb(SiteParser):
        'productionCompany': {
            'page': 'reference',
            're': [
-                'Production Companies.*?<ul(.*?)</ul>',
-                '<a href="/company/.*?/">(.*?)</a>'
+                r'Production Companies.*?<ul(.*?)</ul>',
+                r'<a href="/company/.*?/">(.*?)</a>'
            ],
            'type': 'list'
        },
        'rating': {
            'page': 'reference',
            're': [
-                '<div class="ipl-rating-star ">(.*?)</div>',
-                'ipl-rating-star__rating">([\d,.]+?)</span>',
+                r'<div class="ipl-rating-star ">(.*?)</div>',
+                r'ipl-rating-star__rating">([\d,.]+?)</span>',
            ],
            'type': 'float'
        },
@ -343,38 +343,38 @@ class Imdb(SiteParser):
        'season': {
            'page': 'reference',
            're': [
-                '<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
-                'Season (\d+)',
+                r'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
+                r'Season (\d+)',
             ],
            'type': 'int'
        },
        'episode': {
            'page': 'reference',
            're': [
-                '<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
-                'Episode (\d+)',
+                r'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
+                r'Episode (\d+)',
             ],
            'type': 'int'
        },
        'series': {
            'page': 'reference',
-            're': '<h4 itemprop="name">.*?<a href="/title/tt(\d+)',
+            're': r'<h4 itemprop="name">.*?<a href="/title/tt(\d+)',
            'type': 'string'
        },
        'isSeries': {
            'page': 'reference',
-            're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
+            're': r'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
            'type': 'string'
        },
        'title': {
            'page': 'releaseinfo',
-            're': '<h2.*?>(.*?)</h2>',
+            're': r'<h2.*?>(.*?)</h2>',
            'type': 'string'
        },
        'trivia': {
            'page': 'trivia',
            're': [
-                '<div class="sodatext">(.*?)<(br|/div)',
+                r'<div class="sodatext">(.*?)<(br|/div)',
                lambda data: data[0]
            ],
            'type': 'list',
@ -382,7 +382,7 @@ class Imdb(SiteParser):
        'votes': {
            'page': 'reference',
            're': [
-                'class="ipl-rating-star__total-votes">\((.*?)\)',
+                r'class="ipl-rating-star__total-votes">\((.*?)\)',
                lambda r: r.replace(',', '')
            ],
            'type': 'string'
@ -391,8 +391,8 @@ class Imdb(SiteParser):
        'year': {
            'page': 'reference',
            're': [
-                '<span class="titlereference-title-year">(.*?)</span>',
-                '<a.*?>(\d+)',
+                r'<span class="titlereference-title-year">(.*?)</span>',
+                r'<a.*?>(\d+)',
            ],
            'type': 'int'
        },
@ -400,7 +400,7 @@ class Imdb(SiteParser):
            'page': 'fullcredits',
            're': [
                lambda data: data.split('<h4'),
-                '>(.*?)</h4>.*?(<table.*?</table>)',
+                r'>(.*?)</h4>.*?(<table.*?</table>)',
                lambda data: [d for d in data if d]
            ],
            'type': 'list'
@ -468,7 +468,7 @@ class Imdb(SiteParser):
                title = title[1:-1]
            if title.startswith("'") and title.endswith("'"):
                title = title[1:-1]
-            title = re.sub('\(\#[.\d]+\)', '', title)
+            title = re.sub(r'\(\#[.\d]+\)', '', title)
            return title.strip()

        for t in ('title', 'originalTitle'):
@ -518,7 +518,7 @@ class Imdb(SiteParser):
            self['actor'] = [c[0] for c in self['cast']]
            def cleanup_character(c):
                c = c.replace('(uncredited)', '').strip()
-                c = re.sub('\s+', ' ', c)
+                c = re.sub(r'\s+', ' ', c)
                return c
            self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
                            for x in self['cast']]
@ -528,7 +528,7 @@ class Imdb(SiteParser):
            del self['isSeries']
            self['isSeries'] = True
        if 'episodeTitle' in self:
-            self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle'])
+            self['episodeTitle'] = re.sub(r'Episode \#\d+\.\d+', '', self['episodeTitle'])


        #make lists unique but keep order
@ -790,7 +790,7 @@ def get_movie_by_title(title, timeout=-1):
    url = "http://www.imdb.com/find?" + params
    data = read_url(url, timeout=timeout, unicode=True)
    #if search results in redirect, get id of current page
-    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
+    r = r'<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
    results = re.compile(r).findall(data)    
    if results:
        return results[0]
@ -869,12 +869,12 @@ def get_movie_id(title, director='', year='', timeout=-1):

    data = read_url(url, timeout=timeout, unicode=True)
    #if search results in redirect, get id of current page
-    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
+    r = r'<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
    results = re.compile(r).findall(data)    
    if results:
        return results[0]
    #otherwise get first result
-    r = '<td valign="top">.*?<a href="/title/tt(\d+)/"'
+    r = r'<td valign="top">.*?<a href="/title/tt(\d+)/"'
    results = re.compile(r).findall(data)
    if results:
        return results[0]
@ -885,7 +885,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
    results = duckduckgo.find(google_query, timeout=timeout)
    if results:
        for r in results[:2]:
-            imdbId = find_re(r[1], 'title/tt(\d+)')
+            imdbId = find_re(r[1], r'title/tt(\d+)')
            if imdbId:
                return imdbId
    #or nothing
@ -912,11 +912,11 @@ def get_episodes(imdbId, season=None):
    if season:
        url += '?season=%d' % season
        data = cache.read_url(url).decode()
-        for e in re.compile('<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
+        for e in re.compile(r'<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
            episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
    else:
        data = cache.read_url(url)
-        match = re.compile('<strong>Season (\d+)</strong>').findall(data)
+        match = re.compile(r'<strong>Season (\d+)</strong>').findall(data)
        if match:
            for season in range(1, int(match[0]) + 1):
               episodes.update(get_episodes(imdbId, season))
@ -927,7 +927,7 @@ def max_votes():
    data = cache.read_url(url).decode('utf-8', 'ignore')
    votes = max([
        int(v.replace(',', ''))
-        for v in re.compile('<span name="nv" data-value="(\d+)"').findall(data)
+        for v in re.compile(r'Votes</span>.*?([\d,]+)', re.DOTALL).findall(data)
    ])
    return votes

--- a/ox/web/piratecinema.py
+++ b/ox/web/piratecinema.py
@ -8,7 +8,7 @@ from ox.net import read_url
 def get_poster_url(id):
    url = 'http://piratecinema.org/posters/'
    html = read_url(url).decode('utf-8')
-    results = re.compile('src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html)
+    results = re.compile(r'src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html)
    for result in results:
        if result[1] == id:
            return url + result[0]
--- a/ox/web/wikipedia.py
+++ b/ox/web/wikipedia.py
@ -81,36 +81,36 @@ def get_movie_data(wikipedia_url):
    if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
        del filmbox['amg_id']
    if 'Allmovie movie' in data:
-        filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
+        filmbox['amg_id'] = find_re(data, r'Allmovie movie\|.*?(\d+)')
    elif 'Allmovie title' in data:
-        filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
+        filmbox['amg_id'] = find_re(data, r'Allmovie title\|.*?(\d+)')

    if 'Official website' in data:
-        filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
+        filmbox['website'] = find_re(data, r'Official website\|(.*?)}').strip()

-    r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
+    r = re.compile(r'{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
    if r:
        filmbox['imdb_id'] = r[0]
    else:
-        r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
+        r = re.compile(r'{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
        if r:
            filmbox['imdb_id'] = r[0]

-    r = re.compile('{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
+    r = re.compile(r'{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
    if r:
        filmbox['archiveorg_id'] = r[0]

-    r = re.compile('{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
+    r = re.compile(r'{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
    if r:
        filmbox['mojo_id'] = r[0].replace('id=', '')

-    r = re.compile('{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
+    r = re.compile(r'{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
    if r:
        filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
    if 'google video' in data:
-        filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
+        filmbox['google_video_id'] = find_re(data, r'google video\|.*?(\d*?)[\|}]')
    if 'DEFAULTSORT' in data:
-        filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
+        filmbox['title_sort'] = find_re(data, r'''\{\{DEFAULTSORT:(.*?)\}\}''')
    return filmbox

 def get_image_url(name):
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def get_version():
        f = open(changelog)
        head = f.read().strip().split('\n')[0]
        f.close()
-        rev = re.compile('\d+\.\d+\.(\d+)').findall(head)
+        rev = re.compile(r'\d+\.\d+\.(\d+)').findall(head)
        if rev:
            return '3.0.%s' % rev[0]
    return '3.0.x'
Author	SHA1	Message	Date
j	ae10c5c9b9	more raw regexp strings	2024-08-30 13:30:47 +02:00
j	29a309f15e	also consider local version	2024-08-30 13:07:20 +02:00
j	171059cf45	avoid distutils, no longer in python 3.12	2024-08-29 20:22:37 +02:00
j	301babd1dd	more raw regexp strings	2024-07-08 13:33:07 +01:00
j	08636ba81a	update ua	2024-07-08 13:26:48 +01:00
j	bb13747023	use r'' for regex strings	2024-06-30 09:52:59 +01:00
j	414cb00115	add force_framerate flag	2024-06-08 11:37:47 +01:00
j	70565970b3	use avg_frame_rate if video is 90k	2024-06-08 11:22:25 +01:00
j	6b22e2ec29	use request.iter_content	2024-05-22 08:21:01 +02:00
j	e850296f68	download to tmp file and rename once download is complete	2024-03-22 08:48:48 +01:00
j	0e801f82a3	typo	2024-03-21 00:08:40 +01:00
j	99554cb461	fix add_link	2024-03-20 12:55:14 +01:00
j	7461719e23	fix links at end of paragraph	2024-03-20 12:50:11 +01:00
j	e9a5bcb890	fix max_votes	2024-01-23 23:00:22 +01:00