From 301babd1dd436a5557374076eaa6a50a46b2748b Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Mon, 8 Jul 2024 13:33:07 +0100
Subject: [PATCH] more raw regexp strings

---
 ox/web/imdb.py | 76 +++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)
diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index 1b93459..7698dbf 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -43,8 +43,8 @@ def reference_section(id):
     return {
         'page': 'reference',
         're': [
-            '<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
-            '<a href="/name/.*?>(.*?)</a>'
+            r'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
+            r'<a href="/name/.*?>(.*?)</a>'
         ],
         'type': 'list'
     }
@@ -54,8 +54,8 @@ def zebra_list(label, more=None):
     conditions = {
         'page': 'reference',
         're': [
-            '_label">' + label + '</td>.*?<ul(.*?)</ul>',
-            '<li.*?>(.*?)</li>'
+            r'_label">' + label + '</td>.*?<ul(.*?)</ul>',
+            r'<li.*?>(.*?)</li>'
         ],
         'type': 'list',
     }
@@ -67,7 +67,7 @@ def zebra_table(label, more=None, type='string'):
     conditions = {
         'page': 'reference',
         're': [
-            '_label">' + label + '</td>.*?<td>(.*?)</td>',
+            r'_label">' + label + '</td>.*?<td>(.*?)</td>',
         ],
         'type': type,
     }
@@ -97,9 +97,9 @@ def technical(label):
     return {
         'page': 'technical',
         're': [
-            '<td class="label">\s*?%s\s*?</td>.*?<td>\s*?(.*?)\s*?</td>' % label,
+            r'<td class="label">\s*?%s\s*?</td>.*?<td>\s*?(.*?)\s*?</td>' % label,
             lambda data: [
-                re.sub('\s+', ' ', d.strip()) for d in data.strip().split('<br>')
+                re.sub(r'\s+', ' ', d.strip()) for d in data.strip().split('<br>')
             ] if data else []
         ],
         'type': 'list'
@@ -258,13 +258,13 @@ class Imdb(SiteParser):
         'aspectratio': {
             'page': 'reference',
             're': [
-                'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
+                r'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
                 parse_aspectratio,
             ],
             'type': 'float',
         },
         'budget': zebra_table('Budget', more=[
-            lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
+            lambda data: find_re(decode_html(data).replace(',', ''), r'\d+')
         ], type='int'),
         'cast': {
             'page': 'reference',
@@ -287,12 +287,12 @@ class Imdb(SiteParser):
         },
         'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']),
         'gross': zebra_table('Cumulative Worldwide Gross', more=[
-            lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
+            lambda data: find_re(decode_html(data).replace(',', ''), r'\d+')
         ], type='int'),
         'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
         'originalTitle': {
             'page': 'releaseinfo',
-            're': '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
+            're': r'<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
             'type': 'string'
         },
         'summary': zebra_table('Plot Summary', more=[
@@ -300,7 +300,7 @@ class Imdb(SiteParser):
         ]),
         'storyline': {
             'page': '',
-            're': '<h2>Storyline</h2>.*?<p>(.*?)</p>',
+            're': r'<h2>Storyline</h2>.*?<p>(.*?)</p>',
             'type': 'string'
         },
         'posterId': {
@@ -312,16 +312,16 @@ class Imdb(SiteParser):
         'productionCompany': {
             'page': 'reference',
             're': [
-                'Production Companies.*?<ul(.*?)</ul>',
-                '<a href="/company/.*?/">(.*?)</a>'
+                r'Production Companies.*?<ul(.*?)</ul>',
+                r'<a href="/company/.*?/">(.*?)</a>'
             ],
             'type': 'list'
         },
         'rating': {
             'page': 'reference',
             're': [
-                '<div class="ipl-rating-star ">(.*?)</div>',
-                'ipl-rating-star__rating">([\d,.]+?)</span>',
+                r'<div class="ipl-rating-star ">(.*?)</div>',
+                r'ipl-rating-star__rating">([\d,.]+?)</span>',
             ],
             'type': 'float'
         },
@@ -343,38 +343,38 @@ class Imdb(SiteParser):
         'season': {
             'page': 'reference',
             're': [
-                '<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
-                'Season (\d+)',
+                r'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
+                r'Season (\d+)',
              ],
             'type': 'int'
         },
         'episode': {
             'page': 'reference',
             're': [
-                '<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
-                'Episode (\d+)',
+                r'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
+                r'Episode (\d+)',
              ],
             'type': 'int'
         },
         'series': {
             'page': 'reference',
-            're': '<h4 itemprop="name">.*?<a href="/title/tt(\d+)',
+            're': r'<h4 itemprop="name">.*?<a href="/title/tt(\d+)',
             'type': 'string'
         },
         'isSeries': {
             'page': 'reference',
-            're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
+            're': r'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
             'type': 'string'
         },
         'title': {
             'page': 'releaseinfo',
-            're': '<h2.*?>(.*?)</h2>',
+            're': r'<h2.*?>(.*?)</h2>',
             'type': 'string'
         },
         'trivia': {
             'page': 'trivia',
             're': [
-                '<div class="sodatext">(.*?)<(br|/div)',
+                r'<div class="sodatext">(.*?)<(br|/div)',
                 lambda data: data[0]
             ],
             'type': 'list',
@@ -382,7 +382,7 @@ class Imdb(SiteParser):
         'votes': {
             'page': 'reference',
             're': [
-                'class="ipl-rating-star__total-votes">\((.*?)\)',
+                r'class="ipl-rating-star__total-votes">\((.*?)\)',
                 lambda r: r.replace(',', '')
             ],
             'type': 'string'
@@ -391,8 +391,8 @@ class Imdb(SiteParser):
         'year': {
             'page': 'reference',
             're': [
-                '<span class="titlereference-title-year">(.*?)</span>',
-                '<a.*?>(\d+)',
+                r'<span class="titlereference-title-year">(.*?)</span>',
+                r'<a.*?>(\d+)',
             ],
             'type': 'int'
         },
@@ -400,7 +400,7 @@ class Imdb(SiteParser):
             'page': 'fullcredits',
             're': [
                 lambda data: data.split('<h4'),
-                '>(.*?)</h4>.*?(<table.*?</table>)',
+                r'>(.*?)</h4>.*?(<table.*?</table>)',
                 lambda data: [d for d in data if d]
             ],
             'type': 'list'
@@ -468,7 +468,7 @@ class Imdb(SiteParser):
                 title = title[1:-1]
             if title.startswith("'") and title.endswith("'"):
                 title = title[1:-1]
-            title = re.sub('\(\#[.\d]+\)', '', title)
+            title = re.sub(r'\(\#[.\d]+\)', '', title)
             return title.strip()
 
         for t in ('title', 'originalTitle'):
@@ -518,7 +518,7 @@ class Imdb(SiteParser):
             self['actor'] = [c[0] for c in self['cast']]
             def cleanup_character(c):
                 c = c.replace('(uncredited)', '').strip()
-                c = re.sub('\s+', ' ', c)
+                c = re.sub(r'\s+', ' ', c)
                 return c
             self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
                             for x in self['cast']]
@@ -528,7 +528,7 @@ class Imdb(SiteParser):
             del self['isSeries']
             self['isSeries'] = True
         if 'episodeTitle' in self:
-            self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle'])
+            self['episodeTitle'] = re.sub(r'Episode \#\d+\.\d+', '', self['episodeTitle'])
 
 
         #make lists unique but keep order
@@ -790,7 +790,7 @@ def get_movie_by_title(title, timeout=-1):
     url = "http://www.imdb.com/find?" + params
     data = read_url(url, timeout=timeout, unicode=True)
     #if search results in redirect, get id of current page
-    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
+    r = r'<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
     results = re.compile(r).findall(data)    
     if results:
         return results[0]
@@ -869,12 +869,12 @@ def get_movie_id(title, director='', year='', timeout=-1):
 
     data = read_url(url, timeout=timeout, unicode=True)
     #if search results in redirect, get id of current page
-    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
+    r = r'<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
     results = re.compile(r).findall(data)    
     if results:
         return results[0]
     #otherwise get first result
-    r = '<td valign="top">.*?<a href="/title/tt(\d+)/"'
+    r = r'<td valign="top">.*?<a href="/title/tt(\d+)/"'
     results = re.compile(r).findall(data)
     if results:
         return results[0]
@@ -885,7 +885,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
     results = duckduckgo.find(google_query, timeout=timeout)
     if results:
         for r in results[:2]:
-            imdbId = find_re(r[1], 'title/tt(\d+)')
+            imdbId = find_re(r[1], r'title/tt(\d+)')
             if imdbId:
                 return imdbId
     #or nothing
@@ -912,11 +912,11 @@ def get_episodes(imdbId, season=None):
     if season:
         url += '?season=%d' % season
         data = cache.read_url(url).decode()
-        for e in re.compile('<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
+        for e in re.compile(r'<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
             episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
     else:
         data = cache.read_url(url)
-        match = re.compile('<strong>Season (\d+)</strong>').findall(data)
+        match = re.compile(r'<strong>Season (\d+)</strong>').findall(data)
         if match:
             for season in range(1, int(match[0]) + 1):
                episodes.update(get_episodes(imdbId, season))
@@ -927,7 +927,7 @@ def max_votes():
     data = cache.read_url(url).decode('utf-8', 'ignore')
     votes = max([
         int(v.replace(',', ''))
-        for v in re.compile('Votes</span>.*?([\d,]+)', re.DOTALL).findall(data)
+        for v in re.compile(r'Votes</span>.*?([\d,]+)', re.DOTALL).findall(data)
     ])
     return votes