1 IMDb
j edited this page 2023-07-02 12:57:13 +05:30

IMDb Parser

[Documents, pandora*, 0xdb2*)]TOC(heading=Design

Open Issues:

  • some feelds need html stripped, others not
  • tv shows have " around title
  • lists of type, i.e. release_date can be a list of dates with current parser

this is now implemented in python-ox see here

{
    'cast': {
        'page': 'combined',
        're': '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
        'type': 'list'
    },
    'cinematographers': {
        'page': 'combined',
        're': [
            'Cinematography by</a>(.*?)</table>',
            '<a href="/name/.*?/">(.*?)</a>'
        ],
        'type': 'list'
    },
    'countries': {
        'page': 'combined',
        're': '<a href="/Sections/Countries/.*?/">(.*?)</a>',
        'type': 'list'
    },
    'directors': {
        'page': 'combined',
        're': [
            'Directed by</a>(.*?)</table>',
            '<a href="/name/.*?/">(.*?)</a>'
        ],
        'type': 'list'
    },
    'editors': {
        'page': 'combined',
        're': [
            'Film Editing by</a>(.*?)</table>',
            '<a href="/name/.*?/">(.*?)</a>'
        ],
        'type': 'list'
    },
    'filming_locations': {
        'page': 'locations',
        're': '<a href="/search/title\?locations=.*?">(.*?)</a>',
        'type': 'list'
    },
    'genres': {
        'page': 'combined',
        're': '<a href="/Sections/Genres/.*?/">(.*?)</a>',
        'type': 'list'
    },
    'keywords': {
        'page': 'keywords',
        're': '<a href="/keyword/.*?/">(.*?)</a>',
        'type': 'list'
    },
    'languages': {
        'page': 'combined',
        're': '<a href="/Sections/Languages/.*?/">(.*?)</a>',
        'type': 'list'
    },
    'poster_id': {
        'page': 'combined',
        're': '/primary-photo/media/rm(.*?)/tt',
        'type': 'list'
    },
    'poster_ids': {
        'page': 'posters',
        're': '/unknown-thumbnail/media/rm(.*?)/tt',
        'type': 'list'
    },
    'producers': {
        'page': 'combined',
        're': [
            'Produced by</a>(.*?)</table>',
            '<a href="/name/.*?/">(.*?)</a>'
        ],
        'type': 'list'
    },
    'rating': {
        'page': 'combined',
        're': '<div class="starbar-meta">.*?<b>(.*?)/10</b>',
        'type': 'float'
    },
    'release_date': {
        'page': 'releaseinfo',
        're': '<a href="/date/(\d{2})-(\d{2})/">.*?</a> <a href="/year/(\d{4})/">',
        'type': 'date'
    },
    'title': {
        'page': 'combined',
        're': '<h1>(.*?) <span>',
        'type': 'list'
    },
    'trivia': {
        'page': 'trivia',
        're': '<div class="sodatext">(.*?)<br>',
        'type': 'list',
    },
    'votes': {
        'page': 'combined',
        're': '<a href="ratings" class="tn15more">(.*?) votes</a>',
        'type': 'int'
    },
    'writers': {
        'page': 'combined',
        're': [
            'Writing credits</a>(.*?)</table>',
            '<a href="/name/.*?/">(.*?)</a>'
        ],
        'type': 'list'
    },
    'year': {
        'page': 'combined',
        're': '<a href="/year/(\d{4})/">',
        'type': 'int'
    }
}