scrapeit/scrapeit/utils.py

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
"""
screape tools
"""

import re
import time
import urllib
import urllib2

import djangohtml


# Default headers for HTTP requests.
DEFAULT_HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.6) Gecko/20061201 Firefox/2.0.0.6 (Ubuntu-feisty)'}

# --------------------------------------------------------------------
# Functions
# --------------------------------------------------------------------

def quote_plus(s):
  """
  A variant of urllib.quote_plus which handles ASCII and Unicode.
  """
  return urllib.quote_plus(s.encode('utf-8'))


def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
  """
  Read str contents of given str URL.

  Here headers is a map of str -> str for HTTP request headers.  If
  blocking is True, returns the str page contents.  If blocking is
  False, returns an iterator which gives None until a successful read,
  at which point the str page contents is yielded.
  """
  req = urllib2.Request(url, None, headers)
  f = urllib2.urlopen(req)
  data = f.read()
  f.close()
  ctype = f.headers.getheader('content-type')
  charset = ctype.split('charset=')
  if len(charset)>1: charset = charset[1]
  else: charset = 'latin-1'
  data = unicode(data, charset)
  return data

def open_url(url, headers=DEFAULT_HEADERS):
  url = url.replace(' ', '%20')
  req = urllib2.Request(url, None, headers)
  return urllib2.urlopen(req)

def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
  """
  Read str contents of given str URL.

  Here headers is a map of str -> str for HTTP request headers.  If
  blocking is True, returns the str page contents.  If blocking is
  False, returns an iterator which gives None until a successful read,
  at which point the str page contents is yielded.
  """
  url = url.replace(' ', '%20')
  f = open_url(url, headers)
  data = f.read()
  f.close()
  return data


def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
  """
  opens given str URL and returns the url after redirection.
  """
  rurl = url
  try:
    req = urllib2.Request(url, None, headers)
    rurl = urllib2.urlopen(req).url
    rurl = rurl.replace('&src=rss', '')
  except:
    rurl = url
  return rurl


def fix_url(url):
  """
  Given url str, trim redirect stuff and return actual URL.

  Currently this just returns the URL unmodified.
  """
#  if url.lower().find('http%3a//') > 0:
#    return 'http://' + url[url.lower().rindex('http%3a//')+9:]
#  if url.find('http://') > 0:
#    return url[url.rindex('http://'):]
  return url


_html_entity_re = re.compile(r'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(?:([0-9]+)|[xX]([0-9a-fA-F]+)));?')
import htmlentitydefs

def html_entity_decode(s, encoding = 'utf-8'):
  r = []
  p = 0
  mo = _html_entity_re.search(s, p)
  while mo:
    r.append(s[p:mo.start()].decode(encoding))
    i = mo.lastindex
    e = mo.group(i)
    try:
      if i == 1:
        c = htmlentitydefs.name2codepoint[e]
      elif i == 2:
        c = int(e)
      elif i == 3:
        c = int(e, 16)
      else:
        assert 0
      r.append(unichr(c))
    except KeyError:
      r.append(mo.group(0))

    p = mo.end()
    mo = _html_entity_re.search(s, p)
  r.append(s[p:].decode(encoding))
  return u''.join(r)

def stripTags(s):
  if s:
    s = htmldecode(s)
    return djangohtml.strip_tags(s).strip()
  return u''

strip_tags=stripTags

from htmlentitydefs import name2codepoint

# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')

def htmldecode(text):
  """Decode HTML entities in the given text."""
  if type(text) != unicode:
   text = unicode(text)[:]
  if type(text) is unicode:
    uchr = unichr
  else:
    uchr = lambda value: value > 255 and unichr(value) or chr(value)
  def entitydecode(match, uchr=uchr):
    entity = match.group(1)
    if entity.startswith('#x'):
      return uchr(int(entity[2:], 16))
    elif entity.startswith('#'):
      return uchr(int(entity[1:]))
    elif entity in name2codepoint:
      return uchr(name2codepoint[entity])
    else:
      return match.group(0)
  return charrefpat.sub(entitydecode, text)