Python: Analyze web page with HTMLParser

This article will describe an example code for HTMLParser which is useful for Crawler.

1 Get header and body with urllib.request

Get response with urllib.request and decode response body with charset writtend in response header.
If there is no charset in response header, decode response body with charset in DECODE_CHARSET.

import urllib.request

DECODE_CHARSET = ['iso-8859-1', 'utf-8', 'shift-jis', 'euc-jp']


def example_decode(html):
    for charset in DECODE_CHARSET:
        try:
            decode_html = html.decode(charset)
            return decode_html
        except:
            pass
    return html


def example_http_get(url):
    with urllib.request.urlopen(url) as response:
        charset = response.headers.get_content_charset()
        if charset:
            return response.read().decode(charset)
        return example_decode(response.read())
    return ""

2 Analyze web page with HTMLParser

Define ExampleParser inherited HTMLParser.
When call feed method of ExampleParser, handle_starttag method will be called.
The arguments of handle_starttag are as below. When matching tag and attr givened by constructor, This code will print attribute.

<a href=http://example.com>
tag = a
attrs = [ [ 'href', 'http://example.com' ] ]

from html.parser import HTMLParser

class ExampleParser(HTMLParser):
    def __init__(self, tag, attr):
        super(ExampleParser, self).__init__()
        self.tag = tag
        self.attr = attr

    def handle_starttag(self, tag, attrs):
        if tag == self.tag:
            for attr in attrs:
                if attr[0] == self.attr:
                    print(attr[1])


def example_print(tag, attr, html):
    parser = ExampleParser(tag, attr)
    parser.feed(html)

3 Example code

Example code is as below.

#!/usr/bin/env python3

import sys
import urllib.request

from html.parser import HTMLParser

DECODE_CHARSET = ['iso-8859-1', 'utf-8', 'shift-jis', 'euc-jp']


def example_decode(html):
    for charset in DECODE_CHARSET:
        try:
            decode_html = html.decode(charset)
            return decode_html
        except:
            pass
    return html


def example_http_get(url):
    with urllib.request.urlopen(url) as response:
        charset = response.headers.get_content_charset()
        if charset:
            return response.read().decode(charset)
        return example_decode(response.read())
    return ""


class ExampleParser(HTMLParser):
    def __init__(self, tag, attr):
        super(ExampleParser, self).__init__()
        self.tag = tag
        self.attr = attr
        self.attrs = []

    def handle_starttag(self, tag, attrs):
        if tag == self.tag:
            for attr in attrs:
                if attr[0] == self.attr:
                    print(attr[1])


def example_print(tag, attr, html):
    parser = ExampleParser(tag, attr)
    parser.feed(html)


if __name__ == "__main__":
    argv = sys.argv
    argc = len(argv)
    if argc != 4:
        print('usage: %s <tag> <attr> <url>' % argv[0])
        exit(1)
    example_print(argv[1], argv[2], example_http_get(argv[3]))
    exit(0)

Execution result is as below.

$ ./example-html-parser.py a href http://yahoo.co.jp
http//...
http//...
http//...
<snip>

4 Weak point of HTMLParser

When HTML has an incomplete tag, HTMLParser will be error. For example, when HTML has <img> and no </img>, HTML parser will be error.

Python: Analyze web page with HTMLParser

Table of Contents

1 Get header and body with urllib.request

2 Analyze web page with HTMLParser

3 Example code

4 Weak point of HTMLParser