#!/bin/python
# Parse HTML tables
# Copywrite (c) 2004 Randy Gamage
# This script parses a web page with tables, and returns a dictionary/list
# structure that contains the contents of the table. It ignores outer levels
# of tables, and only fetches data in the inner-most table cells (in the case
# of nested tables). As of July 2004, it is still a work in progress, and
# is no means roboust.
# standard modules
import htmllib
import formatter
import string
import urllib
import os
class TableParser(htmllib.HTMLParser):
"""Table Parser. Puts all table data
into dictionaries in the following format:
tables[table1, table2, etc...] = list of all tables
table: {col : cells} = dictionary of all table data
col: integer column number
cells: [cell1, cell2, cell3...] = list of all cell data for a given heading
"""
def __init__(self, formatter):
htmllib.HTMLParser.__init__(self, formatter)
self.tables=[{}]
self.current_table = {}
self.current_table = 0
self.current_row = 0
self.current_col = 0
self.in_row = 0
self.in_cell = 0
self.in_table = 0
self.in_innermost_table = 0
def start_table(self, attrs):
# Add a new table to the tables list
self.in_table = 1
def end_table(self):
if self.in_table:
self.current_table += 1
self.in_table = 0
def start_tr(self, attrs):
# Signal when we get to the beginning of a new row.
self.current_col = 0
if self.in_table:
self.in_row = 1
def end_tr(self):
self.in_row = 0
self.current_row += 1
def start_td(self, attrs):
"""Keep track of the number of td elements encountered.
This increments a counter and sets some boolean variables to help
us keep track of what column we are in, in a row.
"""
if self.in_row & self.in_table:
self.in_cell = 1
def end_td(self):
# Unset flags which are not unset by other code.
self.in_cell = 0
self.current_col += 1
def start_a(self, attrs):
pass
def handle_data(self, text):
"""Actually extract the data we want.
Based on the flag set, this will put the text information into
the current_row record class as the appropriate attribute.
"""
if self.in_cell:
#print text
print self.current_col
if not (len(self.tables) > self.current_table):
self.tables.append({})
if not self.tables[self.current_table].has_key(self.current_col):
self.tables[self.current_table][self.current_col]=[]
self.tables[self.current_table][self.current_col].append(string.strip(text))
def end_html(self):
"""At the end of a parse we need to add the last row info."""
pass
def parse_table(file_handle):
abs_formatter = formatter.NullFormatter()
parser = TableParser(abs_formatter)
parser.feed(file_handle.read())
return parser.tables
# Start of Main Program
# Enter web site address here:
#f = urllib.urlopen("http://www.tablewebsite.com")
os.chdir(os.environ['userprofile'] + '\\My Documents\\python source')
# This file is just for testing
f = urllib.urlopen("tables3.htm")
t = parse_table(f)
urllib.urlcleanup()
# Print table in tabular format
for tbl in t:
for row in tbl[0]:
rowlist = []
for colum in tbl:
rowindex = tbl[0].index(row)
rowlist.append(tbl[colum][rowindex])
rowprint = ''
for cell in rowlist:
rowprint += ('%s' % cell).center(10)
print rowprint