#!/bin/python
# Parse HTML tables
# Copywrite (c) 2004 Randy Gamage
# This script parses a web page with tables, and returns a dictionary/list
# structure that contains the contents of the table.  It ignores outer levels
# of tables, and only fetches data in the inner-most table cells (in the case
# of nested tables).  As of July 2004, it is still a work in progress, and
# is no means roboust.

# standard modules
import htmllib
import formatter
import string
import urllib
import os

class TableParser(htmllib.HTMLParser):
    """Table Parser.  Puts all table data
    into dictionaries in the following format:
    tables[table1, table2, etc...] = list of all tables
    table: {col : cells} = dictionary of all table data
    col: integer column number
    cells: [cell1, cell2, cell3...] = list of all cell data for a given heading
    """
    def __init__(self, formatter):
        htmllib.HTMLParser.__init__(self, formatter)

        self.tables=[{}]
        self.current_table = {}
        self.current_table = 0
        self.current_row = 0
        self.current_col = 0
        self.in_row = 0
        self.in_cell = 0
        self.in_table = 0
        self.in_innermost_table = 0
        
    def start_table(self, attrs):
        # Add a new table to the tables list
        self.in_table = 1

    def end_table(self):
        if self.in_table:
            self.current_table += 1
        self.in_table = 0

    def start_tr(self, attrs):
        # Signal when we get to the beginning of a new row.
        self.current_col = 0
        if self.in_table:
            self.in_row = 1

    def end_tr(self):
        self.in_row = 0
        self.current_row += 1

    def start_td(self, attrs):
        """Keep track of the number of td elements encountered.
        This increments a counter and sets some boolean variables to help
        us keep track of what column  we are in, in a row.
        """
        if self.in_row & self.in_table:
            self.in_cell = 1
            
    def end_td(self):
        # Unset flags which are not unset by other code.
        self.in_cell = 0
        self.current_col += 1
                
    def start_a(self, attrs):
        pass
    
    def handle_data(self, text):
        """Actually extract the data we want.
        Based on the flag set, this will put the text information into
        the current_row record class as the appropriate attribute.
        """

        if self.in_cell:
            #print text
            print self.current_col
            if not (len(self.tables) > self.current_table):
                self.tables.append({})
            if not self.tables[self.current_table].has_key(self.current_col):
                self.tables[self.current_table][self.current_col]=[]
            self.tables[self.current_table][self.current_col].append(string.strip(text))

    def end_html(self):
        """At the end of a parse we need to add the last row info."""
        pass
    
def parse_table(file_handle):
    abs_formatter = formatter.NullFormatter()
    parser = TableParser(abs_formatter)
    parser.feed(file_handle.read())
    return parser.tables

# Start of Main Program
# Enter web site address here:
#f = urllib.urlopen("http://www.tablewebsite.com")
os.chdir(os.environ['userprofile'] + '\\My Documents\\python source')
# This file is just for testing
f = urllib.urlopen("tables3.htm")
t = parse_table(f)
urllib.urlcleanup()

# Print table in tabular format
for tbl in t:
    for row in tbl[0]:
        rowlist = []
        for colum in tbl:
            rowindex = tbl[0].index(row)
            rowlist.append(tbl[colum][rowindex])
        rowprint = ''
        for cell in rowlist:
            rowprint += ('%s' % cell).center(10)
        print rowprint