#!/usr/bin/env python
#
# HtmlTableOfContents.py
# (c) 2006-1007 Gene Thomas
# http://genethomas.com
# http://genethomas.com/software/HtmlTableOfContents.py 
#
# v1.3
#
# o Updates a html file inserting a table of contents between the comments
#   <!--toc--> and <!--end toc-->
# o Adds numbering to the headers. Removing number put there previously be this program.
# o Can be re-run on html file again.
# o Saves original as <name>.bak
# o If fails, e.g. no <!--toc--> does not overwrite file
# o Paranoidly checks all HTML tags match up and will not overwrite an invlid html file
#
# Version History:
#
# 1.3 
#     o Add <title> as non closing tag
#     o Handle <a href> and manually inserted <a name> in headings
#       Stripped in heading in table of contents

import sys
import HTMLParser
import os
import re
import cgi # for escape()
import string

numbering = True

debugFlag = os.getenv("DEBUG") != None
def debug(text):
    if debugFlag:
	print text
debug("DEBUG set, writing debug information")

def comment(text):
    return "<!--" + text + "-->"

def quote(text):
    return cgi.escape(text, 1)

def decl(text):
    return "<!" + text + ">"

def indent(num):
    return "    " * num

def entity(text):
    return "&" + text + ";"

def charref(text):
    return "&#" + text + ";"

def stripATags(text):
    debug("stripATags(" + text + ")")
    text = re.sub("<a [^>]*>", "", text)
    text = re.sub("</a>", "", text)
    debug(" " + text)
    return text
    
class TOCParser(HTMLParser.HTMLParser):
    def __init__(self, filename):
	# parsing constants
	self.HEADERS = ["", "h1", "h2", "h3", "h4", "h5", "h6"] # index is number
	self.START_ENDS = ["hr", "br", "meta", "link"] # tags that don't has close tags
	self.OPTIONAL_ENDS = ["p", "img", "frame"] # tags that don't have to have close tags

	HTMLParser.HTMLParser.__init__(self)
	self.filename = filename

	self.prelude = ""
	self.toc = ""
	self.rest = ""

	self.TOC = "toc"
	self.END_TOC = "end toc"

	self.path = []  # list of tags, e.g. ["html", "body"]
	self.tagLines = [] # list of (int line, int column) positionsself.path[x] tags appeared on

	# STATES
	self.sPRELUDE = "prelude"
	self.sTOC = "toc"	
	self.sREST = "rest"
	self.sHEADER = "header"
    
	self.state = self.sPRELUDE

	self.topHeader = None # int level
	self.lastHeader = None	# last header written to TOC, int level form <hx> tag
	self.currentHeaderText = ""	# in sHEADER:  include <tags>
	self.lastHeaderNumber = [] # list<int>, e.g. [1, 2, 1] for 1.2.1

    def getHeaderAnchor(self):
	# [1, 2, 3] -> "1.2.3"
	return string.join([str(n) for n in self.lastHeaderNumber], ".")
	
    def close(self):
	if self.lastHeader != None:
	    while self.topHeader <= self.lastHeader:
		self.lastHeader = self.lastHeader - 1
		self.toc = self.toc + indent(self.lastHeader-self.topHeader+1) + "</ul>\n"
	HTMLParser.HTMLParser.close(self)

    def debug(self, text):
	line, col = self.getpos()	
	debug(self.state + " " + str(line) + " " + str(self.path) + " " + text)
	
    def read(self):
	self.close()
	if self.state == self.sPRELUDE:
	    self.fail("Missing <!--toc--><!--end toc--> tags")
	elif self.state == self.sTOC:
	    self.fail("Missing <!--end toc--> tag")
	elif self.state == self.sHEADER:
	    self.fail("Missing <!--end toc--> tag")
	elif self.state == self.sREST:
	    if self.path != []:
		self.fail("Missing close tags for open tags: " + str(self.path));
	else:
	    self.fail("bad state " + self.state)	
	# don't add \ns
	return self.prelude + comment(self.TOC)  + self.toc + comment(self.END_TOC) + self.rest

    def fail(self, reason):
	line, col = self.getpos()
	raise Exception(filename + ":" + str(line) + ":" + str(col)  + " " + reason)

    def handle_starttag(self, tag, attrs):  
	self.debug("handle_starttag(" + tag + ")")
	if tag in self.START_ENDS:
	    # tags that aren't ended as per XML
	    self.handle_startendtag(tag, attrs)
	    return
	self.path.append(tag)	
	self.tagLines.append(self.getpos())
	if self.state == self.sPRELUDE:
	    self.prelude = self.prelude + self.get_starttag_text()
	elif self.state == self.sTOC:
	    pass
	elif self.state == self.sREST:
	    if  tag in self.HEADERS:
		while self.path[:-1] != ["html", "body"] :
		    if len(self.path) < 2:
			  self.fail("<" + tag + "> outside <html><body>")
		    # e.g. ["html", "body", "p", "h3"]
		    if self.path[-2] in self.OPTIONAL_ENDS:
			del self.path[-2]
			del self.tagLines[-2]
		    else:
		        self.fail("<" + tag + "> outside <html><body>, is in " + string.join(["<" + t + ">" for t in self.path[:-1]], "") + " from lines " + \
			    string.join([str(line) + ":" + str(col) for (line, col) in self.tagLines[:-1]], ", "))
		self.state = self.sHEADER
		num = self.HEADERS.index(tag)
		if self.topHeader != None and num < self.topHeader:
		    self.fail("first header is not to level header, e.g. <h4> followed by <h3>")
		self.currentHeaderText = ""
	    else:
		self.rest = self.rest + self.get_starttag_text()
	elif self.state == self.sHEADER:
	    if tag == "a":
		for (at, av) in attrs:
		    if at == "name" and av.startswith("part"):
			# ignore <a name="part-"> we put in titles
			return
	    if tag in self.HEADERS:
		self.fail("nested headers")	
	    self.currentHeaderText = self.currentHeaderText + self.get_starttag_text()
	else:
	    self.fail("bad state " + self.state)	

    def handle_startendtag(self, tag, attrs):
	self.debug("handle_startendtag(" + tag + ")")
	if self.state == self.sPRELUDE:
	    self.prelude = self.prelude + self.get_starttag_text()
	elif self.state == self.sTOC:
	    pass
	elif self.state == self.sREST:
	    self.rest = self.rest + self.get_starttag_text()
	elif self.state == self.sHEADER:
	    self.currentHeaderText = self.currentHeaderText + self.get_starttag_text()
	else:
	    self.fail("bad state " + self.state)	

    def handle_endtag(self, tag):
	self.debug("handle_endtag(" + tag + ")")
	if self.path == []:
	    self.fail("<" + tag  + "> missing")
	while self.path[-1] != tag:
	    if self.path[-1] in self.OPTIONAL_ENDS:
		del self.path[-1]
		del self.tagLines[-1]
		if self.path == []:
		    self.fail("<" + tag  + "> missing")
	    else:
		self.fail("</" + tag  + "> does not match <" + self.path[-1] + ">")
	del self.path[-1]
	del self.tagLines[-1]
	if self.state == self.sPRELUDE:
	    self.prelude = self.prelude + "</" + tag + ">"
	elif self.state == self.sTOC:
	    pass
	elif self.state == self.sHEADER:
	    if tag == "a":
	        if self.currentHeaderText.count("<a")  > self.currentHeaderText.count("</a"):
	            # has another a name we want to keep
	            pass
	        else:
		    # ignore <a name="part-"> we put in titles
		    return
	    if  tag in self.HEADERS:
		self.debug("END HEADER " + self.currentHeaderText)
		if self.path != ["html", "body"] :
		    self.fail("</" + tag + "> outside <html><body>, is in " + string.join(["<" + t + ">" for t in self.path[:-1]], "") + " from lines " + \
			string.join([str(line) + ":" + str(col) for (line, col) in self.tagLines[:-1]], ", "))
		self.state = self.sREST
		num = self.HEADERS.index(tag)		
		if self.lastHeader == None:
		    self.topHeader = num
		    self.toc = "\n<h" + str(num) + "><a name=\"toc\">Contents</a></h" + str(num) + ">\n" + \
			"<!--\n" + \
			"Table of contents inserted by HtmlTableOfContents.py by Gene Thomas\n" + \
		        "see http://genethomas.com/software/HtmlTableOfContents.py and\n" + \
                        "http://genethomas.com\n" + \
			"-->\n" + \
			"<ul>\n"
		    self.lastHeaderNumber = [1]
		else:
		    while num > self.lastHeader:
			self.toc = self.toc + indent(self.lastHeader-self.topHeader+1) + "<ul>\n"
			self.lastHeader = self.lastHeader + 1
			self.lastHeaderNumber.append(0)
		    while num < self.lastHeader:
			self.lastHeader = self.lastHeader - 1
			self.toc = self.toc + indent(self.lastHeader-self.topHeader+1) + "</ul>\n"
			del self.lastHeaderNumber[-1]
		    self.lastHeaderNumber[-1] = self.lastHeaderNumber[-1] + 1
		if numbering:
		    id = self.getHeaderAnchor() + ". "
		else:
		    id = ""
        	# HTML names must begin with a letter
		self.toc = self.toc + indent(num-self.topHeader+1) + "<a href=\"#part-"+ self.getHeaderAnchor() + "\">" + id + stripATags(self.currentHeaderText) + "</a><br/>\n"
		self.rest = self.rest + "<h" + str(num) + ">" + id + "<a name=\"part-" + self.getHeaderAnchor() + "\">" + self.currentHeaderText + "</a></h" + str(num) + ">\n"
		self.lastHeader = num
	    else:
		self.currentHeaderText = self.currentHeaderText + "</" + tag + ">"

	elif self.state == self.sREST:
	    self.rest = self.rest + "</" + tag + ">"
	else:
	    self.fail("bad state " + self.state)	

    def handle_data(self, text):
	self.debug("handle_data(\"" + text.replace("\n", "\\n") + "\")")
	if self.state == self.sPRELUDE:
	    self.prelude = self.prelude + text
	elif self.state == self.sTOC:
	    pass
	elif self.state == self.sHEADER:
	    if self.currentHeaderText == "" and numbering:
		# remove numbering, e.g. "1.2.3 "
		# we write "1.2.3 <a name="1.2.3">title</a>
		pos = text.find(" ")
		if pos == len(text)-1 and pos > 0:
		    id = text[0:pos]
		    isId = True
		    for c in id:
			if not c.isdigit() and c != ".":
			    # not id
			    isId = False
			    break
		    if isId:
			# ignore numbering we put in last time
			return
	    self.currentHeaderText = self.currentHeaderText + text
	elif self.state == self.sREST:
	    self.rest = self.rest + text
	else:
	    self.fail("bad state " + self.state)	
 
    def handle_comment(self, text):
	self.debug("handle_comment(\"" + text + "\")")
	if self.state == self.sPRELUDE:
	    if text == self.TOC:
		self.state = self.sTOC
	    else:
		self.prelude = self.prelude + comment(text)
	elif self.state == self.sTOC:
	    if text == self.END_TOC:
		self.state = self.sREST
	elif self.state == self.sHEADER:
	    self.currentHeaderText = self.currentHeaderText + comment(text)
	elif self.state == self.sREST:
	    self.rest = self.rest + comment(text)
	else:
	    self.fail("bad state " + self.state)	

    def handle_charref(self, text):
	self.debug("handle_charref(" + text + ")")
	self.handle_data(charref(text))

    def handle_entityref(self, text):
	self.debug("handle_entiryref(" + text + ")")
	self.handle_data(entity(text))

    def handle_decl(self, text):
	self.debug("handle_decl(" + text + ")")
	if self.state == self.sPRELUDE:
	    self.prelude = self.prelude + decl(text)
	elif self.state == self.sTOC:
	    pass
	elif self.state == self.sHEADER:
	    self.fail("SGML declaratin inside header")
	elif self.state == self.sREST:
	    self.rest = self.rest + decl(text)
	else:
	    self.fail("bad state " + self.state)	

filename = sys.argv[1]
output = filename + ".out"

parser = TOCParser(output)
data = file(filename).read()
#print "data = "+ data
parser.feed(data)

out = file(output, "w")
out.write(parser.read())
out.close()

backup = filename + ".bak"
try:
    os.remove(backup)
except:
    pass
os.rename(filename, backup)
os.rename(output, filename)
