#!/usr/bin/python

# script accepting as input reasonably-formatted HTML/XHTML BODY, and
# producing moinmoin markup as output. the markup will need some manual touch
# up editing afterwads. things to look for:
#   * H1 and H2 both map to top-level headings, with the expectation that
#     H1 should be used only once for the page title (and hence should be edited
#     out of the moin content). this will probably need manual adjustment
#   * markup like '' or ''' might have gratuitous spaces around it
#   * nested <B><STRONG> etc. won't work as expected (''' ''' foo ''' ''')
#   * tables with empty cells, might produce |||| (needs a space)
#   * inline comments don't work too well, best to move them out of the para
#   * anchor tags are delayed until after a heading, might be better moved just
#     before it

# andrewb@cse.unsw.edu.au disclaims all resonsibility for this ugly hack

import sys, os.path
from HTMLParser import HTMLParser, HTMLParseError

TAG_MAP = {
    'h1'    : ('\n\n= ', ' =\n'),
    'h2'    : ('\n\n= ', ' =\n'),
    'h3'    : ('\n\n== ', ' ==\n'),
    'h4'    : ('\n\n=== ', ' ===\n'),
    'h5'    : ('\n\n==== ', ' ====\n'),
    'h6'    : ('\n\n===== ', ' =====\n'),
    'em'    : (" ''", "'' "),
    'i'     : (" ''", "'' "),
    'tt'    : (" `", "` "),
    'strong': (" '''", "''' "),
    'bold'  : (" '''", "''' "),
    'b'     : (" '''", "''' "),
    'p'     : ('\n', '\n'),
    'code'  : ('{{{', '}}}'),
}

ENTITY_MAP = {
    'nbsp'  : ' ',
    'lt'    : '<',
    'gt'    : '>',
    'amp'   : '&',
    'ndash' : '-',
    'mdash' : '--',
    'quot'  : '"',
}

# list types (FIXME: does python have an enum?)
ORDERED = True
UNORDERED = False

# max line length to output
MAXLINELEN = 76

class MoinConverter(HTMLParser):
    def __init__(self, out):
        HTMLParser.__init__(self)
        self.out = out
        self.listtype = []
        self.indent = 0
        self.linepos = 0
        self.preformatted = False
        self.last_word_space = False
        self.in_heading = False
        self.no_newline = 0
        self.in_a = False
        self.queued_tags = []

    def newline(self):
        if (self.no_newline == 0):
            space = " " * self.indent
            self.out.write('\n' + space)
            self.linepos = len(space)
            self.last_word_space = False

    def output(self, word, space = False):
        if self.in_heading and not space:
            self.queued_tags.append(word)
            return
        while word.find('\n') != -1:
            (w1, w2) = word.split('\n', 1)
            if self.linepos + len(w1) > MAXLINELEN:
                self.out.write('\n')
            self.out.write(w1)
            self.newline()
            word = w2
        if word == '':
            return
        if (self.last_word_space and space
            and self.linepos + len(word) < MAXLINELEN):
            self.out.write(" " + word)
            self.linepos += len(word) + 1
        else:
            if self.last_word_space and space:
                if self.no_newline == 0:
                    self.newline()
                else:
                    self.out.write(" ")
            elif self.linepos + len(word) > MAXLINELEN:
                self.newline()
            self.out.write(word)
            self.linepos += len(word)
        self.last_word_space = space

    def handle_starttag(self, tag, attrlist):
        attrs = {}
        for (key, value) in attrlist:
            attrs[key] = value

        if TAG_MAP.has_key(tag):
            (start, end) = TAG_MAP[tag]
            self.output(start)
            if tag[0] == 'h':
                self.in_heading = True
        elif tag == "ol":
            self.listtype.append(ORDERED)
            self.indent += 1
        elif tag == "ul":
            self.listtype.append(UNORDERED)
            self.indent += 1
        elif tag == "li":
            assert(self.listtype != [])
            space = " " * len(self.listtype)
            if self.listtype[-1] == ORDERED:
                self.output("\n1.", True)
            else:
                self.output("\n*", True)
        elif tag == "a":
            if attrs.has_key('href'):
                url = attrs['href']
                if (url.startswith('http://') or url.startswith('https://')
                    or url.startswith('ftp://') or url.startswith('mailto:')
                    or url.startswith('#')):
                    self.output('[%s' % url, True)
                else:
                    # guess it's a relative URL, and make an attachment for it
                    attachname = os.path.basename(url)
                    self.output('[attachment:%s' % attachname, True)
                self.in_a = True
                self.no_newline += 1
            elif attrs.has_key('name'):
                self.output('[[Anchor(%s)]]' % attrs['name'])
        elif tag == "tr":
            self.newline()
            self.no_newline += 1
        elif tag == "th":
            self.output("||")
            if attrs.has_key("style"):
                self.output('<style="%s">' % attrs["style"])
            self.output("'''")
        elif tag == "hr":
            self.output("\n\n----\n")
        elif tag == "br":
            self.output("[[BR]]\n")
        elif tag == "td":
            self.output("||")
            if attrs.has_key("style"):
                self.output('<style="%s">' % attrs["style"])
        elif tag == "pre":
            self.output('{{{\n')
            self.preformatted = True
        elif tag in ["table"]:
            pass
        else:
            sys.stderr.write("Warning: ignoring <%s %s>\n" % (tag, attrlist))

    def handle_startendtag(self, tag, attrlist):
        attrs = {}
        for (key, value) in attrlist:
            attrs[key] = value

        if tag == "hr":
            self.output("\n\n----\n")
        elif tag == "br":
            self.output("[[BR]]\n")
        elif tag == "a" and attrs.has_key('name'):
            self.output('[[Anchor(%s)]]' % attrs['name'])
        else:
            sys.stderr.write("Warning: ignoring <%s />\n" % tag)

    def handle_endtag(self, tag):
        if TAG_MAP.has_key(tag):
            (start, end) = TAG_MAP[tag]
            if tag[0] == 'h':
                self.in_heading = False
            self.output(end)
            if tag[0] == 'h':
                for word in self.queued_tags:
                    self.output(word, False)
                self.queued_tags = []
        elif tag in ["ol", "ul"]:
            assert(self.listtype != [])
            self.listtype = self.listtype[:-1]
            self.indent -= 1
        elif tag == "a":
            if self.in_a:
                self.output('] ')
                self.no_newline -= 1
                self.in_a = False
        elif tag == "tr":
            self.output("||")
            self.no_newline -= 1
#            self.newline()
        elif tag == "th":
            self.output("'''")
        elif tag == "pre":
            self.output('}}}')
            self.preformatted = False
        elif tag in ["li", "table", "td"]:
            pass

    def handle_data(self, data):
        if self.preformatted:
            self.out.write(data)
        else:
            for word in data.split():
                self.output(word, True)

    def handle_charref(self, name):
        sys.stderr.write("Warning: ignoring &#%s;\n" % name)

    def handle_entityref(self, name):
        if ENTITY_MAP.has_key(name):
            self.output(ENTITY_MAP[name], False)
        else:
            sys.stderr.write("Warning: ignoring &%s;\n" % name)

    def handle_comment(self, text):
        for line in text.splitlines():
            self.out.write("\n## %s" % line)
        self.newline()

def main(argv):
    if len(argv) <= 1:
        f = sys.stdin
    else:
        try:
            f = file(argv[1])
        except IOError, e:
            sys.stderr.write("Error: %s\n" % e)
            sys.exit(1)
    parser = MoinConverter(sys.stdout)
    try:
        parser.feed(f.read())
    except HTMLParseError, e:
        sys.stderr.write("Parse error: %d: %s\n" % (e.lineno, e.msg))
        sys.exit(1)

if __name__ == "__main__":
    sys.exit(main(sys.argv))
