Quick and dirty script to convert reasonably clean and well-formatted HTML (body only) to MoinMoin markup. The output will almost certainly need touch-up editing afterwards, see the main comment for some details.
1 #!/usr/bin/python
2
3 # script accepting as input reasonably-formatted HTML/XHTML BODY, and
4 # producing moinmoin markup as output. the markup will need some manual touch
5 # up editing afterwads. things to look for:
6 # * H1 and H2 both map to top-level headings, with the expectation that
7 # H1 should be used only once for the page title (and hence should be edited
8 # out of the moin content). this will probably need manual adjustment
9 # * markup like '' or ''' might have gratuitous spaces around it
10 # * nested <B><STRONG> etc. won't work as expected (''' ''' foo ''' ''')
11 # * tables with empty cells, might produce |||| (needs a space)
12 # * inline comments don't work too well, best to move them out of the para
13 # * anchor tags are delayed until after a heading, might be better moved just
14 # before it
15
16 # andrewb@cse.unsw.edu.au disclaims all resonsibility for this ugly hack
17
18 import sys, os.path
19 from HTMLParser import HTMLParser, HTMLParseError
20
21 TAG_MAP = {
22 'h1' : ('\n\n= ', ' =\n'),
23 'h2' : ('\n\n= ', ' =\n'),
24 'h3' : ('\n\n== ', ' ==\n'),
25 'h4' : ('\n\n=== ', ' ===\n'),
26 'h5' : ('\n\n==== ', ' ====\n'),
27 'h6' : ('\n\n===== ', ' =====\n'),
28 'em' : (" ''", "'' "),
29 'i' : (" ''", "'' "),
30 'tt' : (" `", "` "),
31 'strong': (" '''", "''' "),
32 'bold' : (" '''", "''' "),
33 'b' : (" '''", "''' "),
34 'p' : ('\n', '\n'),
35 'code' : ('{{{', '}}}'),
36 }
37
38 ENTITY_MAP = {
39 'nbsp' : ' ',
40 'lt' : '<',
41 'gt' : '>',
42 'amp' : '&',
43 'ndash' : '-',
44 'mdash' : '--',
45 'quot' : '"',
46 }
47
48 # list types (FIXME: does python have an enum?)
49 ORDERED = True
50 UNORDERED = False
51
52 # max line length to output
53 MAXLINELEN = 76
54
55 class MoinConverter(HTMLParser):
56 def __init__(self, out):
57 HTMLParser.__init__(self)
58 self.out = out
59 self.listtype = []
60 self.indent = 0
61 self.linepos = 0
62 self.preformatted = False
63 self.last_word_space = False
64 self.in_heading = False
65 self.no_newline = 0
66 self.in_a = False
67 self.queued_tags = []
68
69 def newline(self):
70 if (self.no_newline == 0):
71 space = " " * self.indent
72 self.out.write('\n' + space)
73 self.linepos = len(space)
74 self.last_word_space = False
75
76 def output(self, word, space = False):
77 if self.in_heading and not space:
78 self.queued_tags.append(word)
79 return
80 while word.find('\n') != -1:
81 (w1, w2) = word.split('\n', 1)
82 if self.linepos + len(w1) > MAXLINELEN:
83 self.out.write('\n')
84 self.out.write(w1)
85 self.newline()
86 word = w2
87 if word == '':
88 return
89 if (self.last_word_space and space
90 and self.linepos + len(word) < MAXLINELEN):
91 self.out.write(" " + word)
92 self.linepos += len(word) + 1
93 else:
94 if self.last_word_space and space:
95 if self.no_newline == 0:
96 self.newline()
97 else:
98 self.out.write(" ")
99 elif self.linepos + len(word) > MAXLINELEN:
100 self.newline()
101 self.out.write(word)
102 self.linepos += len(word)
103 self.last_word_space = space
104
105 def handle_starttag(self, tag, attrlist):
106 attrs = {}
107 for (key, value) in attrlist:
108 attrs[key] = value
109
110 if TAG_MAP.has_key(tag):
111 (start, end) = TAG_MAP[tag]
112 self.output(start)
113 if tag[0] == 'h':
114 self.in_heading = True
115 elif tag == "ol":
116 self.listtype.append(ORDERED)
117 self.indent += 1
118 elif tag == "ul":
119 self.listtype.append(UNORDERED)
120 self.indent += 1
121 elif tag == "li":
122 assert(self.listtype != [])
123 space = " " * len(self.listtype)
124 if self.listtype[-1] == ORDERED:
125 self.output("\n1.", True)
126 else:
127 self.output("\n*", True)
128 elif tag == "a":
129 if attrs.has_key('href'):
130 url = attrs['href']
131 if (url.startswith('http://') or url.startswith('https://')
132 or url.startswith('ftp://') or url.startswith('mailto:')
133 or url.startswith('#')):
134 self.output('[%s' % url, True)
135 else:
136 # guess it's a relative URL, and make an attachment for it
137 attachname = os.path.basename(url)
138 self.output('[attachment:%s' % attachname, True)
139 self.in_a = True
140 self.no_newline += 1
141 elif attrs.has_key('name'):
142 self.output('[[Anchor(%s)]]' % attrs['name'])
143 elif tag == "tr":
144 self.newline()
145 self.no_newline += 1
146 elif tag == "th":
147 self.output("||")
148 if attrs.has_key("style"):
149 self.output('<style="%s">' % attrs["style"])
150 self.output("'''")
151 elif tag == "hr":
152 self.output("\n\n----\n")
153 elif tag == "br":
154 self.output("[[BR]]\n")
155 elif tag == "td":
156 self.output("||")
157 if attrs.has_key("style"):
158 self.output('<style="%s">' % attrs["style"])
159 elif tag == "pre":
160 self.output('{{{\n')
161 self.preformatted = True
162 elif tag in ["table"]:
163 pass
164 else:
165 sys.stderr.write("Warning: ignoring <%s %s>\n" % (tag, attrlist))
166
167 def handle_startendtag(self, tag, attrlist):
168 attrs = {}
169 for (key, value) in attrlist:
170 attrs[key] = value
171
172 if tag == "hr":
173 self.output("\n\n----\n")
174 elif tag == "br":
175 self.output("[[BR]]\n")
176 elif tag == "a" and attrs.has_key('name'):
177 self.output('[[Anchor(%s)]]' % attrs['name'])
178 else:
179 sys.stderr.write("Warning: ignoring <%s />\n" % tag)
180
181 def handle_endtag(self, tag):
182 if TAG_MAP.has_key(tag):
183 (start, end) = TAG_MAP[tag]
184 if tag[0] == 'h':
185 self.in_heading = False
186 self.output(end)
187 if tag[0] == 'h':
188 for word in self.queued_tags:
189 self.output(word, False)
190 self.queued_tags = []
191 elif tag in ["ol", "ul"]:
192 assert(self.listtype != [])
193 self.listtype = self.listtype[:-1]
194 self.indent -= 1
195 elif tag == "a":
196 if self.in_a:
197 self.output('] ')
198 self.no_newline -= 1
199 self.in_a = False
200 elif tag == "tr":
201 self.output("||")
202 self.no_newline -= 1
203 # self.newline()
204 elif tag == "th":
205 self.output("'''")
206 elif tag == "pre":
207 self.output('}}}')
208 self.preformatted = False
209 elif tag in ["li", "table", "td"]:
210 pass
211
212 def handle_data(self, data):
213 if self.preformatted:
214 self.out.write(data)
215 else:
216 for word in data.split():
217 self.output(word, True)
218
219 def handle_charref(self, name):
220 sys.stderr.write("Warning: ignoring &#%s;\n" % name)
221
222 def handle_entityref(self, name):
223 if ENTITY_MAP.has_key(name):
224 self.output(ENTITY_MAP[name], False)
225 else:
226 sys.stderr.write("Warning: ignoring &%s;\n" % name)
227
228 def handle_comment(self, text):
229 for line in text.splitlines():
230 self.out.write("\n## %s" % line)
231 self.newline()
232
233 def main(argv):
234 if len(argv) <= 1:
235 f = sys.stdin
236 else:
237 try:
238 f = file(argv[1])
239 except IOError, e:
240 sys.stderr.write("Error: %s\n" % e)
241 sys.exit(1)
242 parser = MoinConverter(sys.stdout)
243 try:
244 parser.feed(f.read())
245 except HTMLParseError, e:
246 sys.stderr.write("Parse error: %d: %s\n" % (e.lineno, e.msg))
247 sys.exit(1)
248
249 if __name__ == "__main__":
250 sys.exit(main(sys.argv))