First prototype
"""
program to convert files of wiki markup language (moinmoin 1.21) to xml and pdf
@copyright: 2004 Uwe Fechner (ufechner AT sk28 DOT de)
@license: GNU GPL
printParagraph uses some code from Chris Arndt,
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/134571
version: 0.07
"""
from reportlab.pdfgen import canvas
import libxml2,sys,string
from os import getcwd
from string import strip
import re
# change as needed
wikiencoding="iso8859-1"
# change as needed
xmlencoding ="iso8859-1"
# change as needed
pdfencoding ="iso8859-1"
# change as needed
pageheight = 1000
pagetop = 200
pageleft = 50
docstr='<?xml version="1.0" encoding="'+xmlencoding+'"?><pdfreport/>'
# the parameters of the libxml functions must be UTF8 encoded
def toUTF8(wikistring):
return (unicode(wikistring,wikiencoding).encode('UTF-8'))
# if the pdf library needs something else then UTF8, it must be decoded
def toPdf(xmlcontent):
return (unicode(xmlcontent,'UTF-8').encode(pdfencoding))
class XmlReport:
"A class to convert wiki pages to xml"
# create empty pdf canvas, empty xmldoc
def __init__(self,name):
self.name = name
self.path = getcwd()+"/"
# remove indentation when parsing xml files
libxml2.keepBlanksDefault(0)
# parse emty default document
self.doc = libxml2.parseDoc(docstr)
# save the document root node
self.root = self.doc.children
# set the name of the document
self.root.setProp("name", toUTF8(name))
def clearSection(self):
self.section = ""
self.enumNodes = [None,None,None,None,None,None]
# save self.doc as xml file
def saveAsXml(self):
str=self.doc.serialize(format=1)
filename=self.path+self.name+'.xml'
print "xml-file: "+filename
newfile = open(filename, "w")
newfile.write(str)
newfile.close()
return
def appendParagraph(self):
# strip trailing newline
self.section=string.rstrip(self.section,"\n")
childname="p"
# check, if it is a header
for j in range(6):
if (len(self.section)>0):
if (self.section[0]=="="):
childname="h"+str(j+1)
self.section=strip(self.section[1:-1])
# append paragraph node
para = self.page.newTextChild(None, childname, toUTF8(self.section))
XmlReport.clearSection(self)
# parses and appends (to xml dom) the wikipage with the given filename
def appendWikiPage(self, pagename):
print "wiki-file: "+self.path+'/'+pagename
# Open input file, read it, and close it
file = open(self.path+'/'+pagename, "r")
text = file.readlines()
file.close()
# create a page element and append it to the root node
self.page = self.root.newTextChild(None, "page", None)
# set the name of the page
self.page.setProp("name", toUTF8(pagename))
i = 0
XmlReport.clearSection(self)
# parse the wiki page
for line in text:
i=i+1
# if we find an empty line or EOF, than we have a new paragraph
if (line=="\n") or (i==len(text)):
# make shure, that we don't lose the last line
if i==len(text):
self.section = self.section + line
XmlReport.appendParagraph(self)
else:
# check, if it is a unorderd list
self.enumNodes[0] = self.page
isEnum = 0
for k in range(6):
if line[k+1]=="*":
isEnum = 1
if self.enumNodes[k+1] == None:
self.enumNodes[k+1] = self.enumNodes[k].newChild(None, "ul", "")
self.enumNodes[k+1].newTextChild(None, "li", toUTF8(strip(line[k+2:])))
# if it is no list
if isEnum == 0:
self.section = self.section + line
for enumNode in self.enumNodes:
enumNode = None
return
# needed for libxml2 C library
def free(self):
report.doc.freeDoc()
class PdfReport(XmlReport):
"A class to convert a wiki dom to pdf"
def __init__(self,name):
XmlReport.__init__(self,name)
self.filename = self.path+self.name+".pdf"
self.can = canvas.Canvas(self.filename)
# the cursor
self.x = pageleft
self.y = pageheight-pagetop
# indentlevel of lists
self.indentLevel = 0
# print one line of text to the canvas
def pdfPrint(self, text, size=10):
# for debugging
print text
self.can.drawString(self.x,self.y,text)
self.y -= 20
# add a header to the canvas
def printHeader(self, headerNode):
size = headerNode.name[1]
text = toPdf(headerNode.content)
self.pdfPrint( "H"+str(size)+": "+text)
def printParagraph(self, paraNode, width=80):
"""Wrap paragraphs to width linelength.
Inter-word space is reduced to one space character
Indention is currently also lost.
"""
text=toPdf(paraNode.content)
# split paragraphs into a list of words
words = text.strip().split()
line = []; new_par = []
while 1:
if words:
if len(' '.join(line + [words[0]])) > width and line:
# the line is already long enough -> add it to paragraph
new_par.append(' '.join(line))
line = []
else:
# append next word
line.append(words.pop(0))
else:
# last line in paragraph
new_par.append(' '.join(line))
line = []
break
# print the lines of the paragraph
for par in new_par:
PdfReport.pdfPrint(self, par)
return
# add a list to the canvas
def printList(self, listNode):
child = listNode.children
self.indentLevel += 1
while child != None:
if child.name[0]=="u":
PdfReport.printList(self,child)
child = child.next
else:
# skip text nodes (from xml indentation)
if child.name[0]=="l":
# print list element
text = toPdf(child.content)
self.pdfPrint("LI"+str(self.indentLevel)+": "+text)
child = child.next
self.indentLevel -= 1
# converts the xml dom to pdf and writes it to a file
def convertToPdf(self):
# loop through all children of the page element
child = self.root.children.children
while child != None:
# print child
if child.name[0]=="h":
PdfReport.printHeader(self, child)
if child.name[0]=="p":
PdfReport.printParagraph(self, child)
if child.name[0]=="u":
PdfReport.printList(self, child)
child = child.next
# create pdf page
self.can.showPage()
# save pdf file
self.can.save()
return
report = PdfReport("WikiSchulung")
report.appendWikiPage("WikiSchulung")
report.saveAsXml()
report.convertToPdf()
report.free()