First prototype

"""
    program to convert files of wiki markup language (moinmoin 1.21) to xml and pdf
    @copyright: 2004 Uwe Fechner (ufechner AT sk28 DOT de) 
    @license:   GNU GPL
    printParagraph uses some code from Chris Arndt, 
    http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/134571
    version:    0.07
"""

from reportlab.pdfgen import canvas
import libxml2,sys,string
from os import getcwd
from string import strip
import re

# change as needed
wikiencoding="iso8859-1"

# change as needed
xmlencoding ="iso8859-1"

# change as needed
pdfencoding ="iso8859-1"

# change as needed
pageheight = 1000
pagetop  =    200
pageleft =     50 

docstr='<?xml version="1.0" encoding="'+xmlencoding+'"?><pdfreport/>'

# the parameters of the libxml functions must be  UTF8 encoded
def toUTF8(wikistring):
   return (unicode(wikistring,wikiencoding).encode('UTF-8'))
   
# if the pdf library needs something else then UTF8, it must be decoded
def toPdf(xmlcontent):
    return (unicode(xmlcontent,'UTF-8').encode(pdfencoding))
      
class XmlReport:
    "A class to convert wiki pages to xml"
    
    # create empty pdf canvas, empty xmldoc
    def __init__(self,name):
       self.name = name
       self.path = getcwd()+"/"
   
       # remove indentation when parsing xml files
       libxml2.keepBlanksDefault(0)       
       # parse emty default document
       self.doc = libxml2.parseDoc(docstr)       
       # save the document root node
       self.root = self.doc.children       
       # set the name of the document
       self.root.setProp("name", toUTF8(name))            
       
    def clearSection(self):
        self.section = ""
        self.enumNodes = [None,None,None,None,None,None]
              
    # save self.doc as xml file
    def saveAsXml(self):                
        str=self.doc.serialize(format=1)
        filename=self.path+self.name+'.xml'
        print "xml-file:  "+filename
        newfile = open(filename, "w")
        newfile.write(str)
        newfile.close()
        return
    
    def appendParagraph(self):           
        # strip trailing newline
        self.section=string.rstrip(self.section,"\n")        
        childname="p"        
        # check, if it is a header
        for j in range(6):
            if (len(self.section)>0):
              if (self.section[0]=="="):
                childname="h"+str(j+1)
                self.section=strip(self.section[1:-1])        
        # append paragraph node            
        para = self.page.newTextChild(None, childname, toUTF8(self.section))
        XmlReport.clearSection(self)
        
    # parses and appends (to xml dom) the wikipage with the given filename
    def appendWikiPage(self, pagename):       
        print "wiki-file: "+self.path+'/'+pagename
        # Open input file, read it, and close it
        file = open(self.path+'/'+pagename, "r")
        text = file.readlines()
        file.close()        
        # create a page element and append it to the root node
        self.page = self.root.newTextChild(None, "page", None)        
        # set the name of the page
        self.page.setProp("name", toUTF8(pagename))        
        i = 0
        XmlReport.clearSection(self)        
        # parse the wiki page
        for line in text:
          i=i+1
          # if we find an empty line or EOF, than we have a new paragraph
          if (line=="\n") or (i==len(text)):
              # make shure, that we don't lose the last line
              if i==len(text):
                  self.section = self.section + line     
              XmlReport.appendParagraph(self)
          else:
            # check, if it is a unorderd list
            self.enumNodes[0] = self.page
            isEnum = 0
            for k in range(6):
                if line[k+1]=="*":
                    isEnum = 1
                    if self.enumNodes[k+1] == None:
                        self.enumNodes[k+1] = self.enumNodes[k].newChild(None, "ul", "")
                    self.enumNodes[k+1].newTextChild(None, "li", toUTF8(strip(line[k+2:])))
            # if it is no list
            if isEnum == 0:
                self.section = self.section + line
                for enumNode in self.enumNodes:
                    enumNode = None
        return
               
    # needed for libxml2 C library
    def free(self):
        report.doc.freeDoc()
        
class PdfReport(XmlReport):
    "A class to convert a wiki dom to pdf"
    
    def __init__(self,name):
        XmlReport.__init__(self,name)
        self.filename = self.path+self.name+".pdf"
        self.can = canvas.Canvas(self.filename)   
        # the cursor 
        self.x = pageleft
        self.y = pageheight-pagetop
        # indentlevel of lists
        self.indentLevel = 0
               
    # print one line of text to the canvas
    def pdfPrint(self, text, size=10):
        # for debugging
        print text
        self.can.drawString(self.x,self.y,text)       
        self.y -= 20 
        
    # add a header to the canvas
    def printHeader(self, headerNode):
        size = headerNode.name[1]
        text = toPdf(headerNode.content)
        self.pdfPrint( "H"+str(size)+": "+text)
        
    def printParagraph(self, paraNode, width=80):
        """Wrap paragraphs to width linelength.
    
        Inter-word space is reduced to one space character  
        Indention is currently also lost.    
        """
        text=toPdf(paraNode.content)
        # split paragraphs into a list of words
        words = text.strip().split()
        line = []; new_par = []
        while 1:
           if words:
               if len(' '.join(line + [words[0]])) > width and line:
                   # the line is already long enough -> add it to paragraph
                   new_par.append(' '.join(line))
                   line = []
               else:
                   # append next word
                   line.append(words.pop(0))
           else:
               # last line in paragraph
               new_par.append(' '.join(line))
               line = []
               break
        # print the lines of the paragraph
        for par in new_par:
            PdfReport.pdfPrint(self, par)
        return 
           
    # add a list to the canvas
    def printList(self, listNode):
        child = listNode.children
        self.indentLevel += 1
        while child != None:
            if child.name[0]=="u":
                PdfReport.printList(self,child) 
                child = child.next
            else:         
                # skip text nodes (from xml indentation)      
                if child.name[0]=="l": 
                    # print list element
                    text = toPdf(child.content)
                    self.pdfPrint("LI"+str(self.indentLevel)+": "+text)
                child = child.next
        self.indentLevel -= 1
          
    # converts the xml dom to pdf and writes it to a file
    def convertToPdf(self):
        # loop through all children of the page element
        child = self.root.children.children
        while child != None:
            # print child
            if child.name[0]=="h":
                PdfReport.printHeader(self, child)
            if child.name[0]=="p":
                PdfReport.printParagraph(self, child)
            if child.name[0]=="u":
                PdfReport.printList(self, child)
            child = child.next
        # create pdf page
        self.can.showPage()        
        # save pdf file
        self.can.save()        
        return        

report = PdfReport("WikiSchulung")
report.appendWikiPage("WikiSchulung")
report.saveAsXml()
report.convertToPdf()
report.free()

MoinMoin: PdfReport/FirstPrototyp (last edited 2007-10-29 19:21:15 by localhost)