Attachment 'oo2txt.py'
Download 1 #!/usr/bin/python
2 #
3 # simple OO writer to text. just strips all markup, useable for search engines
4 # this is an example with only a bit error handling
5 #
6 # Copyright (c) 2004 by Oliver Graf <ograf@bitart.de>
7 #
8
9 import sys, zipfile, re
10 import xml.parsers.expat
11 from htmlentitydefs import name2codepoint
12
13 class OOparser:
14
15 def __init__(self, entities=None):
16 self.entities={}
17 self.entities.update(name2codepoint)
18 if entities is not None:
19 self.entities.update(entities)
20
21 def reset(self):
22 self.data=u''
23 self.parser=p=xml.parsers.expat.ParserCreate()
24 p.CharacterDataHandler = self._char_data
25 p.DefaultHandler = self._default_data
26 p.StartElementHandler = self._ignore
27 p.EndElementHandler = self._ignore
28 p.XmlDeclHandler = self._ignore
29 p.StartDoctypeDeclHandler = self._ignore
30 p.EndDoctypeDeclHandler = self._ignore
31 p.ElementDeclHandler = self._ignore
32 p.AttlistDeclHandler = self._ignore
33 p.ProcessingInstructionHandler = self._ignore
34 p.UnparsedEntityDeclHandler = self._ignore
35 p.EntityDeclHandler = self._ignore
36 p.NotationDeclHandler = self._ignore
37 p.StartNamespaceDeclHandler = self._ignore
38 p.EndNamespaceDeclHandler = self._ignore
39 p.CommentHandler = self._ignore
40 p.StartCdataSectionHandler = self._ignore
41 p.EndCdataSectionHandler = self._ignore
42 p.ExternalEntityRefHandler = self._ignore
43 return p
44
45 def parse(self, data):
46 p=self.reset()
47 p.Parse(data,1)
48 return self.data
49
50 def _char_data(self, data):
51 self.data+=data
52
53 def _default_data(self, data):
54 # handle entities! everything inside should be unicode
55 if data[0]=='&' and data[-1]==';':
56 if self.entities.has_key(data[1:-1]):
57 self.data+=unichr(self.entities[data[1:-1]])
58 return
59 self.data+=data
60
61 def _ignore(self, *args, **kwargs):
62 pass
63
64 def oo2txt(filename):
65 z=zipfile.ZipFile(filename,'r')
66 data=z.read('content.xml')
67 p=OOparser()
68 words=re.split(r'\W+',p.parse(data).encode('ISO-8859-1'))
69 print '\n'.join(words)
70
71 if __name__=='__main__':
72 if len(sys.argv)!=2:
73 sys.stderr.write('Usage: %s FILENAME\n'%(sys.argv[0]))
74 sys.exit(1)
75 oo2txt(sys.argv[1])
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.