#!/usr/bin/python
#
# simple OO writer to text. just strips all markup, useable for search engines
# this is an example with only a bit error handling
#
# Copyright (c) 2004 by Oliver Graf <ograf@bitart.de>
#

import sys, zipfile, re
import xml.parsers.expat
from htmlentitydefs import name2codepoint

class OOparser:
	
	def __init__(self, entities=None):
		self.entities={}
		self.entities.update(name2codepoint)
		if entities is not None:
			self.entities.update(entities)

	def reset(self):
		self.data=u''
		self.parser=p=xml.parsers.expat.ParserCreate()
		p.CharacterDataHandler         = self._char_data
		p.DefaultHandler               = self._default_data
		p.StartElementHandler          = self._ignore
		p.EndElementHandler            = self._ignore
		p.XmlDeclHandler               = self._ignore
		p.StartDoctypeDeclHandler      = self._ignore
		p.EndDoctypeDeclHandler        = self._ignore
		p.ElementDeclHandler           = self._ignore
		p.AttlistDeclHandler           = self._ignore
		p.ProcessingInstructionHandler = self._ignore
		p.UnparsedEntityDeclHandler    = self._ignore
		p.EntityDeclHandler            = self._ignore
		p.NotationDeclHandler          = self._ignore
		p.StartNamespaceDeclHandler    = self._ignore
		p.EndNamespaceDeclHandler      = self._ignore
		p.CommentHandler               = self._ignore
		p.StartCdataSectionHandler     = self._ignore
		p.EndCdataSectionHandler       = self._ignore
		p.ExternalEntityRefHandler     = self._ignore
		return p

	def parse(self, data):
		p=self.reset()
		p.Parse(data,1)
		return self.data

	def _char_data(self, data):
		self.data+=data

	def _default_data(self, data):
		# handle entities! everything inside should be unicode
		if data[0]=='&' and data[-1]==';':
			if self.entities.has_key(data[1:-1]):
				self.data+=unichr(self.entities[data[1:-1]])
				return
		self.data+=data

	def _ignore(self, *args, **kwargs):
		pass

def oo2txt(filename):
	z=zipfile.ZipFile(filename,'r')
	data=z.read('content.xml')
	p=OOparser()
	words=re.split(r'\W+',p.parse(data).encode('ISO-8859-1'))
	print '\n'.join(words)

if __name__=='__main__':
	if len(sys.argv)!=2:
		sys.stderr.write('Usage: %s FILENAME\n'%(sys.argv[0]))
		sys.exit(1)
	oo2txt(sys.argv[1])
