Details
- Applies to
moin export dump
- Purpose
- Reduce the time it takes to do a dump.
- Description
- The patch looks at the edit-log and restricts the dump page list to pages newer than the last dump.
Patch
1 From f0b065d977ba5cdf4202b8de6df19cebf2f3ea82 Mon Sep 17 00:00:00 2001
2 From: Paul Wise <pabs3@bonedaddy.net>
3 Date: Wed, 1 Jan 2014 19:30:20 +0800
4 Subject: [PATCH] Implement an incremental dump process.
5
6 This also fixes dumping of the attachments.
7
8 This also allows the dump script to be interrupted.
9 ---
10 MoinMoin/action/AttachFile.py | 9 ++-
11 MoinMoin/script/export/dump.py | 172 ++++++++++++++++++++++++++++++++++-------
12 2 files changed, 149 insertions(+), 32 deletions(-)
13
14 diff --git a/MoinMoin/action/AttachFile.py b/MoinMoin/action/AttachFile.py
15 index 9081c3a..0bfc865 100644
16 --- a/MoinMoin/action/AttachFile.py
17 +++ b/MoinMoin/action/AttachFile.py
18 @@ -310,7 +310,7 @@ def _access_file(pagename, request):
19 return (pagename, None, None)
20
21
22 -def _build_filelist(request, pagename, showheader, readonly, mime_type='*', filterfn=None):
23 +def _build_filelist(request, pagename, showheader, readonly, mime_type='*', filterfn=None, downloadonly=False):
24 _ = request.getText
25 fmt = request.html_formatter
26
27 @@ -372,9 +372,10 @@ def _build_filelist(request, pagename, showheader, readonly, mime_type='*', filt
28 fmt.text(label_get) +
29 fmt.url(0))
30
31 - links.append(fmt.url(1, getAttachUrl(pagename, file, request, do='view')) +
32 - fmt.text(label_view) +
33 - fmt.url(0))
34 + if may_read and not downloadonly:
35 + links.append(fmt.url(1, getAttachUrl(pagename, file, request, do='view')) +
36 + fmt.text(label_view) +
37 + fmt.url(0))
38
39 if may_write and not readonly:
40 edit_url = getAttachUrl(pagename, file, request, do='modify')
41 diff --git a/MoinMoin/script/export/dump.py b/MoinMoin/script/export/dump.py
42 index d770e5b..b7b72e6 100644
43 --- a/MoinMoin/script/export/dump.py
44 +++ b/MoinMoin/script/export/dump.py
45 @@ -3,7 +3,8 @@
46 MoinMoin - Dump a MoinMoin wiki to static pages
47
48 @copyright: 2002-2004 Juergen Hermann <jh@web.de>,
49 - 2005-2006 MoinMoin:ThomasWaldmann
50 + 2005-2006 MoinMoin:ThomasWaldmann,
51 + 2013-2014 Paul Wise <pabs3@bonedaddy.net>
52 @license: GNU GPL, see COPYING for details.
53 """
54
55 @@ -12,11 +13,17 @@ import sys, os, time, codecs, shutil, re, errno
56 from MoinMoin import config, wikiutil, Page, user
57 from MoinMoin import script
58 from MoinMoin.action import AttachFile
59 +from MoinMoin.logfile import editlog, LogMissing
60
61 url_prefix_static = "."
62 logo_html = '<img src="logo.png">'
63 HTML_SUFFIX = ".html"
64
65 +timestamp_text = u"""This is a MoinMoin timestamp file.
66 +Please delete it to rebuild all pages.
67 +This page dump was last created at:
68 +%s
69 +"""
70 page_template = u'''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
71 <html>
72 <head>
73 @@ -60,14 +67,19 @@ td.noborder {
74 <div id="page">
75 %(pagehtml)s
76 </div>
77 +<div id="attachments">
78 +%(attachments_html)s
79 +</div>
80 <hr>
81 %(timestamp)s
82 </body>
83 </html>
84 '''
85
86 +def _attachment_fn(outputdir, pagename, filename=''):
87 + return os.path.join(outputdir, "attachments", wikiutil.quoteWikinameFS(pagename), filename.encode(config.charset))
88
89 -def _attachment(request, pagename, filename, outputdir, **kw):
90 +def _attachment(request, pagename, filename, outputdir, copy=False, **kw):
91 filename = filename.encode(config.charset)
92 source_dir = AttachFile.getAttachDir(request, pagename)
93 source_file = os.path.join(source_dir, filename)
94 @@ -75,20 +87,25 @@ def _attachment(request, pagename, filename, outputdir, **kw):
95 dest_file = os.path.join(dest_dir, filename)
96 dest_url = "attachments/%s/%s" % (wikiutil.quoteWikinameFS(pagename), wikiutil.url_quote(filename))
97 if os.access(source_file, os.R_OK):
98 - if not os.access(dest_dir, os.F_OK):
99 - try:
100 - os.makedirs(dest_dir)
101 - except:
102 - script.fatal("Cannot create attachment directory '%s'" % dest_dir)
103 - elif not os.path.isdir(dest_dir):
104 - script.fatal("'%s' is not a directory" % dest_dir)
105 -
106 - shutil.copyfile(source_file, dest_file)
107 - script.log('Writing "%s"...' % dest_url)
108 + if copy:
109 + if not os.access(dest_dir, os.F_OK):
110 + try:
111 + os.makedirs(dest_dir)
112 + except OSError, err:
113 + if err.errno != errno.EEXIST:
114 + script.fatal("Cannot create attachment directory '%s'" % dest_dir)
115 + elif not os.path.isdir(dest_dir):
116 + script.fatal("'%s' is not a directory" % dest_dir)
117 +
118 + script.log('Writing "%s" attachment "%s"...' % (pagename, filename))
119 + shutil.copyfile(source_file, dest_file)
120 return dest_url
121 else:
122 return ""
123
124 +def fatal_hook(filename, fatal, msgtext):
125 + os.remove(filename)
126 + fatal(msgtext)
127
128 class PluginScript(script.MoinScript):
129 """\
130 @@ -157,16 +174,54 @@ General syntax: moin [options] export dump [dump-options]
131 # use this user for permissions checks
132 request.user = user.User(request, name=self.options.dump_user)
133
134 - pages = request.rootpage.getPageList(user='') # get list of all pages in wiki
135 - pages.sort()
136 - if self.options.page: # did user request a particular page or group of pages?
137 - try:
138 - namematch = re.compile(self.options.page)
139 - pages = [page for page in pages if namematch.match(page)]
140 - if not pages:
141 - pages = [self.options.page]
142 - except:
143 - pages = [self.options.page]
144 + pages = request.rootpage.getPageList(user='', exists=0) # get list of all pages in wiki
145 +
146 + # Check the last update timestamp
147 + timestamp_file = os.path.join(outputdir, 'moin-last-update')
148 + try:
149 + with open(timestamp_file) as f:
150 + timestamp_value = long(f.read().splitlines()[-1])
151 + except IOError, err:
152 + timestamp_value = 0
153 + if err.errno != errno.ENOENT:
154 + script.fatal("Cannot check last update time of '%s' (%s)!" % (timestamp_file, str(err)))
155 +
156 + # Create a new timestamp to use if successful
157 + log = editlog.EditLog(request)
158 + try: new_timestamp_value = log.date()
159 + except LogMissing: new_timestamp_value = 0
160 + new_timestamp_file = timestamp_file + '.new'
161 + with open(new_timestamp_file, 'w') as f:
162 + f.write(timestamp_text % new_timestamp_value)
163 +
164 + # Fatal errors should delete the new timestamp file
165 + script_fatal = script.fatal
166 + script.fatal = lambda msgtext: fatal_hook(new_timestamp_file, script_fatal, msgtext)
167 +
168 + # Get a list of pages that need actions
169 + attachments = dict()
170 + if timestamp_value:
171 + pages = set()
172 + for line in log:
173 + if line.ed_time_usecs <= timestamp_value:
174 + continue
175 + elif line.action in ('ATTNEW', 'ATTDEL'):
176 + if line.pagename not in attachments:
177 + attachments[line.pagename] = {}
178 + attachments[line.pagename][line.extra] = line.action
179 + elif line.action == 'SAVE/RENAME':
180 + attachment_from = _attachment_fn(outputdir, line.extra)
181 + attachment_to = _attachment_fn(outputdir, line.pagename)
182 + try:
183 + os.rename(attachment_from, attachment_to)
184 + except OSError, err:
185 + if err.errno != errno.ENOENT:
186 + script.fatal('Cannot move attachments from "%s" to "%s" (%s)!' % (line.extra, line.pagename, str(err)))
187 + else:
188 + script.log('Moving attachments from "%s" to "%s"' % (line.extra, line.pagename))
189 + pages.add(line.extra)
190 + pages.add(line.pagename)
191 + pages = list(pages)
192
193 wikiutil.quoteWikinameURL = lambda pagename, qfn=wikiutil.quoteWikinameFS: (qfn(pagename) + HTML_SUFFIX)
194
195 @@ -184,34 +239,90 @@ General syntax: moin [options] export dump [dump-options]
196 for p in [page_front_page, page_title_index, page_word_index]:
197 navibar_html += '[<a href="%s">%s</a>] ' % (wikiutil.quoteWikinameURL(p), wikiutil.escape(p))
198
199 + # Re-render the title and word indicies if anything changed
200 + if new_timestamp_value > timestamp_value:
201 + pages = list(set(pages+[page_title_index, page_word_index]))
202 +
203 + if self.options.page: # did user request a particular page or group of pages?
204 + try:
205 + namematch = re.compile(self.options.page)
206 + pages = [page for page in pages if namematch.match(page)]
207 + if not pages:
208 + pages = [self.options.page]
209 + except:
210 + pages = [self.options.page]
211 +
212 + # Render the pages in alphabetical order
213 + pages.sort()
214 +
215 urlbase = request.url # save wiki base url
216 for pagename in pages:
217 + # Process attachments for this page
218 + copy_attachments = []
219 + delete_attachments = []
220 + if pagename in attachments:
221 + for filename, action in attachments[pagename].items():
222 + if action == 'ATTNEW':
223 + copy_attachments.append(filename)
224 + elif action == 'ATTDEL':
225 + delete_attachments.append(filename)
226 + elif not timestamp_value:
227 + copy_attachments = AttachFile._get_files(request, pagename)
228 + for filename in copy_attachments:
229 + _attachment(request, pagename, filename, outputdir, copy=True)
230 + for filename in delete_attachments:
231 + try:
232 + os.remove(_attachment_fn(outputdir, pagename, filename))
233 + except OSError, err:
234 + if err.errno != errno.ENOENT:
235 + script.fatal('Cannot remove "%s" attachment "%s" (%s)!' % (pagename, filename, str(err)))
236 + else:
237 + script.log('Removed "%s" attachment "%s"...' % (pagename, filename))
238 +
239 # we have the same name in URL and FS
240 file = wikiutil.quoteWikinameURL(pagename)
241 - script.log('Writing "%s"...' % file)
242 + filepath = os.path.join(outputdir, file)
243 + exists = os.path.exists(filepath)
244 + request.url = urlbase + pagename # add current pagename to url base
245 + page = Page.Page(request, pagename)
246 + missing = not page.exists()
247 + unreadable = not request.user.may.read(pagename)
248 + if missing or unreadable:
249 + try:
250 + os.remove(filepath)
251 + except OSError, err:
252 + if err.errno != errno.ENOENT:
253 + script.fatal("Cannot remove '%s' (%s)!" % (file, str(err)))
254 + else:
255 + script.log('Removed "%s"...' % pagename)
256 + continue
257 try:
258 + script.log('Writing "%s"...' % pagename)
259 pagehtml = ''
260 - request.url = urlbase + pagename # add current pagename to url base
261 - page = Page.Page(request, pagename)
262 request.page = page
263 try:
264 request.reset()
265 pagehtml = request.redirectedOutput(page.send_page, count_hit=0, content_only=1)
266 - except:
267 + attachments_html = AttachFile._build_filelist(request, pagename, 0, 1, downloadonly=True)
268 + if attachments_html: attachments_html = '<h2>Attached Files</h2>' + attachments_html
269 + except Exception:
270 errcnt = errcnt + 1
271 print >> sys.stderr, "*** Caught exception while writing page!"
272 print >> errlog, "~" * 78
273 - print >> errlog, file # page filename
274 + print >> errlog, pagename
275 import traceback
276 traceback.print_exc(None, errlog)
277 + except:
278 + os.remove(new_timestamp_file)
279 + raise
280 finally:
281 timestamp = time.strftime("%Y-%m-%d %H:%M")
282 - filepath = os.path.join(outputdir, file)
283 fileout = codecs.open(filepath, 'w', config.charset)
284 fileout.write(page_template % {
285 'charset': config.charset,
286 'pagename': pagename,
287 'pagehtml': pagehtml,
288 + 'attachments_html': attachments_html,
289 'logo_html': logo_html,
290 'navibar_html': navibar_html,
291 'timestamp': timestamp,
292 @@ -231,4 +342,9 @@ General syntax: moin [options] export dump [dump-options]
293 errlog.close()
294 if errcnt:
295 print >> sys.stderr, "*** %d error(s) occurred, see '%s'!" % (errcnt, errfile)
296 + os.remove(new_timestamp_file)
297 + else:
298 + os.rename(new_timestamp_file, timestamp_file)
299
300 + # Restore the script.fatal hook
301 + script.fatal = script_fatal
302 --
303 1.8.5.2
0001-Implement-an-incremental-dump-process-9.patch
Discussion
that exception handler at the bottom of the newly added code looks like it has a much too wide scope of code.
fixed in patch 2
I guess you want to use set. Or you can have multiple times items in your lists.
Fixed in patch 6, which is much more comprehensive - deletes pages and attachments too.
Patch 7 is even more comprehensive, copes with wiki configuration changes making pages inaccessible or accessible.
Patch 8 is fixes attachment dumping, improves the output and allows to interrupt the script.
Patch 9 handles timestamps, fatal errors better and also ensures the WordIndex and TitleIndex pages are up-to-date.
Plan
- Priority: low
- Assigned to:
- Status: