Details

Applies to: moin export dump
Purpose: Reduce the time it takes to do a dump.
Description: The patch looks at the edit-log and restricts the dump page list to pages newer than the last dump.
Patch

   1 From f0b065d977ba5cdf4202b8de6df19cebf2f3ea82 Mon Sep 17 00:00:00 2001
   2 From: Paul Wise <pabs3@bonedaddy.net>
   3 Date: Wed, 1 Jan 2014 19:30:20 +0800
   4 Subject: [PATCH] Implement an incremental dump process.
   5 
   6 This also fixes dumping of the attachments.
   7 
   8 This also allows the dump script to be interrupted.
   9 ---
  10  MoinMoin/action/AttachFile.py  |   9 ++-
  11  MoinMoin/script/export/dump.py | 172 ++++++++++++++++++++++++++++++++++-------
  12  2 files changed, 149 insertions(+), 32 deletions(-)
  13 
  14 diff --git a/MoinMoin/action/AttachFile.py b/MoinMoin/action/AttachFile.py
  15 index 9081c3a..0bfc865 100644
  16 --- a/MoinMoin/action/AttachFile.py
  17 +++ b/MoinMoin/action/AttachFile.py
  18 @@ -310,7 +310,7 @@ def _access_file(pagename, request):
  19      return (pagename, None, None)
  20  
  21  
  22 -def _build_filelist(request, pagename, showheader, readonly, mime_type='*', filterfn=None):
  23 +def _build_filelist(request, pagename, showheader, readonly, mime_type='*', filterfn=None, downloadonly=False):
  24      _ = request.getText
  25      fmt = request.html_formatter
  26  
  27 @@ -372,9 +372,10 @@ def _build_filelist(request, pagename, showheader, readonly, mime_type='*', filt
  28                           fmt.text(label_get) +
  29                           fmt.url(0))
  30  
  31 -            links.append(fmt.url(1, getAttachUrl(pagename, file, request, do='view')) +
  32 -                         fmt.text(label_view) +
  33 -                         fmt.url(0))
  34 +            if may_read and not downloadonly:
  35 +                links.append(fmt.url(1, getAttachUrl(pagename, file, request, do='view')) +
  36 +                             fmt.text(label_view) +
  37 +                             fmt.url(0))
  38  
  39              if may_write and not readonly:
  40                  edit_url = getAttachUrl(pagename, file, request, do='modify')
  41 diff --git a/MoinMoin/script/export/dump.py b/MoinMoin/script/export/dump.py
  42 index d770e5b..b7b72e6 100644
  43 --- a/MoinMoin/script/export/dump.py
  44 +++ b/MoinMoin/script/export/dump.py
  45 @@ -3,7 +3,8 @@
  46  MoinMoin - Dump a MoinMoin wiki to static pages
  47  
  48  @copyright: 2002-2004 Juergen Hermann <jh@web.de>,
  49 -            2005-2006 MoinMoin:ThomasWaldmann
  50 +            2005-2006 MoinMoin:ThomasWaldmann,
  51 +            2013-2014 Paul Wise <pabs3@bonedaddy.net>
  52  @license: GNU GPL, see COPYING for details.
  53  """
  54  
  55 @@ -12,11 +13,17 @@ import sys, os, time, codecs, shutil, re, errno
  56  from MoinMoin import config, wikiutil, Page, user
  57  from MoinMoin import script
  58  from MoinMoin.action import AttachFile
  59 +from MoinMoin.logfile import editlog, LogMissing
  60  
  61  url_prefix_static = "."
  62  logo_html = '<img src="logo.png">'
  63  HTML_SUFFIX = ".html"
  64  
  65 +timestamp_text = u"""This is a MoinMoin timestamp file.
  66 +Please delete it to rebuild all pages.
  67 +This page dump was last created at:
  68 +%s
  69 +"""
  70  page_template = u'''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
  71  <html>
  72  <head>
  73 @@ -60,14 +67,19 @@ td.noborder {
  74  <div id="page">
  75  %(pagehtml)s
  76  </div>
  77 +<div id="attachments">
  78 +%(attachments_html)s
  79 +</div>
  80  <hr>
  81  %(timestamp)s
  82  </body>
  83  </html>
  84  '''
  85  
  86 +def _attachment_fn(outputdir, pagename, filename=''):
  87 +    return os.path.join(outputdir, "attachments", wikiutil.quoteWikinameFS(pagename), filename.encode(config.charset))
  88  
  89 -def _attachment(request, pagename, filename, outputdir, **kw):
  90 +def _attachment(request, pagename, filename, outputdir, copy=False, **kw):
  91      filename = filename.encode(config.charset)
  92      source_dir = AttachFile.getAttachDir(request, pagename)
  93      source_file = os.path.join(source_dir, filename)
  94 @@ -75,20 +87,25 @@ def _attachment(request, pagename, filename, outputdir, **kw):
  95      dest_file = os.path.join(dest_dir, filename)
  96      dest_url = "attachments/%s/%s" % (wikiutil.quoteWikinameFS(pagename), wikiutil.url_quote(filename))
  97      if os.access(source_file, os.R_OK):
  98 -        if not os.access(dest_dir, os.F_OK):
  99 -            try:
 100 -                os.makedirs(dest_dir)
 101 -            except:
 102 -                script.fatal("Cannot create attachment directory '%s'" % dest_dir)
 103 -        elif not os.path.isdir(dest_dir):
 104 -            script.fatal("'%s' is not a directory" % dest_dir)
 105 -
 106 -        shutil.copyfile(source_file, dest_file)
 107 -        script.log('Writing "%s"...' % dest_url)
 108 +        if copy:
 109 +            if not os.access(dest_dir, os.F_OK):
 110 +                try:
 111 +                    os.makedirs(dest_dir)
 112 +                except OSError, err:
 113 +                    if err.errno != errno.EEXIST:
 114 +                        script.fatal("Cannot create attachment directory '%s'" % dest_dir)
 115 +            elif not os.path.isdir(dest_dir):
 116 +                script.fatal("'%s' is not a directory" % dest_dir)
 117 +
 118 +            script.log('Writing "%s" attachment "%s"...' % (pagename, filename))
 119 +            shutil.copyfile(source_file, dest_file)
 120          return dest_url
 121      else:
 122          return ""
 123  
 124 +def fatal_hook(filename, fatal, msgtext):
 125 +    os.remove(filename)
 126 +    fatal(msgtext)
 127  
 128  class PluginScript(script.MoinScript):
 129      """\
 130 @@ -157,16 +174,54 @@ General syntax: moin [options] export dump [dump-options]
 131          # use this user for permissions checks
 132          request.user = user.User(request, name=self.options.dump_user)
 133  
 134 -        pages = request.rootpage.getPageList(user='') # get list of all pages in wiki
 135 -        pages.sort()
 136 -        if self.options.page: # did user request a particular page or group of pages?
 137 -            try:
 138 -                namematch = re.compile(self.options.page)
 139 -                pages = [page for page in pages if namematch.match(page)]
 140 -                if not pages:
 141 -                    pages = [self.options.page]
 142 -            except:
 143 -                pages = [self.options.page]
 144 +        pages = request.rootpage.getPageList(user='', exists=0) # get list of all pages in wiki
 145 +
 146 +        # Check the last update timestamp
 147 +        timestamp_file = os.path.join(outputdir, 'moin-last-update')
 148 +        try:
 149 +            with open(timestamp_file) as f:
 150 +                timestamp_value = long(f.read().splitlines()[-1])
 151 +        except IOError, err:
 152 +            timestamp_value = 0
 153 +            if err.errno != errno.ENOENT:
 154 +                script.fatal("Cannot check last update time of '%s' (%s)!" % (timestamp_file, str(err)))
 155 +
 156 +        # Create a new timestamp to use if successful
 157 +        log = editlog.EditLog(request)
 158 +        try: new_timestamp_value = log.date()
 159 +        except LogMissing: new_timestamp_value = 0
 160 +        new_timestamp_file = timestamp_file + '.new'
 161 +        with open(new_timestamp_file, 'w') as f:
 162 +            f.write(timestamp_text % new_timestamp_value)
 163 +
 164 +        # Fatal errors should delete the new timestamp file
 165 +        script_fatal = script.fatal
 166 +        script.fatal = lambda msgtext: fatal_hook(new_timestamp_file, script_fatal, msgtext)
 167 +
 168 +        # Get a list of pages that need actions
 169 +        attachments = dict()
 170 +        if timestamp_value:
 171 +            pages = set()
 172 +            for line in log:
 173 +                if line.ed_time_usecs <= timestamp_value:
 174 +                    continue
 175 +                elif line.action in ('ATTNEW', 'ATTDEL'):
 176 +                    if line.pagename not in attachments:
 177 +                        attachments[line.pagename] = {}
 178 +                    attachments[line.pagename][line.extra] = line.action
 179 +                elif line.action == 'SAVE/RENAME':
 180 +                    attachment_from = _attachment_fn(outputdir, line.extra)
 181 +                    attachment_to = _attachment_fn(outputdir, line.pagename)
 182 +                    try:
 183 +                        os.rename(attachment_from, attachment_to)
 184 +                    except OSError, err:
 185 +                        if err.errno != errno.ENOENT:
 186 +                            script.fatal('Cannot move attachments from "%s" to "%s" (%s)!' % (line.extra, line.pagename, str(err)))
 187 +                    else:
 188 +                        script.log('Moving attachments from "%s" to "%s"' % (line.extra, line.pagename))
 189 +                    pages.add(line.extra)
 190 +                pages.add(line.pagename)
 191 +            pages = list(pages)
 192  
 193          wikiutil.quoteWikinameURL = lambda pagename, qfn=wikiutil.quoteWikinameFS: (qfn(pagename) + HTML_SUFFIX)
 194  
 195 @@ -184,34 +239,90 @@ General syntax: moin [options] export dump [dump-options]
 196          for p in [page_front_page, page_title_index, page_word_index]:
 197              navibar_html += '[<a href="%s">%s</a>]&nbsp;' % (wikiutil.quoteWikinameURL(p), wikiutil.escape(p))
 198  
 199 +        # Re-render the title and word indicies if anything changed
 200 +        if new_timestamp_value > timestamp_value:
 201 +            pages = list(set(pages+[page_title_index, page_word_index]))
 202 +
 203 +        if self.options.page: # did user request a particular page or group of pages?
 204 +            try:
 205 +                namematch = re.compile(self.options.page)
 206 +                pages = [page for page in pages if namematch.match(page)]
 207 +                if not pages:
 208 +                    pages = [self.options.page]
 209 +            except:
 210 +                pages = [self.options.page]
 211 +
 212 +        # Render the pages in alphabetical order
 213 +        pages.sort()
 214 +
 215          urlbase = request.url # save wiki base url
 216          for pagename in pages:
 217 +            # Process attachments for this page
 218 +            copy_attachments = []
 219 +            delete_attachments = []
 220 +            if pagename in attachments:
 221 +                for filename, action in attachments[pagename].items():
 222 +                    if action == 'ATTNEW':
 223 +                        copy_attachments.append(filename)
 224 +                    elif action == 'ATTDEL':
 225 +                        delete_attachments.append(filename)
 226 +            elif not timestamp_value:
 227 +                copy_attachments = AttachFile._get_files(request, pagename)
 228 +            for filename in copy_attachments:
 229 +                _attachment(request, pagename, filename, outputdir, copy=True)
 230 +            for filename in delete_attachments:
 231 +                try:
 232 +                    os.remove(_attachment_fn(outputdir, pagename, filename))
 233 +                except OSError, err:
 234 +                    if err.errno != errno.ENOENT:
 235 +                        script.fatal('Cannot remove "%s" attachment "%s" (%s)!' % (pagename, filename, str(err)))
 236 +                else:
 237 +                    script.log('Removed "%s" attachment "%s"...' % (pagename, filename))
 238 +
 239              # we have the same name in URL and FS
 240              file = wikiutil.quoteWikinameURL(pagename)
 241 -            script.log('Writing "%s"...' % file)
 242 +            filepath = os.path.join(outputdir, file)
 243 +            exists = os.path.exists(filepath)
 244 +            request.url = urlbase + pagename # add current pagename to url base
 245 +            page = Page.Page(request, pagename)
 246 +            missing = not page.exists()
 247 +            unreadable = not request.user.may.read(pagename)
 248 +            if missing or unreadable:
 249 +                try:
 250 +                    os.remove(filepath)
 251 +                except OSError, err:
 252 +                    if err.errno != errno.ENOENT:
 253 +                        script.fatal("Cannot remove '%s' (%s)!" % (file, str(err)))
 254 +                else:
 255 +                    script.log('Removed "%s"...' % pagename)
 256 +                continue
 257              try:
 258 +                script.log('Writing "%s"...' % pagename)
 259                  pagehtml = ''
 260 -                request.url = urlbase + pagename # add current pagename to url base
 261 -                page = Page.Page(request, pagename)
 262                  request.page = page
 263                  try:
 264                      request.reset()
 265                      pagehtml = request.redirectedOutput(page.send_page, count_hit=0, content_only=1)
 266 -                except:
 267 +                    attachments_html = AttachFile._build_filelist(request, pagename, 0, 1, downloadonly=True)
 268 +                    if attachments_html: attachments_html = '<h2>Attached Files</h2>' + attachments_html
 269 +                except Exception:
 270                      errcnt = errcnt + 1
 271                      print >> sys.stderr, "*** Caught exception while writing page!"
 272                      print >> errlog, "~" * 78
 273 -                    print >> errlog, file # page filename
 274 +                    print >> errlog, pagename
 275                      import traceback
 276                      traceback.print_exc(None, errlog)
 277 +                except:
 278 +                    os.remove(new_timestamp_file)
 279 +                    raise
 280              finally:
 281                  timestamp = time.strftime("%Y-%m-%d %H:%M")
 282 -                filepath = os.path.join(outputdir, file)
 283                  fileout = codecs.open(filepath, 'w', config.charset)
 284                  fileout.write(page_template % {
 285                      'charset': config.charset,
 286                      'pagename': pagename,
 287                      'pagehtml': pagehtml,
 288 +                    'attachments_html': attachments_html,
 289                      'logo_html': logo_html,
 290                      'navibar_html': navibar_html,
 291                      'timestamp': timestamp,
 292 @@ -231,4 +342,9 @@ General syntax: moin [options] export dump [dump-options]
 293          errlog.close()
 294          if errcnt:
 295              print >> sys.stderr, "*** %d error(s) occurred, see '%s'!" % (errcnt, errfile)
 296 +            os.remove(new_timestamp_file)
 297 +        else:
 298 +            os.rename(new_timestamp_file, timestamp_file)
 299  
 300 +        # Restore the script.fatal hook
 301 +        script.fatal = script_fatal
 302 -- 
 303 1.8.5.2
0001-Implement-an-incremental-dump-process-9.patch
Discussion

that exception handler at the bottom of the newly added code looks like it has a much too wide scope of code.
fixed in patch 2
I guess you want to use set. Or you can have multiple times items in your lists.
Fixed in patch 6, which is much more comprehensive - deletes pages and attachments too.
Patch 7 is even more comprehensive, copes with wiki configuration changes making pages inaccessible or accessible.
Patch 8 is fixes attachment dumping, improves the output and allows to interrupt the script.
Patch 9 handles timestamps, fatal errors better and also ensures the WordIndex and TitleIndex pages are up-to-date.
Plan

Priority: low
Assigned to:
Status:
CategoryMoinMoinPatch
MoinMoin: MoinMoinPatch/IncrementalDump (last edited 2014-01-11 01:38:59 by d175-38-163-227)