Attachment 'time_antispam.py'

Download

   1 """
   2 Time antispam overhead
   3 
   4 Usage: python time_antispam.py get | current | compiled | compiled-links
   5 """
   6 
   7 import sys, time, re, urllib, codecs
   8 
   9 try:
  10     import cPickle as pickle
  11 except ImportError:
  12     import pickle
  13 
  14 charset = 'utf-8'
  15 
  16 
  17 # From current antispam ------------------------------------------------
  18 
  19 def makelist(text):
  20     """ Split text into lines, strip them, skip # comments """
  21     lines = text.splitlines()
  22     list = []
  23     for line in lines:
  24         line = line.split(' # ', 1)[0] # rest of line comment
  25         line = line.strip()
  26         if line and not line.startswith('#'):
  27             list.append(line)
  28     return list
  29 
  30 def match(res, text):
  31     for scanner in res:
  32         match = re.search(scanner, text, re.I)
  33 
  34 
  35 # New ------------------------------------------------------------------
  36 
  37 def makeReList(text, size):
  38     """ Create list of regex, each with max patterns
  39 
  40     @param text: bad content text, unicode
  41     @param size: how many patterns in each re 
  42     """
  43     lines = text.splitlines()
  44     res = []
  45     count = 0
  46     groups = 0
  47     current = []
  48     for line in lines:
  49         # Strip comments after pattern, ignore comments lines
  50         line = line.split(' #', 1)[0]
  51         line = line.strip()
  52         if line and line.startswith('#'):
  53             continue
  54 
  55         # Count groups. Groups is not very strict but should be fine for
  56         # url patterens. We can use more strict re here.
  57         linegroups = line.count('(') 
  58 
  59         if count + 1 <= size and groups + linegroups <= 100:
  60             # Add to current and update counts.
  61             current.append(line)
  62             groups += linegroups
  63             count += 1
  64         else:
  65             # Compile curent and add to regex list
  66             current = re.compile('|'.join(current), re.I)
  67             res.append(current)            
  68 
  69             # Reset counts, start to append to new list
  70             current = [line]
  71             groups = linegroups
  72             count = 1           
  73 
  74     # Add last group
  75     if current:
  76         current = re.compile('|'.join(current), re.I)
  77         res.append(current)
  78 
  79     return res
  80 
  81 
  82 def getlinks(text):
  83     """ Return text with one link per line
  84 
  85     Should speedup the search, using tiny fragment of the original text.
  86     """
  87     linkscanner = re.compile(r'\bhttps?://(\S+)\b', re.I)
  88     matches = linkscanner.findall(text)
  89     print 'found %d links' % len(matches)
  90     return '\n'.join(matches)
  91     
  92     
  93 def matchCompiled(res, body):
  94     # Simulate the worst case, we have to try all res
  95     for scanner in res:
  96         match = scanner.search(body)
  97 
  98 
  99 if __name__ == '__main__':
 100     try:
 101         verb = sys.argv[1]
 102     except IndexError:
 103         print __doc__
 104         sys.exit(1)
 105 
 106     if verb == 'get':       
 107         url = 'http://moinmaster.wikiwikiweb.de:8000/%s?action=raw'
 108 
 109         badcontent = urllib.urlopen(url % "BadContent").read()
 110         file('badcontent', 'wb').write(badcontent)
 111         pagetext = urllib.urlopen(url % "MoinMoinQuestions").read()
 112         file('pagetext', 'wb').write(pagetext)
 113         
 114         # pickle res
 115         badcontent = unicode(badcontent, charset)
 116         res = makeReList(badcontent, size=100)
 117         pickle.dump(res, file('badcontent.pickle', 'wb'), pickle.HIGHEST_PROTOCOL)
 118         print 'Created compiled bad content pickle (%d items)' % len(res)
 119 
 120     elif verb == 'compiled':
 121         print 'testing new code, using few big compiled re objects'
 122         # Time new compiling code
 123         pagetext = codecs.open('pagetext', 'r', charset).read()
 124 
 125         # First request - load from pickle and match
 126         start = time.time()
 127         res = pickle.load(file('badcontent.pickle'))
 128         matchCompiled(res, pagetext)
 129         print 'first request on long running process / cgi: %.8f' % (time.time() - start)
 130 
 131         # Second request (re cached by python)
 132         start = time.time()
 133         res = pickle.load(file('badcontent.pickle'))
 134         matchCompiled(res, pagetext)
 135         print 'second request on long running process: %.8f' % (time.time() - start)
 136 
 137     elif verb == 'compiled-links':
 138         print 'testing new code, using compiled re objects on page links only'
 139         # Time new compiling code
 140         pagetext = codecs.open('pagetext', 'r', charset).read()
 141 
 142         # First request - load from pickle and match
 143         start = time.time()
 144         linktext = getlinks(pagetext)
 145         res = pickle.load(file('badcontent.pickle'))
 146         matchCompiled(res, linktext)
 147         print 'first request on long running process / cgi: %.8f' % (time.time() - start)
 148 
 149         # Second request (re cached by python)
 150         start = time.time()
 151         linktext = getlinks(pagetext)
 152         res = pickle.load(file('badcontent.pickle'))
 153         matchCompiled(res, linktext)
 154         print 'second request on long running process: %.8f' % (time.time() - start)
 155 
 156     elif verb == 'current':
 157         print 'testing current code, using re.search'
 158         # Time currrent code
 159         pagetext = codecs.open('pagetext', 'r', charset).read()
 160         
 161         start = time.time()
 162         badcontent = codecs.open('badcontent', 'r', charset).read()
 163         res = makelist(badcontent)
 164         match(res, pagetext)
 165         print 'first request on long running process / cgi: %.8f' % (time.time() - start)
 166 
 167         # Second request (re cached by python)
 168         start = time.time()
 169         badcontent = codecs.open('badcontent', 'r', charset).read()
 170         res = makelist(badcontent)
 171         match(res, pagetext)
 172         print 'second request on long running process: %.8f' % (time.time() - start)
 173         

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2004-12-09 23:15:49, 4.4 KB) [[attachment:bench.gif]]
  • [get | view] (2004-12-10 13:30:21, 5.5 KB) [[attachment:time_antispam.py]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.