Attachment 'time_antispam.py'
Download 1 """
2 Time antispam overhead
3
4 Usage: python time_antispam.py get | current | compiled | compiled-links
5 """
6
7 import sys, time, re, urllib, codecs
8
9 try:
10 import cPickle as pickle
11 except ImportError:
12 import pickle
13
14 charset = 'utf-8'
15
16
17 # From current antispam ------------------------------------------------
18
19 def makelist(text):
20 """ Split text into lines, strip them, skip # comments """
21 lines = text.splitlines()
22 list = []
23 for line in lines:
24 line = line.split(' # ', 1)[0] # rest of line comment
25 line = line.strip()
26 if line and not line.startswith('#'):
27 list.append(line)
28 return list
29
30 def match(res, text):
31 for scanner in res:
32 match = re.search(scanner, text, re.I)
33
34
35 # New ------------------------------------------------------------------
36
37 def makeReList(text, size):
38 """ Create list of regex, each with max patterns
39
40 @param text: bad content text, unicode
41 @param size: how many patterns in each re
42 """
43 lines = text.splitlines()
44 res = []
45 count = 0
46 groups = 0
47 current = []
48 for line in lines:
49 # Strip comments after pattern, ignore comments lines
50 line = line.split(' #', 1)[0]
51 line = line.strip()
52 if line and line.startswith('#'):
53 continue
54
55 # Count groups. Groups is not very strict but should be fine for
56 # url patterens. We can use more strict re here.
57 linegroups = line.count('(')
58
59 if count + 1 <= size and groups + linegroups <= 100:
60 # Add to current and update counts.
61 current.append(line)
62 groups += linegroups
63 count += 1
64 else:
65 # Compile curent and add to regex list
66 current = re.compile('|'.join(current), re.I)
67 res.append(current)
68
69 # Reset counts, start to append to new list
70 current = [line]
71 groups = linegroups
72 count = 1
73
74 # Add last group
75 if current:
76 current = re.compile('|'.join(current), re.I)
77 res.append(current)
78
79 return res
80
81
82 def getlinks(text):
83 """ Return text with one link per line
84
85 Should speedup the search, using tiny fragment of the original text.
86 """
87 linkscanner = re.compile(r'\bhttps?://(\S+)\b', re.I)
88 matches = linkscanner.findall(text)
89 print 'found %d links' % len(matches)
90 return '\n'.join(matches)
91
92
93 def matchCompiled(res, body):
94 # Simulate the worst case, we have to try all res
95 for scanner in res:
96 match = scanner.search(body)
97
98
99 if __name__ == '__main__':
100 try:
101 verb = sys.argv[1]
102 except IndexError:
103 print __doc__
104 sys.exit(1)
105
106 if verb == 'get':
107 url = 'http://moinmaster.wikiwikiweb.de:8000/%s?action=raw'
108
109 badcontent = urllib.urlopen(url % "BadContent").read()
110 file('badcontent', 'wb').write(badcontent)
111 pagetext = urllib.urlopen(url % "MoinMoinQuestions").read()
112 file('pagetext', 'wb').write(pagetext)
113
114 # pickle res
115 badcontent = unicode(badcontent, charset)
116 res = makeReList(badcontent, size=100)
117 pickle.dump(res, file('badcontent.pickle', 'wb'), pickle.HIGHEST_PROTOCOL)
118 print 'Created compiled bad content pickle (%d items)' % len(res)
119
120 elif verb == 'compiled':
121 print 'testing new code, using few big compiled re objects'
122 # Time new compiling code
123 pagetext = codecs.open('pagetext', 'r', charset).read()
124
125 # First request - load from pickle and match
126 start = time.time()
127 res = pickle.load(file('badcontent.pickle'))
128 matchCompiled(res, pagetext)
129 print 'first request on long running process / cgi: %.8f' % (time.time() - start)
130
131 # Second request (re cached by python)
132 start = time.time()
133 res = pickle.load(file('badcontent.pickle'))
134 matchCompiled(res, pagetext)
135 print 'second request on long running process: %.8f' % (time.time() - start)
136
137 elif verb == 'compiled-links':
138 print 'testing new code, using compiled re objects on page links only'
139 # Time new compiling code
140 pagetext = codecs.open('pagetext', 'r', charset).read()
141
142 # First request - load from pickle and match
143 start = time.time()
144 linktext = getlinks(pagetext)
145 res = pickle.load(file('badcontent.pickle'))
146 matchCompiled(res, linktext)
147 print 'first request on long running process / cgi: %.8f' % (time.time() - start)
148
149 # Second request (re cached by python)
150 start = time.time()
151 linktext = getlinks(pagetext)
152 res = pickle.load(file('badcontent.pickle'))
153 matchCompiled(res, linktext)
154 print 'second request on long running process: %.8f' % (time.time() - start)
155
156 elif verb == 'current':
157 print 'testing current code, using re.search'
158 # Time currrent code
159 pagetext = codecs.open('pagetext', 'r', charset).read()
160
161 start = time.time()
162 badcontent = codecs.open('badcontent', 'r', charset).read()
163 res = makelist(badcontent)
164 match(res, pagetext)
165 print 'first request on long running process / cgi: %.8f' % (time.time() - start)
166
167 # Second request (re cached by python)
168 start = time.time()
169 badcontent = codecs.open('badcontent', 'r', charset).read()
170 res = makelist(badcontent)
171 match(res, pagetext)
172 print 'second request on long running process: %.8f' % (time.time() - start)
173
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.