This is a script to extract external links from spammed pages making ready to paste link into BadContent.
1 """
2 Extract links from spam and return ready to paste regular expressions.
3 """
4
5 import sys
6 import re
7 import urlparse
8
9 urlPattern = re.compile(r'\bhttps?://[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]+',
10 re.IGNORECASE)
11
12 def extractPatterns(text):
13 patterns = {}
14 for link in urlPattern.findall(text):
15 # antispam care only about the network location
16 netloc = urlparse.urlparse(link)[1]
17 # Ignore www subdomain
18 netloc = netloc.replace('www.', '')
19 netloc = netloc.replace('.', '\.')
20 patterns[netloc] = None
21 return patterns.keys()
22
23
24 def run():
25 text = file(sys.argv[1]).read()
26 patterns = extractPatterns(text)
27 print '\n'.join(patterns)
28
29
30 if __name__ == '__main__':
31 run()
32