"""
Time antispam overhead

Usage: python time_antispam.py get | current | compiled | compiled-links
"""

import sys, time, re, urllib, codecs

try:
    import cPickle as pickle
except ImportError:
    import pickle

charset = 'utf-8'


# From current antispam ------------------------------------------------

def makelist(text):
    """ Split text into lines, strip them, skip # comments """
    lines = text.splitlines()
    list = []
    for line in lines:
        line = line.split(' # ', 1)[0] # rest of line comment
        line = line.strip()
        if line and not line.startswith('#'):
            list.append(line)
    return list

def match(res, text):
    for scanner in res:
        match = re.search(scanner, text, re.I)


# New ------------------------------------------------------------------

def makeReList(text, size):
    """ Create list of regex, each with max patterns

    @param text: bad content text, unicode
    @param size: how many patterns in each re 
    """
    lines = text.splitlines()
    res = []
    count = 0
    groups = 0
    current = []
    for line in lines:
        # Strip comments after pattern, ignore comments lines
        line = line.split(' #', 1)[0]
        line = line.strip()
        if line and line.startswith('#'):
            continue

        # Count groups. Groups is not very strict but should be fine for
        # url patterens. We can use more strict re here.
        linegroups = line.count('(') 

        if count + 1 <= size and groups + linegroups <= 100:
            # Add to current and update counts.
            current.append(line)
            groups += linegroups
            count += 1
        else:
            # Compile curent and add to regex list
            current = re.compile('|'.join(current), re.I)
            res.append(current)            

            # Reset counts, start to append to new list
            current = [line]
            groups = linegroups
            count = 1           

    # Add last group
    if current:
        current = re.compile('|'.join(current), re.I)
        res.append(current)

    return res


def getlinks(text):
    """ Return text with one link per line

    Should speedup the search, using tiny fragment of the original text.
    """
    linkscanner = re.compile(r'\bhttps?://(\S+)\b', re.I)
    matches = linkscanner.findall(text)
    print 'found %d links' % len(matches)
    return '\n'.join(matches)
    
    
def matchCompiled(res, body):
    # Simulate the worst case, we have to try all res
    for scanner in res:
        match = scanner.search(body)


if __name__ == '__main__':
    try:
        verb = sys.argv[1]
    except IndexError:
        print __doc__
        sys.exit(1)

    if verb == 'get':       
        url = 'http://moinmaster.wikiwikiweb.de:8000/%s?action=raw'

        badcontent = urllib.urlopen(url % "BadContent").read()
        file('badcontent', 'wb').write(badcontent)
        pagetext = urllib.urlopen(url % "MoinMoinQuestions").read()
        file('pagetext', 'wb').write(pagetext)
        
        # pickle res
        badcontent = unicode(badcontent, charset)
        res = makeReList(badcontent, size=100)
        pickle.dump(res, file('badcontent.pickle', 'wb'), pickle.HIGHEST_PROTOCOL)
        print 'Created compiled bad content pickle (%d items)' % len(res)

    elif verb == 'compiled':
        print 'testing new code, using few big compiled re objects'
        # Time new compiling code
        pagetext = codecs.open('pagetext', 'r', charset).read()

        # First request - load from pickle and match
        start = time.time()
        res = pickle.load(file('badcontent.pickle'))
        matchCompiled(res, pagetext)
        print 'first request on long running process / cgi: %.8f' % (time.time() - start)

        # Second request (re cached by python)
        start = time.time()
        res = pickle.load(file('badcontent.pickle'))
        matchCompiled(res, pagetext)
        print 'second request on long running process: %.8f' % (time.time() - start)

    elif verb == 'compiled-links':
        print 'testing new code, using compiled re objects on page links only'
        # Time new compiling code
        pagetext = codecs.open('pagetext', 'r', charset).read()

        # First request - load from pickle and match
        start = time.time()
        linktext = getlinks(pagetext)
        res = pickle.load(file('badcontent.pickle'))
        matchCompiled(res, linktext)
        print 'first request on long running process / cgi: %.8f' % (time.time() - start)

        # Second request (re cached by python)
        start = time.time()
        linktext = getlinks(pagetext)
        res = pickle.load(file('badcontent.pickle'))
        matchCompiled(res, linktext)
        print 'second request on long running process: %.8f' % (time.time() - start)

    elif verb == 'current':
        print 'testing current code, using re.search'
        # Time currrent code
        pagetext = codecs.open('pagetext', 'r', charset).read()
        
        start = time.time()
        badcontent = codecs.open('badcontent', 'r', charset).read()
        res = makelist(badcontent)
        match(res, pagetext)
        print 'first request on long running process / cgi: %.8f' % (time.time() - start)

        # Second request (re cached by python)
        start = time.time()
        badcontent = codecs.open('badcontent', 'r', charset).read()
        res = makelist(badcontent)
        match(res, pagetext)
        print 'second request on long running process: %.8f' % (time.time() - start)