import re # Regex based on http://wikitech.wikimedia.org/view/Squid_log_format squidline = re.compile( r""" (?P[\S]+) # Name of the squid server \s[-]* (?P[0-9]+) # Sequence ID from the squid server \s (?P[0-9-]+T[0-9:.]+) # Timestamp \s (?P[0-9.]+) # Request service time \s (?P[\S]+) # Client IP address \s (?P[\S]+) # Squid request status and HTTP status code \s (?P[0-9]+) # Reply size including HTTP headers \s (?P[\S]+) # Request type \s (?P[\S]+) # Request URL \s (?P[\S]+) # Squid hierarchy status, peer IP \s (?P[\S]+) # MIME content type \s (?P[\S]+) # Referer header \s (?P[\S]+) # X-Forwarded-For header \s (?P[\S\s]+) # User-Agent header \s (?P[\S\s]+) # Accept-Language header \s (?P[\S\s]+) # X-carrier header """, re.VERBOSE ) # Based on urlparse.urlsplit which is really slow but only does: # :///?# # This regex does not replicate all functionality, just optimizes # even further for our purposes urlparts = re.compile( r""" (?Phttp|https) :// (?P(?:(?!/|\?|\#)\S)*) /? (?P(?:(?!\?|\#)\S)*) \?? (?P(?:(?!\#)\S)*) \#? (?P[\S]*) """, re.VERBOSE ) # Splits a querystring into its constituent key/value pairs queryparts = re.compile( r""" ((?:(?!=)\S)*) = ((?:(?!&)\S)*) &? """, re.VERBOSE ) # some quick regexes to pare down the parse list and not try our best to parse everything landingpages_ignore = [ re.compile( r""" # ignore calls to Special:LandingCheck. They 302 and then result in a proper call to the landing page # also ignore Special:ContributionTracking and the MediaWiki namespace (http|https) ://wikimediafoundation.org/ ( wiki/ | w/index.php\?title= ) ( Special:Landingcheck | Special:ContributionTracking | Special:RecentChanges | MediaWiki: | File: | Talk: ) """, re.VERBOSE | re.IGNORECASE ), re.compile( r""" # ignore calls to the api as well as the favicon and the MediaWiki namespace (http|https) ://wikimediafoundation.org/ ( upload/ | w/ ( skins- | api.php | opensearch_desc.php | index.php\?search= | extensions/ ) | favicon.ico | tracker/bannerImpression.php ) """, re.VERBOSE | re.IGNORECASE ), re.compile( r""" # ignore the ToU for now (http|https) ://wikimediafoundation.org/ ( wiki/ | w/index.php\?title= ) ( Terms_of_Use | New_Terms_of_use | New%20Terms%20of%20use | Feedback_privacy_statement | Home | Main_Page | Donate/Benefactor | Donate/Stories | Donate/Thank_You | Donate/Transparency | SOPA/Blackoutpage ) """, re.VERBOSE | re.IGNORECASE ), ] landingpages = [ re.compile( r""" # match all of the landing page patterns on wmfwiki (http|https) :// (?Pwikimediafoundation.org)/ (wiki/|w/index.php\?title=) (?P ( L11 # landing page naming scheme for 2011 | L12 # landing page naming scheme for 2012 | L2011 # potential landing page naming scheme for 2011 | L2012 # potential landing page naming scheme for 2012 | WMF # eg WMFJA085 | Donate # old forms, keeping so that we can possibly redirect them all | Support_Wikipedia # old forms | Test_120511 # Test from 2012-05-11 ) (?:(?!\?|&)[\S])* # this will give us the landing page up to the next ? or & ) """, re.VERBOSE | re.IGNORECASE ), re.compile( r""" # match all of the landing page patterns on wmfwiki (http|https) :// (?Pdonate.wikimedia.org)/ (wiki/|w/index.php\?title=) (?P ( L11 # landing page naming scheme for 2011 | L12 # landing page naming scheme for 2012 | L2011 # potential landing page naming scheme for 2011 | L2012 # potential landing page naming scheme for 2012 | WMF # eg WMFJA085 ) (?:(?!\?|&)[\S])* # this will give us the landing page up to the next ? or & ) """, re.VERBOSE | re.IGNORECASE ), re.compile( r""" (http|https) :// (?Pdonate.wikimedia.org)/ ( wiki/ | w/index.php\?title= ) ( Special:FundraiserLandingPage # | Special:FundraiserRedirector # these 302 and should result in a valid call to S:FLP ) """, re.VERBOSE | re.IGNORECASE ) ] ignore_uas = [ re.compile(r"""frontend_tester/p14"""), re.compile(r"""frontend_tester/p14_1"""), re.compile(r"""/home/mwaler/frontend_tester/p14"""), re.compile(r"""\./p12"""), re.compile(r"""\./p13"""), re.compile(r"""\./p14"""), re.compile(r"""\./p15"""), re.compile(r"""/usr/local/frontend_tester/p12"""), re.compile(r"""/usr/local/frontend_tester/p14"""), re.compile(r"""^bot""", re.IGNORECASE), ] phantomJS = re.compile(r"phantomJS", re.IGNORECASE) sampled = re.compile( r""" bannerImpressions- sampled (?P[0-9]+) """, re.VERBOSE | re.IGNORECASE )