#!/usr/bin/python
"""
For more detail, see http://wikitech.wikimedia.org/view/Text_storage_data
reads in a file which should contain the output of
ben@hume:~$ /home/w/bin/foreachwiki maintenance/storage/storageTypeStats.php > /tmp/storageTypeStats.log
Parses it and sums up the values for all wikis.
prints this sum to stdout.
Example content:
ben@fenari:~/storageStats$ cat sample_output.txt
-----------------------------------------------------------------
aawiki
-----------------------------------------------------------------
aawiki: Using bin size of 100
aawiki: 0^M1000^M2000^M3000^M4000^M5000^M6000^M7000^M8000^M9000^M10000^M
aawiki:
aawiki: Flags Class Count old_id range
aawiki: ------------------------------------------------------------------------------------------------------------------------
aawiki: gzip [none] 4568 0 - 4700
aawiki: [none] [none] 1615 4600 - 6300
aawiki: utf-8,gzip [none] 1883 5300 - 8300
aawiki: external,utf-8 CGZ pointer 626 6200 - 10300
aawiki: external,utf-8 DHB pointer 368 9100 - 10300
aawiki: utf-8,gzip,external simple pointer 975 8200 - 10400
aawiki: external,utf8 DHB pointer 211 9400 - 10200
-----------------------------------------------------------------
aawikibooks
-----------------------------------------------------------------
aawikibooks: Using bin size of 100
aawikibooks: 0^M1000^M2000^M3000^M
aawikibooks:
aawikibooks: Flags Class Count old_id range
aawikibooks: ------------------------------------------------------------------------------------------------------------------------
aawikibooks: [none] [none] 881 0 - 1000
aawikibooks: external,utf-8 CGZ pointer 187 0 - 3400
aawikibooks: external,utf-8 DHB pointer 34 3200 - 3400
aawikibooks: object historyblobcurstub 898 900 - 1900
aawikibooks: utf-8,gzip [none] 900 1800 - 2900
aawikibooks: utf-8,gzip,external simple pointer 431 2800 - 3400
aawikibooks: external,utf8 DHB pointer 25 3300 - 3400
"""
import re
import optparse
##
## set up argument parsing. Require --input (or -i) and a filename.
usage = "usage: %prog "
desc = """Sum the storage types across all wikis. The input file should
contain the output of:
foreachwiki maintenance/storage/storageTypeStats.php
"""
parser = optparse.OptionParser(usage=usage, description=desc)
(opts, args) = parser.parse_args()
if len(args) != 1:
print "I can't do anything without a file to parse. Sorry!"
parser.print_help()
exit(1)
input = args[0]
try:
file=open(input, 'r')
# create a bunch of regexes to match various sections of the file
# a section starts with nothing on the line but the name of the wiki db
#aawikibooks
start_section = re.compile("^(?P[a-z0-9_]+)$")
#aawikibooks: external,utf-8 DHB pointer 34 3200 - 3400
counter = re.compile("^[a-z0-9_]*: *(?P[^ ]+) +(?P[^ ]+ [^ ]*) +(?P\d+) +.*")
# create a bunch of counters
wiki_count=0
content_counters = dict()
# ok, parse the file and collect stats!
for line in file:
match = start_section.match(line)
if match:
# this isn't actually used yet, but is in here for when we
# want more interesting stats and collect per-db
wiki_count += 1
db_name=match.group('dbname')
match = counter.match(line)
if match:
# sum all unique class,flags combinations
key = "%s/%s" % (match.group('flags'), match.group('class'))
try:
content_counters[key] += int(match.group('count'))
except KeyError:
content_counters[key] = int(match.group('count'))
except IOError, e:
print "omg io error %s!" % e
raise e
print "Results:"
print " Count Type"
print "------------------------------------------"
for key in sorted(content_counters.keys()):
print "%12d %s" % (content_counters[key], key)
print "all done!"