import filter_wiki
import copy
import re
import gzip
class Revision(object):
"""
class that stores information
about one revision of the wiki article
"""
def __init__(self
, article_title
, article_id
, revision_id
, revision_timestamp
, contributor_username
, contributor_id
#, revision_comment
, revision_text
, len_raw_rev_text):
self.article_title = article_title
self.article_id = article_id
self.revision_id = revision_id
self.revision_timestamp = revision_timestamp
self.contributor_username = contributor_username
self.contributor_id = contributor_id
#self.revision_comment = revision_comment
self.revision_text = revision_text
self.len_raw_revision_text = len_raw_rev_text
def tokenize(self):
res = []
for x in self.revision_text:
res.extend(line2tokens(x))
self.revision_text = res
class WikiArticleRaw(object):
"""
class stores raw information
about wikipedia article
cleaninig text and tokenization happens in method add_new_revision
"""
def __init__(self, title, article_id):
self.title = title
self.article_id = article_id
self.revisions = [] # list of revisions; each revision
# is object of class Revision
# incremental revision number
self.cr_number = -1
# self.cr an object of class Revision
self.cr = None
def add_new_revision(self, new_rev):
""" add new revision; id of new revision
should be greater then id of any stored revision
self.cr would be Revision with tokenized text
"""
c = 100
if len(self.revisions) > 0 and new_rev.revision_id <= self.revisions[-1].revision_id:
print "Error: revisions should be in ascending order"
print "Error on revsion with id = %s"%new_rev.revision_id
print "self.revisions[-1].revision_id is %s"%self.revisions[-1].revision_id
print " boolean expr %s"%(new_rev.revision_id <= self.revisions[-1].revision_id)
raise Exception('revisions should be in ascending order')
return -1
# okay, adding newest revision
# saving length of raw article
#sum_len = 0
#for i in xrange(len(new_rev.revision_text)):
# sum_len += len(new_rev.revision_text[i])
#new_rev.save_raw_text_len(sum_len)
# filtering raw text from markup and tokenize
for i in xrange(len(new_rev.revision_text)):
new_rev.revision_text[i] = filter_wiki.filter_wiki(new_rev.revision_text[i])
new_rev.tokenize()
self.revisions.append(new_rev)
# store only last c revisions
if len(self.revisions) > c:
del self.revisions[0]
self.cr = new_rev
self.cr_number += 1
def get_raw_revisions(filename):
""" method yields object of type Revision
filename is a name of xml.gz dump file
text represented as list of lines of raw wikitext
"""
states = ['page', 'revision', 'contributor']
state_stack = []
fetching_text = False
text = []
counter = 0 # for tracking mission information
article_title = -1
article_id = -1
revision_id = -1
revision_timestamp = -1
contributor_username = -1
contributor_id = -1
current_page = None
pages_all = []
for line in gzip.open(filename, 'rb'):
# save page
if len(state_stack) == 0 and current_page != None:
#pages_all.append(current_page)
current_page = None
counter = 0
line.lower()
if state_update(line, state_stack):
continue
# save text
# case when there is one line of text
if len(re.findall('', line)) != 0:
############ take care about next lines:#############
#
# 329879422
# 2009-12-05T15:11:55Z
#
#
#
#
if len(re.findall('deleted', line)) != 0:
counter = 2
continue
#####################################################
# if text tag is in one line like next
#
# and counter == 4 then there was no info about
# contributor, so we can skip this revison
if counter == 4:
counter = 2
continue
if counter != 6:
print line
print "counter is %s"%counter
raise Exception('Error: counter should be 6')
len_raw_rev_text = len(line)
#current_page.add_new_revision(Revision(article_title,
# article_id,
# revision_id,
# revision_timestamp,
# contributor_username,
# contributor_id,
# [line],
# len_raw_rev_text))
yield Revision(article_title, article_id, revision_id,
revision_timestamp, contributor_username,
contributor_id, [line], len_raw_rev_text)
counter = 2
if line.find('') != -1:
fetching_text = False
#print line
if counter != 6:
print line
print "counter is %s"%counter
raise Exception('Error: counter should be 6')
len_raw_rev_text = 0
for x in text:
len_raw_rev_text += len(x)
#current_page.add_new_revision(Revision(article_title,
# article_id,
# revision_id,
# revision_timestamp,
# contributor_username,
# contributor_id,
# text,
# len_raw_rev_text))
#print len_raw_rev_text
#st = revision_timestamp
#print st
#print st[:4] + st[5:7] + st[8:10]
yield Revision(article_title, article_id, revision_id,
revision_timestamp, contributor_username,
contributor_id, text, len_raw_rev_text)
counter = 2
# page state - save title and id
if len(state_stack) > 0 and state_stack[-1] == 'page':
if line.find('') != -1:
article_title = get_value_between_tags(line)
counter += 1
if line.find('') != -1:
article_id = int(get_value_between_tags(line))
counter += 1
if counter != 2:
raise Exception('Error: counter should be 2')
current_page = WikiArticleRaw(article_title, article_id)
# revision state - save id, timestamp
if len(state_stack) > 0 and state_stack[-1] == 'revision':
if line.find('') != -1:
revision_id = int(get_value_between_tags(line))
#print 'revision_id is %s'%revision_id
counter += 1
if line.find('') != -1:
revision_timestamp = get_value_between_tags(line)
#print 'revision_timestamp is %s'%revision_timestamp
counter += 1
# contributor state - save username and id
if len(state_stack) > 0 and state_stack[-1] == 'contributor':
if line.find('') != -1:
contributor_username = get_value_between_tags(line)
#print 'contributor username is %s'%contributor_username
counter += 1
if line.find('') != -1:
contributor_id = get_value_between_tags(line)
#print 'contributor id is %s'%contributor_id
counter += 1
if line.find('') != -1:
contributor_id = get_value_between_tags(line)
contributor_username = get_value_between_tags(line)
#print "contributor ip is %s"%contributor_id
counter += 2
#return pages_all
def get_value_between_tags(line):
"""
find value between tags in the line
no nested tags implied
"""
m = re.search('>.*?<', line)
return m.group(0)[1:-1]
def state_update(line, state_stack):
# page
if line.find('') != -1:
state_stack.append('page')
return True
if line.find('') != -1:
val = state_stack.pop()
if val != 'page':
raise Exception('Error: state should be \'page\'')
return True
# revision
if line.find('') != -1:
state_stack.append('revision')
return True
if line.find('') != -1:
val = state_stack.pop()
if val != 'revision':
raise Exception('Error: state should be \'revision\'')
return True
# contributor
if line.find('') != -1:
state_stack.append('contributor')
return True
if line.find('') != -1:
val = state_stack.pop()
if val != 'contributor':
raise Exception('Error: state should be \'contributor\'')
return True
return False
def line2tokens(line):
# remove all junk charachters
junk = [ ','
, '\.'
# , '\(.*?\)'
# , '\[.*?\]'
, '!'
, '\.'
, '\?'
, '\n'
, ';'
, ':'
, '\"'
, '\*'
, '=='
, '\'\'\''
, '-'
, '\'\''
]
for i in xrange(len(junk)):
line = re.sub(junk[i], ' ', line)
line = re.sub(' +', ' ', line)
line = line.lower()
return line.split()
def rawtext2tokens(text):
""" text is a string
"""
# filtering from wiki markup
text = filter_wiki.filter_wiki(text)
text = line2tokens(text)
return text
def yield_next_revision(input_gz):
""" yields an article, each time with new revision
article is an object of class WikiArticleRaw
"""
title_prev = '-1title=1'
bad_titles = ['User', 'Talk', 'File']
first_time = True
for rev in get_raw_revisions(input_gz):
is_bad_title = False
for x in bad_titles:
if rev.article_title.find(x) == 0:
is_bad_title = True
if is_bad_title:
continue
if title_prev != rev.article_title:
if not first_time:
# process previous article
pass
# initializing new article
art = WikiArticleRaw(rev.article_title,
rev.article_id)
first_time = False
print "%s adding revisions ..."%art.title
art.add_new_revision(rev)
print '%s; revision %s was added'%(art.title, art.cr_number)
title_prev = rev.article_title
yield art
if __name__ == '__main__':
filename = 'wiki-00002029.xml.gz'
articles = get_raw_articles(filename)
for art in articles:
print ''