import filter_wiki import copy import re import gzip class Revision(object): """ class that stores information about one revision of the wiki article """ def __init__(self , article_title , article_id , revision_id , revision_timestamp , contributor_username , contributor_id #, revision_comment , revision_text , len_raw_rev_text): self.article_title = article_title self.article_id = article_id self.revision_id = revision_id self.revision_timestamp = revision_timestamp self.contributor_username = contributor_username self.contributor_id = contributor_id #self.revision_comment = revision_comment self.revision_text = revision_text self.len_raw_revision_text = len_raw_rev_text def tokenize(self): res = [] for x in self.revision_text: res.extend(line2tokens(x)) self.revision_text = res class WikiArticleRaw(object): """ class stores raw information about wikipedia article cleaninig text and tokenization happens in method add_new_revision """ def __init__(self, title, article_id): self.title = title self.article_id = article_id self.revisions = [] # list of revisions; each revision # is object of class Revision # incremental revision number self.cr_number = -1 # self.cr an object of class Revision self.cr = None def add_new_revision(self, new_rev): """ add new revision; id of new revision should be greater then id of any stored revision self.cr would be Revision with tokenized text """ c = 100 if len(self.revisions) > 0 and new_rev.revision_id <= self.revisions[-1].revision_id: print "Error: revisions should be in ascending order" print "Error on revsion with id = %s"%new_rev.revision_id print "self.revisions[-1].revision_id is %s"%self.revisions[-1].revision_id print " boolean expr %s"%(new_rev.revision_id <= self.revisions[-1].revision_id) raise Exception('revisions should be in ascending order') return -1 # okay, adding newest revision # saving length of raw article #sum_len = 0 #for i in xrange(len(new_rev.revision_text)): # sum_len += len(new_rev.revision_text[i]) #new_rev.save_raw_text_len(sum_len) # filtering raw text from markup and tokenize for i in xrange(len(new_rev.revision_text)): new_rev.revision_text[i] = filter_wiki.filter_wiki(new_rev.revision_text[i]) new_rev.tokenize() self.revisions.append(new_rev) # store only last c revisions if len(self.revisions) > c: del self.revisions[0] self.cr = new_rev self.cr_number += 1 def get_raw_revisions(filename): """ method yields object of type Revision filename is a name of xml.gz dump file text represented as list of lines of raw wikitext """ states = ['page', 'revision', 'contributor'] state_stack = [] fetching_text = False text = [] counter = 0 # for tracking mission information article_title = -1 article_id = -1 revision_id = -1 revision_timestamp = -1 contributor_username = -1 contributor_id = -1 current_page = None pages_all = [] for line in gzip.open(filename, 'rb'): # save page if len(state_stack) == 0 and current_page != None: #pages_all.append(current_page) current_page = None counter = 0 line.lower() if state_update(line, state_stack): continue # save text # case when there is one line of text if len(re.findall('', line)) != 0: ############ take care about next lines:############# # # 329879422 # 2009-12-05T15:11:55Z # # # # if len(re.findall('deleted', line)) != 0: counter = 2 continue ##################################################### # if text tag is in one line like next # # and counter == 4 then there was no info about # contributor, so we can skip this revison if counter == 4: counter = 2 continue if counter != 6: print line print "counter is %s"%counter raise Exception('Error: counter should be 6') len_raw_rev_text = len(line) #current_page.add_new_revision(Revision(article_title, # article_id, # revision_id, # revision_timestamp, # contributor_username, # contributor_id, # [line], # len_raw_rev_text)) yield Revision(article_title, article_id, revision_id, revision_timestamp, contributor_username, contributor_id, [line], len_raw_rev_text) counter = 2 if line.find('') != -1: fetching_text = False #print line if counter != 6: print line print "counter is %s"%counter raise Exception('Error: counter should be 6') len_raw_rev_text = 0 for x in text: len_raw_rev_text += len(x) #current_page.add_new_revision(Revision(article_title, # article_id, # revision_id, # revision_timestamp, # contributor_username, # contributor_id, # text, # len_raw_rev_text)) #print len_raw_rev_text #st = revision_timestamp #print st #print st[:4] + st[5:7] + st[8:10] yield Revision(article_title, article_id, revision_id, revision_timestamp, contributor_username, contributor_id, text, len_raw_rev_text) counter = 2 # page state - save title and id if len(state_stack) > 0 and state_stack[-1] == 'page': if line.find('') != -1: article_title = get_value_between_tags(line) counter += 1 if line.find('<id>') != -1: article_id = int(get_value_between_tags(line)) counter += 1 if counter != 2: raise Exception('Error: counter should be 2') current_page = WikiArticleRaw(article_title, article_id) # revision state - save id, timestamp if len(state_stack) > 0 and state_stack[-1] == 'revision': if line.find('<id>') != -1: revision_id = int(get_value_between_tags(line)) #print 'revision_id is %s'%revision_id counter += 1 if line.find('<timestamp>') != -1: revision_timestamp = get_value_between_tags(line) #print 'revision_timestamp is %s'%revision_timestamp counter += 1 # contributor state - save username and id if len(state_stack) > 0 and state_stack[-1] == 'contributor': if line.find('<username>') != -1: contributor_username = get_value_between_tags(line) #print 'contributor username is %s'%contributor_username counter += 1 if line.find('<id>') != -1: contributor_id = get_value_between_tags(line) #print 'contributor id is %s'%contributor_id counter += 1 if line.find('<ip>') != -1: contributor_id = get_value_between_tags(line) contributor_username = get_value_between_tags(line) #print "contributor ip is %s"%contributor_id counter += 2 #return pages_all def get_value_between_tags(line): """ find value between tags in the line no nested tags implied """ m = re.search('>.*?<', line) return m.group(0)[1:-1] def state_update(line, state_stack): # page if line.find('<page>') != -1: state_stack.append('page') return True if line.find('</page>') != -1: val = state_stack.pop() if val != 'page': raise Exception('Error: state should be \'page\'') return True # revision if line.find('<revision>') != -1: state_stack.append('revision') return True if line.find('</revision>') != -1: val = state_stack.pop() if val != 'revision': raise Exception('Error: state should be \'revision\'') return True # contributor if line.find('<contributor>') != -1: state_stack.append('contributor') return True if line.find('</contributor>') != -1: val = state_stack.pop() if val != 'contributor': raise Exception('Error: state should be \'contributor\'') return True return False def line2tokens(line): # remove all junk charachters junk = [ ',' , '\.' # , '\(.*?\)' # , '\[.*?\]' , '!' , '\.' , '\?' , '\n' , ';' , ':' , '\"' , '\*' , '==' , '\'\'\'' , '-' , '\'\'' ] for i in xrange(len(junk)): line = re.sub(junk[i], ' ', line) line = re.sub(' +', ' ', line) line = line.lower() return line.split() def rawtext2tokens(text): """ text is a string """ # filtering from wiki markup text = filter_wiki.filter_wiki(text) text = line2tokens(text) return text def yield_next_revision(input_gz): """ yields an article, each time with new revision article is an object of class WikiArticleRaw """ title_prev = '-1title=1' bad_titles = ['User', 'Talk', 'File'] first_time = True for rev in get_raw_revisions(input_gz): is_bad_title = False for x in bad_titles: if rev.article_title.find(x) == 0: is_bad_title = True if is_bad_title: continue if title_prev != rev.article_title: if not first_time: # process previous article pass # initializing new article art = WikiArticleRaw(rev.article_title, rev.article_id) first_time = False print "%s adding revisions ..."%art.title art.add_new_revision(rev) print '%s; revision %s was added'%(art.title, art.cr_number) title_prev = rev.article_title yield art if __name__ == '__main__': filename = 'wiki-00002029.xml.gz' articles = get_raw_articles(filename) for art in articles: print ''