## functions to perform db queries and process jobs from the job queue ## cursor is an object of MySQLdb from TriesA5 import TriesA5 import MySQLdb import wiki2text_v2 as w2t from collections import defaultdict def get_treejson(cursor, article_id): """ function returns string with json which represents saved tree """ query = "SELECT * FROM bm_triesJsons WHERE article_id=%s;" cursor.execute(query % article_id) jsons_rows = cursor.fetchall() if len(jsons_rows) == 0: return None return jsons_rows[0][2] def save_covering(cursor, article_id, revision_id, timestamp, jsn): query = "INSERT INTO bm_coveringsJsons VALUES (%s, %s, %s, %s);" cursor.execute(query, (article_id, revision_id, timestamp, jsn)) def save_treejson(cursor, article_id, revision_id, jsn): subquery1 = "INSERT INTO bm_triesJsons VALUES (%s, %s, %s) ON DUPLICATE KEY" subquery2 = " UPDATE article_id=%s, revision_id=%s, tree_json=%s;" query = "%s%s" % (subquery1, subquery2) cursor.execute(query, (article_id, revision_id, jsn, article_id, revision_id, jsn)) def delete_job_from_queue(cursor, article_id, revision_id): query = ("DELETE FROM bm_jobQueue WHERE article_id=%s AND revision_id=%s;"% (article_id, revision_id)) cursor.execute(query) def get_all_jobs(cursor): """ retruns a dictionary of all jobs key is article_id, value is a list of revisions id the list is ascending order """ cursor.execute("SELECT * FROM bm_jobQueue;") rows = cursor.fetchall() if len(rows) == 0: return None # there is a job in the queue d = defaultdict( list ) for v, k in rows: d[v].append(k) # sorting in ascending order for v in d: d[v] = sorted(d[v]) return d def process_a_job(cursor, job, tree_json, N, K_REV, K_TIME): """ Given a job the function reads tree from the db, updates it, calculates covering, saves the tree and covering to the db, and deletes the job from the db. Job is a row from the job queue db table. """ (article_id, revision_id) = job (timestamp, revision_text, user_id, user_name) = get_jobinfo(cursor, revision_id) a5 = TriesA5(N, K_REV, K_TIME) # if we are processing not the very first revison of the article # then load tree from json if tree_json != None: a5.load_from_json(tree_json) # adding new revision revision_text_tokens = w2t.rawtext2tokens(revision_text) a5.add_revision(revision_text_tokens, timestamp, user_id, user_name, wiki_revision_id=int(revision_id)) # save covering jsn = a5.dump_covering2json() save_covering(cursor, article_id, revision_id, timestamp, jsn) # save the tree save_treejson(cursor, article_id, revision_id, a5.dump2json()) # delete the job from the queue delete_job_from_queue(cursor, article_id, revision_id) def get_jobinfo(cursor, revision_id): """ function returns tuple (timestamp, text_id, user_id, user_name) """ query = "SELECT * FROM revision WHERE rev_id=%s;" cursor.execute(query % revision_id) rows = cursor.fetchall() if len(rows) == 0: return (None, None, None, None) row = rows[0] timestamp = row[6] text_id = row[2] user_id = row[5] user_name = row[6] # obtaining text itself query = "SELECT * FROM text WHERE old_id=%s;" cursor.execute(query % text_id) rows = cursor.fetchall() if len(rows) == 0: return (timestamp, None, user_id, user_name) text = rows[0][1] return (timestamp, text, user_id, user_name) def is_tree_uptodate(cursor, tree_json, article_id, revision_id): """ function returns true if tree stored in tree_json was previous revision of an article with article_id and revision_id """ if tree_json == None: return False last_inserted_rev_id = TriesA5.get_last_revision_id(tree_json) # getting id of previous revision rev_parent_id = get_previous_revision_id(cursor, revision_id) return rev_parent_id == last_inserted_rev_id def fetch_new_revisions(cursor, tree_json, article_id): """ function returns a list of id's of all revisions which have not been added to the tree """ if tree_json == None: last_inserted_rev_id = 0 else: last_inserted_rev_id = TriesA5.get_last_revision_id(tree_json) latest_revision_id = get_latest_revision_id(cursor, article_id) # starting from the latest revision iterate down # untill we get whether last inserted revision id or revision with id zero all_rev_id = [] current_rev_id = latest_revision_id while True: if current_rev_id == 0 or current_rev_id == last_inserted_rev_id: break all_rev_id.append(current_rev_id) current_rev_id = get_previous_revision_id(cursor, current_rev_id) return all_rev_id def get_latest_revision_id(cursor, article_id): """ function returns id of the latest revision of given article """ query = "SELECT * FROM page WHERE page_id=%s;" cursor.execute(query % article_id) row = cursor.fetchone() if row == None: raise Exception("there isn't article with id %s" % article_id) latest_revision_id = row[9] return latest_revision_id def get_previous_revision_id(cursor, revision_id): """ function returns id of previous revision """ query = "SELECT * FROM revision WHERE rev_id=%s;" % revision_id cursor.execute(query) row = cursor.fetchone() if row == None: raise Exception("there isn't revision with id %s" % revision_id) rev_parent_id = row[10] return rev_parent_id def put_job_in_queue(cursor, article_id, revision_id): query = "INSERT INTO bm_jobQueue (article_id, revision_id) VALUES (%s, %s);" cursor.execute(query % (article_id, revision_id))